diff --git a/.github/workflows/dkg-runner.yml b/.github/workflows/dkg-runner.yml index e16b01b7..0d0fdd23 100644 --- a/.github/workflows/dkg-runner.yml +++ b/.github/workflows/dkg-runner.yml @@ -40,11 +40,14 @@ jobs: fail-fast: false matrix: include: - - name: 4 Charon nodes - id: 4-charon - pluto_nodes: 0 - charon_nodes: 4 - + - name: 2 Charon + 2 Pluto nodes + id: 2-charon-2-pluto + pluto_nodes: 2 + charon_nodes: 2 + - name: 4 Pluto nodes + id: 4-pluto + pluto_nodes: 4 + charon_nodes: 0 steps: - name: Checkout uses: actions/checkout@v6 diff --git a/Cargo.lock b/Cargo.lock index bd23ac86..4b5fce85 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5639,6 +5639,7 @@ dependencies = [ "pluto-k1util", "pluto-p2p", "pluto-parsigex", + "pluto-peerinfo", "pluto-testutil", "pluto-tracing", "prost 0.14.3", diff --git a/Cargo.toml b/Cargo.toml index d7fb3251..64346f3b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ members = [ "crates/peerinfo", "crates/frost", ] +exclude = ["tools/dkg-stress"] resolver = "3" [workspace.package] diff --git a/crates/cli/src/main.rs b/crates/cli/src/main.rs index 8c0e1e61..f9ac0fdd 100644 --- a/crates/cli/src/main.rs +++ b/crates/cli/src/main.rs @@ -32,12 +32,31 @@ async fn run() -> std::result::Result<(), CliError> { let matches = cmd.get_matches(); let cli = Cli::from_arg_matches(&matches)?; - // Top level cancellation token for graceful shutdown on Ctrl+C + // Top level cancellation token for graceful shutdown on Ctrl+C / SIGTERM. let ct = CancellationToken::new(); tokio::spawn({ let ct = ct.clone(); async move { - let _ = tokio::signal::ctrl_c().await; + #[cfg(unix)] + { + use tokio::signal::unix::{SignalKind, signal}; + let mut sigterm = match signal(SignalKind::terminate()) { + Ok(s) => s, + Err(_) => { + let _ = tokio::signal::ctrl_c().await; + ct.cancel(); + return; + } + }; + tokio::select! { + _ = tokio::signal::ctrl_c() => {} + _ = sigterm.recv() => {} + } + } + #[cfg(not(unix))] + { + let _ = tokio::signal::ctrl_c().await; + } ct.cancel(); } }); diff --git a/crates/cluster/src/definition.rs b/crates/cluster/src/definition.rs index 9f3c30ae..41e8d3ba 100644 --- a/crates/cluster/src/definition.rs +++ b/crates/cluster/src/definition.rs @@ -767,7 +767,7 @@ impl Definition { /// Returns true if the provided definition version supports partial /// deposits. - fn support_partial_deposits(version: &str) -> bool { + pub fn support_partial_deposits(version: &str) -> bool { !matches!( version, V1_0 | V1_1 | V1_2 | V1_3 | V1_4 | V1_5 | V1_6 | V1_7 diff --git a/crates/dkg/Cargo.toml b/crates/dkg/Cargo.toml index 284ff539..0cca0559 100644 --- a/crates/dkg/Cargo.toml +++ b/crates/dkg/Cargo.toml @@ -30,6 +30,7 @@ pluto-eth2api.workspace = true pluto-eth1wrap.workspace = true pluto-eth2util.workspace = true pluto-parsigex.workspace = true +pluto-peerinfo.workspace = true pluto-frost.workspace = true async-trait.workspace = true pluto-tracing.workspace = true diff --git a/crates/dkg/src/bcast/error.rs b/crates/dkg/src/bcast/error.rs index 92faf4f9..6e4f0740 100644 --- a/crates/dkg/src/bcast/error.rs +++ b/crates/dkg/src/bcast/error.rs @@ -155,6 +155,10 @@ pub enum Error { #[error("missing protobuf field: {0}")] MissingField(&'static str), + /// A typed broadcast message failed protocol-specific validation. + #[error("invalid message: {0}")] + InvalidMessage(&'static str), + /// Protobuf encoding failed. #[error("protobuf encode failed: {0}")] Encode(#[from] prost::EncodeError), diff --git a/crates/dkg/src/disk.rs b/crates/dkg/src/disk.rs index 93d943ad..ff6b7ed6 100644 --- a/crates/dkg/src/disk.rs +++ b/crates/dkg/src/disk.rs @@ -131,7 +131,7 @@ pub async fn load_definition( if conf.no_verify { warn!( error = %error, - "Ignoring failed cluster definition signatures verification due to --no-verify flag" + "Ignoring failed cluster definition signature verification due to --no-verify flag" ); } else { return Err(DiskError::ClusterDefinitionError(error)); diff --git a/crates/dkg/src/dkg.rs b/crates/dkg/src/dkg.rs index d9f9a5d2..5edc76a5 100644 --- a/crates/dkg/src/dkg.rs +++ b/crates/dkg/src/dkg.rs @@ -1,13 +1,17 @@ -use std::{num::TryFromIntError, path, time::Duration}; +use std::{collections::HashMap, ffi::OsStr, num::TryFromIntError, path, time::Duration}; use bon::Builder; -use libp2p::PeerId; +use futures::StreamExt; +use libp2p::{PeerId, swarm::SwarmEvent}; +use pluto_app::{privkeylock, utils::UtilsError}; +use pluto_core::version; +use tokio::select; use tokio_util::sync::CancellationToken; -use tracing::{info, warn}; +use tracing::{debug, error, info, warn}; -use crate::disk; pub use crate::{ aggregate::{AggregateError, agg_deposit_data, agg_lock_hash_sig, agg_validator_registrations}, + exchanger::{Exchanger, SIG_DEPOSIT_DATA, SIG_LOCK, SIG_VALIDATOR_REG}, publish::{PublishError, write_lock_to_api}, share::Share, signing::{SigningError, sign_deposit_msgs, sign_lock_hash, sign_validator_registrations}, @@ -16,19 +20,24 @@ pub use crate::{ set_registration_signature, }, }; +use crate::{disk, frost, frostp2p, nodesigs}; use pluto_cluster::{ - definition::{Definition, ValidatorAddresses}, + definition::{Definition, DefinitionError, ValidatorAddresses}, distvalidator::DistValidatorError, - lock::Lock, + lock::{Lock, LockError}, operator::Operator, + version::versions::*, }; use pluto_crypto::types::PrivateKey; use pluto_eth1wrap::{EthClient, EthClientError}; use pluto_eth2api::spec::phase0; +use pluto_eth2util as eth2util; use pluto_eth2util::keymanager::{self, KeymanagerError}; -use pluto_p2p::{config::P2PConfig, peer::Peer}; +use pluto_p2p::{ + behaviours::pluto::PlutoBehaviourEvent, bootnode::BootnodeError, config::P2PConfig, + k1::key_path, p2p::P2PError, peer::Peer, +}; use pluto_tracing::TracingConfig; -use std::collections::HashMap; use url::Url; const DEFAULT_DATA_DIR: &str = ".charon"; @@ -112,6 +121,102 @@ pub enum DkgError { /// Integer overflow. #[error("integer overflow")] IntegerOverflow, + + /// Test-only configuration is not allowed on mainnet. + #[error("cannot use test flags on mainnet")] + TestConfigOnMainnet, + + /// Failed to create private key lock service. + #[error("failed to create private key lock service: {0}")] + PrivKeyLock(#[from] privkeylock::PrivKeyLockError), + + /// Unsupported definition version. + #[error("only v1.6.0 and newer cluster definition versions supported, got: {version}")] + UnsupportedDefinitionVersion { + /// The unsupported version. + version: String, + }, + + /// Failed to convert fork version to network. + #[error("failed to convert fork version to network: {0}")] + ForkVersionToNetwork(#[from] eth2util::network::NetworkError), + + /// Failed to load private key. + #[error("failed to load private key: {0}")] + KeyLoadError(#[from] pluto_p2p::k1::K1Error), + + /// Peer error. + #[error("peer error: {0}")] + PeerError(#[from] pluto_p2p::peer::PeerError), + + /// The local P2P key did not match the definition peer set. + #[error("private key not matching definition file: peer not in definition: {peer_id}")] + LocalPeerNotInDefinition { + /// Local peer ID derived from the P2P private key. + peer_id: PeerId, + }, + + /// Definition error. + #[error("definition error: {0}")] + Definition(#[from] DefinitionError), + + /// Bootnode or relay resolution error. + #[error("bootnode error: {0}")] + Bootnode(#[from] BootnodeError), + + /// Sync protocol error. + #[error("sync error: {0}")] + Sync(#[from] crate::sync::Error), + + /// P2P node setup error. + #[error("p2p error: {0}")] + P2P(#[from] P2PError), + + /// FROST DKG setup or execution failed. + #[error("frost error: {0}")] + Frost(#[from] frost::FrostError), + + /// DKG signing or aggregation failed. + #[error("dkg signing error: {0}")] + Signing(#[from] SigningError), + + /// K1 node-signature exchange failed. + #[error("k1 lock hash signature exchange: {0}")] + NodeSignatures(#[from] nodesigs::Error), + + /// Cluster lock verification failed. + #[error("invalid lock file signatures: {0}")] + LockVerification(#[source] LockError), + + /// Deposit-data file write failed. + #[error("deposit data error: {0}")] + Deposit(#[from] pluto_eth2util::deposit::DepositError), + + /// Output archive creation failed. + #[error("bundle output: {0}")] + BundleOutput(#[from] UtilsError), + + /// Background task failed. + #[error("background task failed: {0}")] + Join(#[from] tokio::task::JoinError), + + /// The configured deposit data does not match deposit amounts. + #[error( + "deposit data length does not match deposit amounts length: deposit_data={deposit_data}, deposit_amounts={deposit_amounts}" + )] + DepositDataLengthMismatch { + /// Deposit-data set count. + deposit_data: usize, + /// Deposit amount count. + deposit_amounts: usize, + }, + + /// The configured DKG algorithm is not supported. + #[error("unsupported dkg algorithm: {algorithm}")] + UnsupportedDkgAlgorithm { + /// Algorithm name from the cluster definition. + algorithm: String, + }, } /// Keymanager configuration accepted by the entrypoint. @@ -186,6 +291,9 @@ pub struct Config { #[builder(default)] pub execution_engine_addr: String, + /// Append configuration. + pub append_config: Option, + /// Whether to bundle the output directory as a tarball. #[builder(default)] pub zipped: bool, @@ -198,9 +306,7 @@ pub struct Config { impl Config { /// Returns `true` if any test-only configuration is active. pub fn has_test_config(&self) -> bool { - // TODO: Extend this when more test-only hooks are added to TestConfig, - // so preflight skips stay aligned with the full test configuration. - self.test_config.def.is_some() + self.test_config.def.is_some() || self.test_config.p2p_key.is_some() } } @@ -209,6 +315,9 @@ impl Config { pub struct TestConfig { /// Provides the cluster definition explicitly, skips loading from disk. pub def: Option, + + /// Provides the P2P private key explicitly, skips loading from disk. + pub p2p_key: Option, } /// Configuration used to merge the outcome of two DKG ceremonies. @@ -259,25 +368,615 @@ fn default_tracing_config() -> TracingConfig { .build() } -/// Runs the DKG entrypoint until the unported backend boundary. -pub async fn run(conf: Config, shutdown: CancellationToken) -> Result<(), DkgError> { - if shutdown.is_cancelled() { +/// Runs the DKG entrypoint. +pub async fn run(conf: Config, ct: CancellationToken) -> Result<(), DkgError> { + if ct.is_cancelled() { return Err(DkgError::ShutdownRequestedBeforeStartup); } + let (lock_ct, lock_task) = start_private_key_lock(&conf).await?; + let result = run_inner(conf, ct).await; + + lock_ct.cancel(); + lock_task + .await + .unwrap_or_else(|err| error!(?err, "Error joining private key lock task")); + + result +} + +async fn start_private_key_lock( + conf: &Config, +) -> Result<(CancellationToken, tokio::task::JoinHandle<()>), DkgError> { + let lock_svc = std::sync::Arc::new( + privkeylock::Service::new(private_key_lock_path(&conf.data_dir), "charon dkg").await?, + ); + let lock_ct = CancellationToken::new(); + let task_ct = lock_ct.clone(); + let task = tokio::spawn(async move { + let run_svc = lock_svc.clone(); + let mut run_task = tokio::spawn(async move { run_svc.run().await }); + + select! { + _ = task_ct.cancelled() => { + lock_svc.close().await; + log_private_key_lock_result(run_task.await); + } + result = &mut run_task => log_private_key_lock_result(result), + } + }); + + Ok((lock_ct, task)) +} + +fn log_private_key_lock_result( + result: std::result::Result< + std::result::Result<(), privkeylock::PrivKeyLockError>, + tokio::task::JoinError, + >, +) { + match result { + Ok(Ok(())) => {} + Ok(Err(err)) => error!(?err, "Error locking private key file"), + Err(err) => error!(?err, "Error locking private key file"), + } +} + +fn private_key_lock_path(data_dir: &path::Path) -> path::PathBuf { + let mut lock_path = key_path(data_dir); + let file_name = lock_path + .file_name() + .and_then(OsStr::to_str) + .unwrap_or("charon-enr-private-key"); + lock_path.set_file_name(format!("{file_name}.lock")); + lock_path +} + +async fn run_inner(conf: Config, ct: CancellationToken) -> Result<(), DkgError> { + if let Some(append) = &conf.append_config { + append.validate()?; + } + + version::log_info("Charon DKG starting"); + let eth1 = EthClient::new(&conf.execution_engine_addr).await?; - let _definition = disk::load_definition(&conf, ð1).await?; + let ( + def, + total_validators, + new_validators, + new_withdrawal_addresses, + new_fee_recipient_addresses, + ) = if let Some(append) = &conf.append_config { + let def = append.cluster_lock.definition.clone(); + let new_validators = u64::try_from(append.add_validators)?; + let total_validators = def + .num_validators + .checked_add(new_validators) + .ok_or(DkgError::IntegerOverflow)?; + let new_withdrawal_addresses = append + .validator_addresses + .iter() + .map(|addr| addr.withdrawal_address.clone()) + .collect::>(); + let new_fee_recipient_addresses = append + .validator_addresses + .iter() + .map(|addr| addr.fee_recipient_address.clone()) + .collect::>(); + + ( + def, + total_validators, + new_validators, + new_withdrawal_addresses, + new_fee_recipient_addresses, + ) + } else { + let def = disk::load_definition(&conf, ð1).await?; + + let total_validators = def.num_validators; + let new_validators = def.num_validators; + let new_withdrawal_addresses = def.withdrawal_addresses(); + let new_fee_recipient_addresses = def.fee_recipient_addresses(); + + ( + def, + total_validators, + new_validators, + new_withdrawal_addresses, + new_fee_recipient_addresses, + ) + }; + + // This DKG only supports a few specific config versions. + if !matches!(def.version.as_str(), V1_6 | V1_7 | V1_8 | V1_9 | V1_10) { + return Err(DkgError::UnsupportedDefinitionVersion { + version: def.version.clone(), + }); + } validate_keymanager_flags(&conf)?; + + // Check if keymanager address is reachable. verify_keymanager_connection(&conf).await?; if !conf.has_test_config() { disk::check_clear_data_dir(&conf.data_dir).await?; } + disk::check_writes(&conf.data_dir).await?; - unimplemented!("DKG ceremony backend is not implemented yet"); + let network = eth2util::network::fork_version_to_network(&def.fork_version)?; + if network == eth2util::network::MAINNET.name && conf.has_test_config() { + return Err(DkgError::TestConfigOnMainnet); + } + + let peers = def.peers()?; + + let def_hash = pluto_cluster::helpers::to_0x_hex(&def.definition_hash); + + let key = if let Some(key) = conf.test_config.p2p_key.clone() { + key + } else { + pluto_p2p::k1::load_priv_key(&conf.data_dir)? + }; + + let peer_id = pluto_p2p::peer::peer_id_from_key(key.public_key())?; + + info!("Starting local P2P networking peer"); + + log_peer_summary(peer_id, &peers, &def.operators); + + let sig_types = vec![SIG_LOCK, SIG_DEPOSIT_DATA, SIG_VALIDATOR_REG]; + let sig_type_set = std::sync::Arc::new(sig_types.iter().copied().collect()); + let num_validators = u32::try_from(new_validators)?; + let (node, mut handlers) = crate::node::setup_p2p( + key.clone(), + &conf, + &peers, + def.definition_hash.clone(), + sig_type_set, + num_validators, + ct.child_token(), + ) + .await?; + + let node_idx = def + .node_idx(node.local_peer_id()) + .map_err(|source| match source { + DefinitionError::PeerNotFound { peer_id } => { + DkgError::LocalPeerNotInDefinition { peer_id } + } + other => DkgError::Definition(other), + })?; + + let peer_ids = def.peer_ids()?; + let exchanger = Exchanger::new( + ct.child_token(), + handlers.parsigex.clone(), + peer_ids, + sig_types, + ) + .await; + + let peer_share_indices = peers + .iter() + .map(|peer| Ok((peer.id, u32::try_from(peer.share_idx())?))) + .collect::, DkgError>>()?; + let local_share_idx = u32::try_from(node_idx.share_idx)?; + let threshold = usize::try_from(def.threshold)?; + let mut frost_transport = frostp2p::new_frost_p2p( + handlers.bcast.clone(), + &mut handlers.frost_p2p, + &peer_share_indices, + local_share_idx, + threshold, + num_validators as usize, + ) + .await?; + let node_sig_caster = nodesigs::NodeSigBcast::new( + peers.clone(), + node_idx.peer_idx, + handlers.bcast.clone(), + ct.child_token(), + ) + .await?; + + let sync_clients = handlers.sync.clone(); + let sync_server = handlers.sync_server.clone(); + let frost_handle = handlers.frost_p2p; + let network_ct = ct.child_token(); + let network_task = tokio::spawn(drive_dkg_network(node, frost_handle, network_ct.clone())); + + let result = run_ceremony( + &conf, + ð1, + ct.child_token(), + def, + total_validators, + new_validators, + new_withdrawal_addresses, + new_fee_recipient_addresses, + network, + def_hash, + key, + node_idx, + peers, + exchanger, + &mut frost_transport, + node_sig_caster, + sync_server, + sync_clients, + ) + .await; + + network_ct.cancel(); + let _ = network_task.await; + + result +} + +#[allow(clippy::too_many_arguments, reason = "mirrors the Go DKG run flow")] +async fn run_ceremony( + conf: &Config, + eth1: &EthClient, + ct: CancellationToken, + def: Definition, + total_validators: u64, + new_validators: u64, + new_withdrawal_addresses: Vec, + new_fee_recipient_addresses: Vec, + network: String, + def_hash: String, + key: k256::SecretKey, + node_idx: pluto_cluster::definition::NodeIdx, + peers: Vec, + exchanger: Exchanger, + frost_transport: &mut T, + node_sig_caster: nodesigs::NodeSigBcast, + sync_server: crate::sync::Server, + sync_clients: Vec, +) -> Result<(), DkgError> { + info!("Waiting to connect to all peers..."); + + let mut sync_runtime = start_sync_protocol(sync_server, sync_clients, ct.child_token()).await?; + + info!("All peers connected, starting DKG ceremony"); + + let num_validators = u32::try_from(new_validators)?; + let threshold = u32::try_from(def.threshold)?; + let share_idx = u32::try_from(node_idx.share_idx)?; + + let shares = match def.dkg_algorithm.as_str() { + "default" | "frost" => { + let num_nodes = u32::try_from(peers.len())?; + frost::run_frost_parallel( + ct.child_token(), + frost_transport, + num_validators, + num_nodes, + threshold, + share_idx, + &def_hash, + ) + .await? + } + algorithm => { + return Err(DkgError::UnsupportedDkgAlgorithm { + algorithm: algorithm.to_string(), + }); + } + }; + + // DKG was step 1, advance to step 2. + sync_runtime.next_step().await?; + + let append_config = conf.append_config.as_ref(); + let existing_shares = if append_config.is_some_and(|append| !append.unverified) { + get_existing_shares(append_config)? + } else { + Vec::new() + }; + + if append_config.is_some() { + debug!( + total = total_validators, + added = new_validators, + "Validator keys summary" + ); + } + + let deposit_amounts = deposit_amounts_for_definition(&def); + if let Some(append) = append_config + && !append.deposit_data.is_empty() + && append.deposit_data.len() != deposit_amounts.len() + { + return Err(DkgError::DepositDataLengthMismatch { + deposit_data: append.deposit_data.len(), + deposit_amounts: deposit_amounts.len(), + }); + } + + let mut deposit_datas = crate::signing::sign_and_agg_deposit_data( + &exchanger, + &shares, + &new_withdrawal_addresses, + &network, + &node_idx, + &deposit_amounts, + def.compounding, + ) + .await?; + + // Deposit data was step 2, advance to step 3. + sync_runtime.next_step().await?; + + let val_regs = crate::signing::sign_and_agg_validator_registrations( + &exchanger, + &shares, + &new_fee_recipient_addresses, + def.target_gas_limit, + &node_idx, + &def.fork_version, + ) + .await?; + + // Pre-regs was step 3, advance to step 4. + sync_runtime.next_step().await?; + + let mut lock = crate::signing::sign_and_aggregate_lock_hash( + &existing_shares, + &shares, + def, + &node_idx, + &exchanger, + deposit_datas.clone(), + val_regs, + append_config, + ) + .await?; + + // Lock hash aggregate was step 4, advance to step 5. + sync_runtime.next_step().await?; + + lock.node_signatures = node_sig_caster + .exchange(Some(&key), &lock.lock_hash, ct.child_token()) + .await?; + + if !pluto_cluster::version::support_node_signatures(&lock.version) { + lock.node_signatures.clear(); + } + + // Node signatures was step 5, advance to step 6. + sync_runtime.next_step().await?; + + if !conf.no_verify && append_config.is_none_or(|append| !append.unverified) { + lock.verify_signatures(eth1) + .await + .map_err(DkgError::LockVerification)?; + } + + if conf.keymanager.address.is_empty() { + let all_shares = existing_shares + .iter() + .chain(shares.iter()) + .cloned() + .collect::>(); + disk::write_keys_to_disk(conf, &all_shares, false).await?; + debug!(total = all_shares.len(), "Saved keyshares to disk"); + } else { + disk::write_to_keymanager( + &conf.keymanager.address, + &conf.keymanager.auth_token, + &shares, + ) + .await?; + debug!( + keymanager_address = conf.keymanager.address, + total = shares.len(), + "Imported keyshares to keymanager" + ); + } + + let mut dashboard_url = None; + if conf.publish.enabled { + match write_lock_to_api(&conf.publish.address, &lock, conf.publish.timeout).await { + Ok(url) => dashboard_url = Some(url), + Err(error) => warn!(%error, "Couldn't publish lock file to Obol API"), + } + } + + disk::write_lock(&conf.data_dir, &lock).await?; + debug!("Saved lock file to disk"); + + if let Some(append) = append_config + && !append.deposit_data.is_empty() + { + deposit_datas = pluto_eth2util::deposit::merge_deposit_data_sets( + deposit_datas, + append.deposit_data.clone(), + ); + debug!( + amounts = deposit_datas.len(), + validators = deposit_datas.first().map_or(0, Vec::len), + "Merged deposit data files" + ); + } + + for deposit_data in &deposit_datas { + pluto_eth2util::deposit::write_deposit_data_file(deposit_data, &network, &conf.data_dir) + .await?; + debug!("Saved deposit data file(s) to disk"); + } + + // Signature verification and disk key write was step 6, advance to step 7. + sync_runtime.next_step().await?; + + sync_runtime.shutdown().await?; + + if conf.zipped { + let data_dir = conf.data_dir.clone(); + tokio::task::spawn_blocking(move || { + pluto_app::utils::bundle_output(data_dir, "dkg.tar.gz") + }) + .await??; + } + + debug!( + seconds = conf.shutdown_delay.as_secs(), + "Graceful shutdown delay" + ); + tokio::time::sleep(conf.shutdown_delay).await; + + info!("Successfully completed DKG ceremony πŸŽ‰"); + if let Some(url) = dashboard_url { + info!("You can find your newly-created cluster dashboard here: {url}"); + } + + Ok(()) +} + +fn deposit_amounts_for_definition(def: &Definition) -> Vec { + if def.deposit_amounts.is_empty() { + if pluto_cluster::definition::Definition::support_partial_deposits(&def.version) { + pluto_eth2util::deposit::default_deposit_amounts(def.compounding) + } else { + vec![pluto_eth2util::deposit::DEFAULT_DEPOSIT_AMOUNT] + } + } else { + pluto_eth2util::deposit::dedup_amounts(&def.deposit_amounts) + } +} + +struct SyncRuntime { + server: crate::sync::Server, + clients: Vec, + step: i64, + cancellation: CancellationToken, + tasks: Vec>, +} + +impl SyncRuntime { + async fn next_step(&mut self) -> Result<(), DkgError> { + self.step = self.step.checked_add(1).ok_or(DkgError::IntegerOverflow)?; + for client in &self.clients { + client.set_step(self.step); + } + + debug!(step = self.step, "Waiting for peers to start next step"); + self.server + .await_all_at_step(self.step, self.cancellation.child_token()) + .await?; + + Ok(()) + } + + async fn shutdown(mut self) -> Result<(), DkgError> { + for client in &self.clients { + client.shutdown(self.cancellation.child_token()).await?; + } + + self.server + .await_all_shutdown(self.cancellation.child_token()) + .await?; + self.cancellation.cancel(); + + for task in self.tasks.drain(..) { + let _ = task.await; + } + + Ok(()) + } +} + +impl Drop for SyncRuntime { + fn drop(&mut self) { + self.cancellation.cancel(); + } +} + +async fn start_sync_protocol( + server: crate::sync::Server, + clients: Vec, + cancellation: CancellationToken, +) -> Result { + server.start(); + + let mut tasks = Vec::with_capacity(clients.len()); + for client in &clients { + let client = client.clone(); + let client_ct = cancellation.child_token(); + let cancel_on_error = cancellation.clone(); + tasks.push(tokio::spawn(async move { + if let Err(error) = client.run(client_ct).await + && !matches!(error, crate::sync::Error::Canceled) + { + error!(%error, "Sync failed to peer"); + cancel_on_error.cancel(); + } + })); + } + + let mut ticker = tokio::time::interval(Duration::from_millis(250)); + loop { + if let Some(error) = server.err().await { + return Err(DkgError::Sync(error)); + } + + let connected_count = clients + .iter() + .filter(|client| client.is_connected()) + .count(); + if connected_count == clients.len() { + break; + } + + tokio::select! { + _ = cancellation.cancelled() => return Err(crate::sync::Error::Canceled.into()), + _ = ticker.tick() => {} + } + } + + for client in &clients { + client.disable_reconnect(); + } + + server + .await_all_connected(cancellation.child_token()) + .await?; + + let mut runtime = SyncRuntime { + server, + clients, + step: 0, + cancellation, + tasks, + }; + runtime.next_step().await?; + + Ok(runtime) +} + +async fn drive_dkg_network( + mut node: pluto_p2p::p2p::Node, + frost_handle: frostp2p::FrostP2PHandle, + cancellation: CancellationToken, +) { + loop { + tokio::select! { + _ = cancellation.cancelled() => break, + event = node.select_next_some() => { + if let SwarmEvent::Behaviour(PlutoBehaviourEvent::Inner( + crate::node::DkgBehaviourEvent::Bcast(event), + )) = event + && let Err(error) = frost_handle.handle_bcast_event(event) + { + debug!(%error, "Failed to forward bcast event to FROST transport"); + } + } + } + } } fn validate_keymanager_flags(conf: &Config) -> Result<(), DkgError> { @@ -475,10 +1174,12 @@ mod tests { #[tokio::test] async fn run_rejects_mismatched_keymanager_flags() { + let tempdir = tempfile::tempdir().expect("tempdir"); let (lock, ..) = pluto_cluster::test_cluster::new_for_test(1, 3, 4, 0); let err = run( Config::builder() + .data_dir(tempdir.path().to_path_buf()) .test_config(TestConfig::builder().def(lock.definition.clone()).build()) .keymanager( KeymanagerConfig::builder() @@ -541,36 +1242,32 @@ mod tests { } #[tokio::test] - async fn run_executes_preflight_before_reaching_backend_boundary() { + async fn run_reaches_p2p_key_verification_after_preflight() { let tempdir = tempfile::tempdir().expect("tempdir"); - let definition_path = tempdir.path().join("cluster-definition.json"); - let private_key_path = tempdir.path().join("charon-enr-private-key"); + let (lock, ..) = pluto_cluster::test_cluster::new_for_test(1, 3, 4, 1); + let mismatched_key = pluto_testutil::random::generate_insecure_k1_key(99); - tokio::fs::write(&private_key_path, b"dummy") - .await - .expect("private key"); - - let (lock, ..) = pluto_cluster::test_cluster::new_for_test(1, 3, 4, 0); - let definition = serde_json::to_string(&lock.definition).expect("definition json"); - tokio::fs::write(&definition_path, definition) - .await - .expect("definition file"); - - let join_err = tokio::spawn(async move { - run( - Config::builder() - .data_dir(tempdir.path().to_path_buf()) - .def_file(definition_path.to_string_lossy().into_owned()) - .no_verify(true) - .build(), - CancellationToken::new(), - ) - .await - }) + let err = run( + Config::builder() + .data_dir(tempdir.path().to_path_buf()) + .p2p(P2PConfig::default()) + .shutdown_delay(Duration::ZERO) + .test_config( + TestConfig::builder() + .def(lock.definition.clone()) + .p2p_key(mismatched_key) + .build(), + ) + .build(), + CancellationToken::new(), + ) .await - .expect_err("backend handoff should panic until implemented"); + .expect_err("mismatched P2P key should fail before networking"); - assert!(join_err.is_panic()); + assert!(matches!( + err, + DkgError::PeerError(pluto_p2p::peer::PeerError::UnknownPublicKey) + )); } #[tokio::test] diff --git a/crates/dkg/src/frost.rs b/crates/dkg/src/frost.rs index a27ca2e8..cafd6645 100644 --- a/crates/dkg/src/frost.rs +++ b/crates/dkg/src/frost.rs @@ -1,5 +1,3 @@ -#![allow(dead_code)] - use std::collections::{BTreeMap, HashMap}; use async_trait::async_trait; @@ -59,7 +57,7 @@ pub(crate) trait FTransport: Send + Sync { /// FROST DKG orchestration errors. #[derive(Debug, thiserror::Error)] -pub(crate) enum FrostError { +pub enum FrostError { /// Failed to construct a participant. #[error("new participant: {0}")] NewParticipant(#[source] pluto_frost::kryptology::KryptologyError), @@ -290,38 +288,26 @@ pub(crate) async fn run_frost_parallel( share_idx: u32, dkg_ctx: &str, ) -> Result, FrostError> { - debug!( - num_validators, - num_nodes, threshold, share_idx, "Starting FROST DKG" - ); let mut validators = new_frost_participants(num_validators, num_nodes, threshold, share_idx, dkg_ctx)?; let (cast_r1, p2p_r1) = round1(&mut validators)?; - debug!( - bcasts = cast_r1.len(), - p2p = p2p_r1.len(), - "Completed local FROST DKG round 1" - ); + + debug!("Sending round 1 messages"); + let (cast_r1_result, p2p_r1_result) = tp.round1(&cancellation, cast_r1, p2p_r1).await?; - debug!( - bcasts = cast_r1_result.len(), - p2p = p2p_r1_result.len(), - "Completed FROST DKG round 1 transport" - ); + + debug!("Received round 1 results"); let cast_r2 = round2(&mut validators, &cast_r1_result, &p2p_r1_result)?; - debug!(bcasts = cast_r2.len(), "Completed local FROST DKG round 2"); + + debug!("Sending round 2 messages"); + let cast_r2_result = tp.round2(&cancellation, cast_r2).await?; - debug!( - bcasts = cast_r2_result.len(), - "Completed FROST DKG round 2 transport" - ); - let shares = make_shares(&validators, &cast_r2_result)?; - debug!(shares = shares.len(), "Completed FROST DKG"); + debug!("Received round 2 results"); - Ok(shares) + make_shares(&validators, &cast_r2_result) } /// Returns multiple frost DKG participants (one for each parallel validator). diff --git a/crates/dkg/src/frostp2p.rs b/crates/dkg/src/frostp2p.rs index 6b094c22..50177582 100644 --- a/crates/dkg/src/frostp2p.rs +++ b/crates/dkg/src/frostp2p.rs @@ -103,8 +103,6 @@ //! intentionally not reset; create a fresh [`FrostP2PBehaviour`], //! [`FrostP2PHandle`], and [`FrostP2P`] for each DKG. -#![allow(dead_code)] - use std::{ collections::{HashMap, HashSet, VecDeque}, sync::{Arc, Mutex}, @@ -167,7 +165,7 @@ pub(crate) const SEND_TIMEOUT: Duration = Duration::from_secs(7); /// FROST direct-P2P delivery errors. #[derive(Debug, thiserror::Error)] -pub(crate) enum FrostP2PError { +pub enum FrostP2PError { /// The behaviour task is no longer running. #[error("frost p2p behaviour is no longer running")] BehaviourClosed, @@ -200,6 +198,7 @@ pub(crate) enum OutEvent { /// Event emitted while the FROST P2P transport progresses through its rounds. #[derive(Debug)] +#[allow(dead_code)] pub(crate) enum FrostP2PEvent { /// A FROST transport round started. RoundStarted { diff --git a/crates/dkg/src/lib.rs b/crates/dkg/src/lib.rs index dff76aec..72e07923 100644 --- a/crates/dkg/src/lib.rs +++ b/crates/dkg/src/lib.rs @@ -49,3 +49,6 @@ mod signing; /// Registration conversion and distributed-validator assembly helpers. mod validators; + +/// P2P node setup. +mod node; diff --git a/crates/dkg/src/node.rs b/crates/dkg/src/node.rs new file mode 100644 index 00000000..85b7102f --- /dev/null +++ b/crates/dkg/src/node.rs @@ -0,0 +1,211 @@ +use std::{ + collections::{HashMap, HashSet}, + sync::Arc, +}; + +use crate::{ + bcast, + dkg::{Config, DkgError}, + exchanger::{SIG_DEPOSIT_DATA, SigType}, + frostp2p, sync, +}; +use libp2p::{Multiaddr, multiaddr::Protocol, relay, swarm::NetworkBehaviour}; +use pluto_core::{ + types::{Duty, DutyType}, + version, +}; +use pluto_p2p::{ + bootnode, gater, + p2p::{Node, NodeType}, + p2p_context::P2PContext, + peer::{Peer, peer_id_from_key, verify_p2p_key}, + relay::{MutableRelayReservation, RelayRouter}, +}; +use pluto_parsigex as parsigex; +use pluto_peerinfo::{self as peerinfo, LocalPeerInfo}; +use tokio_util::sync::CancellationToken; + +#[derive(NetworkBehaviour)] +pub(crate) struct DkgBehaviour { + pub(crate) relay: relay::client::Behaviour, + pub(crate) relay_reservation: MutableRelayReservation, + pub(crate) relay_router: RelayRouter, + pub(crate) bcast: bcast::Behaviour, + pub(crate) sync: sync::Behaviour, + pub(crate) parsigex: parsigex::Behaviour, + pub(crate) peerinfo: peerinfo::Behaviour, + pub(crate) frost_p2p: frostp2p::FrostP2PBehaviour, +} + +type Result = std::result::Result; + +pub(crate) struct Handlers { + pub(crate) bcast: bcast::Component, + pub(crate) sync: Vec, + pub(crate) sync_server: sync::Server, + pub(crate) parsigex: parsigex::Handle, + pub(crate) frost_p2p: frostp2p::FrostP2PHandle, +} + +pub(crate) async fn setup_p2p( + key: k256::SecretKey, + conf: &Config, + peers: &[Peer], + def_hash: Vec, + sig_types: Arc>, + num_validators: u32, + ct: CancellationToken, +) -> Result<(Node, Handlers)> { + let peer_ids = peers.iter().map(|peer| peer.id).collect::>(); + let local_peer_id = peer_id_from_key(key.public_key())?; + + verify_p2p_key(peers, &key)?; + + let relay_addrs = relay_addrs_for_resolution(&conf.p2p.relays); + let relays = bootnode::new_relays(ct, &relay_addrs, &hex::encode(&def_hash)).await?; + + let conn_gater = gater::ConnGater::new_conn_gater(peer_ids.clone(), relays.clone()); + + let p2p_context = P2PContext::new(peer_ids.clone()); + p2p_context.set_local_peer_id(local_peer_id); + + let relay_reservation = MutableRelayReservation::new(relays.clone()); + let relay_router = RelayRouter::new(relays, p2p_context.clone(), local_peer_id); + + let (bcast_comp, bcast_comp_handle) = + bcast::Behaviour::new(peer_ids.clone(), p2p_context.clone(), key.clone()); + let (sync_comp, sync_server, sync_clients) = sync::new( + peer_ids.clone(), + p2p_context.clone(), + &key, + def_hash.clone(), + version::VERSION.to_minor(), + )?; + + let parsigex_config = parsigex::Config::new( + local_peer_id, + p2p_context.clone(), + Arc::new(|_duty, _pk, _sig| Box::pin(async { Ok(()) })), + Arc::new(move |duty: &Duty| { + if duty.duty_type != DutyType::Signature { + return false; + } + + if sig_types.contains(&SIG_DEPOSIT_DATA) && duty.slot.inner() >= SIG_DEPOSIT_DATA { + return true; + } + + sig_types.contains(&duty.slot.inner()) + }), + ); + let (parsigex_comp, parsigex_handle) = parsigex::Behaviour::new(parsigex_config); + + let (git_hash, _) = version::git_commit(); + let peerinfo_config = peerinfo::Config::new(LocalPeerInfo::new( + version::VERSION.to_string(), + def_hash.clone(), + git_hash, + false, + "", + )) + .with_peers(peer_ids.clone()); + let peerinfo_comp = peerinfo::Behaviour::new(local_peer_id, peerinfo_config); + + let mut share_idx_by_peer = HashMap::new(); + let mut local_share_idx = None; + for peer in peers { + let share_idx = u32::try_from(peer.share_idx())?; + share_idx_by_peer.insert(peer.id, share_idx); + if peer.id == local_peer_id { + local_share_idx = Some(share_idx); + } + } + let local_share_idx = local_share_idx.ok_or(DkgError::LocalPeerNotInDefinition { + peer_id: local_peer_id, + })?; + + let (frost_p2p_comp, frost_p2p_handle) = frostp2p::FrostP2PBehaviour::new( + p2p_context.clone(), + peer_ids.clone(), + share_idx_by_peer, + local_share_idx, + num_validators as usize, + ); + + let node = Node::new( + conf.p2p.clone(), + key, + NodeType::TCP, + false, + p2p_context, + |builder, _, relay_client| { + builder.with_gater(conn_gater).with_inner(DkgBehaviour { + relay: relay_client, + relay_reservation, + relay_router, + bcast: bcast_comp, + sync: sync_comp, + parsigex: parsigex_comp, + peerinfo: peerinfo_comp, + frost_p2p: frost_p2p_comp, + }) + }, + )?; + + let handlers = Handlers { + bcast: bcast_comp_handle, + sync: sync_clients, + sync_server, + parsigex: parsigex_handle, + frost_p2p: frost_p2p_handle, + }; + + Ok((node, handlers)) +} + +fn relay_addrs_for_resolution(relays: &[Multiaddr]) -> Vec { + relays.iter().map(relay_addr_for_resolution).collect() +} + +fn relay_addr_for_resolution(relay: &Multiaddr) -> String { + let mut scheme = None; + let mut host = None; + let mut port = None; + + for protocol in relay.iter() { + match protocol { + Protocol::Http => scheme = Some("http"), + Protocol::Https => scheme = Some("https"), + Protocol::Dns(name) + | Protocol::Dns4(name) + | Protocol::Dns6(name) + | Protocol::Dnsaddr(name) + if host.is_none() => + { + host = Some(name.to_string()); + } + Protocol::Ip4(ip) if host.is_none() => { + host = Some(ip.to_string()); + } + Protocol::Ip6(ip) if host.is_none() => { + host = Some(format!("[{ip}]")); + } + Protocol::Tcp(tcp_port) => port = Some(tcp_port), + _ => {} + } + } + + if let (Some(scheme), Some(host)) = (scheme, host) { + let default_port = match scheme { + "https" => 443, + _ => 80, + }; + + return match port { + Some(port) if port != default_port => format!("{scheme}://{host}:{port}"), + _ => format!("{scheme}://{host}"), + }; + } + + relay.to_string() +} diff --git a/crates/dkg/src/publish.rs b/crates/dkg/src/publish.rs index c4093e2b..32417af2 100644 --- a/crates/dkg/src/publish.rs +++ b/crates/dkg/src/publish.rs @@ -27,7 +27,7 @@ pub async fn write_lock_to_api( )?; client.publish_lock(lock.clone()).await?; - debug!(addr = publish_addr, "Published lock file"); + debug!(addr = publish_addr, "Published lock file to api"); Ok(client.launchpad_url_for_lock(lock)?) } diff --git a/crates/dkg/src/signing.rs b/crates/dkg/src/signing.rs index 4d25ec02..1beaa5e7 100644 --- a/crates/dkg/src/signing.rs +++ b/crates/dkg/src/signing.rs @@ -262,8 +262,8 @@ pub(crate) async fn sign_and_agg_validator_registrations( ) -> Result> { let effective_gas_limit = if gas_limit == 0 { warn!( - default = registration::DEFAULT_GAS_LIMIT, - "gas_limit not set, using default" + default_gas_limit = registration::DEFAULT_GAS_LIMIT, + "custom target gas limit not supported, setting to default" ); registration::DEFAULT_GAS_LIMIT } else { diff --git a/crates/dkg/src/sync/handler.rs b/crates/dkg/src/sync/handler.rs index 2989c61d..0807b5db 100644 --- a/crates/dkg/src/sync/handler.rs +++ b/crates/dkg/src/sync/handler.rs @@ -223,7 +223,7 @@ impl ConnectionHandler for Handler { self.inbound = None; } Poll::Ready(Err(error)) => { - warn!(peer = %self.peer_id, err = %error, "Error serving inbound sync stream"); + warn!(peer = %self.peer_id, err = %error, "Error serving sync protocol"); self.inbound = None; } } @@ -467,12 +467,8 @@ async fn handle_inbound_stream( } else { let (inserted, count) = server.set_connected(peer_id).await; if inserted { - info!( - peer = %peer_id, - connected = count, - expected = server.expected_peer_count(), - "Connected to peer" - ); + let expected = server.expected_peer_count(); + info!(peer = %peer_id, "Connected to peer {count} of {expected}"); } } diff --git a/crates/p2p/src/proto.rs b/crates/p2p/src/proto.rs index 409e136a..04f8581a 100644 --- a/crates/p2p/src/proto.rs +++ b/crates/p2p/src/proto.rs @@ -57,8 +57,14 @@ pub async fn write_fixed_size_delimited( let len = i64::try_from(payload.len()) .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "payload length overflow"))?; - stream.write_all(&len.to_le_bytes()).await?; - stream.write_all(payload).await?; + // Charon's `readSizedProto` uses a single `reader.Read(buf)` for the + // payload (not `io.ReadFull`), so it requires the length prefix and payload + // to arrive in one libp2p chunk. Coalesce them into one `write_all` to + // avoid splitting across yamux frames. + let mut buf = Vec::with_capacity(8usize.saturating_add(payload.len())); + buf.extend_from_slice(&len.to_le_bytes()); + buf.extend_from_slice(payload); + stream.write_all(&buf).await?; stream.flush().await } diff --git a/crates/p2p/src/relay.rs b/crates/p2p/src/relay.rs index 9219da59..af153640 100644 --- a/crates/p2p/src/relay.rs +++ b/crates/p2p/src/relay.rs @@ -252,7 +252,6 @@ impl MutableRelayReservation { /// Processes pending subscription events. fn process_subscription_events(&mut self) { - tracing::debug!("Processing subscription events"); let peers = { let Ok(mut queue) = self.subscription_events.lock() else { tracing::warn!("Failed to lock subscription events queue"); diff --git a/scripts/dkg-runner/README.md b/scripts/dkg-runner/README.md index 98ee1daa..9b38f0c8 100644 --- a/scripts/dkg-runner/README.md +++ b/scripts/dkg-runner/README.md @@ -60,6 +60,8 @@ All variables are optional. Set them in the environment before calling any scrip | `FEE_RECIPIENT` | `0xDeaDBeef…` | Fee recipient address for the cluster | | `WITHDRAWAL_ADDR` | `0xDeaDBeef…` | Withdrawal address for the cluster | | `TIMEOUT` | `120` | Seconds to wait before declaring the ceremony failed | +| `SHUTDOWN_DELAY` | `30s` | Graceful shutdown delay passed to each node via `--shutdown-delay` | +| `NODE_EXIT_TIMEOUT` | `90` | Seconds to wait for node processes to exit cleanly after artifacts appear | | `PLUTO_BIN` | `./target/debug/pluto` | Path to the Pluto binary (only required when `PLUTO_NODES > 0`) | | `CHARON_BIN` | `charon` | Path to the Charon binary | | `WORK_DIR` | `/tmp/dkg-run` | Scratch directory β€” wiped at the start of every run | @@ -75,7 +77,7 @@ All variables are optional. Set them in the environment before calling any scrip | 1 | `setup.sh` | Wipes `WORK_DIR`, creates `node-0/`…`node-N/` data dirs, generates a p2p key + ENR for each node (`pluto create enr` / `charon create enr`), then runs `charon create dkg --operator-enrs=…` | | 2 | `start-nodes.sh` | Starts Pluto nodes (slots 0…PLUTO_NODES-1) and Charon nodes (remaining slots) as background processes, each in its own process group; logs to `node-N/node.log` | | 3 | `monitor.sh` | Waits for `cluster-lock.json` to appear in every node's data dir; exits 0 on completion, 1 on timeout (with the tail of each `node.log` dumped to stderr) | -| 4 | *(inline)* | Sends SIGTERM to each node's process group unless `KEEP_NODES` is enabled | +| 4 | `wait-node-exits.sh` | Waits for each node process to exit with status `0` unless `KEEP_NODES` is enabled | | 5 | `collect.sh` | Copies keystores and `cluster-lock.json` to `WORK_DIR/output/`; prints a summary | On success, outputs are under `$WORK_DIR/output/`. On failure or timeout, partial outputs are still collected and `WORK_DIR` is preserved for inspection. `run.sh` never deletes `WORK_DIR`; use `./scripts/dkg-runner/reset.sh` when you're done. @@ -93,6 +95,7 @@ Ctrl-C at any point kills all node process groups cleanly via the SIGINT trap; ` | `start-nodes.sh` | Launches node processes in the background (each in its own process group) | | `run-node.sh` | Runs a single node in the foreground: `run-node.sh ` | | `monitor.sh` | Waits for ceremony completion or timeout | +| `wait-node-exits.sh` | Waits for all node processes to report clean exit codes | | `collect.sh` | Gathers keystores and lock file into `output/` | | `reset.sh` | Kills all nodes and removes `WORK_DIR` (the explicit cleanup tool) | | `config.sh` | Shared env-var defaults sourced by every script | diff --git a/scripts/dkg-runner/config.sh b/scripts/dkg-runner/config.sh index 63519a09..33eac79d 100755 --- a/scripts/dkg-runner/config.sh +++ b/scripts/dkg-runner/config.sh @@ -11,6 +11,8 @@ : "${CHARON_NODES:=2}" : "${RELAY_URL:=https://0.relay.obol.tech}" : "${TIMEOUT:=120}" +: "${SHUTDOWN_DELAY:=30s}" +: "${NODE_EXIT_TIMEOUT:=90}" : "${PLUTO_BIN:=./target/debug/pluto}" : "${CHARON_BIN:=charon}" : "${WORK_DIR:=/tmp/dkg-run}" diff --git a/scripts/dkg-runner/monitor.sh b/scripts/dkg-runner/monitor.sh index 6e4a57b6..38d15a2b 100755 --- a/scripts/dkg-runner/monitor.sh +++ b/scripts/dkg-runner/monitor.sh @@ -25,7 +25,17 @@ POLL_INTERVAL=2 TAIL_LINES=30 log_info "Waiting for ${NODES} nodes (timeout: ${TIMEOUT}s)" -log_info "Completion = cluster-lock.json present in ${WORK_DIR}/node-*/" +log_info "Completion = cluster-lock.json AND keystore-*.json present in ${WORK_DIR}/node-*/" + +# A node is done when both cluster-lock.json and at least one keystore are +# present. Pluto writes keystores under validator_keys/, Charon writes them +# flat in the data dir β€” accept either layout. +node_done() { + local node_dir="${1}" + [[ -f "${node_dir}/cluster-lock.json" ]] || return 1 + compgen -G "${node_dir}/validator_keys/keystore-*.json" > /dev/null 2>&1 \ + || compgen -G "${node_dir}/keystore-*.json" > /dev/null 2>&1 +} start_time="${SECONDS}" last_count=-1 @@ -34,7 +44,7 @@ while true; do elapsed=$(( SECONDS - start_time )) done_count=0 for (( i = 0; i < NODES; i++ )); do - if [[ -f "${WORK_DIR}/node-${i}/cluster-lock.json" ]]; then + if node_done "${WORK_DIR}/node-${i}"; then done_count=$(( done_count + 1 )) fi done diff --git a/scripts/dkg-runner/run-node.sh b/scripts/dkg-runner/run-node.sh index 811e6d2d..6dc05c82 100755 --- a/scripts/dkg-runner/run-node.sh +++ b/scripts/dkg-runner/run-node.sh @@ -94,4 +94,5 @@ log_info "==============================================" --definition-file="${DEF_FILE}" \ --data-dir="${DATA_DIR}" \ --p2p-relays="${RELAY_URL}" \ + --shutdown-delay="${SHUTDOWN_DELAY}" \ 2>&1 | tee "${LOG_FILE}" diff --git a/scripts/dkg-runner/run.sh b/scripts/dkg-runner/run.sh index 59bab8de..9ba3f124 100755 --- a/scripts/dkg-runner/run.sh +++ b/scripts/dkg-runner/run.sh @@ -12,6 +12,7 @@ # RELAY_URL=https://0.relay.obol.tech # Relay ENR endpoint used by the DKG nodes. # TIMEOUT=120 Seconds to wait for all nodes before aborting. +# NODE_EXIT_TIMEOUT=90 Seconds to wait for nodes to exit after completion. # PLUTO_BIN=./target/debug/pluto # Path to the Pluto binary. # CHARON_BIN=charon Path to the Charon binary. @@ -89,6 +90,7 @@ log_info " CHARON_NODES = ${CHARON_NODES}" log_info " RELAY_URL = ${RELAY_URL}" log_info " NETWORK = ${NETWORK}" log_info " TIMEOUT = ${TIMEOUT}s" +log_info " NODE_EXIT_TIMEOUT = ${NODE_EXIT_TIMEOUT}s" log_info " PLUTO_BIN = ${PLUTO_BIN}" log_info " CHARON_BIN = ${CHARON_BIN}" log_info " WORK_DIR = ${WORK_DIR}" @@ -119,8 +121,17 @@ fi if is_truthy "${KEEP_NODES}"; then log_info "--- Phase 4: Keep nodes running (ceremony complete) ---" else - log_info "--- Phase 4: Stop nodes (ceremony complete) ---" - _kill_nodes || true + log_info "--- Phase 4: Wait for clean node exits ---" + wait_exit=0 + "${SCRIPT_DIR}/wait-node-exits.sh" || wait_exit=$? + if (( wait_exit != 0 )); then + log_err "One or more nodes exited unsuccessfully after producing artifacts." + _kill_nodes || true + "${SCRIPT_DIR}/collect.sh" || true + log_info "Work dir preserved at ${WORK_DIR}. Run ${SCRIPT_DIR}/reset.sh to remove it." + trap - INT TERM + exit 1 + fi fi log_info "--- Phase 5: Collect outputs ---" diff --git a/scripts/dkg-runner/start-nodes.sh b/scripts/dkg-runner/start-nodes.sh index 929be0b9..578ff63b 100755 --- a/scripts/dkg-runner/start-nodes.sh +++ b/scripts/dkg-runner/start-nodes.sh @@ -49,24 +49,40 @@ start_node() { local label="${3}" local data_dir="${WORK_DIR}/node-${index}" local log_file="${data_dir}/node.log" + local exit_file="${data_dir}/exit-code" mkdir -p "${data_dir}" + rm -f "${exit_file}" log_info "Starting ${label} node ${index} (bin: ${bin})" if is_ci; then # Quiet path for CI: write to log file only. - "${bin}" dkg \ - --definition-file="${DEF_FILE}" \ - --data-dir="${data_dir}" \ - --p2p-relays="${RELAY_URL}" \ - > "${log_file}" 2>&1 & + ( + set +e + "${bin}" dkg \ + --definition-file="${DEF_FILE}" \ + --data-dir="${data_dir}" \ + --p2p-relays="${RELAY_URL}" \ + --shutdown-delay="${SHUTDOWN_DELAY}" \ + > "${log_file}" 2>&1 + status=$? + echo "${status}" > "${exit_file}" + exit "${status}" + ) & else # Interactive path: tee to log file and the terminal. - "${bin}" dkg \ - --definition-file="${DEF_FILE}" \ - --data-dir="${data_dir}" \ - --p2p-relays="${RELAY_URL}" \ - > >(tee "${log_file}") 2>&1 & + ( + set +e + "${bin}" dkg \ + --definition-file="${DEF_FILE}" \ + --data-dir="${data_dir}" \ + --p2p-relays="${RELAY_URL}" \ + --shutdown-delay="${SHUTDOWN_DELAY}" \ + > >(tee "${log_file}") 2>&1 + status=$? + echo "${status}" > "${exit_file}" + exit "${status}" + ) & fi echo "$!" >> "${PID_FILE}" diff --git a/scripts/dkg-runner/stress.sh b/scripts/dkg-runner/stress.sh new file mode 100755 index 00000000..65bbb72a --- /dev/null +++ b/scripts/dkg-runner/stress.sh @@ -0,0 +1,405 @@ +#!/usr/bin/env bash +# stress.sh β€” Run N DKG ceremonies back-to-back (or in parallel) for stress +# testing. Each ceremony gets its own isolated WORK_DIR; results are aggregated +# into a TSV summary. +# +# Usage: +# ./stress.sh [--help] +# +# Stress-test variables (all optional; defaults shown): +# RUNS=10 Total ceremonies to run. +# WORKERS=1 Concurrent ceremonies. +# STRESS_WORK_DIR=/tmp/dkg-stress Base directory; each run uses run-NNN/. +# KEEP_PASSED=0 When truthy, keep full per-run dirs even +# on success. Default trims node-*/ on pass +# to save disk; failed runs are always kept. +# INTERACTIVE=auto auto|1|0. When auto (default), uses an +# in-place TUI table when stdout is a TTY, +# CI is unset, and the table fits the +# terminal. Set to 1 to force, 0 to disable. +# +# Per-run variables (forwarded to run.sh β€” see run.sh --help for full list): +# NODES, THRESHOLD, PLUTO_NODES, CHARON_NODES, RELAY_URL, NETWORK, +# FEE_RECIPIENT, WITHDRAWAL_ADDR, TIMEOUT, NODE_EXIT_TIMEOUT, +# SHUTDOWN_DELAY, PLUTO_BIN, CHARON_BIN. +# RELAY_URL is overridden per run with a random index in https://{0..4}.relay.obol.tech. +# +# WORK_DIR from the environment is ignored β€” stress.sh assigns one per run. +# CI is forced to "true" when WORKERS > 1 so node logs don't interleave. +# +# Outputs: +# ${STRESS_WORK_DIR}/summary.tsv TSV with one row per run. +# ${STRESS_WORK_DIR}/run-NNN/run.log Captured stdout/stderr of run.sh. +# ${STRESS_WORK_DIR}/run-NNN/... Whatever run.sh wrote (preserved +# for failed runs; trimmed on pass +# unless KEEP_PASSED is truthy). +# +# Exit codes: +# 0 β€” all RUNS ceremonies passed. +# 1 β€” one or more failed (failed runs preserved for inspection). +# 130 β€” interrupted; in-flight workers terminated. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib.sh +source "${SCRIPT_DIR}/lib.sh" +LOG_PREFIX="stress" + +if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then + grep '^#' "${BASH_SOURCE[0]}" | grep -v '#!/' | sed 's/^# \?//' + exit 0 +fi + +# ── Stress-test params ─────────────────────────────────────────────────────── + +: "${RUNS:=10}" +: "${WORKERS:=1}" +: "${STRESS_WORK_DIR:=/tmp/dkg-stress}" +: "${KEEP_PASSED:=0}" +: "${INTERACTIVE:=auto}" + +if (( RUNS < 1 )); then + log_err "RUNS must be >= 1 (got ${RUNS})" + exit 1 +fi +if (( WORKERS < 1 )); then + log_err "WORKERS must be >= 1 (got ${WORKERS})" + exit 1 +fi +if (( WORKERS > RUNS )); then + WORKERS=${RUNS} +fi + +mkdir -p "${STRESS_WORK_DIR}" +SUMMARY="${STRESS_WORK_DIR}/summary.tsv" +printf 'run_id\tstatus\tduration_s\tstart_time\twork_dir\n' > "${SUMMARY}" + +# Force CI=true when running in parallel so per-node logs don't interleave on +# the controlling terminal. Each run's stdout/stderr is captured to its own +# run.log regardless, so this only changes the live-tail behaviour. +WORKER_CI="${CI:-}" +if (( WORKERS > 1 )) && [[ -z "${WORKER_CI}" ]]; then + WORKER_CI="true" +fi + +# ── Interactive TUI vs append-only logging ─────────────────────────────────── +# +# In TUI mode each run owns one terminal row that mutates pending β†’ running β†’ +# PASS/FAIL, plus a footer summary. The mode is auto-disabled when: +# - stdout isn't a tty (piped, redirected, CI) +# - CI env is truthy +# - the table doesn't fit (RUNS + footer would exceed the terminal height) +# In all other cases, workers emit per-state log lines as before. + +INTERACTIVE_MODE=0 +INTERACTIVE_REASON="" +case "${INTERACTIVE}" in + 1|true|TRUE|True|yes|YES|Yes|on|ON|On) + INTERACTIVE_MODE=1 + ;; + 0|false|FALSE|False|no|NO|No|off|OFF|Off) + INTERACTIVE_MODE=0 + INTERACTIVE_REASON="forced off" + ;; + auto|AUTO|Auto|"") + if ! [[ -t 1 ]]; then + INTERACTIVE_REASON="stdout is not a tty" + elif is_truthy "${CI:-}"; then + INTERACTIVE_REASON="CI is set" + else + term_lines=$(tput lines 2>/dev/null || echo 0) + # Need RUNS rows + 1 footer; leave a couple of lines breathing room + # and for the prompt that comes back when we exit. + if (( term_lines >= RUNS + 3 )); then + INTERACTIVE_MODE=1 + else + INTERACTIVE_REASON="terminal has ${term_lines} rows, need >= $(( RUNS + 3 )); resize taller or set INTERACTIVE=0 to silence" + fi + fi + ;; + *) + log_err "INTERACTIVE must be auto|1|0 (got: ${INTERACTIVE})" + exit 1 + ;; +esac + +STATE_DIR="${STRESS_WORK_DIR}/.state" +rm -rf "${STATE_DIR}" +mkdir -p "${STATE_DIR}" +for (( i = 1; i <= RUNS; i++ )); do + printf 'pending\n' > "${STATE_DIR}/$(printf 'run-%04d' "${i}")" +done + +write_state() { + local id="${1}" + local state="${2}" + printf '%s\n' "${state}" > "${STATE_DIR}/$(printf 'run-%04d' "${id}")" +} + +# ANSI helpers (only emit escapes when we'll be drawing to the terminal). +ansi() { + if (( INTERACTIVE_MODE )); then + printf '\033[%sm' "${1}" + fi +} +reset() { + if (( INTERACTIVE_MODE )); then + printf '\033[0m' + fi +} + +format_run_line() { + local label="${1}" + local state="${2}" + local now="${3}" + case "${state}" in + pending) + printf ' %s %spending%s' "${label}" "$(ansi 2)" "$(reset)" + ;; + running:*) + local since="${state#running:}" + local elapsed=$(( now - since )) + printf ' %s %srunning%s %3ds' \ + "${label}" "$(ansi 33)" "$(reset)" "${elapsed}" + ;; + pass:*) + local dur="${state#pass:}" + printf ' %s %sPASS %s %3ds' \ + "${label}" "$(ansi '1;32')" "$(reset)" "${dur}" + ;; + fail:*) + local dur="${state#fail:}" + printf ' %s %sFAIL %s %3ds' \ + "${label}" "$(ansi '1;31')" "$(reset)" "${dur}" + ;; + esac +} + +# Lines drawn by the most recent draw_table call (RUNS rows + 1 footer). +# 0 means we haven't drawn yet, so the next call doesn't try to move the +# cursor up over content that isn't there. +TABLE_LINES=0 + +draw_table() { + (( INTERACTIVE_MODE )) || return 0 + + local now + now=$(date +%s) + + # Move cursor back to the top of the previously-drawn block. + if (( TABLE_LINES > 0 )); then + printf '\033[%dA' "${TABLE_LINES}" + fi + + local pass=0 fail=0 run=0 pend=0 + for (( i = 1; i <= RUNS; i++ )); do + local label state + label=$(printf 'run-%04d' "${i}") + state=$(<"${STATE_DIR}/${label}") + case "${state}" in + pending) (( pend++ )) ;; + running:*) (( run++ )) ;; + pass:*) (( pass++ )) ;; + fail:*) (( fail++ )) ;; + esac + # \033[2K clears the entire line; \r ensures we start at column 0. + printf '\r\033[2K%s\n' "$(format_run_line "${label}" "${state}" "${now}")" + done + + printf '\r\033[2K %sPASS%s %d %sFAIL%s %d %srun%s %d %spend%s %d (%d/%d done)\n' \ + "$(ansi '1;32')" "$(reset)" "${pass}" \ + "$(ansi '1;31')" "$(reset)" "${fail}" \ + "$(ansi 33)" "$(reset)" "${run}" \ + "$(ansi 2)" "$(reset)" "${pend}" \ + $(( pass + fail )) "${RUNS}" + + TABLE_LINES=$(( RUNS + 1 )) +} + +# ── Cleanup / signal handling ──────────────────────────────────────────────── + +WORKER_PIDS=() + +_kill_workers() { + (( ${#WORKER_PIDS[@]} == 0 )) && return 0 + for pid in "${WORKER_PIDS[@]}"; do + if kill -0 "${pid}" 2>/dev/null; then + # Each worker is its own process group (set -m below), so signal + # the whole group to take down run.sh and any node descendants. + kill -TERM -- "-${pid}" 2>/dev/null \ + || kill -TERM "${pid}" 2>/dev/null \ + || true + fi + done +} + +_on_signal() { + if (( INTERACTIVE_MODE )) && (( TABLE_LINES > 0 )); then + # Draw a final frame so any in-flight rows get a last update before + # we leave them in place; then move below the table to print our + # warning, so the cleanup messages don't overwrite it. + draw_table + fi + log_warn "Caught signal β€” terminating ${#WORKER_PIDS[@]} in-flight worker(s)" + _kill_workers + wait 2>/dev/null || true + log_info "Aborted. Partial summary at ${SUMMARY}" + exit 130 +} + +trap '_on_signal' INT TERM + +# ── Worker ─────────────────────────────────────────────────────────────────── + +run_one() { + local id="${1}" + local label + label=$(printf 'run-%04d' "${id}") + local run_dir="${STRESS_WORK_DIR}/${label}" + local run_log="${run_dir}/run.log" + + rm -rf "${run_dir}" + mkdir -p "${run_dir}" + + local started ended duration status start_time + started=$(date +%s) + start_time=$(date -u +%Y-%m-%dT%H:%M:%SZ) + + local run_relay_url="https://$(( RANDOM % 5 )).relay.obol.tech" + + write_state "${id}" "running:${started}" + if (( ! INTERACTIVE_MODE )); then + log_info "[${label}] starting (relay: ${run_relay_url})" + fi + + # Each ceremony runs in an isolated WORK_DIR. All other run.sh env vars + # are inherited from this script's environment. + if WORK_DIR="${run_dir}" CI="${WORKER_CI}" RELAY_URL="${run_relay_url}" \ + "${SCRIPT_DIR}/run.sh" >"${run_log}" 2>&1; then + status="pass" + else + status="fail" + fi + + ended=$(date +%s) + duration=$(( ended - started )) + + write_state "${id}" "${status}:${duration}" + + # Atomic-ish append: a single printf-write of one line to a TSV is + # effectively safe under typical bash buffering with WORKERS in single + # digits, but parallel writers can in principle interleave. A flock + # would be cleaner; we accept the small risk for portability (no + # flock(1) on macOS by default). + printf '%s\t%s\t%d\t%s\t%s\n' \ + "${label}" "${status}" "${duration}" "${start_time}" "${run_dir}" \ + >> "${SUMMARY}" + + if [[ "${status}" == "pass" ]]; then + if (( ! INTERACTIVE_MODE )); then + log_info "[${label}] PASS in ${duration}s" + fi + if ! is_truthy "${KEEP_PASSED}"; then + # Keep run.log + cluster-lock.json for verification; drop the + # node data dirs, which dominate disk usage. + rm -rf "${run_dir}/node-"*/ 2>/dev/null || true + fi + else + if (( ! INTERACTIVE_MODE )); then + log_err "[${label}] FAIL after ${duration}s β€” preserved at ${run_dir}" + fi + fi +} + +# ── Dispatch ───────────────────────────────────────────────────────────────── + +log_info "==============================================" +log_info "DKG stress test" +log_info " RUNS = ${RUNS}" +log_info " WORKERS = ${WORKERS}" +log_info " STRESS_WORK_DIR = ${STRESS_WORK_DIR}" +log_info " KEEP_PASSED = ${KEEP_PASSED}" +log_info " CI (per worker) = ${WORKER_CI:-}" +if (( INTERACTIVE_MODE )); then + log_info " INTERACTIVE = ${INTERACTIVE} (active)" +else + log_info " INTERACTIVE = ${INTERACTIVE} (disabled${INTERACTIVE_REASON:+: ${INTERACTIVE_REASON}})" +fi +log_info " (per-run config inherited from environment; see run.sh --help)" +log_info "==============================================" + +# Job control: each backgrounded worker becomes its own process group leader, +# so $! == PGID and we can signal the whole tree (run.sh + nodes) on cleanup. +set -m + +# Initial frame so the user sees the table immediately, with all rows pending. +draw_table + +next=1 +while (( next <= RUNS )) || (( ${#WORKER_PIDS[@]} > 0 )); do + # Fill the worker pool up to WORKERS. + while (( ${#WORKER_PIDS[@]} < WORKERS )) && (( next <= RUNS )); do + run_one "${next}" & + WORKER_PIDS+=("$!") + next=$(( next + 1 )) + done + + # Tick: redraw, sleep, then reap finished workers. Polled rather than + # `wait -n` for portability across bash 3.2 (macOS default). + draw_table + sleep 1 + alive=() + for pid in "${WORKER_PIDS[@]}"; do + if kill -0 "${pid}" 2>/dev/null; then + alive+=("${pid}") + else + wait "${pid}" 2>/dev/null || true + fi + done + WORKER_PIDS=("${alive[@]+"${alive[@]}"}") +done + +# Final frame so the table reflects the last state transition. +draw_table + +trap - INT TERM + +# ── Aggregate ──────────────────────────────────────────────────────────────── + +pass=$(awk -F'\t' 'NR>1 && $2=="pass"' "${SUMMARY}" | wc -l | tr -d ' ') +fail=$(awk -F'\t' 'NR>1 && $2=="fail"' "${SUMMARY}" | wc -l | tr -d ' ') +total=$(( pass + fail )) + +if (( total == 0 )); then + log_err "No runs completed." + exit 1 +fi + +read -r dmin dmax dmean < <( + awk -F'\t' 'NR>1 { + d = $3 + 0 + if (n == 0 || d < min) min = d + if (d > max) max = d + sum += d + n++ + } END { + printf "%d %d %.1f", min, max, (n>0 ? sum/n : 0) + }' "${SUMMARY}" +) + +log_info "==============================================" +log_info "Stress test complete" +log_info " Passed: ${pass}/${total}" +log_info " Failed: ${fail}/${total}" +log_info " Duration min/mean/max = ${dmin}s / ${dmean}s / ${dmax}s" +log_info " Summary: ${SUMMARY}" +log_info "==============================================" + +if (( fail > 0 )); then + log_warn "Failed runs:" + awk -F'\t' 'NR>1 && $2=="fail" {printf " %s (%ds) %s\n", $1, $3, $5}' \ + "${SUMMARY}" >&2 + exit 1 +fi +exit 0 diff --git a/scripts/dkg-runner/wait-node-exits.sh b/scripts/dkg-runner/wait-node-exits.sh new file mode 100755 index 00000000..523ac0f0 --- /dev/null +++ b/scripts/dkg-runner/wait-node-exits.sh @@ -0,0 +1,104 @@ +#!/usr/bin/env bash +# wait-node-exits.sh β€” waits for every DKG node to report a clean exit. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=config.sh +source "${SCRIPT_DIR}/config.sh" +# shellcheck source=lib.sh +source "${SCRIPT_DIR}/lib.sh" +LOG_PREFIX="wait-node-exits" + +log_tail() { + local index="${1}" + local log_file="${WORK_DIR}/node-${index}/node.log" + if [[ -f "${log_file}" ]]; then + log_err "Last log lines for node-${index}:" + tail -40 "${log_file}" >&2 || true + else + log_err "No log file for node-${index}: ${log_file}" + fi +} + +node_exit_code() { + local index="${1}" + local exit_file="${WORK_DIR}/node-${index}/exit-code" + [[ -f "${exit_file}" ]] || return 1 + cat "${exit_file}" +} + +# Pluto and Charon both emit this line right after the ceremony finishes, +# before their final shutdown/teardown. Treat it as authoritative β€” by this +# point monitor.sh has already verified the artifacts on disk. +SUCCESS_LINE="Successfully completed DKG ceremony" +PID_FILE="${WORK_DIR}/pids" + +node_logged_success() { + local index="${1}" + local log_file="${WORK_DIR}/node-${index}/node.log" + [[ -f "${log_file}" ]] || return 1 + grep -qF "${SUCCESS_LINE}" "${log_file}" +} + +log_info "Waiting for ${NODES} node exit codes (timeout: ${NODE_EXIT_TIMEOUT}s)" + +start_time=$(date +%s) +while true; do + done_count=0 + success_count=0 + for (( i = 0; i < NODES; i++ )); do + if [[ -f "${WORK_DIR}/node-${i}/exit-code" ]]; then + done_count=$(( done_count + 1 )) + code=$(node_exit_code "${i}") + if [[ "${code}" != "0" ]]; then + log_err "node-${i} exited with status ${code}" + log_tail "${i}" + exit 1 + fi + fi + if node_logged_success "${i}"; then + success_count=$(( success_count + 1 )) + fi + done + + if (( done_count == NODES )); then + break + fi + + # Short-circuit: every node logged success but some are still running + # (e.g. blocked on SHUTDOWN_DELAY). Kill survivors and treat as success. + if (( success_count == NODES )); then + log_info "All ${NODES} nodes logged success; stopping survivors." + kill_pgids "${PID_FILE}" 5 + log_info "All ${NODES} nodes stopped after success log line." + exit 0 + fi + + elapsed=$(( $(date +%s) - start_time )) + if (( elapsed >= NODE_EXIT_TIMEOUT )); then + log_err "TIMEOUT after ${elapsed}s β€” ${done_count}/${NODES} nodes exited" + for (( i = 0; i < NODES; i++ )); do + [[ -f "${WORK_DIR}/node-${i}/exit-code" ]] || log_tail "${i}" + done + exit 1 + fi + + sleep 1 +done + +failed=0 +for (( i = 0; i < NODES; i++ )); do + code=$(node_exit_code "${i}") + if [[ "${code}" != "0" ]]; then + log_err "node-${i} exited with status ${code}" + log_tail "${i}" + failed=1 + fi +done + +if (( failed != 0 )); then + exit 1 +fi + +log_info "All ${NODES} nodes exited cleanly." diff --git a/tools/dkg-stress/.gitignore b/tools/dkg-stress/.gitignore new file mode 100644 index 00000000..2f7896d1 --- /dev/null +++ b/tools/dkg-stress/.gitignore @@ -0,0 +1 @@ +target/ diff --git a/tools/dkg-stress/Cargo.lock b/tools/dkg-stress/Cargo.lock new file mode 100644 index 00000000..f84c7651 --- /dev/null +++ b/tools/dkg-stress/Cargo.lock @@ -0,0 +1,779 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + +[[package]] +name = "ansi-to-tui" +version = "7.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67555e1f1ece39d737e28c8a017721287753af3f93225e4a445b29ccb0f5912c" +dependencies = [ + "nom", + "ratatui", + "simdutf8", + "smallvec", + "thiserror", +] + +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "bitflags" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" + +[[package]] +name = "cassowary" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53" + +[[package]] +name = "castaway" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec551ab6e7578819132c713a93c022a05d60159dc86e7a7050223577484c55a" +dependencies = [ + "rustversion", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "clap" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "compact_str" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b79c4069c6cad78e2e0cdfcbd26275770669fb39fd308a752dc110e83b9af32" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "static_assertions", +] + +[[package]] +name = "crossterm" +version = "0.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6" +dependencies = [ + "bitflags", + "crossterm_winapi", + "mio", + "parking_lot", + "rustix", + "signal-hook", + "signal-hook-mio", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" +dependencies = [ + "winapi", +] + +[[package]] +name = "darling" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" +dependencies = [ + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" +dependencies = [ + "darling_core", + "quote", + "syn", +] + +[[package]] +name = "dkg-stress" +version = "0.1.0" +dependencies = [ + "ansi-to-tui", + "anyhow", + "clap", + "libc", + "ratatui", + "signal-hook", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "indoc" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] + +[[package]] +name = "instability" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eb2d60ef19920a3a9193c3e371f726ec1dafc045dac788d0fb3704272458971" +dependencies = [ + "darling", + "indoc", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "linux-raw-sys" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "lru" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" +dependencies = [ + "hashbrown", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "mio" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1" +dependencies = [ + "libc", + "log", + "wasi", + "windows-sys 0.61.2", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "ratatui" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eabd94c2f37801c20583fc49dd5cd6b0ba68c716787c2dd6ed18571e1e63117b" +dependencies = [ + "bitflags", + "cassowary", + "compact_str", + "crossterm", + "indoc", + "instability", + "itertools", + "lru", + "paste", + "strum", + "unicode-segmentation", + "unicode-truncate", + "unicode-width 0.2.0", +] + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + +[[package]] +name = "rustix" +version = "0.38.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.59.0", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "signal-hook" +version = "0.3.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d881a16cf4426aa584979d30bd82cb33429027e42122b169753d6ef1085ed6e2" +dependencies = [ + "libc", + "signal-hook-registry", +] + +[[package]] +name = "signal-hook-mio" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b75a19a7a740b25bc7944bdee6172368f988763b744e3d4dfe753f6b4ece40cc" +dependencies = [ + "libc", + "mio", + "signal-hook", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b" +dependencies = [ + "errno", + "libc", +] + +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-segmentation" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c" + +[[package]] +name = "unicode-truncate" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3644627a5af5fa321c95b9b235a72fd24cd29c648c2c379431e6628655627bf" +dependencies = [ + "itertools", + "unicode-segmentation", + "unicode-width 0.1.14", +] + +[[package]] +name = "unicode-width" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" + +[[package]] +name = "unicode-width" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/tools/dkg-stress/Cargo.toml b/tools/dkg-stress/Cargo.toml new file mode 100644 index 00000000..6bc08d8c --- /dev/null +++ b/tools/dkg-stress/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "dkg-stress" +version = "0.1.0" +edition = "2024" +publish = false +description = "Stress runner for DKG ceremonies (wraps scripts/dkg-runner/run.sh, ratatui UI)" + +[[bin]] +name = "dkg-stress" +path = "src/main.rs" + +[dependencies] +anyhow = "1" +ansi-to-tui = "7" +clap = { version = "4.5", features = ["derive", "env"] } +ratatui = "0.29" +libc = "0.2" +signal-hook = "0.3" + +[profile.release] +opt-level = 3 +lto = "thin" diff --git a/tools/dkg-stress/README.md b/tools/dkg-stress/README.md new file mode 100644 index 00000000..cadb0b3c --- /dev/null +++ b/tools/dkg-stress/README.md @@ -0,0 +1,200 @@ +# dkg-stress + +Stress runner for DKG ceremonies. Wraps `scripts/dkg-runner/run.sh` to execute +N ceremonies (sequentially or in parallel), with a live ratatui UI for +inspecting in-flight progress and per-node logs. + +This crate lives outside the main Pluto workspace (`exclude` entry in the root +`Cargo.toml`) so it has its own dependency graph and `Cargo.lock`. Build and +run it locally β€” it isn't part of `cargo build --workspace`. + +## Build + +```bash +cd tools/dkg-stress +cargo build --release +``` + +The binary lands at `tools/dkg-stress/target/release/dkg-stress`. + +`run.sh`'s prerequisites still apply: `charon` on `$PATH` (or via `CHARON_BIN`), +`pluto` built (only if `PLUTO_NODES > 0`), and a reachable relay. See +`scripts/dkg-runner/README.md` for the per-ceremony setup. + +## Quick start + +```bash +# 50 ceremonies, 4 in flight at a time, 5-minute timeout per run. +CHARON_BIN=~/projects/charon/charon RUNS=50 WORKERS=4 TIMEOUT=300 \ + ./tools/dkg-stress/target/release/dkg-stress + +# Same thing with flags rather than env vars. +./tools/dkg-stress/target/release/dkg-stress \ + --runs 50 --workers 4 + +# Sequential smoke test, all-Pluto, keep all artifacts for inspection. +PLUTO_NODES=4 CHARON_NODES=0 \ + ./tools/dkg-stress/target/release/dkg-stress \ + --runs 5 --keep-passed + +# Append-only mode (CI, log capture, redirected output). +./tools/dkg-stress/target/release/dkg-stress --runs 10 --no-tui +``` + +## Configuration + +Every option supports both a CLI flag and an environment variable. Flags win +when both are set; otherwise env vars; otherwise defaults. + +### Stress-runner options + +| Flag | Env var | Default | Description | +|---|---|---|---| +| `-n`, `--runs` | `RUNS` | `10` | Total ceremonies to run | +| `-w`, `--workers` | `WORKERS` | `1` | Concurrent ceremonies | +| `--work-dir` | `STRESS_WORK_DIR` | `/tmp/dkg-stress` | Base directory; each run uses `run-NNNN/` | +| `--run-script` | `DKG_RUN_SCRIPT` | `../../scripts/dkg-runner/run.sh` (relative to crate) | Path to `run.sh` | +| `--keep-passed` | `KEEP_PASSED` | off | Keep `node-*/` dirs of passed runs (default trims them) | +| `--no-tui` | `NO_TUI` | off | Disable ratatui UI; emit per-run log lines | +| `--tick-ms` | `TICK_MS` | `250` | UI redraw interval | + +### Per-ceremony options (forwarded to `run.sh` via env) + +These are inherited from the calling environment unchanged β€” see +`scripts/dkg-runner/run.sh --help` for the authoritative list: + +`NODES`, `THRESHOLD`, `PLUTO_NODES`, `CHARON_NODES`, `RELAY_URL`, `NETWORK`, +`FEE_RECIPIENT`, `WITHDRAWAL_ADDR`, `TIMEOUT`, `NODE_EXIT_TIMEOUT`, +`SHUTDOWN_DELAY`, `PLUTO_BIN`, `CHARON_BIN`. + +`WORK_DIR` is overridden per run and is **not** forwarded β€” each ceremony gets +its own isolated work dir under `STRESS_WORK_DIR`. `CI` is forced to `true` +when `WORKERS > 1` so per-node logs don't tee to the controlling terminal +(unless you explicitly export `CI` yourself). + +## TUI + +``` +β”Œβ”€ DKG stress test ────────────────────────────────────────────────┐ +β”‚ runs=50 workers=4 work_dir=/tmp/dkg-stress β”‚ +β”‚ j/k=run Β· J/K=Β±10 Β· Home/End Β· Tab/h/l=log Β· PgUp/PgDn=scroll … β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ runs β”‚ run-0017 β€” running β”‚ +β”‚ run-0001 PASS β”‚ run.log β”‚ node-0 β”‚ node-1 β”‚ node-2 β”‚ node-3 β”‚ +β”‚ run-0002 PASS β”‚ ───────────────────────────────────────────── β”‚ +β”‚ run-0003 FAIL β”‚ 2026-05-08T... INFO pluto::dkg starting β”‚ +β”‚ β–Άrun-0017 run.. β”‚ ... β”‚ +β”‚ run-0018 pend β”‚ β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ PASS 16 FAIL 1 run 4 pend 29 (17/50 done) follow=auto … β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +Each in-flight run's row mutates `pending β†’ running Ns β†’ PASS/FAIL Ns` in +place. The detail pane on the right tails the selected log file (last +~256 KB), parsing ANSI escape codes so the colored Pluto/Charon log output +renders correctly. + +### Keybindings + +**Run selection (left pane)** + +| Key | Action | +|---|---| +| `j` `k` `↓` `↑` | Move selection by 1 | +| `J` `K` | Move selection by 10 | +| `Home` `End` | First / last run | +| `a` | Re-engage auto-follow (selection tracks the latest active run) | + +**Log navigation (right pane)** + +| Key | Action | +|---|---| +| `Tab` `Shift-Tab` `h` `l` `←` `β†’` | Cycle log file (`run.log`, `node-0`, `node-1`, …) | +| `PgUp` `PgDn` | Scroll log by ~20 lines | +| `Ctrl-u` `Ctrl-d` | Scroll log by ~10 lines (vim half-page) | +| `Ctrl-b` `Ctrl-f` | Scroll log by ~20 lines (vim full-page) | +| `g` | Jump to top of buffer | +| `G` | Jump to tail (resume live updates) | + +**Other** + +| Key | Action | +|---|---| +| `q` `Esc` `Ctrl-C` | Graceful shutdown β€” SIGTERMs in-flight ceremonies, finalises the summary | + +Once you scroll up or move the selection, the footer shows `follow=manual` +(selection pinned) and/or `log=+N (G to follow)` (log offset). Press `a` to +return to auto-follow, `G` to snap the log back to its tail. + +## Output + +For each invocation, `dkg-stress` writes: + +``` +${STRESS_WORK_DIR}/ +β”œβ”€β”€ summary.tsv # one row per completed run +β”œβ”€β”€ run-0001/ +β”‚ β”œβ”€β”€ run.log # full stdout/stderr of this run.sh invocation +β”‚ β”œβ”€β”€ node-0/node.log # per-node logs (passed runs trim these by default) +β”‚ β”œβ”€β”€ node-1/node.log +β”‚ └── … +β”œβ”€β”€ run-0002/ +└── … +``` + +`summary.tsv` columns: `run_id`, `status` (`pass`/`fail`), `duration_s`, +`start_time` (ISO-8601 UTC), `work_dir`. New rows are appended atomically as +ceremonies complete. + +When `--keep-passed` is off (the default), `node-*/` subdirs of passed runs +are deleted to keep disk usage bounded. `run.log` and the cluster lock files +are always preserved. Failed runs are kept in full. + +## Exit codes + +| Code | Meaning | +|---|---| +| `0` | All ceremonies passed | +| `1` | One or more failed; details in the final summary and `summary.tsv` | +| `2` | Configuration error (bad flag, missing `run.sh`, etc.) | +| `130` | Interrupted (SIGINT/SIGTERM/SIGHUP); in-flight ceremonies are SIGTERM'd, partial summary preserved | + +## Graceful shutdown + +`q`, `Esc`, and `Ctrl-C` from the TUI, plus external `SIGINT` / `SIGTERM` / +`SIGHUP`, all flow through the same path: + +1. Set the shared stop flag β€” workers stop dispatching new runs. +2. SIGTERM every in-flight `run.sh` process group, so each ceremony's + `_on_signal` trap fires and shuts the four nodes down cleanly. +3. Wait up to 5 s for clean exits, then SIGKILL stragglers. +4. Restore the terminal, finalise `summary.tsv`, print aggregate stats. + +No orphan processes; partial runs are recorded as `fail` with their actual +runtime, un-started runs as "skipped". + +## Troubleshooting + +**"could not find run.sh"** β€” pass `--run-script` or set `DKG_RUN_SCRIPT`. The +default lookup walks two directories up from the binary's manifest dir, so it +only auto-resolves when running from a checkout. + +**TUI is garbled / shows raw escape codes** β€” pluto/charon logs are now +parsed with `ansi-to-tui`. If you still see escapes, the file likely contains +non-SGR control sequences; switch tabs or hit `g` to refresh. + +**Scrolling does nothing** β€” make sure you're hitting the log pane keys +(`PgUp`/`PgDn`, `Ctrl-u`/`Ctrl-d`), not the run-selection keys (`j`/`k`). +The detail title shows `[+N lines]` once you've scrolled. Bear in mind the +buffer is the last 256 KB of the file β€” extremely long ceremonies will only +let you scroll back through that window. + +**"all failed" with no obvious cause** β€” open one of the failed runs in the +TUI, cycle through `run.log` (orchestration output) and each `node-N/node.log` +to find the first error. If `KEEP_PASSED` was off and you want artifacts of +all runs, re-run with `--keep-passed`. + +**Workers wedged after Ctrl-C** β€” should not happen; check +`pgrep -fl run.sh`. If anything sticks around, file an issue with the +`/tmp/dkg-stress/run-NNNN/` directory contents. diff --git a/tools/dkg-stress/src/cli.rs b/tools/dkg-stress/src/cli.rs new file mode 100644 index 00000000..5ff5c916 --- /dev/null +++ b/tools/dkg-stress/src/cli.rs @@ -0,0 +1,45 @@ +use clap::Parser; +use std::path::PathBuf; + +#[derive(Parser, Debug, Clone)] +#[command( + name = "dkg-stress", + about = "Run N DKG ceremonies (back-to-back or in parallel) with a live ratatui UI.", + long_about = "Wraps scripts/dkg-runner/run.sh, dispatching N runs across W parallel \ + workers with isolated WORK_DIRs. Per-run config (NODES, THRESHOLD, \ + PLUTO_NODES, CHARON_NODES, TIMEOUT, etc.) is forwarded to run.sh \ + via the inherited environment β€” see run.sh --help for the full list." +)] +pub struct Cli { + /// Total number of ceremonies to run. + #[arg(short = 'n', long, env = "RUNS", default_value_t = 10)] + pub runs: u32, + + /// Number of ceremonies in flight at the same time. + #[arg(short = 'w', long, env = "WORKERS", default_value_t = 1)] + pub workers: u32, + + /// Base directory; each run uses run-NNNN/ inside it. + #[arg(long, env = "STRESS_WORK_DIR", default_value = "/tmp/dkg-stress")] + pub work_dir: PathBuf, + + /// Path to scripts/dkg-runner/run.sh. Defaults to the script next to the + /// repo's checked-in copy, resolved relative to the binary's location. + #[arg(long, env = "DKG_RUN_SCRIPT")] + pub run_script: Option, + + /// Keep full per-run dirs even on success. By default, node-*/ subdirs of + /// passed runs are deleted to save disk; failed run dirs are always kept. + #[arg(long, env = "KEEP_PASSED")] + pub keep_passed: bool, + + /// Disable the ratatui UI; emit per-run log lines instead. Auto-enabled + /// when stdout isn't a TTY or CI is set. + #[arg(long, env = "NO_TUI")] + pub no_tui: bool, + + /// UI tick rate in milliseconds (how often the table redraws and elapsed + /// counters advance). Lower = smoother but more CPU. + #[arg(long, env = "TICK_MS", default_value_t = 250)] + pub tick_ms: u64, +} diff --git a/tools/dkg-stress/src/config.rs b/tools/dkg-stress/src/config.rs new file mode 100644 index 00000000..c9989634 --- /dev/null +++ b/tools/dkg-stress/src/config.rs @@ -0,0 +1,125 @@ +use anyhow::{Context, Result, bail}; +use std::fs; +use std::io::{BufWriter, Write}; +use std::path::{Path, PathBuf}; +use std::sync::Mutex; + +use crate::cli::Cli; + +/// Resolved configuration shared across worker threads. All fields are +/// immutable after construction; mutable shared state lives on `App` instead. +pub struct Config { + pub runs: u32, + pub workers: u32, + pub work_dir: PathBuf, + pub run_script: PathBuf, + pub keep_passed: bool, + pub no_tui: bool, + pub tick_ms: u64, + pub worker_ci: String, + pub summary_path: PathBuf, + /// Serialised writer for the TSV summary (multiple workers append to it). + pub summary: Mutex>, +} + +impl Config { + pub fn from_cli(cli: Cli) -> Result { + if cli.runs == 0 { + bail!("RUNS must be >= 1 (got {})", cli.runs); + } + if cli.workers == 0 { + bail!("WORKERS must be >= 1 (got {})", cli.workers); + } + let workers = cli.workers.min(cli.runs); + + let run_script = match cli.run_script { + Some(p) => p, + None => default_run_script()?, + }; + let run_script = run_script + .canonicalize() + .with_context(|| format!("run script not found: {}", run_script.display()))?; + if !run_script.is_file() { + bail!("run script is not a regular file: {}", run_script.display()); + } + + // Force CI=true for parallel runs so per-node logs don't tee to the + // controlling terminal (run.sh suppresses tee under CI). Honour any + // existing CI value the user explicitly set. + let worker_ci = match std::env::var("CI") { + Ok(v) if !v.is_empty() => v, + _ if workers > 1 => "true".to_string(), + _ => String::new(), + }; + + fs::create_dir_all(&cli.work_dir) + .with_context(|| format!("create work dir {}", cli.work_dir.display()))?; + let summary_path = cli.work_dir.join("summary.tsv"); + let summary_file = fs::File::create(&summary_path) + .with_context(|| format!("create summary file {}", summary_path.display()))?; + let mut summary = BufWriter::new(summary_file); + writeln!(summary, "run_id\tstatus\tduration_s\tstart_time\twork_dir")?; + summary.flush()?; + + Ok(Self { + runs: cli.runs, + workers, + work_dir: cli.work_dir, + run_script, + keep_passed: cli.keep_passed, + no_tui: cli.no_tui, + tick_ms: cli.tick_ms.max(50), + worker_ci, + summary_path, + summary: Mutex::new(summary), + }) + } + + pub fn append_summary_line( + &self, + label: &str, + status: &str, + duration_s: u64, + start_time_iso: &str, + run_dir: &Path, + ) -> Result<()> { + let mut w = self + .summary + .lock() + .map_err(|_| anyhow::anyhow!("summary writer lock poisoned"))?; + writeln!( + w, + "{}\t{}\t{}\t{}\t{}", + label, + status, + duration_s, + start_time_iso, + run_dir.display() + )?; + w.flush()?; + Ok(()) + } +} + +/// Locate scripts/dkg-runner/run.sh relative to either the running binary +/// (when invoked from a checkout) or CWD as a final fallback. +fn default_run_script() -> Result { + // The crate lives at /tools/dkg-stress; the script lives at + // /scripts/dkg-runner/run.sh. Cargo sets CARGO_MANIFEST_DIR at + // compile time so we know the crate's location regardless of how the + // binary is launched. + let manifest_dir = env!("CARGO_MANIFEST_DIR"); + let candidate = Path::new(manifest_dir) + .join("..") + .join("..") + .join("scripts") + .join("dkg-runner") + .join("run.sh"); + if candidate.exists() { + return Ok(candidate); + } + bail!( + "could not find run.sh at {} β€” pass --run-script or set DKG_RUN_SCRIPT", + candidate.display() + ) +} diff --git a/tools/dkg-stress/src/logs.rs b/tools/dkg-stress/src/logs.rs new file mode 100644 index 00000000..9a82ffe0 --- /dev/null +++ b/tools/dkg-stress/src/logs.rs @@ -0,0 +1,37 @@ +use std::fs::{self, File}; +use std::io::{Read, Seek, SeekFrom}; +use std::path::{Path, PathBuf}; + +/// Read up to `max_bytes` from the end of `path`, returning the trailing +/// portion as a UTF-8 string (lossily decoded). Returns None on any I/O +/// error or if the path doesn't exist. +pub fn read_tail(path: &Path, max_bytes: u64) -> Option { + let mut f = File::open(path).ok()?; + let len = f.metadata().ok()?.len(); + let start = len.saturating_sub(max_bytes); + f.seek(SeekFrom::Start(start)).ok()?; + let mut buf = Vec::with_capacity(max_bytes.min(64 * 1024) as usize); + f.take(max_bytes).read_to_end(&mut buf).ok()?; + Some(String::from_utf8_lossy(&buf).into_owned()) +} + +/// Enumerate node-* subdirectories of `run_dir`, sorted by name. Empty if +/// the run directory doesn't exist yet (pending) or has been pruned (passed +/// run with KEEP_PASSED off). +pub fn enumerate_nodes(run_dir: &Path) -> Vec { + let Ok(entries) = fs::read_dir(run_dir) else { + return Vec::new(); + }; + let mut nodes: Vec = entries + .flatten() + .map(|e| e.path()) + .filter(|p| { + p.is_dir() + && p.file_name() + .and_then(|n| n.to_str()) + .is_some_and(|n| n.starts_with("node-")) + }) + .collect(); + nodes.sort(); + nodes +} diff --git a/tools/dkg-stress/src/main.rs b/tools/dkg-stress/src/main.rs new file mode 100644 index 00000000..74b71b41 --- /dev/null +++ b/tools/dkg-stress/src/main.rs @@ -0,0 +1,248 @@ +//! DKG stress runner with a ratatui-based UI. +//! +//! Wraps `scripts/dkg-runner/run.sh` to execute N ceremonies, optionally in +//! parallel, with live status visualisation. Per-run config (NODES, THRESHOLD, +//! PLUTO_NODES, CHARON_NODES, TIMEOUT, …) is forwarded via the inherited +//! environment β€” see `run.sh --help`. + +mod cli; +mod config; +mod logs; +mod state; +mod ui; +mod worker; + +use anyhow::Result; +use clap::Parser; +use std::collections::HashSet; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Mutex}; +use std::thread::{self, JoinHandle}; +use std::time::{Duration, Instant}; + +use crate::cli::Cli; +use crate::config::Config; +use crate::state::{App, RunState}; +use crate::worker::{spawn_workers, kill_all}; + +fn main() -> Result<()> { + let cli = Cli::parse(); + let config = Arc::new(Config::from_cli(cli)?); + + let app = Arc::new(Mutex::new(App::new(config.runs as usize))); + let stop = Arc::new(AtomicBool::new(false)); + install_signal_handlers(&stop)?; + let killers = Arc::new(Mutex::new(HashSet::new())); + + let workers = spawn_workers(config.clone(), app.clone(), stop.clone(), killers.clone()); + + // Auto-disable the TUI when stdout isn't a TTY (piped/redirected) β€” the + // alt-screen escapes would garble the captured output. The explicit + // --no-tui flag overrides regardless. + let use_tui = !config.no_tui && is_tty_stdout(); + + if use_tui { + let workers_done = make_done_check(&workers); + ui::run_tui( + config.clone(), + app.clone(), + stop.clone(), + killers.clone(), + workers_done, + )?; + } else { + run_logging(&config, &app, &stop, &workers); + } + + // Whatever path got us here (TUI quit, all workers finished, or the + // logging loop returned), make sure no children outlive us and the + // worker threads have a chance to drain their final-state writes. + if !workers + .iter() + .all(|h| h.is_finished()) + { + stop.store(true, Ordering::Relaxed); + kill_all(&killers, Duration::from_secs(5)); + } + for h in workers { + let _ = h.join(); + } + + print_final_summary(&config, &app); + + let any_fail = match app.lock() { + Ok(a) => a.runs.iter().any(|s| matches!(s, RunState::Fail { .. })), + Err(_) => true, + }; + if any_fail { + std::process::exit(1); + } + Ok(()) +} + +fn make_done_check(workers: &[JoinHandle<()>]) -> impl Fn() -> bool + '_ { + move || workers.iter().all(|h| h.is_finished()) +} + +/// Replace the default termination handlers so SIGINT/SIGTERM/SIGHUP flip +/// the shared stop flag instead of killing us outright. This lets the TUI +/// restore the terminal and the dispatch path SIGTERM in-flight ceremonies +/// before we exit, regardless of whether the signal arrived from a tty +/// Ctrl-C (no-tui mode) or an external `kill`. +#[cfg(unix)] +fn install_signal_handlers(stop: &Arc) -> Result<()> { + use signal_hook::consts::{SIGHUP, SIGINT, SIGTERM}; + for sig in [SIGINT, SIGTERM, SIGHUP] { + signal_hook::flag::register(sig, stop.clone())?; + } + Ok(()) +} + +#[cfg(not(unix))] +fn install_signal_handlers(_stop: &Arc) -> Result<()> { + Ok(()) +} + +fn is_tty_stdout() -> bool { + // SAFETY: isatty is a pure libc syscall taking an fd; STDOUT_FILENO is + // always a valid file descriptor for our process. + unsafe { libc::isatty(libc::STDOUT_FILENO) == 1 } +} + +/// Append-only fallback for non-TTY / `--no-tui` runs. Polls App state and +/// emits one line per state transition, plus a heartbeat counter. +fn run_logging( + config: &Config, + app: &Mutex, + stop: &AtomicBool, + workers: &[JoinHandle<()>], +) { + eprintln!( + "dkg-stress: runs={} workers={} work_dir={}", + config.runs, + config.workers, + config.work_dir.display() + ); + let total = config.runs as usize; + let mut last: Vec = vec![RunStateTag::Pending; total]; + + loop { + let snapshot: Vec = match app.lock() { + Ok(a) => a.runs.clone(), + Err(_) => return, + }; + for (i, state) in snapshot.iter().enumerate() { + let tag = tag(state); + if tag != last[i] { + emit_transition(i + 1, state); + last[i] = tag; + } + } + if workers.iter().all(|h| h.is_finished()) { + return; + } + if stop.load(Ordering::Relaxed) { + eprintln!("dkg-stress: caught signal β€” terminating in-flight ceremonies"); + return; + } + thread::sleep(Duration::from_millis(config.tick_ms)); + } +} + +#[derive(Clone, Copy, PartialEq, Eq)] +enum RunStateTag { + Pending, + Running, + Pass, + Fail, +} + +fn tag(s: &RunState) -> RunStateTag { + match s { + RunState::Pending => RunStateTag::Pending, + RunState::Running { .. } => RunStateTag::Running, + RunState::Pass { .. } => RunStateTag::Pass, + RunState::Fail { .. } => RunStateTag::Fail, + } +} + +fn emit_transition(id: usize, state: &RunState) { + match state { + RunState::Pending => {} + RunState::Running { .. } => { + println!("[run-{:04}] starting", id); + } + RunState::Pass { duration_s } => { + println!("[run-{:04}] PASS in {}s", id, duration_s); + } + RunState::Fail { duration_s } => { + eprintln!("[run-{:04}] FAIL after {}s", id, duration_s); + } + } +} + +fn print_final_summary(config: &Config, app: &Mutex) { + let snapshot: Vec = match app.lock() { + Ok(a) => a.runs.clone(), + Err(_) => return, + }; + let mut passed = 0u64; + let mut failed = 0u64; + let mut pending = 0u64; + let mut min_d = u64::MAX; + let mut max_d = 0u64; + let mut sum_d = 0u64; + let mut n_d = 0u64; + for s in &snapshot { + match s { + RunState::Pass { duration_s } => { + passed += 1; + update_stats(*duration_s, &mut min_d, &mut max_d, &mut sum_d, &mut n_d); + } + RunState::Fail { duration_s } => { + failed += 1; + update_stats(*duration_s, &mut min_d, &mut max_d, &mut sum_d, &mut n_d); + } + _ => pending += 1, + } + } + + println!("=============================================="); + println!("dkg-stress complete"); + println!(" Passed: {}/{}", passed, snapshot.len()); + println!(" Failed: {}/{}", failed, snapshot.len()); + if pending > 0 { + println!(" Skipped: {} (aborted before they ran)", pending); + } + if n_d > 0 { + let mean = (sum_d as f64) / (n_d as f64); + println!(" Duration min/mean/max = {}s / {:.1}s / {}s", min_d, mean, max_d); + } + println!(" Summary: {}", config.summary_path.display()); + + if failed > 0 { + println!("Failed runs:"); + for (i, s) in snapshot.iter().enumerate() { + if let RunState::Fail { duration_s } = s { + let label = format!("run-{:04}", i + 1); + let dir = config.work_dir.join(&label); + println!(" {} ({}s) {}", label, duration_s, dir.display()); + } + } + } + println!("=============================================="); + + // Suppress unused-import warning when we only conditionally read Instant. + let _ = Instant::now; +} + +fn update_stats(d: u64, min_d: &mut u64, max_d: &mut u64, sum_d: &mut u64, n: &mut u64) { + if d < *min_d { + *min_d = d; + } + if d > *max_d { + *max_d = d; + } + *sum_d = sum_d.saturating_add(d); + *n = n.saturating_add(1); +} diff --git a/tools/dkg-stress/src/state.rs b/tools/dkg-stress/src/state.rs new file mode 100644 index 00000000..7305cf64 --- /dev/null +++ b/tools/dkg-stress/src/state.rs @@ -0,0 +1,215 @@ +use std::time::Instant; + +#[derive(Clone, Copy, Debug)] +pub enum RunState { + Pending, + Running { started_at: Instant }, + Pass { duration_s: u64 }, + Fail { duration_s: u64 }, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum ActivePane { + Runs, + Logs, +} + +pub struct App { + pub runs: Vec, + /// Which pane receives navigation and scroll input. + pub active_pane: ActivePane, + /// 0-based index of the run currently displayed in the detail pane. + pub selected_run: usize, + /// 0 = run.log, 1..=N = node-(idx-1)/node.log. + pub selected_tab: usize, + /// True once the user has navigated manually; suppresses auto-follow so + /// the table doesn't yank focus away from what they're inspecting. + pub manual_select: bool, + /// Number of lines scrolled back from the tail of the active log. 0 + /// means "stick to the tail" (live updates appear). Grows as the user + /// scrolls up; clamped on render to the available content. Reset on + /// run/tab switch and on `G` / End. + pub log_scroll: usize, +} + +impl App { + pub fn new(total: usize) -> Self { + Self { + runs: vec![RunState::Pending; total], + active_pane: ActivePane::Runs, + selected_run: 0, + selected_tab: 0, + manual_select: false, + log_scroll: 0, + } + } + + pub fn focus_runs(&mut self) { + self.active_pane = ActivePane::Runs; + self.manual_select = true; + } + + pub fn focus_logs(&mut self) { + self.active_pane = ActivePane::Logs; + self.manual_select = true; + } + + pub fn toggle_pane(&mut self) { + self.active_pane = match self.active_pane { + ActivePane::Runs => ActivePane::Logs, + ActivePane::Logs => ActivePane::Runs, + }; + self.manual_select = true; + } + + pub fn next_run(&mut self) { + if self.runs.is_empty() { + return; + } + if self.selected_run + 1 < self.runs.len() { + self.selected_run += 1; + } + self.manual_select = true; + self.log_scroll = 0; + } + + pub fn prev_run(&mut self) { + self.selected_run = self.selected_run.saturating_sub(1); + self.manual_select = true; + self.log_scroll = 0; + } + + pub fn next_run_page(&mut self, page: usize) { + let last = self.runs.len().saturating_sub(1); + self.selected_run = self.selected_run.saturating_add(page).min(last); + self.manual_select = true; + self.log_scroll = 0; + } + + pub fn prev_run_page(&mut self, page: usize) { + self.selected_run = self.selected_run.saturating_sub(page); + self.manual_select = true; + self.log_scroll = 0; + } + + pub fn first_run(&mut self) { + self.selected_run = 0; + self.manual_select = true; + self.log_scroll = 0; + } + + pub fn last_run(&mut self) { + self.selected_run = self.runs.len().saturating_sub(1); + self.manual_select = true; + self.log_scroll = 0; + } + + pub fn next_tab(&mut self, tab_count: usize) { + if tab_count == 0 { + return; + } + self.selected_tab = (self.selected_tab + 1) % tab_count; + self.log_scroll = 0; + } + + pub fn prev_tab(&mut self, tab_count: usize) { + if tab_count == 0 { + return; + } + self.selected_tab = if self.selected_tab == 0 { + tab_count - 1 + } else { + self.selected_tab - 1 + }; + self.log_scroll = 0; + } + + pub fn scroll_log_up(&mut self, lines: usize) { + self.log_scroll = self.log_scroll.saturating_add(lines); + // Pin the selected run while reading scrollback so auto_advance + // doesn't yank us to a different ceremony mid-scroll. `a` / `G` + // re-engage auto-follow. + self.manual_select = true; + } + + pub fn scroll_log_down(&mut self, lines: usize) { + self.log_scroll = self.log_scroll.saturating_sub(lines); + self.manual_select = true; + } + + pub fn scroll_log_to_tail(&mut self) { + self.log_scroll = 0; + } + + /// "Go to top" β€” set scroll past any sane document length; render code + /// clamps to the actual line count. + pub fn scroll_log_to_top(&mut self) { + self.log_scroll = usize::MAX / 2; + self.manual_select = true; + } + + /// Re-engage auto-follow (selection tracks the active frontier again). + pub fn follow_auto(&mut self) { + self.manual_select = false; + self.log_scroll = 0; + self.active_pane = ActivePane::Runs; + } + + /// If the user hasn't taken manual control, keep the selection on the + /// most-recent active run. Resets log scroll when the focus moves so + /// the live tail kicks back in. + pub fn auto_advance_selection(&mut self) { + if self.manual_select { + return; + } + if let Some(idx) = self.focus_idx() + && self.selected_run != idx + { + self.selected_run = idx; + self.log_scroll = 0; + } + } + + pub fn counts(&self) -> Counts { + let mut c = Counts::default(); + for state in &self.runs { + match state { + RunState::Pending => c.pending += 1, + RunState::Running { .. } => c.running += 1, + RunState::Pass { .. } => c.passed += 1, + RunState::Fail { .. } => c.failed += 1, + } + } + c + } + + /// The largest 1-based run index that is no longer Pending. Used by the + /// UI as the auto-scroll focus so the table follows the active frontier. + pub fn focus_idx(&self) -> Option { + self.runs + .iter() + .enumerate() + .rev() + .find_map(|(i, s)| (!matches!(s, RunState::Pending)).then_some(i)) + } +} + +#[derive(Default, Clone, Copy)] +pub struct Counts { + pub passed: usize, + pub failed: usize, + pub running: usize, + pub pending: usize, +} + +impl Counts { + pub fn done(&self) -> usize { + self.passed.saturating_add(self.failed) + } + pub fn total(&self) -> usize { + self.passed + .saturating_add(self.failed) + .saturating_add(self.running) + .saturating_add(self.pending) + } +} diff --git a/tools/dkg-stress/src/ui.rs b/tools/dkg-stress/src/ui.rs new file mode 100644 index 00000000..432fca93 --- /dev/null +++ b/tools/dkg-stress/src/ui.rs @@ -0,0 +1,700 @@ +use ansi_to_tui::IntoText; +use anyhow::Result; +use ratatui::Frame; +use ratatui::crossterm::{ + event::{ + self, DisableMouseCapture, EnableMouseCapture, Event, KeyCode, KeyEventKind, KeyModifiers, + MouseEvent, MouseEventKind, + }, + execute, +}; +use ratatui::layout::{Constraint, Layout, Position, Rect}; +use ratatui::style::{Color, Modifier, Style}; +use ratatui::text::{Line, Span, Text}; +use ratatui::widgets::{Block, Borders, Cell, Paragraph, Row, Table, TableState, Tabs, Wrap}; +use std::io; +use std::path::PathBuf; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; + +use crate::config::Config; +use crate::logs::{enumerate_nodes, read_tail}; +use crate::state::{ActivePane, App, Counts, RunState}; +use crate::worker::{Killers, kill_all}; + +/// Maximum bytes read from the end of a log file per draw. 256KB gives +/// the user meaningful room to scroll back through a node's log while +/// bounding I/O regardless of total log size. +const LOG_TAIL_BYTES: u64 = 256 * 1024; + +pub fn run_tui( + config: Arc, + app: Arc>, + stop: Arc, + killers: Killers, + workers_done: impl Fn() -> bool, +) -> Result<()> { + let mut terminal = ratatui::init(); + if let Err(err) = execute!(io::stdout(), EnableMouseCapture) { + ratatui::restore(); + return Err(err.into()); + } + let result = event_loop(&mut terminal, &config, &app, &stop, &killers, workers_done); + let _ = execute!(io::stdout(), DisableMouseCapture); + ratatui::restore(); + result +} + +fn event_loop( + terminal: &mut ratatui::DefaultTerminal, + config: &Config, + app: &Mutex, + stop: &AtomicBool, + killers: &Killers, + workers_done: impl Fn() -> bool, +) -> Result<()> { + let tick = Duration::from_millis(config.tick_ms); + let mut next_tick = Instant::now() + tick; + let mut table_state = TableState::default(); + let mut completed = false; + + loop { + // External shutdown (SIGINT/SIGTERM/SIGHUP from the signal handler + // installed in main). Mirror the in-TUI quit path: SIGTERM the + // in-flight ceremonies, then return so ratatui::restore() runs. + if stop.load(Ordering::Relaxed) { + kill_all(killers, Duration::from_secs(5)); + return Ok(()); + } + + terminal.draw(|frame| draw(frame, config, app, &mut table_state, completed))?; + + if workers_done() { + completed = true; + terminal.draw(|frame| draw(frame, config, app, &mut table_state, completed))?; + } + + let now = Instant::now(); + let timeout = next_tick.saturating_duration_since(now); + if event::poll(timeout)? { + match event::read()? { + Event::Key(key) + if key.kind == KeyEventKind::Press + && handle_key(key.code, key.modifiers, config, app, stop, killers) => + { + return Ok(()); + } + Event::Mouse(mouse) => handle_mouse(mouse, terminal, config, app)?, + _ => {} + } + } + if Instant::now() >= next_tick { + next_tick = Instant::now() + tick; + } + } +} + +/// Returns true if the caller should exit the event loop. +fn handle_key( + code: KeyCode, + mods: KeyModifiers, + config: &Config, + app: &Mutex, + stop: &AtomicBool, + killers: &Killers, +) -> bool { + let quit = matches!(code, KeyCode::Char('q') | KeyCode::Esc) + || (code == KeyCode::Char('c') && mods.contains(KeyModifiers::CONTROL)); + if quit { + stop.store(true, Ordering::Relaxed); + kill_all(killers, Duration::from_secs(5)); + return true; + } + + let Ok(mut a) = app.lock() else { + return false; + }; + let ctrl = mods.contains(KeyModifiers::CONTROL); + match code { + KeyCode::Tab | KeyCode::BackTab => a.toggle_pane(), + KeyCode::Char('1') => a.focus_runs(), + KeyCode::Char('2') => a.focus_logs(), + KeyCode::Char('a') => a.follow_auto(), + _ => match a.active_pane { + ActivePane::Runs => handle_runs_key(code, &mut a), + ActivePane::Logs => handle_logs_key(code, ctrl, config, &mut a), + }, + } + false +} + +fn handle_runs_key(code: KeyCode, app: &mut App) { + match code { + KeyCode::Down | KeyCode::Char('j') => app.next_run(), + KeyCode::Up | KeyCode::Char('k') => app.prev_run(), + KeyCode::PageDown | KeyCode::Char('J') => app.next_run_page(10), + KeyCode::PageUp | KeyCode::Char('K') => app.prev_run_page(10), + KeyCode::Home => app.first_run(), + KeyCode::End => app.last_run(), + KeyCode::Right | KeyCode::Char('l') | KeyCode::Enter => app.focus_logs(), + _ => {} + } +} + +fn handle_logs_key(code: KeyCode, ctrl: bool, config: &Config, app: &mut App) { + match code { + KeyCode::Left | KeyCode::Char('h') => { + let tabs = tab_count_for(config, app, app.selected_run); + app.prev_tab(tabs); + } + KeyCode::Right | KeyCode::Char('l') => { + let tabs = tab_count_for(config, app, app.selected_run); + app.next_tab(tabs); + } + KeyCode::Up | KeyCode::Char('k') => app.scroll_log_up(1), + KeyCode::Down | KeyCode::Char('j') => app.scroll_log_down(1), + KeyCode::PageUp => app.scroll_log_up(20), + KeyCode::PageDown => app.scroll_log_down(20), + KeyCode::Char('u') if ctrl => app.scroll_log_up(10), + KeyCode::Char('d') if ctrl => app.scroll_log_down(10), + KeyCode::Char('b') if ctrl => app.scroll_log_up(20), + KeyCode::Char('f') if ctrl => app.scroll_log_down(20), + KeyCode::Home | KeyCode::Char('g') => app.scroll_log_to_top(), + KeyCode::End | KeyCode::Char('G') => app.scroll_log_to_tail(), + _ => {} + } +} + +fn handle_mouse( + mouse: MouseEvent, + terminal: &ratatui::DefaultTerminal, + config: &Config, + app: &Mutex, +) -> Result<()> { + let size = terminal.size()?; + let areas = ui_areas(Rect::new(0, 0, size.width, size.height)); + let pos = Position { + x: mouse.column, + y: mouse.row, + }; + + let Ok(mut a) = app.lock() else { + return Ok(()); + }; + let target = pane_at(areas, pos).unwrap_or(a.active_pane); + match mouse.kind { + MouseEventKind::Down(_) => match target { + ActivePane::Runs => a.focus_runs(), + ActivePane::Logs => a.focus_logs(), + }, + MouseEventKind::ScrollUp => match target { + ActivePane::Runs => { + a.focus_runs(); + a.prev_run(); + } + ActivePane::Logs => { + a.focus_logs(); + a.scroll_log_up(3); + } + }, + MouseEventKind::ScrollDown => match target { + ActivePane::Runs => { + a.focus_runs(); + a.next_run(); + } + ActivePane::Logs => { + a.focus_logs(); + a.scroll_log_down(3); + } + }, + MouseEventKind::ScrollLeft if target == ActivePane::Logs => { + a.focus_logs(); + let tabs = tab_count_for(config, &a, a.selected_run); + a.prev_tab(tabs); + } + MouseEventKind::ScrollRight if target == ActivePane::Logs => { + a.focus_logs(); + let tabs = tab_count_for(config, &a, a.selected_run); + a.next_tab(tabs); + } + _ => {} + } + Ok(()) +} + +fn pane_at(areas: UiAreas, pos: Position) -> Option { + if areas.list.contains(pos) { + Some(ActivePane::Runs) + } else if areas.detail.contains(pos) { + Some(ActivePane::Logs) + } else { + None + } +} + +fn tab_count_for(config: &Config, app: &App, run_idx: usize) -> usize { + let Some(state) = app.runs.get(run_idx) else { + return 1; + }; + if matches!(state, RunState::Pending) { + return 1; // run.log only (and it'll show "not started") + } + let run_dir = run_dir_for(config, run_idx); + 1 + enumerate_nodes(&run_dir).len() +} + +fn run_dir_for(config: &Config, run_idx: usize) -> PathBuf { + config + .work_dir + .join(format!("run-{:04}", run_idx.saturating_add(1))) +} + +#[derive(Clone, Copy)] +struct UiAreas { + header: Rect, + list: Rect, + detail: Rect, + footer: Rect, +} + +#[derive(Clone, Copy)] +struct DetailView { + selected_run: usize, + selected_tab: usize, + active: bool, + log_scroll: usize, +} + +fn ui_areas(area: Rect) -> UiAreas { + let [header, body, footer] = Layout::vertical([ + Constraint::Length(5), + Constraint::Min(8), + Constraint::Length(3), + ]) + .areas(area); + let [list, detail] = + Layout::horizontal([Constraint::Percentage(40), Constraint::Percentage(60)]).areas(body); + UiAreas { + header, + list, + detail, + footer, + } +} + +fn draw( + frame: &mut Frame, + config: &Config, + app: &Mutex, + table_state: &mut TableState, + completed: bool, +) { + // Take everything we need from app under a single short lock, then + // release it before doing file I/O. That keeps worker threads from + // stalling on the lock during draws. + let (snapshot, counts, active_pane, selected_run, selected_tab, manual_select, log_scroll) = { + let mut a = match app.lock() { + Ok(a) => a, + Err(_) => return, + }; + a.auto_advance_selection(); + ( + a.runs.clone(), + a.counts(), + a.active_pane, + a.selected_run, + a.selected_tab, + a.manual_select, + a.log_scroll, + ) + }; + let now = Instant::now(); + + let areas = ui_areas(frame.area()); + + frame.render_widget(header(config), areas.header); + + render_run_list( + frame, + areas.list, + &snapshot, + selected_run, + active_pane == ActivePane::Runs, + now, + table_state, + ); + let final_log_scroll = render_detail( + frame, + areas.detail, + config, + &snapshot, + DetailView { + selected_run, + selected_tab, + active: active_pane == ActivePane::Logs, + log_scroll, + }, + ); + + // Clamp the stored scroll back to whatever the renderer ended up using + // (lines available, screen height, etc.) so a future user keystroke + // operates on the actual offset rather than usize::MAX/2. + if final_log_scroll != log_scroll + && let Ok(mut a) = app.lock() + { + a.log_scroll = final_log_scroll; + } + + frame.render_widget( + footer(counts, active_pane, manual_select, log_scroll, completed), + areas.footer, + ); +} + +fn header(config: &Config) -> Paragraph<'_> { + Paragraph::new(vec![ + Line::from(vec![ + Span::styled("DKG stress test", Style::new().add_modifier(Modifier::BOLD)), + Span::raw(format!( + " runs={} workers={} work_dir={}", + config.runs, + config.workers, + config.work_dir.display() + )), + ]), + Line::from(Span::styled( + "Tab/click=focus Β· wheel=scroll active pane Β· a=auto Β· q=quit", + Style::new().fg(Color::DarkGray), + )), + Line::from(Span::styled( + "runs: j/k/Pg/Home/End Β· logs: j/k/Pg/Ctrl-u/d scroll, h/l tabs, g/G top/tail", + Style::new().fg(Color::DarkGray), + )), + ]) + .block(Block::default().borders(Borders::ALL)) +} + +fn render_run_list( + frame: &mut Frame, + area: ratatui::layout::Rect, + snapshot: &[RunState], + selected: usize, + active: bool, + now: Instant, + table_state: &mut TableState, +) { + let rows: Vec = snapshot + .iter() + .enumerate() + .map(|(i, state)| run_row(i + 1, *state, now)) + .collect(); + + let widths = [ + Constraint::Length(10), + Constraint::Length(10), + Constraint::Length(7), + ]; + + let block = active_block(" runs ", active); + let table = Table::new(rows, widths) + .header( + Row::new(vec![ + Cell::from("run").style(Style::new().add_modifier(Modifier::BOLD)), + Cell::from("status").style(Style::new().add_modifier(Modifier::BOLD)), + Cell::from("time").style(Style::new().add_modifier(Modifier::BOLD)), + ]) + .bottom_margin(0), + ) + .block(block) + .column_spacing(2) + .row_highlight_style(Style::new().bg(Color::DarkGray)); + + table_state.select(Some(selected.min(snapshot.len().saturating_sub(1)))); + frame.render_stateful_widget(table, area, table_state); +} + +fn active_block(title: &'static str, active: bool) -> Block<'static> { + let block = Block::default().borders(Borders::ALL).title(title); + if active { + block.border_style(active_border_style()) + } else { + block + } +} + +fn active_border_style() -> Style { + Style::new().fg(Color::Cyan).add_modifier(Modifier::BOLD) +} + +/// Returns the clamped log_scroll value actually used for rendering, so +/// the caller can persist it back into App state. +fn render_detail( + frame: &mut Frame, + area: ratatui::layout::Rect, + config: &Config, + snapshot: &[RunState], + view: DetailView, +) -> usize { + let label = format!("run-{:04}", view.selected_run.saturating_add(1)); + let run_dir = run_dir_for(config, view.selected_run); + let state = snapshot + .get(view.selected_run) + .copied() + .unwrap_or(RunState::Pending); + + let nodes = enumerate_nodes(&run_dir); + let mut tab_titles: Vec = Vec::with_capacity(1 + nodes.len()); + tab_titles.push("run.log".into()); + for n in &nodes { + if let Some(name) = n.file_name().and_then(|s| s.to_str()) { + tab_titles.push(name.to_string()); + } + } + + let tab_count = tab_titles.len(); + let active_tab = view.selected_tab.min(tab_count.saturating_sub(1)); + + let scroll_suffix = if view.log_scroll == 0 { + String::new() + } else { + format!(" [+{} lines]", view.log_scroll) + }; + let block = Block::default().borders(Borders::ALL).title(format!( + " {} β€” {}{} ", + label, + status_short(state), + scroll_suffix + )); + let block = if view.active { + block.border_style(active_border_style()) + } else { + block + }; + let inner = block.inner(area); + frame.render_widget(block, area); + + let [tabs_area, content_area] = + Layout::vertical([Constraint::Length(2), Constraint::Min(1)]).areas(inner); + + let tabs = Tabs::new( + tab_titles + .iter() + .map(|t| Line::from(t.as_str())) + .collect::>(), + ) + .select(active_tab) + .style(Style::new().fg(Color::Gray)) + .highlight_style(Style::new().fg(Color::Cyan).add_modifier(Modifier::BOLD)) + .divider(" β”‚ "); + frame.render_widget(tabs, tabs_area); + + let log_path = if active_tab == 0 { + run_dir.join("run.log") + } else { + let n = active_tab.saturating_sub(1); + nodes + .get(n) + .cloned() + .unwrap_or_else(|| run_dir.clone()) + .join("node.log") + }; + + let (body, used_scroll) = log_body( + &log_path, + state, + content_area.width, + content_area.height, + view.log_scroll, + ); + frame.render_widget(body, content_area); + used_scroll +} + +/// Renders the log pane body. Returns the actual scroll offset used +/// (clamped to available content) so the caller can persist it. +fn log_body( + path: &std::path::Path, + state: RunState, + width: u16, + height: u16, + scroll: usize, +) -> (Paragraph<'static>, usize) { + if matches!(state, RunState::Pending) { + let p = Paragraph::new(Line::from(Span::styled( + "(run not started yet)", + Style::new().fg(Color::DarkGray), + ))); + return (p, 0); + } + let raw = match read_tail(path, LOG_TAIL_BYTES) { + Some(s) if !s.is_empty() => s, + _ => { + let msg = if path.exists() { + "(log file is empty)" + } else if matches!(state, RunState::Pass { .. }) { + "(log pruned β€” passed run with KEEP_PASSED off)" + } else { + "(log file not found)" + }; + let p = Paragraph::new(Line::from(Span::styled( + msg, + Style::new().fg(Color::DarkGray), + ))); + return (p, 0); + } + }; + + let window = height.max(1) as usize; + let text = match raw.into_text() { + Ok(text) => text, + Err(_) => Text::from(raw), + }; + + // `scroll` is stored as "lines back from the tail" so 0 keeps live + // output pinned to the bottom. Ratatui's paragraph scroll is top-based, + // after wrapping, so convert the tail-relative value at render time. + let total = wrapped_height(&text, width); + let max_scroll = total.saturating_sub(window); + let used_scroll = scroll.min(max_scroll); + let top_offset = max_scroll.saturating_sub(used_scroll); + let top_offset = u16::try_from(top_offset).unwrap_or(u16::MAX); + + ( + Paragraph::new(text) + .wrap(Wrap { trim: false }) + .scroll((top_offset, 0)), + used_scroll, + ) +} + +fn wrapped_height(text: &Text<'_>, width: u16) -> usize { + let width = usize::from(width.max(1)); + text.lines + .iter() + .map(|line| { + let rows = line.width().saturating_add(width.saturating_sub(1)) / width; + rows.max(1) + }) + .sum() +} + +fn run_row(id: usize, state: RunState, now: Instant) -> Row<'static> { + let label = format!("run-{:04}", id); + let (status_span, time_text) = match state { + RunState::Pending => ( + Span::styled("pending", Style::new().fg(Color::DarkGray)), + String::new(), + ), + RunState::Running { started_at } => { + let elapsed = now.saturating_duration_since(started_at).as_secs(); + ( + Span::styled("running", Style::new().fg(Color::Yellow)), + format!("{:>4}s", elapsed), + ) + } + RunState::Pass { duration_s } => ( + Span::styled( + "PASS", + Style::new().fg(Color::Green).add_modifier(Modifier::BOLD), + ), + format!("{:>4}s", duration_s), + ), + RunState::Fail { duration_s } => ( + Span::styled( + "FAIL", + Style::new().fg(Color::Red).add_modifier(Modifier::BOLD), + ), + format!("{:>4}s", duration_s), + ), + }; + Row::new(vec![ + Cell::from(label), + Cell::from(Line::from(status_span)), + Cell::from(time_text), + ]) +} + +fn status_short(state: RunState) -> &'static str { + match state { + RunState::Pending => "pending", + RunState::Running { .. } => "running", + RunState::Pass { .. } => "PASS", + RunState::Fail { .. } => "FAIL", + } +} + +fn footer( + counts: Counts, + active_pane: ActivePane, + manual: bool, + log_scroll: usize, + completed: bool, +) -> Paragraph<'static> { + let pane = match active_pane { + ActivePane::Runs => Span::styled(" pane:runs", Style::new().fg(Color::Cyan)), + ActivePane::Logs => Span::styled(" pane:logs", Style::new().fg(Color::Cyan)), + }; + let follow = if manual { + Span::styled(" manual", Style::new().fg(Color::Magenta)) + } else { + Span::styled(" auto", Style::new().fg(Color::DarkGray)) + }; + let scroll_hint = if log_scroll == 0 { + Span::styled(" tail", Style::new().fg(Color::DarkGray)) + } else { + Span::styled( + format!(" log:+{log_scroll}"), + Style::new().fg(Color::Magenta), + ) + }; + let done_hint = if completed { + Span::styled(" done q=exit", Style::new().fg(Color::Green)) + } else { + Span::raw("") + }; + let line = Line::from(vec![ + Span::styled( + "PASS ", + Style::new().fg(Color::Green).add_modifier(Modifier::BOLD), + ), + Span::raw(format!("{}", counts.passed)), + Span::raw(" "), + Span::styled( + "FAIL ", + Style::new().fg(Color::Red).add_modifier(Modifier::BOLD), + ), + Span::raw(format!("{}", counts.failed)), + Span::raw(" "), + Span::styled("run ", Style::new().fg(Color::Yellow)), + Span::raw(format!("{}", counts.running)), + Span::raw(" "), + Span::styled("pend ", Style::new().fg(Color::DarkGray)), + Span::raw(format!("{}", counts.pending)), + Span::raw(format!(" {}/{}", counts.done(), counts.total())), + pane, + follow, + scroll_hint, + done_hint, + ]); + Paragraph::new(line).block(Block::default().borders(Borders::ALL).title(" summary ")) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn wrapped_height_counts_empty_lines() { + let text = Text::from("one\n\nthree"); + + assert_eq!(wrapped_height(&text, 80), 3); + } + + #[test] + fn wrapped_height_counts_wrapped_rows() { + let text = Text::from("1234567890\nabc"); + + assert_eq!(wrapped_height(&text, 4), 4); + } +} diff --git a/tools/dkg-stress/src/worker.rs b/tools/dkg-stress/src/worker.rs new file mode 100644 index 00000000..7f88bab1 --- /dev/null +++ b/tools/dkg-stress/src/worker.rs @@ -0,0 +1,246 @@ +use anyhow::Result; +use std::collections::HashSet; +use std::fs; +use std::process::{Command, Stdio}; +use std::sync::atomic::{AtomicBool, AtomicU32, Ordering}; +use std::sync::{Arc, Mutex}; +use std::thread::{self, JoinHandle}; +use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; + +use crate::config::Config; +use crate::state::{App, RunState}; + +/// Set of process-group IDs (== PIDs since we put each child in its own +/// group) for in-flight run.sh invocations. The UI thread uses this on +/// shutdown to SIGTERM the whole tree per ceremony. +pub type Killers = Arc>>; + +pub fn spawn_workers( + config: Arc, + app: Arc>, + stop: Arc, + killers: Killers, +) -> Vec> { + let counter = Arc::new(AtomicU32::new(1)); + (0..config.workers) + .map(|_| { + let config = config.clone(); + let app = app.clone(); + let stop = stop.clone(); + let killers = killers.clone(); + let counter = counter.clone(); + thread::spawn(move || worker_loop(config, app, stop, killers, counter)) + }) + .collect() +} + +fn worker_loop( + config: Arc, + app: Arc>, + stop: Arc, + killers: Killers, + counter: Arc, +) { + loop { + if stop.load(Ordering::Relaxed) { + return; + } + let id = counter.fetch_add(1, Ordering::Relaxed); + if id > config.runs { + return; + } + if let Err(err) = run_one(id, &config, &app, &killers) { + // Worker errors (spawn failures, fs errors, etc.) are recorded as + // failures via the App update inside run_one's error path; this + // arm only fires when even that bookkeeping failed. Print to + // stderr so it shows up after the TUI is restored. + eprintln!("[run-{:04}] worker error: {:#}", id, err); + } + } +} + +fn run_one(id: u32, config: &Config, app: &Mutex, killers: &Killers) -> Result<()> { + let label = format!("run-{:04}", id); + let run_dir = config.work_dir.join(&label); + let _ = fs::remove_dir_all(&run_dir); + fs::create_dir_all(&run_dir)?; + + let log_path = run_dir.join("run.log"); + let log_file = fs::File::create(&log_path)?; + let log_clone = log_file.try_clone()?; + + let started = Instant::now(); + let started_unix = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + let started_iso = format_iso_utc(started_unix); + + set_state(app, id, RunState::Running { started_at: started }); + + let mut cmd = Command::new(&config.run_script); + cmd.env("WORK_DIR", &run_dir) + .env("CI", &config.worker_ci) + .stdout(Stdio::from(log_file)) + .stderr(Stdio::from(log_clone)) + .stdin(Stdio::null()); + + #[cfg(unix)] + { + use std::os::unix::process::CommandExt; + // Make the child a process-group leader so we can SIGTERM the whole + // tree (run.sh + its node children) with kill(-pgid, SIGTERM). + cmd.process_group(0); + } + + let mut child = match cmd.spawn() { + Ok(c) => c, + Err(e) => { + set_state(app, id, RunState::Fail { duration_s: 0 }); + config.append_summary_line(&label, "fail", 0, &started_iso, &run_dir)?; + return Err(e.into()); + } + }; + + let pid = child.id(); + insert_killer(killers, pid); + + let wait_result = child.wait(); + remove_killer(killers, pid); + + let duration_s = started.elapsed().as_secs(); + let pass = wait_result.map(|s| s.success()).unwrap_or(false); + + let final_state = if pass { + RunState::Pass { duration_s } + } else { + RunState::Fail { duration_s } + }; + set_state(app, id, final_state); + + let status_str = if pass { "pass" } else { "fail" }; + config.append_summary_line(&label, status_str, duration_s, &started_iso, &run_dir)?; + + if pass && !config.keep_passed { + prune_node_dirs(&run_dir); + } + + Ok(()) +} + +fn set_state(app: &Mutex, id: u32, state: RunState) { + if let Ok(mut a) = app.lock() { + let idx = (id as usize).saturating_sub(1); + if let Some(slot) = a.runs.get_mut(idx) { + *slot = state; + } + } +} + +fn insert_killer(killers: &Killers, pid: u32) { + if let Ok(mut k) = killers.lock() { + k.insert(pid); + } +} + +fn remove_killer(killers: &Killers, pid: u32) { + if let Ok(mut k) = killers.lock() { + k.remove(&pid); + } +} + +/// Drop node-*/ subdirectories of a passed run to keep disk usage bounded. +/// run.log and the cluster-lock outputs are kept for verification. +fn prune_node_dirs(run_dir: &std::path::Path) { + let Ok(entries) = fs::read_dir(run_dir) else { + return; + }; + for entry in entries.flatten() { + let name = entry.file_name(); + if name.to_string_lossy().starts_with("node-") { + let _ = fs::remove_dir_all(entry.path()); + } + } +} + +/// Send SIGTERM to every registered process group, then SIGKILL stragglers +/// after a short grace period. +pub fn kill_all(killers: &Killers, grace: Duration) { + let pids: Vec = killers.lock().map(|k| k.iter().copied().collect()).unwrap_or_default(); + if pids.is_empty() { + return; + } + for pid in &pids { + send_signal(*pid, libc::SIGTERM); + } + let deadline = Instant::now() + grace; + while Instant::now() < deadline { + let remaining = killers.lock().map(|k| k.len()).unwrap_or(0); + if remaining == 0 { + return; + } + thread::sleep(Duration::from_millis(100)); + } + let remaining: Vec = killers.lock().map(|k| k.iter().copied().collect()).unwrap_or_default(); + for pid in remaining { + send_signal(pid, libc::SIGKILL); + } +} + +#[cfg(unix)] +fn send_signal(pid: u32, sig: libc::c_int) { + // Negate the PID to address the whole process group. Each child was + // spawned with process_group(0), making it the group leader (so PID == + // PGID). Negative values to libc::kill mean "every process in this + // group". This is the kernel's standard mechanism for taking down a + // shell-launched subtree (run.sh + the four DKG nodes it forked). + // + // Cast safety: PIDs fit in i32 on every Unix we target. + let signed: i32 = pid.try_into().unwrap_or(0); + if signed > 0 { + // SAFETY: kill is a pure libc syscall with no aliasing or memory + // requirements; we pass a valid signal number. Out-of-range signed + // we already filtered above. Errors (ESRCH if the process is gone) + // are acceptable and ignored. + unsafe { + libc::kill(-signed, sig); + } + } +} + +#[cfg(not(unix))] +fn send_signal(_pid: u32, _sig: libc::c_int) { + // No-op on non-Unix; the tool only targets Unix anyway (run.sh is bash). +} + +fn format_iso_utc(unix_secs: u64) -> String { + // RFC3339 / ISO-8601 in UTC without external chrono dep. + // Range covers years 1970..9999 which is plenty for log timestamps. + let secs = unix_secs as i64; + let days = secs.div_euclid(86_400); + let time = secs.rem_euclid(86_400); + let h = (time / 3600) as u32; + let m = ((time % 3600) / 60) as u32; + let s = (time % 60) as u32; + let (year, month, day) = days_to_ymd(days); + format!( + "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}Z", + year, month, day, h, m, s + ) +} + +/// Convert days since 1970-01-01 to (year, month, day) using the proleptic +/// Gregorian calendar (Howard Hinnant's algorithm). +fn days_to_ymd(days: i64) -> (i32, u32, u32) { + let z = days.saturating_add(719_468); + let era = if z >= 0 { z } else { z - 146_096 } / 146_097; + let doe = (z - era * 146_097) as u64; // [0, 146096] + let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146_096) / 365; // [0, 399] + let y = yoe as i64 + era * 400; + let doy = doe - (365 * yoe + yoe / 4 - yoe / 100); // [0, 365] + let mp = (5 * doy + 2) / 153; // [0, 11] + let d = (doy - (153 * mp + 2) / 5 + 1) as u32; + let m = if mp < 10 { (mp + 3) as u32 } else { (mp - 9) as u32 }; + let year = if m <= 2 { y + 1 } else { y }; + (year as i32, m, d) +}