diff --git a/.github/workflows/dkg-runner.yml b/.github/workflows/dkg-runner.yml
index e16b01b7..0d0fdd23 100644
--- a/.github/workflows/dkg-runner.yml
+++ b/.github/workflows/dkg-runner.yml
@@ -40,11 +40,14 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - name: 4 Charon nodes
-            id: 4-charon
-            pluto_nodes: 0
-            charon_nodes: 4
-
+          - name: 2 Charon + 2 Pluto nodes
+            id: 2-charon-2-pluto
+            pluto_nodes: 2
+            charon_nodes: 2
+          - name: 4 Pluto nodes
+            id: 4-pluto
+            pluto_nodes: 4
+            charon_nodes: 0
     steps:
       - name: Checkout
         uses: actions/checkout@v6
diff --git a/Cargo.lock b/Cargo.lock
index bd23ac86..4b5fce85 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5639,6 +5639,7 @@ dependencies = [
  "pluto-k1util",
  "pluto-p2p",
  "pluto-parsigex",
+ "pluto-peerinfo",
  "pluto-testutil",
  "pluto-tracing",
  "prost 0.14.3",
diff --git a/Cargo.toml b/Cargo.toml
index d7fb3251..64346f3b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,6 +20,7 @@ members = [
     "crates/peerinfo",
     "crates/frost",
 ]
+exclude = ["tools/dkg-stress"]
 resolver = "3"
 
 [workspace.package]
diff --git a/crates/cli/src/main.rs b/crates/cli/src/main.rs
index 8c0e1e61..f9ac0fdd 100644
--- a/crates/cli/src/main.rs
+++ b/crates/cli/src/main.rs
@@ -32,12 +32,31 @@ async fn run() -> std::result::Result<(), CliError> {
     let matches = cmd.get_matches();
     let cli = Cli::from_arg_matches(&matches)?;
 
-    // Top level cancellation token for graceful shutdown on Ctrl+C
+    // Top level cancellation token for graceful shutdown on Ctrl+C / SIGTERM.
     let ct = CancellationToken::new();
     tokio::spawn({
         let ct = ct.clone();
         async move {
-            let _ = tokio::signal::ctrl_c().await;
+            #[cfg(unix)]
+            {
+                use tokio::signal::unix::{SignalKind, signal};
+                let mut sigterm = match signal(SignalKind::terminate()) {
+                    Ok(s) => s,
+                    Err(_) => {
+                        let _ = tokio::signal::ctrl_c().await;
+                        ct.cancel();
+                        return;
+                    }
+                };
+                tokio::select! {
+                    _ = tokio::signal::ctrl_c() => {}
+                    _ = sigterm.recv() => {}
+                }
+            }
+            #[cfg(not(unix))]
+            {
+                let _ = tokio::signal::ctrl_c().await;
+            }
             ct.cancel();
         }
     });
diff --git a/crates/cluster/src/definition.rs b/crates/cluster/src/definition.rs
index 9f3c30ae..41e8d3ba 100644
--- a/crates/cluster/src/definition.rs
+++ b/crates/cluster/src/definition.rs
@@ -767,7 +767,7 @@ impl Definition {
 
     /// Returns true if the provided definition version supports partial
     /// deposits.
-    fn support_partial_deposits(version: &str) -> bool {
+    pub fn support_partial_deposits(version: &str) -> bool {
         !matches!(
             version,
             V1_0 | V1_1 | V1_2 | V1_3 | V1_4 | V1_5 | V1_6 | V1_7
diff --git a/crates/dkg/Cargo.toml b/crates/dkg/Cargo.toml
index 284ff539..0cca0559 100644
--- a/crates/dkg/Cargo.toml
+++ b/crates/dkg/Cargo.toml
@@ -30,6 +30,7 @@ pluto-eth2api.workspace = true
 pluto-eth1wrap.workspace = true
 pluto-eth2util.workspace = true
 pluto-parsigex.workspace = true
+pluto-peerinfo.workspace = true
 pluto-frost.workspace = true
 async-trait.workspace = true
 pluto-tracing.workspace = true
diff --git a/crates/dkg/src/bcast/error.rs b/crates/dkg/src/bcast/error.rs
index 92faf4f9..6e4f0740 100644
--- a/crates/dkg/src/bcast/error.rs
+++ b/crates/dkg/src/bcast/error.rs
@@ -155,6 +155,10 @@ pub enum Error {
     #[error("missing protobuf field: {0}")]
     MissingField(&'static str),
 
+    /// A typed broadcast message failed protocol-specific validation.
+    #[error("invalid message: {0}")]
+    InvalidMessage(&'static str),
+
     /// Protobuf encoding failed.
     #[error("protobuf encode failed: {0}")]
     Encode(#[from] prost::EncodeError),
diff --git a/crates/dkg/src/disk.rs b/crates/dkg/src/disk.rs
index 93d943ad..ff6b7ed6 100644
--- a/crates/dkg/src/disk.rs
+++ b/crates/dkg/src/disk.rs
@@ -131,7 +131,7 @@ pub async fn load_definition(
         if conf.no_verify {
             warn!(
                 error = %error,
-                "Ignoring failed cluster definition signatures verification due to --no-verify flag"
+                "Ignoring failed cluster definition signature verification due to --no-verify flag"
             );
         } else {
             return Err(DiskError::ClusterDefinitionError(error));
diff --git a/crates/dkg/src/dkg.rs b/crates/dkg/src/dkg.rs
index d9f9a5d2..5edc76a5 100644
--- a/crates/dkg/src/dkg.rs
+++ b/crates/dkg/src/dkg.rs
@@ -1,13 +1,17 @@
-use std::{num::TryFromIntError, path, time::Duration};
+use std::{collections::HashMap, ffi::OsStr, num::TryFromIntError, path, time::Duration};
 
 use bon::Builder;
-use libp2p::PeerId;
+use futures::StreamExt;
+use libp2p::{PeerId, swarm::SwarmEvent};
+use pluto_app::{privkeylock, utils::UtilsError};
+use pluto_core::version;
+use tokio::select;
 use tokio_util::sync::CancellationToken;
-use tracing::{info, warn};
+use tracing::{debug, error, info, warn};
 
-use crate::disk;
 pub use crate::{
     aggregate::{AggregateError, agg_deposit_data, agg_lock_hash_sig, agg_validator_registrations},
+    exchanger::{Exchanger, SIG_DEPOSIT_DATA, SIG_LOCK, SIG_VALIDATOR_REG},
     publish::{PublishError, write_lock_to_api},
     share::Share,
     signing::{SigningError, sign_deposit_msgs, sign_lock_hash, sign_validator_registrations},
@@ -16,19 +20,24 @@ pub use crate::{
         set_registration_signature,
     },
 };
+use crate::{disk, frost, frostp2p, nodesigs};
 use pluto_cluster::{
-    definition::{Definition, ValidatorAddresses},
+    definition::{Definition, DefinitionError, ValidatorAddresses},
     distvalidator::DistValidatorError,
-    lock::Lock,
+    lock::{Lock, LockError},
     operator::Operator,
+    version::versions::*,
 };
 use pluto_crypto::types::PrivateKey;
 use pluto_eth1wrap::{EthClient, EthClientError};
 use pluto_eth2api::spec::phase0;
+use pluto_eth2util as eth2util;
 use pluto_eth2util::keymanager::{self, KeymanagerError};
-use pluto_p2p::{config::P2PConfig, peer::Peer};
+use pluto_p2p::{
+    behaviours::pluto::PlutoBehaviourEvent, bootnode::BootnodeError, config::P2PConfig,
+    k1::key_path, p2p::P2PError, peer::Peer,
+};
 use pluto_tracing::TracingConfig;
-use std::collections::HashMap;
 use url::Url;
 
 const DEFAULT_DATA_DIR: &str = ".charon";
@@ -112,6 +121,102 @@ pub enum DkgError {
     /// Integer overflow.
     #[error("integer overflow")]
     IntegerOverflow,
+
+    /// Test-only configuration is not allowed on mainnet.
+    #[error("cannot use test flags on mainnet")]
+    TestConfigOnMainnet,
+
+    /// Failed to create private key lock service.
+    #[error("failed to create private key lock service: {0}")]
+    PrivKeyLock(#[from] privkeylock::PrivKeyLockError),
+
+    /// Unsupported definition version.
+    #[error("only v1.6.0 and newer cluster definition versions supported, got: {version}")]
+    UnsupportedDefinitionVersion {
+        /// The unsupported version.
+        version: String,
+    },
+
+    /// Failed to convert fork version to network.
+    #[error("failed to convert fork version to network: {0}")]
+    ForkVersionToNetwork(#[from] eth2util::network::NetworkError),
+
+    /// Failed to load private key.
+    #[error("failed to load private key: {0}")]
+    KeyLoadError(#[from] pluto_p2p::k1::K1Error),
+
+    /// Peer error.
+    #[error("peer error: {0}")]
+    PeerError(#[from] pluto_p2p::peer::PeerError),
+
+    /// The local P2P key did not match the definition peer set.
+    #[error("private key not matching definition file: peer not in definition: {peer_id}")]
+    LocalPeerNotInDefinition {
+        /// Local peer ID derived from the P2P private key.
+        peer_id: PeerId,
+    },
+
+    /// Definition error.
+    #[error("definition error: {0}")]
+    Definition(#[from] DefinitionError),
+
+    /// Bootnode or relay resolution error.
+    #[error("bootnode error: {0}")]
+    Bootnode(#[from] BootnodeError),
+
+    /// Sync protocol error.
+    #[error("sync error: {0}")]
+    Sync(#[from] crate::sync::Error),
+
+    /// P2P node setup error.
+    #[error("p2p error: {0}")]
+    P2P(#[from] P2PError),
+
+    /// FROST DKG setup or execution failed.
+    #[error("frost error: {0}")]
+    Frost(#[from] frost::FrostError),
+
+    /// DKG signing or aggregation failed.
+    #[error("dkg signing error: {0}")]
+    Signing(#[from] SigningError),
+
+    /// K1 node-signature exchange failed.
+    #[error("k1 lock hash signature exchange: {0}")]
+    NodeSignatures(#[from] nodesigs::Error),
+
+    /// Cluster lock verification failed.
+    #[error("invalid lock file signatures: {0}")]
+    LockVerification(#[source] LockError),
+
+    /// Deposit-data file write failed.
+    #[error("deposit data error: {0}")]
+    Deposit(#[from] pluto_eth2util::deposit::DepositError),
+
+    /// Output archive creation failed.
+    #[error("bundle output: {0}")]
+    BundleOutput(#[from] UtilsError),
+
+    /// Background task failed.
+    #[error("background task failed: {0}")]
+    Join(#[from] tokio::task::JoinError),
+
+    /// The configured deposit data does not match deposit amounts.
+    #[error(
+        "deposit data length does not match deposit amounts length: deposit_data={deposit_data}, deposit_amounts={deposit_amounts}"
+    )]
+    DepositDataLengthMismatch {
+        /// Deposit-data set count.
+        deposit_data: usize,
+        /// Deposit amount count.
+        deposit_amounts: usize,
+    },
+
+    /// The configured DKG algorithm is not supported.
+    #[error("unsupported dkg algorithm: {algorithm}")]
+    UnsupportedDkgAlgorithm {
+        /// Algorithm name from the cluster definition.
+        algorithm: String,
+    },
 }
 
 /// Keymanager configuration accepted by the entrypoint.
@@ -186,6 +291,9 @@ pub struct Config {
     #[builder(default)]
     pub execution_engine_addr: String,
 
+    /// Append configuration.
+    pub append_config: Option<AppendConfig>,
+
     /// Whether to bundle the output directory as a tarball.
     #[builder(default)]
     pub zipped: bool,
@@ -198,9 +306,7 @@ pub struct Config {
 impl Config {
     /// Returns `true` if any test-only configuration is active.
     pub fn has_test_config(&self) -> bool {
-        // TODO: Extend this when more test-only hooks are added to TestConfig,
-        // so preflight skips stay aligned with the full test configuration.
-        self.test_config.def.is_some()
+        self.test_config.def.is_some() || self.test_config.p2p_key.is_some()
     }
 }
 
@@ -209,6 +315,9 @@ impl Config {
 pub struct TestConfig {
     /// Provides the cluster definition explicitly, skips loading from disk.
     pub def: Option<Definition>,
+
+    /// Provides the P2P private key explicitly, skips loading from disk.
+    pub p2p_key: Option<k256::SecretKey>,
 }
 
 /// Configuration used to merge the outcome of two DKG ceremonies.
@@ -259,25 +368,615 @@ fn default_tracing_config() -> TracingConfig {
         .build()
 }
 
-/// Runs the DKG entrypoint until the unported backend boundary.
-pub async fn run(conf: Config, shutdown: CancellationToken) -> Result<(), DkgError> {
-    if shutdown.is_cancelled() {
+/// Runs the DKG entrypoint.
+pub async fn run(conf: Config, ct: CancellationToken) -> Result<(), DkgError> {
+    if ct.is_cancelled() {
         return Err(DkgError::ShutdownRequestedBeforeStartup);
     }
 
+    let (lock_ct, lock_task) = start_private_key_lock(&conf).await?;
+    let result = run_inner(conf, ct).await;
+
+    lock_ct.cancel();
+    lock_task
+        .await
+        .unwrap_or_else(|err| error!(?err, "Error joining private key lock task"));
+
+    result
+}
+
+async fn start_private_key_lock(
+    conf: &Config,
+) -> Result<(CancellationToken, tokio::task::JoinHandle<()>), DkgError> {
+    let lock_svc = std::sync::Arc::new(
+        privkeylock::Service::new(private_key_lock_path(&conf.data_dir), "charon dkg").await?,
+    );
+    let lock_ct = CancellationToken::new();
+    let task_ct = lock_ct.clone();
+    let task = tokio::spawn(async move {
+        let run_svc = lock_svc.clone();
+        let mut run_task = tokio::spawn(async move { run_svc.run().await });
+
+        select! {
+            _ = task_ct.cancelled() => {
+                lock_svc.close().await;
+                log_private_key_lock_result(run_task.await);
+            }
+            result = &mut run_task => log_private_key_lock_result(result),
+        }
+    });
+
+    Ok((lock_ct, task))
+}
+
+fn log_private_key_lock_result(
+    result: std::result::Result<
+        std::result::Result<(), privkeylock::PrivKeyLockError>,
+        tokio::task::JoinError,
+    >,
+) {
+    match result {
+        Ok(Ok(())) => {}
+        Ok(Err(err)) => error!(?err, "Error locking private key file"),
+        Err(err) => error!(?err, "Error locking private key file"),
+    }
+}
+
+fn private_key_lock_path(data_dir: &path::Path) -> path::PathBuf {
+    let mut lock_path = key_path(data_dir);
+    let file_name = lock_path
+        .file_name()
+        .and_then(OsStr::to_str)
+        .unwrap_or("charon-enr-private-key");
+    lock_path.set_file_name(format!("{file_name}.lock"));
+    lock_path
+}
+
+async fn run_inner(conf: Config, ct: CancellationToken) -> Result<(), DkgError> {
+    if let Some(append) = &conf.append_config {
+        append.validate()?;
+    }
+
+    version::log_info("Charon DKG starting");
+
     let eth1 = EthClient::new(&conf.execution_engine_addr).await?;
 
-    let _definition = disk::load_definition(&conf, &eth1).await?;
+    let (
+        def,
+        total_validators,
+        new_validators,
+        new_withdrawal_addresses,
+        new_fee_recipient_addresses,
+    ) = if let Some(append) = &conf.append_config {
+        let def = append.cluster_lock.definition.clone();
+        let new_validators = u64::try_from(append.add_validators)?;
+        let total_validators = def
+            .num_validators
+            .checked_add(new_validators)
+            .ok_or(DkgError::IntegerOverflow)?;
+        let new_withdrawal_addresses = append
+            .validator_addresses
+            .iter()
+            .map(|addr| addr.withdrawal_address.clone())
+            .collect::<Vec<_>>();
+        let new_fee_recipient_addresses = append
+            .validator_addresses
+            .iter()
+            .map(|addr| addr.fee_recipient_address.clone())
+            .collect::<Vec<_>>();
+
+        (
+            def,
+            total_validators,
+            new_validators,
+            new_withdrawal_addresses,
+            new_fee_recipient_addresses,
+        )
+    } else {
+        let def = disk::load_definition(&conf, &eth1).await?;
+
+        let total_validators = def.num_validators;
+        let new_validators = def.num_validators;
+        let new_withdrawal_addresses = def.withdrawal_addresses();
+        let new_fee_recipient_addresses = def.fee_recipient_addresses();
+
+        (
+            def,
+            total_validators,
+            new_validators,
+            new_withdrawal_addresses,
+            new_fee_recipient_addresses,
+        )
+    };
+
+    // This DKG only supports a few specific config versions.
+    if !matches!(def.version.as_str(), V1_6 | V1_7 | V1_8 | V1_9 | V1_10) {
+        return Err(DkgError::UnsupportedDefinitionVersion {
+            version: def.version.clone(),
+        });
+    }
 
     validate_keymanager_flags(&conf)?;
+
+    // Check if keymanager address is reachable.
     verify_keymanager_connection(&conf).await?;
 
     if !conf.has_test_config() {
         disk::check_clear_data_dir(&conf.data_dir).await?;
     }
+
     disk::check_writes(&conf.data_dir).await?;
 
-    unimplemented!("DKG ceremony backend is not implemented yet");
+    let network = eth2util::network::fork_version_to_network(&def.fork_version)?;
+    if network == eth2util::network::MAINNET.name && conf.has_test_config() {
+        return Err(DkgError::TestConfigOnMainnet);
+    }
+
+    let peers = def.peers()?;
+
+    let def_hash = pluto_cluster::helpers::to_0x_hex(&def.definition_hash);
+
+    let key = if let Some(key) = conf.test_config.p2p_key.clone() {
+        key
+    } else {
+        pluto_p2p::k1::load_priv_key(&conf.data_dir)?
+    };
+
+    let peer_id = pluto_p2p::peer::peer_id_from_key(key.public_key())?;
+
+    info!("Starting local P2P networking peer");
+
+    log_peer_summary(peer_id, &peers, &def.operators);
+
+    let sig_types = vec![SIG_LOCK, SIG_DEPOSIT_DATA, SIG_VALIDATOR_REG];
+    let sig_type_set = std::sync::Arc::new(sig_types.iter().copied().collect());
+    let num_validators = u32::try_from(new_validators)?;
+    let (node, mut handlers) = crate::node::setup_p2p(
+        key.clone(),
+        &conf,
+        &peers,
+        def.definition_hash.clone(),
+        sig_type_set,
+        num_validators,
+        ct.child_token(),
+    )
+    .await?;
+
+    let node_idx = def
+        .node_idx(node.local_peer_id())
+        .map_err(|source| match source {
+            DefinitionError::PeerNotFound { peer_id } => {
+                DkgError::LocalPeerNotInDefinition { peer_id }
+            }
+            other => DkgError::Definition(other),
+        })?;
+
+    let peer_ids = def.peer_ids()?;
+    let exchanger = Exchanger::new(
+        ct.child_token(),
+        handlers.parsigex.clone(),
+        peer_ids,
+        sig_types,
+    )
+    .await;
+
+    let peer_share_indices = peers
+        .iter()
+        .map(|peer| Ok((peer.id, u32::try_from(peer.share_idx())?)))
+        .collect::<Result<HashMap<_, _>, DkgError>>()?;
+    let local_share_idx = u32::try_from(node_idx.share_idx)?;
+    let threshold = usize::try_from(def.threshold)?;
+    let mut frost_transport = frostp2p::new_frost_p2p(
+        handlers.bcast.clone(),
+        &mut handlers.frost_p2p,
+        &peer_share_indices,
+        local_share_idx,
+        threshold,
+        num_validators as usize,
+    )
+    .await?;
+    let node_sig_caster = nodesigs::NodeSigBcast::new(
+        peers.clone(),
+        node_idx.peer_idx,
+        handlers.bcast.clone(),
+        ct.child_token(),
+    )
+    .await?;
+
+    let sync_clients = handlers.sync.clone();
+    let sync_server = handlers.sync_server.clone();
+    let frost_handle = handlers.frost_p2p;
+    let network_ct = ct.child_token();
+    let network_task = tokio::spawn(drive_dkg_network(node, frost_handle, network_ct.clone()));
+
+    let result = run_ceremony(
+        &conf,
+        &eth1,
+        ct.child_token(),
+        def,
+        total_validators,
+        new_validators,
+        new_withdrawal_addresses,
+        new_fee_recipient_addresses,
+        network,
+        def_hash,
+        key,
+        node_idx,
+        peers,
+        exchanger,
+        &mut frost_transport,
+        node_sig_caster,
+        sync_server,
+        sync_clients,
+    )
+    .await;
+
+    network_ct.cancel();
+    let _ = network_task.await;
+
+    result
+}
+
+#[allow(clippy::too_many_arguments, reason = "mirrors the Go DKG run flow")]
+async fn run_ceremony<T: frost::FTransport>(
+    conf: &Config,
+    eth1: &EthClient,
+    ct: CancellationToken,
+    def: Definition,
+    total_validators: u64,
+    new_validators: u64,
+    new_withdrawal_addresses: Vec<String>,
+    new_fee_recipient_addresses: Vec<String>,
+    network: String,
+    def_hash: String,
+    key: k256::SecretKey,
+    node_idx: pluto_cluster::definition::NodeIdx,
+    peers: Vec<Peer>,
+    exchanger: Exchanger,
+    frost_transport: &mut T,
+    node_sig_caster: nodesigs::NodeSigBcast,
+    sync_server: crate::sync::Server,
+    sync_clients: Vec<crate::sync::Client>,
+) -> Result<(), DkgError> {
+    info!("Waiting to connect to all peers...");
+
+    let mut sync_runtime = start_sync_protocol(sync_server, sync_clients, ct.child_token()).await?;
+
+    info!("All peers connected, starting DKG ceremony");
+
+    let num_validators = u32::try_from(new_validators)?;
+    let threshold = u32::try_from(def.threshold)?;
+    let share_idx = u32::try_from(node_idx.share_idx)?;
+
+    let shares = match def.dkg_algorithm.as_str() {
+        "default" | "frost" => {
+            let num_nodes = u32::try_from(peers.len())?;
+            frost::run_frost_parallel(
+                ct.child_token(),
+                frost_transport,
+                num_validators,
+                num_nodes,
+                threshold,
+                share_idx,
+                &def_hash,
+            )
+            .await?
+        }
+        algorithm => {
+            return Err(DkgError::UnsupportedDkgAlgorithm {
+                algorithm: algorithm.to_string(),
+            });
+        }
+    };
+
+    // DKG was step 1, advance to step 2.
+    sync_runtime.next_step().await?;
+
+    let append_config = conf.append_config.as_ref();
+    let existing_shares = if append_config.is_some_and(|append| !append.unverified) {
+        get_existing_shares(append_config)?
+    } else {
+        Vec::new()
+    };
+
+    if append_config.is_some() {
+        debug!(
+            total = total_validators,
+            added = new_validators,
+            "Validator keys summary"
+        );
+    }
+
+    let deposit_amounts = deposit_amounts_for_definition(&def);
+    if let Some(append) = append_config
+        && !append.deposit_data.is_empty()
+        && append.deposit_data.len() != deposit_amounts.len()
+    {
+        return Err(DkgError::DepositDataLengthMismatch {
+            deposit_data: append.deposit_data.len(),
+            deposit_amounts: deposit_amounts.len(),
+        });
+    }
+
+    let mut deposit_datas = crate::signing::sign_and_agg_deposit_data(
+        &exchanger,
+        &shares,
+        &new_withdrawal_addresses,
+        &network,
+        &node_idx,
+        &deposit_amounts,
+        def.compounding,
+    )
+    .await?;
+
+    // Deposit data was step 2, advance to step 3.
+    sync_runtime.next_step().await?;
+
+    let val_regs = crate::signing::sign_and_agg_validator_registrations(
+        &exchanger,
+        &shares,
+        &new_fee_recipient_addresses,
+        def.target_gas_limit,
+        &node_idx,
+        &def.fork_version,
+    )
+    .await?;
+
+    // Pre-regs was step 3, advance to step 4.
+    sync_runtime.next_step().await?;
+
+    let mut lock = crate::signing::sign_and_aggregate_lock_hash(
+        &existing_shares,
+        &shares,
+        def,
+        &node_idx,
+        &exchanger,
+        deposit_datas.clone(),
+        val_regs,
+        append_config,
+    )
+    .await?;
+
+    // Lock hash aggregate was step 4, advance to step 5.
+    sync_runtime.next_step().await?;
+
+    lock.node_signatures = node_sig_caster
+        .exchange(Some(&key), &lock.lock_hash, ct.child_token())
+        .await?;
+
+    if !pluto_cluster::version::support_node_signatures(&lock.version) {
+        lock.node_signatures.clear();
+    }
+
+    // Node signatures was step 5, advance to step 6.
+    sync_runtime.next_step().await?;
+
+    if !conf.no_verify && append_config.is_none_or(|append| !append.unverified) {
+        lock.verify_signatures(eth1)
+            .await
+            .map_err(DkgError::LockVerification)?;
+    }
+
+    if conf.keymanager.address.is_empty() {
+        let all_shares = existing_shares
+            .iter()
+            .chain(shares.iter())
+            .cloned()
+            .collect::<Vec<_>>();
+        disk::write_keys_to_disk(conf, &all_shares, false).await?;
+        debug!(total = all_shares.len(), "Saved keyshares to disk");
+    } else {
+        disk::write_to_keymanager(
+            &conf.keymanager.address,
+            &conf.keymanager.auth_token,
+            &shares,
+        )
+        .await?;
+        debug!(
+            keymanager_address = conf.keymanager.address,
+            total = shares.len(),
+            "Imported keyshares to keymanager"
+        );
+    }
+
+    let mut dashboard_url = None;
+    if conf.publish.enabled {
+        match write_lock_to_api(&conf.publish.address, &lock, conf.publish.timeout).await {
+            Ok(url) => dashboard_url = Some(url),
+            Err(error) => warn!(%error, "Couldn't publish lock file to Obol API"),
+        }
+    }
+
+    disk::write_lock(&conf.data_dir, &lock).await?;
+    debug!("Saved lock file to disk");
+
+    if let Some(append) = append_config
+        && !append.deposit_data.is_empty()
+    {
+        deposit_datas = pluto_eth2util::deposit::merge_deposit_data_sets(
+            deposit_datas,
+            append.deposit_data.clone(),
+        );
+        debug!(
+            amounts = deposit_datas.len(),
+            validators = deposit_datas.first().map_or(0, Vec::len),
+            "Merged deposit data files"
+        );
+    }
+
+    for deposit_data in &deposit_datas {
+        pluto_eth2util::deposit::write_deposit_data_file(deposit_data, &network, &conf.data_dir)
+            .await?;
+        debug!("Saved deposit data file(s) to disk");
+    }
+
+    // Signature verification and disk key write was step 6, advance to step 7.
+    sync_runtime.next_step().await?;
+
+    sync_runtime.shutdown().await?;
+
+    if conf.zipped {
+        let data_dir = conf.data_dir.clone();
+        tokio::task::spawn_blocking(move || {
+            pluto_app::utils::bundle_output(data_dir, "dkg.tar.gz")
+        })
+        .await??;
+    }
+
+    debug!(
+        seconds = conf.shutdown_delay.as_secs(),
+        "Graceful shutdown delay"
+    );
+    tokio::time::sleep(conf.shutdown_delay).await;
+
+    info!("Successfully completed DKG ceremony 🎉");
+    if let Some(url) = dashboard_url {
+        info!("You can find your newly-created cluster dashboard here: {url}");
+    }
+
+    Ok(())
+}
+
+fn deposit_amounts_for_definition(def: &Definition) -> Vec<phase0::Gwei> {
+    if def.deposit_amounts.is_empty() {
+        if pluto_cluster::definition::Definition::support_partial_deposits(&def.version) {
+            pluto_eth2util::deposit::default_deposit_amounts(def.compounding)
+        } else {
+            vec![pluto_eth2util::deposit::DEFAULT_DEPOSIT_AMOUNT]
+        }
+    } else {
+        pluto_eth2util::deposit::dedup_amounts(&def.deposit_amounts)
+    }
+}
+
+struct SyncRuntime {
+    server: crate::sync::Server,
+    clients: Vec<crate::sync::Client>,
+    step: i64,
+    cancellation: CancellationToken,
+    tasks: Vec<tokio::task::JoinHandle<()>>,
+}
+
+impl SyncRuntime {
+    async fn next_step(&mut self) -> Result<(), DkgError> {
+        self.step = self.step.checked_add(1).ok_or(DkgError::IntegerOverflow)?;
+        for client in &self.clients {
+            client.set_step(self.step);
+        }
+
+        debug!(step = self.step, "Waiting for peers to start next step");
+        self.server
+            .await_all_at_step(self.step, self.cancellation.child_token())
+            .await?;
+
+        Ok(())
+    }
+
+    async fn shutdown(mut self) -> Result<(), DkgError> {
+        for client in &self.clients {
+            client.shutdown(self.cancellation.child_token()).await?;
+        }
+
+        self.server
+            .await_all_shutdown(self.cancellation.child_token())
+            .await?;
+        self.cancellation.cancel();
+
+        for task in self.tasks.drain(..) {
+            let _ = task.await;
+        }
+
+        Ok(())
+    }
+}
+
+impl Drop for SyncRuntime {
+    fn drop(&mut self) {
+        self.cancellation.cancel();
+    }
+}
+
+async fn start_sync_protocol(
+    server: crate::sync::Server,
+    clients: Vec<crate::sync::Client>,
+    cancellation: CancellationToken,
+) -> Result<SyncRuntime, DkgError> {
+    server.start();
+
+    let mut tasks = Vec::with_capacity(clients.len());
+    for client in &clients {
+        let client = client.clone();
+        let client_ct = cancellation.child_token();
+        let cancel_on_error = cancellation.clone();
+        tasks.push(tokio::spawn(async move {
+            if let Err(error) = client.run(client_ct).await
+                && !matches!(error, crate::sync::Error::Canceled)
+            {
+                error!(%error, "Sync failed to peer");
+                cancel_on_error.cancel();
+            }
+        }));
+    }
+
+    let mut ticker = tokio::time::interval(Duration::from_millis(250));
+    loop {
+        if let Some(error) = server.err().await {
+            return Err(DkgError::Sync(error));
+        }
+
+        let connected_count = clients
+            .iter()
+            .filter(|client| client.is_connected())
+            .count();
+        if connected_count == clients.len() {
+            break;
+        }
+
+        tokio::select! {
+            _ = cancellation.cancelled() => return Err(crate::sync::Error::Canceled.into()),
+            _ = ticker.tick() => {}
+        }
+    }
+
+    for client in &clients {
+        client.disable_reconnect();
+    }
+
+    server
+        .await_all_connected(cancellation.child_token())
+        .await?;
+
+    let mut runtime = SyncRuntime {
+        server,
+        clients,
+        step: 0,
+        cancellation,
+        tasks,
+    };
+    runtime.next_step().await?;
+
+    Ok(runtime)
+}
+
+async fn drive_dkg_network(
+    mut node: pluto_p2p::p2p::Node<crate::node::DkgBehaviour>,
+    frost_handle: frostp2p::FrostP2PHandle,
+    cancellation: CancellationToken,
+) {
+    loop {
+        tokio::select! {
+            _ = cancellation.cancelled() => break,
+            event = node.select_next_some() => {
+                if let SwarmEvent::Behaviour(PlutoBehaviourEvent::Inner(
+                    crate::node::DkgBehaviourEvent::Bcast(event),
+                )) = event
+                    && let Err(error) = frost_handle.handle_bcast_event(event)
+                {
+                    debug!(%error, "Failed to forward bcast event to FROST transport");
+                }
+            }
+        }
+    }
 }
 
 fn validate_keymanager_flags(conf: &Config) -> Result<(), DkgError> {
@@ -475,10 +1174,12 @@ mod tests {
 
     #[tokio::test]
     async fn run_rejects_mismatched_keymanager_flags() {
+        let tempdir = tempfile::tempdir().expect("tempdir");
         let (lock, ..) = pluto_cluster::test_cluster::new_for_test(1, 3, 4, 0);
 
         let err = run(
             Config::builder()
+                .data_dir(tempdir.path().to_path_buf())
                 .test_config(TestConfig::builder().def(lock.definition.clone()).build())
                 .keymanager(
                     KeymanagerConfig::builder()
@@ -541,36 +1242,32 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn run_executes_preflight_before_reaching_backend_boundary() {
+    async fn run_reaches_p2p_key_verification_after_preflight() {
         let tempdir = tempfile::tempdir().expect("tempdir");
-        let definition_path = tempdir.path().join("cluster-definition.json");
-        let private_key_path = tempdir.path().join("charon-enr-private-key");
+        let (lock, ..) = pluto_cluster::test_cluster::new_for_test(1, 3, 4, 1);
+        let mismatched_key = pluto_testutil::random::generate_insecure_k1_key(99);
 
-        tokio::fs::write(&private_key_path, b"dummy")
-            .await
-            .expect("private key");
-
-        let (lock, ..) = pluto_cluster::test_cluster::new_for_test(1, 3, 4, 0);
-        let definition = serde_json::to_string(&lock.definition).expect("definition json");
-        tokio::fs::write(&definition_path, definition)
-            .await
-            .expect("definition file");
-
-        let join_err = tokio::spawn(async move {
-            run(
-                Config::builder()
-                    .data_dir(tempdir.path().to_path_buf())
-                    .def_file(definition_path.to_string_lossy().into_owned())
-                    .no_verify(true)
-                    .build(),
-                CancellationToken::new(),
-            )
-            .await
-        })
+        let err = run(
+            Config::builder()
+                .data_dir(tempdir.path().to_path_buf())
+                .p2p(P2PConfig::default())
+                .shutdown_delay(Duration::ZERO)
+                .test_config(
+                    TestConfig::builder()
+                        .def(lock.definition.clone())
+                        .p2p_key(mismatched_key)
+                        .build(),
+                )
+                .build(),
+            CancellationToken::new(),
+        )
         .await
-        .expect_err("backend handoff should panic until implemented");
+        .expect_err("mismatched P2P key should fail before networking");
 
-        assert!(join_err.is_panic());
+        assert!(matches!(
+            err,
+            DkgError::PeerError(pluto_p2p::peer::PeerError::UnknownPublicKey)
+        ));
     }
 
     #[tokio::test]
diff --git a/crates/dkg/src/frost.rs b/crates/dkg/src/frost.rs
index a27ca2e8..cafd6645 100644
--- a/crates/dkg/src/frost.rs
+++ b/crates/dkg/src/frost.rs
@@ -1,5 +1,3 @@
-#![allow(dead_code)]
-
 use std::collections::{BTreeMap, HashMap};
 
 use async_trait::async_trait;
@@ -59,7 +57,7 @@ pub(crate) trait FTransport: Send + Sync {
 
 /// FROST DKG orchestration errors.
 #[derive(Debug, thiserror::Error)]
-pub(crate) enum FrostError {
+pub enum FrostError {
     /// Failed to construct a participant.
     #[error("new participant: {0}")]
     NewParticipant(#[source] pluto_frost::kryptology::KryptologyError),
@@ -290,38 +288,26 @@ pub(crate) async fn run_frost_parallel<T: FTransport>(
     share_idx: u32,
     dkg_ctx: &str,
 ) -> Result<Vec<Share>, FrostError> {
-    debug!(
-        num_validators,
-        num_nodes, threshold, share_idx, "Starting FROST DKG"
-    );
     let mut validators =
         new_frost_participants(num_validators, num_nodes, threshold, share_idx, dkg_ctx)?;
 
     let (cast_r1, p2p_r1) = round1(&mut validators)?;
-    debug!(
-        bcasts = cast_r1.len(),
-        p2p = p2p_r1.len(),
-        "Completed local FROST DKG round 1"
-    );
+
+    debug!("Sending round 1 messages");
+
     let (cast_r1_result, p2p_r1_result) = tp.round1(&cancellation, cast_r1, p2p_r1).await?;
-    debug!(
-        bcasts = cast_r1_result.len(),
-        p2p = p2p_r1_result.len(),
-        "Completed FROST DKG round 1 transport"
-    );
+
+    debug!("Received round 1 results");
 
     let cast_r2 = round2(&mut validators, &cast_r1_result, &p2p_r1_result)?;
-    debug!(bcasts = cast_r2.len(), "Completed local FROST DKG round 2");
+
+    debug!("Sending round 2 messages");
+
     let cast_r2_result = tp.round2(&cancellation, cast_r2).await?;
-    debug!(
-        bcasts = cast_r2_result.len(),
-        "Completed FROST DKG round 2 transport"
-    );
 
-    let shares = make_shares(&validators, &cast_r2_result)?;
-    debug!(shares = shares.len(), "Completed FROST DKG");
+    debug!("Received round 2 results");
 
-    Ok(shares)
+    make_shares(&validators, &cast_r2_result)
 }
 
 /// Returns multiple frost DKG participants (one for each parallel validator).
diff --git a/crates/dkg/src/frostp2p.rs b/crates/dkg/src/frostp2p.rs
index 6b094c22..50177582 100644
--- a/crates/dkg/src/frostp2p.rs
+++ b/crates/dkg/src/frostp2p.rs
@@ -103,8 +103,6 @@
 //! intentionally not reset; create a fresh [`FrostP2PBehaviour`],
 //! [`FrostP2PHandle`], and [`FrostP2P`] for each DKG.
 
-#![allow(dead_code)]
-
 use std::{
     collections::{HashMap, HashSet, VecDeque},
     sync::{Arc, Mutex},
@@ -167,7 +165,7 @@ pub(crate) const SEND_TIMEOUT: Duration = Duration::from_secs(7);
 
 /// FROST direct-P2P delivery errors.
 #[derive(Debug, thiserror::Error)]
-pub(crate) enum FrostP2PError {
+pub enum FrostP2PError {
     /// The behaviour task is no longer running.
     #[error("frost p2p behaviour is no longer running")]
     BehaviourClosed,
@@ -200,6 +198,7 @@ pub(crate) enum OutEvent {
 
 /// Event emitted while the FROST P2P transport progresses through its rounds.
 #[derive(Debug)]
+#[allow(dead_code)]
 pub(crate) enum FrostP2PEvent {
     /// A FROST transport round started.
     RoundStarted {
diff --git a/crates/dkg/src/lib.rs b/crates/dkg/src/lib.rs
index dff76aec..72e07923 100644
--- a/crates/dkg/src/lib.rs
+++ b/crates/dkg/src/lib.rs
@@ -49,3 +49,6 @@ mod signing;
 
 /// Registration conversion and distributed-validator assembly helpers.
 mod validators;
+
+/// P2P node setup.
+mod node;
diff --git a/crates/dkg/src/node.rs b/crates/dkg/src/node.rs
new file mode 100644
index 00000000..85b7102f
--- /dev/null
+++ b/crates/dkg/src/node.rs
@@ -0,0 +1,211 @@
+use std::{
+    collections::{HashMap, HashSet},
+    sync::Arc,
+};
+
+use crate::{
+    bcast,
+    dkg::{Config, DkgError},
+    exchanger::{SIG_DEPOSIT_DATA, SigType},
+    frostp2p, sync,
+};
+use libp2p::{Multiaddr, multiaddr::Protocol, relay, swarm::NetworkBehaviour};
+use pluto_core::{
+    types::{Duty, DutyType},
+    version,
+};
+use pluto_p2p::{
+    bootnode, gater,
+    p2p::{Node, NodeType},
+    p2p_context::P2PContext,
+    peer::{Peer, peer_id_from_key, verify_p2p_key},
+    relay::{MutableRelayReservation, RelayRouter},
+};
+use pluto_parsigex as parsigex;
+use pluto_peerinfo::{self as peerinfo, LocalPeerInfo};
+use tokio_util::sync::CancellationToken;
+
+#[derive(NetworkBehaviour)]
+pub(crate) struct DkgBehaviour {
+    pub(crate) relay: relay::client::Behaviour,
+    pub(crate) relay_reservation: MutableRelayReservation,
+    pub(crate) relay_router: RelayRouter,
+    pub(crate) bcast: bcast::Behaviour,
+    pub(crate) sync: sync::Behaviour,
+    pub(crate) parsigex: parsigex::Behaviour,
+    pub(crate) peerinfo: peerinfo::Behaviour,
+    pub(crate) frost_p2p: frostp2p::FrostP2PBehaviour,
+}
+
+type Result<T> = std::result::Result<T, DkgError>;
+
+pub(crate) struct Handlers {
+    pub(crate) bcast: bcast::Component,
+    pub(crate) sync: Vec<sync::Client>,
+    pub(crate) sync_server: sync::Server,
+    pub(crate) parsigex: parsigex::Handle,
+    pub(crate) frost_p2p: frostp2p::FrostP2PHandle,
+}
+
+pub(crate) async fn setup_p2p(
+    key: k256::SecretKey,
+    conf: &Config,
+    peers: &[Peer],
+    def_hash: Vec<u8>,
+    sig_types: Arc<HashSet<SigType>>,
+    num_validators: u32,
+    ct: CancellationToken,
+) -> Result<(Node<DkgBehaviour>, Handlers)> {
+    let peer_ids = peers.iter().map(|peer| peer.id).collect::<Vec<_>>();
+    let local_peer_id = peer_id_from_key(key.public_key())?;
+
+    verify_p2p_key(peers, &key)?;
+
+    let relay_addrs = relay_addrs_for_resolution(&conf.p2p.relays);
+    let relays = bootnode::new_relays(ct, &relay_addrs, &hex::encode(&def_hash)).await?;
+
+    let conn_gater = gater::ConnGater::new_conn_gater(peer_ids.clone(), relays.clone());
+
+    let p2p_context = P2PContext::new(peer_ids.clone());
+    p2p_context.set_local_peer_id(local_peer_id);
+
+    let relay_reservation = MutableRelayReservation::new(relays.clone());
+    let relay_router = RelayRouter::new(relays, p2p_context.clone(), local_peer_id);
+
+    let (bcast_comp, bcast_comp_handle) =
+        bcast::Behaviour::new(peer_ids.clone(), p2p_context.clone(), key.clone());
+    let (sync_comp, sync_server, sync_clients) = sync::new(
+        peer_ids.clone(),
+        p2p_context.clone(),
+        &key,
+        def_hash.clone(),
+        version::VERSION.to_minor(),
+    )?;
+
+    let parsigex_config = parsigex::Config::new(
+        local_peer_id,
+        p2p_context.clone(),
+        Arc::new(|_duty, _pk, _sig| Box::pin(async { Ok(()) })),
+        Arc::new(move |duty: &Duty| {
+            if duty.duty_type != DutyType::Signature {
+                return false;
+            }
+
+            if sig_types.contains(&SIG_DEPOSIT_DATA) && duty.slot.inner() >= SIG_DEPOSIT_DATA {
+                return true;
+            }
+
+            sig_types.contains(&duty.slot.inner())
+        }),
+    );
+    let (parsigex_comp, parsigex_handle) = parsigex::Behaviour::new(parsigex_config);
+
+    let (git_hash, _) = version::git_commit();
+    let peerinfo_config = peerinfo::Config::new(LocalPeerInfo::new(
+        version::VERSION.to_string(),
+        def_hash.clone(),
+        git_hash,
+        false,
+        "",
+    ))
+    .with_peers(peer_ids.clone());
+    let peerinfo_comp = peerinfo::Behaviour::new(local_peer_id, peerinfo_config);
+
+    let mut share_idx_by_peer = HashMap::new();
+    let mut local_share_idx = None;
+    for peer in peers {
+        let share_idx = u32::try_from(peer.share_idx())?;
+        share_idx_by_peer.insert(peer.id, share_idx);
+        if peer.id == local_peer_id {
+            local_share_idx = Some(share_idx);
+        }
+    }
+    let local_share_idx = local_share_idx.ok_or(DkgError::LocalPeerNotInDefinition {
+        peer_id: local_peer_id,
+    })?;
+
+    let (frost_p2p_comp, frost_p2p_handle) = frostp2p::FrostP2PBehaviour::new(
+        p2p_context.clone(),
+        peer_ids.clone(),
+        share_idx_by_peer,
+        local_share_idx,
+        num_validators as usize,
+    );
+
+    let node = Node::new(
+        conf.p2p.clone(),
+        key,
+        NodeType::TCP,
+        false,
+        p2p_context,
+        |builder, _, relay_client| {
+            builder.with_gater(conn_gater).with_inner(DkgBehaviour {
+                relay: relay_client,
+                relay_reservation,
+                relay_router,
+                bcast: bcast_comp,
+                sync: sync_comp,
+                parsigex: parsigex_comp,
+                peerinfo: peerinfo_comp,
+                frost_p2p: frost_p2p_comp,
+            })
+        },
+    )?;
+
+    let handlers = Handlers {
+        bcast: bcast_comp_handle,
+        sync: sync_clients,
+        sync_server,
+        parsigex: parsigex_handle,
+        frost_p2p: frost_p2p_handle,
+    };
+
+    Ok((node, handlers))
+}
+
+fn relay_addrs_for_resolution(relays: &[Multiaddr]) -> Vec<String> {
+    relays.iter().map(relay_addr_for_resolution).collect()
+}
+
+fn relay_addr_for_resolution(relay: &Multiaddr) -> String {
+    let mut scheme = None;
+    let mut host = None;
+    let mut port = None;
+
+    for protocol in relay.iter() {
+        match protocol {
+            Protocol::Http => scheme = Some("http"),
+            Protocol::Https => scheme = Some("https"),
+            Protocol::Dns(name)
+            | Protocol::Dns4(name)
+            | Protocol::Dns6(name)
+            | Protocol::Dnsaddr(name)
+                if host.is_none() =>
+            {
+                host = Some(name.to_string());
+            }
+            Protocol::Ip4(ip) if host.is_none() => {
+                host = Some(ip.to_string());
+            }
+            Protocol::Ip6(ip) if host.is_none() => {
+                host = Some(format!("[{ip}]"));
+            }
+            Protocol::Tcp(tcp_port) => port = Some(tcp_port),
+            _ => {}
+        }
+    }
+
+    if let (Some(scheme), Some(host)) = (scheme, host) {
+        let default_port = match scheme {
+            "https" => 443,
+            _ => 80,
+        };
+
+        return match port {
+            Some(port) if port != default_port => format!("{scheme}://{host}:{port}"),
+            _ => format!("{scheme}://{host}"),
+        };
+    }
+
+    relay.to_string()
+}
diff --git a/crates/dkg/src/publish.rs b/crates/dkg/src/publish.rs
index c4093e2b..32417af2 100644
--- a/crates/dkg/src/publish.rs
+++ b/crates/dkg/src/publish.rs
@@ -27,7 +27,7 @@ pub async fn write_lock_to_api(
     )?;
 
     client.publish_lock(lock.clone()).await?;
-    debug!(addr = publish_addr, "Published lock file");
+    debug!(addr = publish_addr, "Published lock file to api");
 
     Ok(client.launchpad_url_for_lock(lock)?)
 }
diff --git a/crates/dkg/src/signing.rs b/crates/dkg/src/signing.rs
index 4d25ec02..1beaa5e7 100644
--- a/crates/dkg/src/signing.rs
+++ b/crates/dkg/src/signing.rs
@@ -262,8 +262,8 @@ pub(crate) async fn sign_and_agg_validator_registrations(
 ) -> Result<Vec<VersionedSignedValidatorRegistration>> {
     let effective_gas_limit = if gas_limit == 0 {
         warn!(
-            default = registration::DEFAULT_GAS_LIMIT,
-            "gas_limit not set, using default"
+            default_gas_limit = registration::DEFAULT_GAS_LIMIT,
+            "custom target gas limit not supported, setting to default"
         );
         registration::DEFAULT_GAS_LIMIT
     } else {
diff --git a/crates/dkg/src/sync/handler.rs b/crates/dkg/src/sync/handler.rs
index 2989c61d..0807b5db 100644
--- a/crates/dkg/src/sync/handler.rs
+++ b/crates/dkg/src/sync/handler.rs
@@ -223,7 +223,7 @@ impl ConnectionHandler for Handler {
                     self.inbound = None;
                 }
                 Poll::Ready(Err(error)) => {
-                    warn!(peer = %self.peer_id, err = %error, "Error serving inbound sync stream");
+                    warn!(peer = %self.peer_id, err = %error, "Error serving sync protocol");
                     self.inbound = None;
                 }
             }
@@ -467,12 +467,8 @@ async fn handle_inbound_stream(
             } else {
                 let (inserted, count) = server.set_connected(peer_id).await;
                 if inserted {
-                    info!(
-                        peer = %peer_id,
-                        connected = count,
-                        expected = server.expected_peer_count(),
-                        "Connected to peer"
-                    );
+                    let expected = server.expected_peer_count();
+                    info!(peer = %peer_id, "Connected to peer {count} of {expected}");
                 }
             }
 
diff --git a/crates/p2p/src/proto.rs b/crates/p2p/src/proto.rs
index 409e136a..04f8581a 100644
--- a/crates/p2p/src/proto.rs
+++ b/crates/p2p/src/proto.rs
@@ -57,8 +57,14 @@ pub async fn write_fixed_size_delimited<S: AsyncWrite + Unpin>(
     let len = i64::try_from(payload.len())
         .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "payload length overflow"))?;
 
-    stream.write_all(&len.to_le_bytes()).await?;
-    stream.write_all(payload).await?;
+    // Charon's `readSizedProto` uses a single `reader.Read(buf)` for the
+    // payload (not `io.ReadFull`), so it requires the length prefix and payload
+    // to arrive in one libp2p chunk. Coalesce them into one `write_all` to
+    // avoid splitting across yamux frames.
+    let mut buf = Vec::with_capacity(8usize.saturating_add(payload.len()));
+    buf.extend_from_slice(&len.to_le_bytes());
+    buf.extend_from_slice(payload);
+    stream.write_all(&buf).await?;
     stream.flush().await
 }
 
diff --git a/crates/p2p/src/relay.rs b/crates/p2p/src/relay.rs
index 9219da59..af153640 100644
--- a/crates/p2p/src/relay.rs
+++ b/crates/p2p/src/relay.rs
@@ -252,7 +252,6 @@ impl MutableRelayReservation {
 
     /// Processes pending subscription events.
     fn process_subscription_events(&mut self) {
-        tracing::debug!("Processing subscription events");
         let peers = {
             let Ok(mut queue) = self.subscription_events.lock() else {
                 tracing::warn!("Failed to lock subscription events queue");
diff --git a/scripts/dkg-runner/README.md b/scripts/dkg-runner/README.md
index 98ee1daa..9b38f0c8 100644
--- a/scripts/dkg-runner/README.md
+++ b/scripts/dkg-runner/README.md
@@ -60,6 +60,8 @@ All variables are optional. Set them in the environment before calling any scrip
 | `FEE_RECIPIENT` | `0xDeaDBeef…` | Fee recipient address for the cluster |
 | `WITHDRAWAL_ADDR` | `0xDeaDBeef…` | Withdrawal address for the cluster |
 | `TIMEOUT` | `120` | Seconds to wait before declaring the ceremony failed |
+| `SHUTDOWN_DELAY` | `30s` | Graceful shutdown delay passed to each node via `--shutdown-delay` |
+| `NODE_EXIT_TIMEOUT` | `90` | Seconds to wait for node processes to exit cleanly after artifacts appear |
 | `PLUTO_BIN` | `./target/debug/pluto` | Path to the Pluto binary (only required when `PLUTO_NODES > 0`) |
 | `CHARON_BIN` | `charon` | Path to the Charon binary |
 | `WORK_DIR` | `/tmp/dkg-run` | Scratch directory — wiped at the start of every run |
@@ -75,7 +77,7 @@ All variables are optional. Set them in the environment before calling any scrip
 | 1 | `setup.sh` | Wipes `WORK_DIR`, creates `node-0/`…`node-N/` data dirs, generates a p2p key + ENR for each node (`pluto create enr` / `charon create enr`), then runs `charon create dkg --operator-enrs=…` |
 | 2 | `start-nodes.sh` | Starts Pluto nodes (slots 0…PLUTO_NODES-1) and Charon nodes (remaining slots) as background processes, each in its own process group; logs to `node-N/node.log` |
 | 3 | `monitor.sh` | Waits for `cluster-lock.json` to appear in every node's data dir; exits 0 on completion, 1 on timeout (with the tail of each `node.log` dumped to stderr) |
-| 4 | *(inline)* | Sends SIGTERM to each node's process group unless `KEEP_NODES` is enabled |
+| 4 | `wait-node-exits.sh` | Waits for each node process to exit with status `0` unless `KEEP_NODES` is enabled |
 | 5 | `collect.sh` | Copies keystores and `cluster-lock.json` to `WORK_DIR/output/`; prints a summary |
 
 On success, outputs are under `$WORK_DIR/output/`. On failure or timeout, partial outputs are still collected and `WORK_DIR` is preserved for inspection. `run.sh` never deletes `WORK_DIR`; use `./scripts/dkg-runner/reset.sh` when you're done.
@@ -93,6 +95,7 @@ Ctrl-C at any point kills all node process groups cleanly via the SIGINT trap; `
 | `start-nodes.sh` | Launches node processes in the background (each in its own process group) |
 | `run-node.sh` | Runs a single node in the foreground: `run-node.sh <index> <pluto\|charon>` |
 | `monitor.sh` | Waits for ceremony completion or timeout |
+| `wait-node-exits.sh` | Waits for all node processes to report clean exit codes |
 | `collect.sh` | Gathers keystores and lock file into `output/` |
 | `reset.sh` | Kills all nodes and removes `WORK_DIR` (the explicit cleanup tool) |
 | `config.sh` | Shared env-var defaults sourced by every script |
diff --git a/scripts/dkg-runner/config.sh b/scripts/dkg-runner/config.sh
index 63519a09..33eac79d 100755
--- a/scripts/dkg-runner/config.sh
+++ b/scripts/dkg-runner/config.sh
@@ -11,6 +11,8 @@
 : "${CHARON_NODES:=2}"
 : "${RELAY_URL:=https://0.relay.obol.tech}"
 : "${TIMEOUT:=120}"
+: "${SHUTDOWN_DELAY:=30s}"
+: "${NODE_EXIT_TIMEOUT:=90}"
 : "${PLUTO_BIN:=./target/debug/pluto}"
 : "${CHARON_BIN:=charon}"
 : "${WORK_DIR:=/tmp/dkg-run}"
diff --git a/scripts/dkg-runner/monitor.sh b/scripts/dkg-runner/monitor.sh
index 6e4a57b6..38d15a2b 100755
--- a/scripts/dkg-runner/monitor.sh
+++ b/scripts/dkg-runner/monitor.sh
@@ -25,7 +25,17 @@ POLL_INTERVAL=2
 TAIL_LINES=30
 
 log_info "Waiting for ${NODES} nodes (timeout: ${TIMEOUT}s)"
-log_info "Completion = cluster-lock.json present in ${WORK_DIR}/node-*/"
+log_info "Completion = cluster-lock.json AND keystore-*.json present in ${WORK_DIR}/node-*/"
+
+# A node is done when both cluster-lock.json and at least one keystore are
+# present. Pluto writes keystores under validator_keys/, Charon writes them
+# flat in the data dir — accept either layout.
+node_done() {
+    local node_dir="${1}"
+    [[ -f "${node_dir}/cluster-lock.json" ]] || return 1
+    compgen -G "${node_dir}/validator_keys/keystore-*.json" > /dev/null 2>&1 \
+        || compgen -G "${node_dir}/keystore-*.json" > /dev/null 2>&1
+}
 
 start_time="${SECONDS}"
 last_count=-1
@@ -34,7 +44,7 @@ while true; do
     elapsed=$(( SECONDS - start_time ))
     done_count=0
     for (( i = 0; i < NODES; i++ )); do
-        if [[ -f "${WORK_DIR}/node-${i}/cluster-lock.json" ]]; then
+        if node_done "${WORK_DIR}/node-${i}"; then
             done_count=$(( done_count + 1 ))
         fi
     done
diff --git a/scripts/dkg-runner/run-node.sh b/scripts/dkg-runner/run-node.sh
index 811e6d2d..6dc05c82 100755
--- a/scripts/dkg-runner/run-node.sh
+++ b/scripts/dkg-runner/run-node.sh
@@ -94,4 +94,5 @@ log_info "=============================================="
     --definition-file="${DEF_FILE}" \
     --data-dir="${DATA_DIR}" \
     --p2p-relays="${RELAY_URL}" \
+    --shutdown-delay="${SHUTDOWN_DELAY}" \
     2>&1 | tee "${LOG_FILE}"
diff --git a/scripts/dkg-runner/run.sh b/scripts/dkg-runner/run.sh
index 59bab8de..9ba3f124 100755
--- a/scripts/dkg-runner/run.sh
+++ b/scripts/dkg-runner/run.sh
@@ -12,6 +12,7 @@
 #   RELAY_URL=https://0.relay.obol.tech
 #                        Relay ENR endpoint used by the DKG nodes.
 #   TIMEOUT=120          Seconds to wait for all nodes before aborting.
+#   NODE_EXIT_TIMEOUT=90 Seconds to wait for nodes to exit after completion.
 #   PLUTO_BIN=./target/debug/pluto
 #                        Path to the Pluto binary.
 #   CHARON_BIN=charon    Path to the Charon binary.
@@ -89,6 +90,7 @@ log_info "  CHARON_NODES = ${CHARON_NODES}"
 log_info "  RELAY_URL    = ${RELAY_URL}"
 log_info "  NETWORK      = ${NETWORK}"
 log_info "  TIMEOUT      = ${TIMEOUT}s"
+log_info "  NODE_EXIT_TIMEOUT = ${NODE_EXIT_TIMEOUT}s"
 log_info "  PLUTO_BIN    = ${PLUTO_BIN}"
 log_info "  CHARON_BIN   = ${CHARON_BIN}"
 log_info "  WORK_DIR     = ${WORK_DIR}"
@@ -119,8 +121,17 @@ fi
 if is_truthy "${KEEP_NODES}"; then
     log_info "--- Phase 4: Keep nodes running (ceremony complete) ---"
 else
-    log_info "--- Phase 4: Stop nodes (ceremony complete) ---"
-    _kill_nodes || true
+    log_info "--- Phase 4: Wait for clean node exits ---"
+    wait_exit=0
+    "${SCRIPT_DIR}/wait-node-exits.sh" || wait_exit=$?
+    if (( wait_exit != 0 )); then
+        log_err "One or more nodes exited unsuccessfully after producing artifacts."
+        _kill_nodes || true
+        "${SCRIPT_DIR}/collect.sh" || true
+        log_info "Work dir preserved at ${WORK_DIR}. Run ${SCRIPT_DIR}/reset.sh to remove it."
+        trap - INT TERM
+        exit 1
+    fi
 fi
 
 log_info "--- Phase 5: Collect outputs ---"
diff --git a/scripts/dkg-runner/start-nodes.sh b/scripts/dkg-runner/start-nodes.sh
index 929be0b9..578ff63b 100755
--- a/scripts/dkg-runner/start-nodes.sh
+++ b/scripts/dkg-runner/start-nodes.sh
@@ -49,24 +49,40 @@ start_node() {
     local label="${3}"
     local data_dir="${WORK_DIR}/node-${index}"
     local log_file="${data_dir}/node.log"
+    local exit_file="${data_dir}/exit-code"
 
     mkdir -p "${data_dir}"
+    rm -f "${exit_file}"
     log_info "Starting ${label} node ${index} (bin: ${bin})"
 
     if is_ci; then
         # Quiet path for CI: write to log file only.
-        "${bin}" dkg \
-            --definition-file="${DEF_FILE}" \
-            --data-dir="${data_dir}" \
-            --p2p-relays="${RELAY_URL}" \
-            > "${log_file}" 2>&1 &
+        (
+            set +e
+            "${bin}" dkg \
+                --definition-file="${DEF_FILE}" \
+                --data-dir="${data_dir}" \
+                --p2p-relays="${RELAY_URL}" \
+                --shutdown-delay="${SHUTDOWN_DELAY}" \
+                > "${log_file}" 2>&1
+            status=$?
+            echo "${status}" > "${exit_file}"
+            exit "${status}"
+        ) &
     else
         # Interactive path: tee to log file and the terminal.
-        "${bin}" dkg \
-            --definition-file="${DEF_FILE}" \
-            --data-dir="${data_dir}" \
-            --p2p-relays="${RELAY_URL}" \
-            > >(tee "${log_file}") 2>&1 &
+        (
+            set +e
+            "${bin}" dkg \
+                --definition-file="${DEF_FILE}" \
+                --data-dir="${data_dir}" \
+                --p2p-relays="${RELAY_URL}" \
+                --shutdown-delay="${SHUTDOWN_DELAY}" \
+                > >(tee "${log_file}") 2>&1
+            status=$?
+            echo "${status}" > "${exit_file}"
+            exit "${status}"
+        ) &
     fi
 
     echo "$!" >> "${PID_FILE}"
diff --git a/scripts/dkg-runner/stress.sh b/scripts/dkg-runner/stress.sh
new file mode 100755
index 00000000..65bbb72a
--- /dev/null
+++ b/scripts/dkg-runner/stress.sh
@@ -0,0 +1,405 @@
+#!/usr/bin/env bash
+# stress.sh — Run N DKG ceremonies back-to-back (or in parallel) for stress
+# testing. Each ceremony gets its own isolated WORK_DIR; results are aggregated
+# into a TSV summary.
+#
+# Usage:
+#   ./stress.sh [--help]
+#
+# Stress-test variables (all optional; defaults shown):
+#   RUNS=10                          Total ceremonies to run.
+#   WORKERS=1                        Concurrent ceremonies.
+#   STRESS_WORK_DIR=/tmp/dkg-stress  Base directory; each run uses run-NNN/.
+#   KEEP_PASSED=0                    When truthy, keep full per-run dirs even
+#                                    on success. Default trims node-*/ on pass
+#                                    to save disk; failed runs are always kept.
+#   INTERACTIVE=auto                 auto|1|0. When auto (default), uses an
+#                                    in-place TUI table when stdout is a TTY,
+#                                    CI is unset, and the table fits the
+#                                    terminal. Set to 1 to force, 0 to disable.
+#
+# Per-run variables (forwarded to run.sh — see run.sh --help for full list):
+#   NODES, THRESHOLD, PLUTO_NODES, CHARON_NODES, RELAY_URL, NETWORK,
+#   FEE_RECIPIENT, WITHDRAWAL_ADDR, TIMEOUT, NODE_EXIT_TIMEOUT,
+#   SHUTDOWN_DELAY, PLUTO_BIN, CHARON_BIN.
+# RELAY_URL is overridden per run with a random index in https://{0..4}.relay.obol.tech.
+#
+# WORK_DIR from the environment is ignored — stress.sh assigns one per run.
+# CI is forced to "true" when WORKERS > 1 so node logs don't interleave.
+#
+# Outputs:
+#   ${STRESS_WORK_DIR}/summary.tsv          TSV with one row per run.
+#   ${STRESS_WORK_DIR}/run-NNN/run.log      Captured stdout/stderr of run.sh.
+#   ${STRESS_WORK_DIR}/run-NNN/...          Whatever run.sh wrote (preserved
+#                                           for failed runs; trimmed on pass
+#                                           unless KEEP_PASSED is truthy).
+#
+# Exit codes:
+#   0   — all RUNS ceremonies passed.
+#   1   — one or more failed (failed runs preserved for inspection).
+#   130 — interrupted; in-flight workers terminated.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=lib.sh
+source "${SCRIPT_DIR}/lib.sh"
+LOG_PREFIX="stress"
+
+if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then
+    grep '^#' "${BASH_SOURCE[0]}" | grep -v '#!/' | sed 's/^# \?//'
+    exit 0
+fi
+
+# ── Stress-test params ───────────────────────────────────────────────────────
+
+: "${RUNS:=10}"
+: "${WORKERS:=1}"
+: "${STRESS_WORK_DIR:=/tmp/dkg-stress}"
+: "${KEEP_PASSED:=0}"
+: "${INTERACTIVE:=auto}"
+
+if (( RUNS < 1 )); then
+    log_err "RUNS must be >= 1 (got ${RUNS})"
+    exit 1
+fi
+if (( WORKERS < 1 )); then
+    log_err "WORKERS must be >= 1 (got ${WORKERS})"
+    exit 1
+fi
+if (( WORKERS > RUNS )); then
+    WORKERS=${RUNS}
+fi
+
+mkdir -p "${STRESS_WORK_DIR}"
+SUMMARY="${STRESS_WORK_DIR}/summary.tsv"
+printf 'run_id\tstatus\tduration_s\tstart_time\twork_dir\n' > "${SUMMARY}"
+
+# Force CI=true when running in parallel so per-node logs don't interleave on
+# the controlling terminal. Each run's stdout/stderr is captured to its own
+# run.log regardless, so this only changes the live-tail behaviour.
+WORKER_CI="${CI:-}"
+if (( WORKERS > 1 )) && [[ -z "${WORKER_CI}" ]]; then
+    WORKER_CI="true"
+fi
+
+# ── Interactive TUI vs append-only logging ───────────────────────────────────
+#
+# In TUI mode each run owns one terminal row that mutates pending → running →
+# PASS/FAIL, plus a footer summary. The mode is auto-disabled when:
+#   - stdout isn't a tty (piped, redirected, CI)
+#   - CI env is truthy
+#   - the table doesn't fit (RUNS + footer would exceed the terminal height)
+# In all other cases, workers emit per-state log lines as before.
+
+INTERACTIVE_MODE=0
+INTERACTIVE_REASON=""
+case "${INTERACTIVE}" in
+    1|true|TRUE|True|yes|YES|Yes|on|ON|On)
+        INTERACTIVE_MODE=1
+        ;;
+    0|false|FALSE|False|no|NO|No|off|OFF|Off)
+        INTERACTIVE_MODE=0
+        INTERACTIVE_REASON="forced off"
+        ;;
+    auto|AUTO|Auto|"")
+        if ! [[ -t 1 ]]; then
+            INTERACTIVE_REASON="stdout is not a tty"
+        elif is_truthy "${CI:-}"; then
+            INTERACTIVE_REASON="CI is set"
+        else
+            term_lines=$(tput lines 2>/dev/null || echo 0)
+            # Need RUNS rows + 1 footer; leave a couple of lines breathing room
+            # and for the prompt that comes back when we exit.
+            if (( term_lines >= RUNS + 3 )); then
+                INTERACTIVE_MODE=1
+            else
+                INTERACTIVE_REASON="terminal has ${term_lines} rows, need >= $(( RUNS + 3 )); resize taller or set INTERACTIVE=0 to silence"
+            fi
+        fi
+        ;;
+    *)
+        log_err "INTERACTIVE must be auto|1|0 (got: ${INTERACTIVE})"
+        exit 1
+        ;;
+esac
+
+STATE_DIR="${STRESS_WORK_DIR}/.state"
+rm -rf "${STATE_DIR}"
+mkdir -p "${STATE_DIR}"
+for (( i = 1; i <= RUNS; i++ )); do
+    printf 'pending\n' > "${STATE_DIR}/$(printf 'run-%04d' "${i}")"
+done
+
+write_state() {
+    local id="${1}"
+    local state="${2}"
+    printf '%s\n' "${state}" > "${STATE_DIR}/$(printf 'run-%04d' "${id}")"
+}
+
+# ANSI helpers (only emit escapes when we'll be drawing to the terminal).
+ansi() {
+    if (( INTERACTIVE_MODE )); then
+        printf '\033[%sm' "${1}"
+    fi
+}
+reset() {
+    if (( INTERACTIVE_MODE )); then
+        printf '\033[0m'
+    fi
+}
+
+format_run_line() {
+    local label="${1}"
+    local state="${2}"
+    local now="${3}"
+    case "${state}" in
+        pending)
+            printf '  %s  %spending%s' "${label}" "$(ansi 2)" "$(reset)"
+            ;;
+        running:*)
+            local since="${state#running:}"
+            local elapsed=$(( now - since ))
+            printf '  %s  %srunning%s   %3ds' \
+                "${label}" "$(ansi 33)" "$(reset)" "${elapsed}"
+            ;;
+        pass:*)
+            local dur="${state#pass:}"
+            printf '  %s  %sPASS   %s   %3ds' \
+                "${label}" "$(ansi '1;32')" "$(reset)" "${dur}"
+            ;;
+        fail:*)
+            local dur="${state#fail:}"
+            printf '  %s  %sFAIL   %s   %3ds' \
+                "${label}" "$(ansi '1;31')" "$(reset)" "${dur}"
+            ;;
+    esac
+}
+
+# Lines drawn by the most recent draw_table call (RUNS rows + 1 footer).
+# 0 means we haven't drawn yet, so the next call doesn't try to move the
+# cursor up over content that isn't there.
+TABLE_LINES=0
+
+draw_table() {
+    (( INTERACTIVE_MODE )) || return 0
+
+    local now
+    now=$(date +%s)
+
+    # Move cursor back to the top of the previously-drawn block.
+    if (( TABLE_LINES > 0 )); then
+        printf '\033[%dA' "${TABLE_LINES}"
+    fi
+
+    local pass=0 fail=0 run=0 pend=0
+    for (( i = 1; i <= RUNS; i++ )); do
+        local label state
+        label=$(printf 'run-%04d' "${i}")
+        state=$(<"${STATE_DIR}/${label}")
+        case "${state}" in
+            pending)   (( pend++ )) ;;
+            running:*) (( run++ )) ;;
+            pass:*)    (( pass++ )) ;;
+            fail:*)    (( fail++ )) ;;
+        esac
+        # \033[2K clears the entire line; \r ensures we start at column 0.
+        printf '\r\033[2K%s\n' "$(format_run_line "${label}" "${state}" "${now}")"
+    done
+
+    printf '\r\033[2K  %sPASS%s %d   %sFAIL%s %d   %srun%s %d   %spend%s %d   (%d/%d done)\n' \
+        "$(ansi '1;32')" "$(reset)" "${pass}" \
+        "$(ansi '1;31')" "$(reset)" "${fail}" \
+        "$(ansi 33)"     "$(reset)" "${run}" \
+        "$(ansi 2)"      "$(reset)" "${pend}" \
+        $(( pass + fail )) "${RUNS}"
+
+    TABLE_LINES=$(( RUNS + 1 ))
+}
+
+# ── Cleanup / signal handling ────────────────────────────────────────────────
+
+WORKER_PIDS=()
+
+_kill_workers() {
+    (( ${#WORKER_PIDS[@]} == 0 )) && return 0
+    for pid in "${WORKER_PIDS[@]}"; do
+        if kill -0 "${pid}" 2>/dev/null; then
+            # Each worker is its own process group (set -m below), so signal
+            # the whole group to take down run.sh and any node descendants.
+            kill -TERM -- "-${pid}" 2>/dev/null \
+                || kill -TERM "${pid}" 2>/dev/null \
+                || true
+        fi
+    done
+}
+
+_on_signal() {
+    if (( INTERACTIVE_MODE )) && (( TABLE_LINES > 0 )); then
+        # Draw a final frame so any in-flight rows get a last update before
+        # we leave them in place; then move below the table to print our
+        # warning, so the cleanup messages don't overwrite it.
+        draw_table
+    fi
+    log_warn "Caught signal — terminating ${#WORKER_PIDS[@]} in-flight worker(s)"
+    _kill_workers
+    wait 2>/dev/null || true
+    log_info "Aborted. Partial summary at ${SUMMARY}"
+    exit 130
+}
+
+trap '_on_signal' INT TERM
+
+# ── Worker ───────────────────────────────────────────────────────────────────
+
+run_one() {
+    local id="${1}"
+    local label
+    label=$(printf 'run-%04d' "${id}")
+    local run_dir="${STRESS_WORK_DIR}/${label}"
+    local run_log="${run_dir}/run.log"
+
+    rm -rf "${run_dir}"
+    mkdir -p "${run_dir}"
+
+    local started ended duration status start_time
+    started=$(date +%s)
+    start_time=$(date -u +%Y-%m-%dT%H:%M:%SZ)
+
+    local run_relay_url="https://$(( RANDOM % 5 )).relay.obol.tech"
+
+    write_state "${id}" "running:${started}"
+    if (( ! INTERACTIVE_MODE )); then
+        log_info "[${label}] starting (relay: ${run_relay_url})"
+    fi
+
+    # Each ceremony runs in an isolated WORK_DIR. All other run.sh env vars
+    # are inherited from this script's environment.
+    if WORK_DIR="${run_dir}" CI="${WORKER_CI}" RELAY_URL="${run_relay_url}" \
+        "${SCRIPT_DIR}/run.sh" >"${run_log}" 2>&1; then
+        status="pass"
+    else
+        status="fail"
+    fi
+
+    ended=$(date +%s)
+    duration=$(( ended - started ))
+
+    write_state "${id}" "${status}:${duration}"
+
+    # Atomic-ish append: a single printf-write of one line to a TSV is
+    # effectively safe under typical bash buffering with WORKERS in single
+    # digits, but parallel writers can in principle interleave. A flock
+    # would be cleaner; we accept the small risk for portability (no
+    # flock(1) on macOS by default).
+    printf '%s\t%s\t%d\t%s\t%s\n' \
+        "${label}" "${status}" "${duration}" "${start_time}" "${run_dir}" \
+        >> "${SUMMARY}"
+
+    if [[ "${status}" == "pass" ]]; then
+        if (( ! INTERACTIVE_MODE )); then
+            log_info "[${label}] PASS in ${duration}s"
+        fi
+        if ! is_truthy "${KEEP_PASSED}"; then
+            # Keep run.log + cluster-lock.json for verification; drop the
+            # node data dirs, which dominate disk usage.
+            rm -rf "${run_dir}/node-"*/ 2>/dev/null || true
+        fi
+    else
+        if (( ! INTERACTIVE_MODE )); then
+            log_err "[${label}] FAIL after ${duration}s — preserved at ${run_dir}"
+        fi
+    fi
+}
+
+# ── Dispatch ─────────────────────────────────────────────────────────────────
+
+log_info "=============================================="
+log_info "DKG stress test"
+log_info "  RUNS            = ${RUNS}"
+log_info "  WORKERS         = ${WORKERS}"
+log_info "  STRESS_WORK_DIR = ${STRESS_WORK_DIR}"
+log_info "  KEEP_PASSED     = ${KEEP_PASSED}"
+log_info "  CI (per worker) = ${WORKER_CI:-<unset>}"
+if (( INTERACTIVE_MODE )); then
+    log_info "  INTERACTIVE     = ${INTERACTIVE} (active)"
+else
+    log_info "  INTERACTIVE     = ${INTERACTIVE} (disabled${INTERACTIVE_REASON:+: ${INTERACTIVE_REASON}})"
+fi
+log_info "  (per-run config inherited from environment; see run.sh --help)"
+log_info "=============================================="
+
+# Job control: each backgrounded worker becomes its own process group leader,
+# so $! == PGID and we can signal the whole tree (run.sh + nodes) on cleanup.
+set -m
+
+# Initial frame so the user sees the table immediately, with all rows pending.
+draw_table
+
+next=1
+while (( next <= RUNS )) || (( ${#WORKER_PIDS[@]} > 0 )); do
+    # Fill the worker pool up to WORKERS.
+    while (( ${#WORKER_PIDS[@]} < WORKERS )) && (( next <= RUNS )); do
+        run_one "${next}" &
+        WORKER_PIDS+=("$!")
+        next=$(( next + 1 ))
+    done
+
+    # Tick: redraw, sleep, then reap finished workers. Polled rather than
+    # `wait -n` for portability across bash 3.2 (macOS default).
+    draw_table
+    sleep 1
+    alive=()
+    for pid in "${WORKER_PIDS[@]}"; do
+        if kill -0 "${pid}" 2>/dev/null; then
+            alive+=("${pid}")
+        else
+            wait "${pid}" 2>/dev/null || true
+        fi
+    done
+    WORKER_PIDS=("${alive[@]+"${alive[@]}"}")
+done
+
+# Final frame so the table reflects the last state transition.
+draw_table
+
+trap - INT TERM
+
+# ── Aggregate ────────────────────────────────────────────────────────────────
+
+pass=$(awk -F'\t' 'NR>1 && $2=="pass"' "${SUMMARY}" | wc -l | tr -d ' ')
+fail=$(awk -F'\t' 'NR>1 && $2=="fail"' "${SUMMARY}" | wc -l | tr -d ' ')
+total=$(( pass + fail ))
+
+if (( total == 0 )); then
+    log_err "No runs completed."
+    exit 1
+fi
+
+read -r dmin dmax dmean < <(
+    awk -F'\t' 'NR>1 {
+        d = $3 + 0
+        if (n == 0 || d < min) min = d
+        if (d > max) max = d
+        sum += d
+        n++
+    } END {
+        printf "%d %d %.1f", min, max, (n>0 ? sum/n : 0)
+    }' "${SUMMARY}"
+)
+
+log_info "=============================================="
+log_info "Stress test complete"
+log_info "  Passed: ${pass}/${total}"
+log_info "  Failed: ${fail}/${total}"
+log_info "  Duration min/mean/max = ${dmin}s / ${dmean}s / ${dmax}s"
+log_info "  Summary: ${SUMMARY}"
+log_info "=============================================="
+
+if (( fail > 0 )); then
+    log_warn "Failed runs:"
+    awk -F'\t' 'NR>1 && $2=="fail" {printf "  %s  (%ds)  %s\n", $1, $3, $5}' \
+        "${SUMMARY}" >&2
+    exit 1
+fi
+exit 0
diff --git a/scripts/dkg-runner/wait-node-exits.sh b/scripts/dkg-runner/wait-node-exits.sh
new file mode 100755
index 00000000..523ac0f0
--- /dev/null
+++ b/scripts/dkg-runner/wait-node-exits.sh
@@ -0,0 +1,104 @@
+#!/usr/bin/env bash
+# wait-node-exits.sh — waits for every DKG node to report a clean exit.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=config.sh
+source "${SCRIPT_DIR}/config.sh"
+# shellcheck source=lib.sh
+source "${SCRIPT_DIR}/lib.sh"
+LOG_PREFIX="wait-node-exits"
+
+log_tail() {
+    local index="${1}"
+    local log_file="${WORK_DIR}/node-${index}/node.log"
+    if [[ -f "${log_file}" ]]; then
+        log_err "Last log lines for node-${index}:"
+        tail -40 "${log_file}" >&2 || true
+    else
+        log_err "No log file for node-${index}: ${log_file}"
+    fi
+}
+
+node_exit_code() {
+    local index="${1}"
+    local exit_file="${WORK_DIR}/node-${index}/exit-code"
+    [[ -f "${exit_file}" ]] || return 1
+    cat "${exit_file}"
+}
+
+# Pluto and Charon both emit this line right after the ceremony finishes,
+# before their final shutdown/teardown. Treat it as authoritative — by this
+# point monitor.sh has already verified the artifacts on disk.
+SUCCESS_LINE="Successfully completed DKG ceremony"
+PID_FILE="${WORK_DIR}/pids"
+
+node_logged_success() {
+    local index="${1}"
+    local log_file="${WORK_DIR}/node-${index}/node.log"
+    [[ -f "${log_file}" ]] || return 1
+    grep -qF "${SUCCESS_LINE}" "${log_file}"
+}
+
+log_info "Waiting for ${NODES} node exit codes (timeout: ${NODE_EXIT_TIMEOUT}s)"
+
+start_time=$(date +%s)
+while true; do
+    done_count=0
+    success_count=0
+    for (( i = 0; i < NODES; i++ )); do
+        if [[ -f "${WORK_DIR}/node-${i}/exit-code" ]]; then
+            done_count=$(( done_count + 1 ))
+            code=$(node_exit_code "${i}")
+            if [[ "${code}" != "0" ]]; then
+                log_err "node-${i} exited with status ${code}"
+                log_tail "${i}"
+                exit 1
+            fi
+        fi
+        if node_logged_success "${i}"; then
+            success_count=$(( success_count + 1 ))
+        fi
+    done
+
+    if (( done_count == NODES )); then
+        break
+    fi
+
+    # Short-circuit: every node logged success but some are still running
+    # (e.g. blocked on SHUTDOWN_DELAY). Kill survivors and treat as success.
+    if (( success_count == NODES )); then
+        log_info "All ${NODES} nodes logged success; stopping survivors."
+        kill_pgids "${PID_FILE}" 5
+        log_info "All ${NODES} nodes stopped after success log line."
+        exit 0
+    fi
+
+    elapsed=$(( $(date +%s) - start_time ))
+    if (( elapsed >= NODE_EXIT_TIMEOUT )); then
+        log_err "TIMEOUT after ${elapsed}s — ${done_count}/${NODES} nodes exited"
+        for (( i = 0; i < NODES; i++ )); do
+            [[ -f "${WORK_DIR}/node-${i}/exit-code" ]] || log_tail "${i}"
+        done
+        exit 1
+    fi
+
+    sleep 1
+done
+
+failed=0
+for (( i = 0; i < NODES; i++ )); do
+    code=$(node_exit_code "${i}")
+    if [[ "${code}" != "0" ]]; then
+        log_err "node-${i} exited with status ${code}"
+        log_tail "${i}"
+        failed=1
+    fi
+done
+
+if (( failed != 0 )); then
+    exit 1
+fi
+
+log_info "All ${NODES} nodes exited cleanly."
diff --git a/tools/dkg-stress/.gitignore b/tools/dkg-stress/.gitignore
new file mode 100644
index 00000000..2f7896d1
--- /dev/null
+++ b/tools/dkg-stress/.gitignore
@@ -0,0 +1 @@
+target/
diff --git a/tools/dkg-stress/Cargo.lock b/tools/dkg-stress/Cargo.lock
new file mode 100644
index 00000000..f84c7651
--- /dev/null
+++ b/tools/dkg-stress/Cargo.lock
@@ -0,0 +1,779 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "allocator-api2"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
+
+[[package]]
+name = "ansi-to-tui"
+version = "7.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67555e1f1ece39d737e28c8a017721287753af3f93225e4a445b29ccb0f5912c"
+dependencies = [
+ "nom",
+ "ratatui",
+ "simdutf8",
+ "smallvec",
+ "thiserror",
+]
+
+[[package]]
+name = "anstream"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
+
+[[package]]
+name = "anstyle-parse"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
+dependencies = [
+ "anstyle",
+ "once_cell_polyfill",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.102"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
+
+[[package]]
+name = "bitflags"
+version = "2.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
+
+[[package]]
+name = "cassowary"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53"
+
+[[package]]
+name = "castaway"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dec551ab6e7578819132c713a93c022a05d60159dc86e7a7050223577484c55a"
+dependencies = [
+ "rustversion",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
+
+[[package]]
+name = "clap"
+version = "4.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51"
+dependencies = [
+ "clap_builder",
+ "clap_derive",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "clap_lex"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
+
+[[package]]
+name = "colorchoice"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
+
+[[package]]
+name = "compact_str"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b79c4069c6cad78e2e0cdfcbd26275770669fb39fd308a752dc110e83b9af32"
+dependencies = [
+ "castaway",
+ "cfg-if",
+ "itoa",
+ "rustversion",
+ "ryu",
+ "static_assertions",
+]
+
+[[package]]
+name = "crossterm"
+version = "0.28.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6"
+dependencies = [
+ "bitflags",
+ "crossterm_winapi",
+ "mio",
+ "parking_lot",
+ "rustix",
+ "signal-hook",
+ "signal-hook-mio",
+ "winapi",
+]
+
+[[package]]
+name = "crossterm_winapi"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "darling"
+version = "0.23.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d"
+dependencies = [
+ "darling_core",
+ "darling_macro",
+]
+
+[[package]]
+name = "darling_core"
+version = "0.23.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0"
+dependencies = [
+ "ident_case",
+ "proc-macro2",
+ "quote",
+ "strsim",
+ "syn",
+]
+
+[[package]]
+name = "darling_macro"
+version = "0.23.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d"
+dependencies = [
+ "darling_core",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "dkg-stress"
+version = "0.1.0"
+dependencies = [
+ "ansi-to-tui",
+ "anyhow",
+ "clap",
+ "libc",
+ "ratatui",
+ "signal-hook",
+]
+
+[[package]]
+name = "either"
+version = "1.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
+
+[[package]]
+name = "equivalent"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
+
+[[package]]
+name = "errno"
+version = "0.3.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
+dependencies = [
+ "libc",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "foldhash"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
+
+[[package]]
+name = "hashbrown"
+version = "0.15.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash",
+]
+
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
+[[package]]
+name = "ident_case"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
+
+[[package]]
+name = "indoc"
+version = "2.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706"
+dependencies = [
+ "rustversion",
+]
+
+[[package]]
+name = "instability"
+version = "0.3.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5eb2d60ef19920a3a9193c3e371f726ec1dafc045dac788d0fb3704272458971"
+dependencies = [
+ "darling",
+ "indoc",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
+
+[[package]]
+name = "itertools"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
+
+[[package]]
+name = "libc"
+version = "0.2.186"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.4.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
+
+[[package]]
+name = "lock_api"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
+dependencies = [
+ "scopeguard",
+]
+
+[[package]]
+name = "log"
+version = "0.4.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
+
+[[package]]
+name = "lru"
+version = "0.12.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38"
+dependencies = [
+ "hashbrown",
+]
+
+[[package]]
+name = "memchr"
+version = "2.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
+
+[[package]]
+name = "minimal-lexical"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
+
+[[package]]
+name = "mio"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1"
+dependencies = [
+ "libc",
+ "log",
+ "wasi",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "nom"
+version = "7.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
+dependencies = [
+ "memchr",
+ "minimal-lexical",
+]
+
+[[package]]
+name = "once_cell_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
+
+[[package]]
+name = "parking_lot"
+version = "0.12.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a"
+dependencies = [
+ "lock_api",
+ "parking_lot_core",
+]
+
+[[package]]
+name = "parking_lot_core"
+version = "0.9.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "redox_syscall",
+ "smallvec",
+ "windows-link",
+]
+
+[[package]]
+name = "paste"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "ratatui"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eabd94c2f37801c20583fc49dd5cd6b0ba68c716787c2dd6ed18571e1e63117b"
+dependencies = [
+ "bitflags",
+ "cassowary",
+ "compact_str",
+ "crossterm",
+ "indoc",
+ "instability",
+ "itertools",
+ "lru",
+ "paste",
+ "strum",
+ "unicode-segmentation",
+ "unicode-truncate",
+ "unicode-width 0.2.0",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.5.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
+dependencies = [
+ "bitflags",
+]
+
+[[package]]
+name = "rustix"
+version = "0.38.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
+dependencies = [
+ "bitflags",
+ "errno",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "rustversion"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
+
+[[package]]
+name = "ryu"
+version = "1.0.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
+
+[[package]]
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+
+[[package]]
+name = "signal-hook"
+version = "0.3.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d881a16cf4426aa584979d30bd82cb33429027e42122b169753d6ef1085ed6e2"
+dependencies = [
+ "libc",
+ "signal-hook-registry",
+]
+
+[[package]]
+name = "signal-hook-mio"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b75a19a7a740b25bc7944bdee6172368f988763b744e3d4dfe753f6b4ece40cc"
+dependencies = [
+ "libc",
+ "mio",
+ "signal-hook",
+]
+
+[[package]]
+name = "signal-hook-registry"
+version = "1.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b"
+dependencies = [
+ "errno",
+ "libc",
+]
+
+[[package]]
+name = "simdutf8"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"
+
+[[package]]
+name = "smallvec"
+version = "1.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
+
+[[package]]
+name = "static_assertions"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
+
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
+[[package]]
+name = "strum"
+version = "0.26.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06"
+dependencies = [
+ "strum_macros",
+]
+
+[[package]]
+name = "strum_macros"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "rustversion",
+ "syn",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
+
+[[package]]
+name = "unicode-segmentation"
+version = "1.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c"
+
+[[package]]
+name = "unicode-truncate"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b3644627a5af5fa321c95b9b235a72fd24cd29c648c2c379431e6628655627bf"
+dependencies = [
+ "itertools",
+ "unicode-segmentation",
+ "unicode-width 0.1.14",
+]
+
+[[package]]
+name = "unicode-width"
+version = "0.1.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af"
+
+[[package]]
+name = "unicode-width"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
+
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
+[[package]]
+name = "wasi"
+version = "0.11.1+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "windows-link"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
+
+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.61.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
+dependencies = [
+ "windows-link",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
diff --git a/tools/dkg-stress/Cargo.toml b/tools/dkg-stress/Cargo.toml
new file mode 100644
index 00000000..6bc08d8c
--- /dev/null
+++ b/tools/dkg-stress/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "dkg-stress"
+version = "0.1.0"
+edition = "2024"
+publish = false
+description = "Stress runner for DKG ceremonies (wraps scripts/dkg-runner/run.sh, ratatui UI)"
+
+[[bin]]
+name = "dkg-stress"
+path = "src/main.rs"
+
+[dependencies]
+anyhow = "1"
+ansi-to-tui = "7"
+clap = { version = "4.5", features = ["derive", "env"] }
+ratatui = "0.29"
+libc = "0.2"
+signal-hook = "0.3"
+
+[profile.release]
+opt-level = 3
+lto = "thin"
diff --git a/tools/dkg-stress/README.md b/tools/dkg-stress/README.md
new file mode 100644
index 00000000..cadb0b3c
--- /dev/null
+++ b/tools/dkg-stress/README.md
@@ -0,0 +1,200 @@
+# dkg-stress
+
+Stress runner for DKG ceremonies. Wraps `scripts/dkg-runner/run.sh` to execute
+N ceremonies (sequentially or in parallel), with a live ratatui UI for
+inspecting in-flight progress and per-node logs.
+
+This crate lives outside the main Pluto workspace (`exclude` entry in the root
+`Cargo.toml`) so it has its own dependency graph and `Cargo.lock`. Build and
+run it locally — it isn't part of `cargo build --workspace`.
+
+## Build
+
+```bash
+cd tools/dkg-stress
+cargo build --release
+```
+
+The binary lands at `tools/dkg-stress/target/release/dkg-stress`.
+
+`run.sh`'s prerequisites still apply: `charon` on `$PATH` (or via `CHARON_BIN`),
+`pluto` built (only if `PLUTO_NODES > 0`), and a reachable relay. See
+`scripts/dkg-runner/README.md` for the per-ceremony setup.
+
+## Quick start
+
+```bash
+# 50 ceremonies, 4 in flight at a time, 5-minute timeout per run.
+CHARON_BIN=~/projects/charon/charon RUNS=50 WORKERS=4 TIMEOUT=300 \
+    ./tools/dkg-stress/target/release/dkg-stress
+
+# Same thing with flags rather than env vars.
+./tools/dkg-stress/target/release/dkg-stress \
+    --runs 50 --workers 4
+
+# Sequential smoke test, all-Pluto, keep all artifacts for inspection.
+PLUTO_NODES=4 CHARON_NODES=0 \
+    ./tools/dkg-stress/target/release/dkg-stress \
+    --runs 5 --keep-passed
+
+# Append-only mode (CI, log capture, redirected output).
+./tools/dkg-stress/target/release/dkg-stress --runs 10 --no-tui
+```
+
+## Configuration
+
+Every option supports both a CLI flag and an environment variable. Flags win
+when both are set; otherwise env vars; otherwise defaults.
+
+### Stress-runner options
+
+| Flag | Env var | Default | Description |
+|---|---|---|---|
+| `-n`, `--runs` | `RUNS` | `10` | Total ceremonies to run |
+| `-w`, `--workers` | `WORKERS` | `1` | Concurrent ceremonies |
+| `--work-dir` | `STRESS_WORK_DIR` | `/tmp/dkg-stress` | Base directory; each run uses `run-NNNN/` |
+| `--run-script` | `DKG_RUN_SCRIPT` | `../../scripts/dkg-runner/run.sh` (relative to crate) | Path to `run.sh` |
+| `--keep-passed` | `KEEP_PASSED` | off | Keep `node-*/` dirs of passed runs (default trims them) |
+| `--no-tui` | `NO_TUI` | off | Disable ratatui UI; emit per-run log lines |
+| `--tick-ms` | `TICK_MS` | `250` | UI redraw interval |
+
+### Per-ceremony options (forwarded to `run.sh` via env)
+
+These are inherited from the calling environment unchanged — see
+`scripts/dkg-runner/run.sh --help` for the authoritative list:
+
+`NODES`, `THRESHOLD`, `PLUTO_NODES`, `CHARON_NODES`, `RELAY_URL`, `NETWORK`,
+`FEE_RECIPIENT`, `WITHDRAWAL_ADDR`, `TIMEOUT`, `NODE_EXIT_TIMEOUT`,
+`SHUTDOWN_DELAY`, `PLUTO_BIN`, `CHARON_BIN`.
+
+`WORK_DIR` is overridden per run and is **not** forwarded — each ceremony gets
+its own isolated work dir under `STRESS_WORK_DIR`. `CI` is forced to `true`
+when `WORKERS > 1` so per-node logs don't tee to the controlling terminal
+(unless you explicitly export `CI` yourself).
+
+## TUI
+
+```
+┌─ DKG stress test ────────────────────────────────────────────────┐
+│ runs=50 workers=4 work_dir=/tmp/dkg-stress                       │
+│ j/k=run · J/K=±10 · Home/End · Tab/h/l=log · PgUp/PgDn=scroll …  │
+├─────────────────┬────────────────────────────────────────────────┤
+│ runs            │ run-0017 — running                             │
+│  run-0001 PASS  │ run.log │ node-0 │ node-1 │ node-2 │ node-3    │
+│  run-0002 PASS  │ ─────────────────────────────────────────────  │
+│  run-0003 FAIL  │ 2026-05-08T... INFO pluto::dkg starting        │
+│ ▶run-0017 run.. │ ...                                            │
+│  run-0018 pend  │                                                │
+├─────────────────┴────────────────────────────────────────────────┤
+│ PASS 16  FAIL 1  run 4  pend 29   (17/50 done)  follow=auto …    │
+└──────────────────────────────────────────────────────────────────┘
+```
+
+Each in-flight run's row mutates `pending → running Ns → PASS/FAIL Ns` in
+place. The detail pane on the right tails the selected log file (last
+~256 KB), parsing ANSI escape codes so the colored Pluto/Charon log output
+renders correctly.
+
+### Keybindings
+
+**Run selection (left pane)**
+
+| Key | Action |
+|---|---|
+| `j` `k` `↓` `↑` | Move selection by 1 |
+| `J` `K` | Move selection by 10 |
+| `Home` `End` | First / last run |
+| `a` | Re-engage auto-follow (selection tracks the latest active run) |
+
+**Log navigation (right pane)**
+
+| Key | Action |
+|---|---|
+| `Tab` `Shift-Tab` `h` `l` `←` `→` | Cycle log file (`run.log`, `node-0`, `node-1`, …) |
+| `PgUp` `PgDn` | Scroll log by ~20 lines |
+| `Ctrl-u` `Ctrl-d` | Scroll log by ~10 lines (vim half-page) |
+| `Ctrl-b` `Ctrl-f` | Scroll log by ~20 lines (vim full-page) |
+| `g` | Jump to top of buffer |
+| `G` | Jump to tail (resume live updates) |
+
+**Other**
+
+| Key | Action |
+|---|---|
+| `q` `Esc` `Ctrl-C` | Graceful shutdown — SIGTERMs in-flight ceremonies, finalises the summary |
+
+Once you scroll up or move the selection, the footer shows `follow=manual`
+(selection pinned) and/or `log=+N (G to follow)` (log offset). Press `a` to
+return to auto-follow, `G` to snap the log back to its tail.
+
+## Output
+
+For each invocation, `dkg-stress` writes:
+
+```
+${STRESS_WORK_DIR}/
+├── summary.tsv                  # one row per completed run
+├── run-0001/
+│   ├── run.log                  # full stdout/stderr of this run.sh invocation
+│   ├── node-0/node.log          # per-node logs (passed runs trim these by default)
+│   ├── node-1/node.log
+│   └── …
+├── run-0002/
+└── …
+```
+
+`summary.tsv` columns: `run_id`, `status` (`pass`/`fail`), `duration_s`,
+`start_time` (ISO-8601 UTC), `work_dir`. New rows are appended atomically as
+ceremonies complete.
+
+When `--keep-passed` is off (the default), `node-*/` subdirs of passed runs
+are deleted to keep disk usage bounded. `run.log` and the cluster lock files
+are always preserved. Failed runs are kept in full.
+
+## Exit codes
+
+| Code | Meaning |
+|---|---|
+| `0` | All ceremonies passed |
+| `1` | One or more failed; details in the final summary and `summary.tsv` |
+| `2` | Configuration error (bad flag, missing `run.sh`, etc.) |
+| `130` | Interrupted (SIGINT/SIGTERM/SIGHUP); in-flight ceremonies are SIGTERM'd, partial summary preserved |
+
+## Graceful shutdown
+
+`q`, `Esc`, and `Ctrl-C` from the TUI, plus external `SIGINT` / `SIGTERM` /
+`SIGHUP`, all flow through the same path:
+
+1. Set the shared stop flag — workers stop dispatching new runs.
+2. SIGTERM every in-flight `run.sh` process group, so each ceremony's
+   `_on_signal` trap fires and shuts the four nodes down cleanly.
+3. Wait up to 5 s for clean exits, then SIGKILL stragglers.
+4. Restore the terminal, finalise `summary.tsv`, print aggregate stats.
+
+No orphan processes; partial runs are recorded as `fail` with their actual
+runtime, un-started runs as "skipped".
+
+## Troubleshooting
+
+**"could not find run.sh"** — pass `--run-script` or set `DKG_RUN_SCRIPT`. The
+default lookup walks two directories up from the binary's manifest dir, so it
+only auto-resolves when running from a checkout.
+
+**TUI is garbled / shows raw escape codes** — pluto/charon logs are now
+parsed with `ansi-to-tui`. If you still see escapes, the file likely contains
+non-SGR control sequences; switch tabs or hit `g` to refresh.
+
+**Scrolling does nothing** — make sure you're hitting the log pane keys
+(`PgUp`/`PgDn`, `Ctrl-u`/`Ctrl-d`), not the run-selection keys (`j`/`k`).
+The detail title shows `[+N lines]` once you've scrolled. Bear in mind the
+buffer is the last 256 KB of the file — extremely long ceremonies will only
+let you scroll back through that window.
+
+**"all failed" with no obvious cause** — open one of the failed runs in the
+TUI, cycle through `run.log` (orchestration output) and each `node-N/node.log`
+to find the first error. If `KEEP_PASSED` was off and you want artifacts of
+all runs, re-run with `--keep-passed`.
+
+**Workers wedged after Ctrl-C** — should not happen; check
+`pgrep -fl run.sh`. If anything sticks around, file an issue with the
+`/tmp/dkg-stress/run-NNNN/` directory contents.
diff --git a/tools/dkg-stress/src/cli.rs b/tools/dkg-stress/src/cli.rs
new file mode 100644
index 00000000..5ff5c916
--- /dev/null
+++ b/tools/dkg-stress/src/cli.rs
@@ -0,0 +1,45 @@
+use clap::Parser;
+use std::path::PathBuf;
+
+#[derive(Parser, Debug, Clone)]
+#[command(
+    name = "dkg-stress",
+    about = "Run N DKG ceremonies (back-to-back or in parallel) with a live ratatui UI.",
+    long_about = "Wraps scripts/dkg-runner/run.sh, dispatching N runs across W parallel \
+                  workers with isolated WORK_DIRs. Per-run config (NODES, THRESHOLD, \
+                  PLUTO_NODES, CHARON_NODES, TIMEOUT, etc.) is forwarded to run.sh \
+                  via the inherited environment — see run.sh --help for the full list."
+)]
+pub struct Cli {
+    /// Total number of ceremonies to run.
+    #[arg(short = 'n', long, env = "RUNS", default_value_t = 10)]
+    pub runs: u32,
+
+    /// Number of ceremonies in flight at the same time.
+    #[arg(short = 'w', long, env = "WORKERS", default_value_t = 1)]
+    pub workers: u32,
+
+    /// Base directory; each run uses run-NNNN/ inside it.
+    #[arg(long, env = "STRESS_WORK_DIR", default_value = "/tmp/dkg-stress")]
+    pub work_dir: PathBuf,
+
+    /// Path to scripts/dkg-runner/run.sh. Defaults to the script next to the
+    /// repo's checked-in copy, resolved relative to the binary's location.
+    #[arg(long, env = "DKG_RUN_SCRIPT")]
+    pub run_script: Option<PathBuf>,
+
+    /// Keep full per-run dirs even on success. By default, node-*/ subdirs of
+    /// passed runs are deleted to save disk; failed run dirs are always kept.
+    #[arg(long, env = "KEEP_PASSED")]
+    pub keep_passed: bool,
+
+    /// Disable the ratatui UI; emit per-run log lines instead. Auto-enabled
+    /// when stdout isn't a TTY or CI is set.
+    #[arg(long, env = "NO_TUI")]
+    pub no_tui: bool,
+
+    /// UI tick rate in milliseconds (how often the table redraws and elapsed
+    /// counters advance). Lower = smoother but more CPU.
+    #[arg(long, env = "TICK_MS", default_value_t = 250)]
+    pub tick_ms: u64,
+}
diff --git a/tools/dkg-stress/src/config.rs b/tools/dkg-stress/src/config.rs
new file mode 100644
index 00000000..c9989634
--- /dev/null
+++ b/tools/dkg-stress/src/config.rs
@@ -0,0 +1,125 @@
+use anyhow::{Context, Result, bail};
+use std::fs;
+use std::io::{BufWriter, Write};
+use std::path::{Path, PathBuf};
+use std::sync::Mutex;
+
+use crate::cli::Cli;
+
+/// Resolved configuration shared across worker threads. All fields are
+/// immutable after construction; mutable shared state lives on `App` instead.
+pub struct Config {
+    pub runs: u32,
+    pub workers: u32,
+    pub work_dir: PathBuf,
+    pub run_script: PathBuf,
+    pub keep_passed: bool,
+    pub no_tui: bool,
+    pub tick_ms: u64,
+    pub worker_ci: String,
+    pub summary_path: PathBuf,
+    /// Serialised writer for the TSV summary (multiple workers append to it).
+    pub summary: Mutex<BufWriter<fs::File>>,
+}
+
+impl Config {
+    pub fn from_cli(cli: Cli) -> Result<Self> {
+        if cli.runs == 0 {
+            bail!("RUNS must be >= 1 (got {})", cli.runs);
+        }
+        if cli.workers == 0 {
+            bail!("WORKERS must be >= 1 (got {})", cli.workers);
+        }
+        let workers = cli.workers.min(cli.runs);
+
+        let run_script = match cli.run_script {
+            Some(p) => p,
+            None => default_run_script()?,
+        };
+        let run_script = run_script
+            .canonicalize()
+            .with_context(|| format!("run script not found: {}", run_script.display()))?;
+        if !run_script.is_file() {
+            bail!("run script is not a regular file: {}", run_script.display());
+        }
+
+        // Force CI=true for parallel runs so per-node logs don't tee to the
+        // controlling terminal (run.sh suppresses tee under CI). Honour any
+        // existing CI value the user explicitly set.
+        let worker_ci = match std::env::var("CI") {
+            Ok(v) if !v.is_empty() => v,
+            _ if workers > 1 => "true".to_string(),
+            _ => String::new(),
+        };
+
+        fs::create_dir_all(&cli.work_dir)
+            .with_context(|| format!("create work dir {}", cli.work_dir.display()))?;
+        let summary_path = cli.work_dir.join("summary.tsv");
+        let summary_file = fs::File::create(&summary_path)
+            .with_context(|| format!("create summary file {}", summary_path.display()))?;
+        let mut summary = BufWriter::new(summary_file);
+        writeln!(summary, "run_id\tstatus\tduration_s\tstart_time\twork_dir")?;
+        summary.flush()?;
+
+        Ok(Self {
+            runs: cli.runs,
+            workers,
+            work_dir: cli.work_dir,
+            run_script,
+            keep_passed: cli.keep_passed,
+            no_tui: cli.no_tui,
+            tick_ms: cli.tick_ms.max(50),
+            worker_ci,
+            summary_path,
+            summary: Mutex::new(summary),
+        })
+    }
+
+    pub fn append_summary_line(
+        &self,
+        label: &str,
+        status: &str,
+        duration_s: u64,
+        start_time_iso: &str,
+        run_dir: &Path,
+    ) -> Result<()> {
+        let mut w = self
+            .summary
+            .lock()
+            .map_err(|_| anyhow::anyhow!("summary writer lock poisoned"))?;
+        writeln!(
+            w,
+            "{}\t{}\t{}\t{}\t{}",
+            label,
+            status,
+            duration_s,
+            start_time_iso,
+            run_dir.display()
+        )?;
+        w.flush()?;
+        Ok(())
+    }
+}
+
+/// Locate scripts/dkg-runner/run.sh relative to either the running binary
+/// (when invoked from a checkout) or CWD as a final fallback.
+fn default_run_script() -> Result<PathBuf> {
+    // The crate lives at <repo>/tools/dkg-stress; the script lives at
+    // <repo>/scripts/dkg-runner/run.sh. Cargo sets CARGO_MANIFEST_DIR at
+    // compile time so we know the crate's location regardless of how the
+    // binary is launched.
+    let manifest_dir = env!("CARGO_MANIFEST_DIR");
+    let candidate = Path::new(manifest_dir)
+        .join("..")
+        .join("..")
+        .join("scripts")
+        .join("dkg-runner")
+        .join("run.sh");
+    if candidate.exists() {
+        return Ok(candidate);
+    }
+    bail!(
+        "could not find run.sh at {} — pass --run-script or set DKG_RUN_SCRIPT",
+        candidate.display()
+    )
+}
diff --git a/tools/dkg-stress/src/logs.rs b/tools/dkg-stress/src/logs.rs
new file mode 100644
index 00000000..9a82ffe0
--- /dev/null
+++ b/tools/dkg-stress/src/logs.rs
@@ -0,0 +1,37 @@
+use std::fs::{self, File};
+use std::io::{Read, Seek, SeekFrom};
+use std::path::{Path, PathBuf};
+
+/// Read up to `max_bytes` from the end of `path`, returning the trailing
+/// portion as a UTF-8 string (lossily decoded). Returns None on any I/O
+/// error or if the path doesn't exist.
+pub fn read_tail(path: &Path, max_bytes: u64) -> Option<String> {
+    let mut f = File::open(path).ok()?;
+    let len = f.metadata().ok()?.len();
+    let start = len.saturating_sub(max_bytes);
+    f.seek(SeekFrom::Start(start)).ok()?;
+    let mut buf = Vec::with_capacity(max_bytes.min(64 * 1024) as usize);
+    f.take(max_bytes).read_to_end(&mut buf).ok()?;
+    Some(String::from_utf8_lossy(&buf).into_owned())
+}
+
+/// Enumerate node-* subdirectories of `run_dir`, sorted by name. Empty if
+/// the run directory doesn't exist yet (pending) or has been pruned (passed
+/// run with KEEP_PASSED off).
+pub fn enumerate_nodes(run_dir: &Path) -> Vec<PathBuf> {
+    let Ok(entries) = fs::read_dir(run_dir) else {
+        return Vec::new();
+    };
+    let mut nodes: Vec<PathBuf> = entries
+        .flatten()
+        .map(|e| e.path())
+        .filter(|p| {
+            p.is_dir()
+                && p.file_name()
+                    .and_then(|n| n.to_str())
+                    .is_some_and(|n| n.starts_with("node-"))
+        })
+        .collect();
+    nodes.sort();
+    nodes
+}
diff --git a/tools/dkg-stress/src/main.rs b/tools/dkg-stress/src/main.rs
new file mode 100644
index 00000000..74b71b41
--- /dev/null
+++ b/tools/dkg-stress/src/main.rs
@@ -0,0 +1,248 @@
+//! DKG stress runner with a ratatui-based UI.
+//!
+//! Wraps `scripts/dkg-runner/run.sh` to execute N ceremonies, optionally in
+//! parallel, with live status visualisation. Per-run config (NODES, THRESHOLD,
+//! PLUTO_NODES, CHARON_NODES, TIMEOUT, …) is forwarded via the inherited
+//! environment — see `run.sh --help`.
+
+mod cli;
+mod config;
+mod logs;
+mod state;
+mod ui;
+mod worker;
+
+use anyhow::Result;
+use clap::Parser;
+use std::collections::HashSet;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::{Arc, Mutex};
+use std::thread::{self, JoinHandle};
+use std::time::{Duration, Instant};
+
+use crate::cli::Cli;
+use crate::config::Config;
+use crate::state::{App, RunState};
+use crate::worker::{spawn_workers, kill_all};
+
+fn main() -> Result<()> {
+    let cli = Cli::parse();
+    let config = Arc::new(Config::from_cli(cli)?);
+
+    let app = Arc::new(Mutex::new(App::new(config.runs as usize)));
+    let stop = Arc::new(AtomicBool::new(false));
+    install_signal_handlers(&stop)?;
+    let killers = Arc::new(Mutex::new(HashSet::new()));
+
+    let workers = spawn_workers(config.clone(), app.clone(), stop.clone(), killers.clone());
+
+    // Auto-disable the TUI when stdout isn't a TTY (piped/redirected) — the
+    // alt-screen escapes would garble the captured output. The explicit
+    // --no-tui flag overrides regardless.
+    let use_tui = !config.no_tui && is_tty_stdout();
+
+    if use_tui {
+        let workers_done = make_done_check(&workers);
+        ui::run_tui(
+            config.clone(),
+            app.clone(),
+            stop.clone(),
+            killers.clone(),
+            workers_done,
+        )?;
+    } else {
+        run_logging(&config, &app, &stop, &workers);
+    }
+
+    // Whatever path got us here (TUI quit, all workers finished, or the
+    // logging loop returned), make sure no children outlive us and the
+    // worker threads have a chance to drain their final-state writes.
+    if !workers
+        .iter()
+        .all(|h| h.is_finished())
+    {
+        stop.store(true, Ordering::Relaxed);
+        kill_all(&killers, Duration::from_secs(5));
+    }
+    for h in workers {
+        let _ = h.join();
+    }
+
+    print_final_summary(&config, &app);
+
+    let any_fail = match app.lock() {
+        Ok(a) => a.runs.iter().any(|s| matches!(s, RunState::Fail { .. })),
+        Err(_) => true,
+    };
+    if any_fail {
+        std::process::exit(1);
+    }
+    Ok(())
+}
+
+fn make_done_check(workers: &[JoinHandle<()>]) -> impl Fn() -> bool + '_ {
+    move || workers.iter().all(|h| h.is_finished())
+}
+
+/// Replace the default termination handlers so SIGINT/SIGTERM/SIGHUP flip
+/// the shared stop flag instead of killing us outright. This lets the TUI
+/// restore the terminal and the dispatch path SIGTERM in-flight ceremonies
+/// before we exit, regardless of whether the signal arrived from a tty
+/// Ctrl-C (no-tui mode) or an external `kill`.
+#[cfg(unix)]
+fn install_signal_handlers(stop: &Arc<AtomicBool>) -> Result<()> {
+    use signal_hook::consts::{SIGHUP, SIGINT, SIGTERM};
+    for sig in [SIGINT, SIGTERM, SIGHUP] {
+        signal_hook::flag::register(sig, stop.clone())?;
+    }
+    Ok(())
+}
+
+#[cfg(not(unix))]
+fn install_signal_handlers(_stop: &Arc<AtomicBool>) -> Result<()> {
+    Ok(())
+}
+
+fn is_tty_stdout() -> bool {
+    // SAFETY: isatty is a pure libc syscall taking an fd; STDOUT_FILENO is
+    // always a valid file descriptor for our process.
+    unsafe { libc::isatty(libc::STDOUT_FILENO) == 1 }
+}
+
+/// Append-only fallback for non-TTY / `--no-tui` runs. Polls App state and
+/// emits one line per state transition, plus a heartbeat counter.
+fn run_logging(
+    config: &Config,
+    app: &Mutex<App>,
+    stop: &AtomicBool,
+    workers: &[JoinHandle<()>],
+) {
+    eprintln!(
+        "dkg-stress: runs={} workers={} work_dir={}",
+        config.runs,
+        config.workers,
+        config.work_dir.display()
+    );
+    let total = config.runs as usize;
+    let mut last: Vec<RunStateTag> = vec![RunStateTag::Pending; total];
+
+    loop {
+        let snapshot: Vec<RunState> = match app.lock() {
+            Ok(a) => a.runs.clone(),
+            Err(_) => return,
+        };
+        for (i, state) in snapshot.iter().enumerate() {
+            let tag = tag(state);
+            if tag != last[i] {
+                emit_transition(i + 1, state);
+                last[i] = tag;
+            }
+        }
+        if workers.iter().all(|h| h.is_finished()) {
+            return;
+        }
+        if stop.load(Ordering::Relaxed) {
+            eprintln!("dkg-stress: caught signal — terminating in-flight ceremonies");
+            return;
+        }
+        thread::sleep(Duration::from_millis(config.tick_ms));
+    }
+}
+
+#[derive(Clone, Copy, PartialEq, Eq)]
+enum RunStateTag {
+    Pending,
+    Running,
+    Pass,
+    Fail,
+}
+
+fn tag(s: &RunState) -> RunStateTag {
+    match s {
+        RunState::Pending => RunStateTag::Pending,
+        RunState::Running { .. } => RunStateTag::Running,
+        RunState::Pass { .. } => RunStateTag::Pass,
+        RunState::Fail { .. } => RunStateTag::Fail,
+    }
+}
+
+fn emit_transition(id: usize, state: &RunState) {
+    match state {
+        RunState::Pending => {}
+        RunState::Running { .. } => {
+            println!("[run-{:04}] starting", id);
+        }
+        RunState::Pass { duration_s } => {
+            println!("[run-{:04}] PASS in {}s", id, duration_s);
+        }
+        RunState::Fail { duration_s } => {
+            eprintln!("[run-{:04}] FAIL after {}s", id, duration_s);
+        }
+    }
+}
+
+fn print_final_summary(config: &Config, app: &Mutex<App>) {
+    let snapshot: Vec<RunState> = match app.lock() {
+        Ok(a) => a.runs.clone(),
+        Err(_) => return,
+    };
+    let mut passed = 0u64;
+    let mut failed = 0u64;
+    let mut pending = 0u64;
+    let mut min_d = u64::MAX;
+    let mut max_d = 0u64;
+    let mut sum_d = 0u64;
+    let mut n_d = 0u64;
+    for s in &snapshot {
+        match s {
+            RunState::Pass { duration_s } => {
+                passed += 1;
+                update_stats(*duration_s, &mut min_d, &mut max_d, &mut sum_d, &mut n_d);
+            }
+            RunState::Fail { duration_s } => {
+                failed += 1;
+                update_stats(*duration_s, &mut min_d, &mut max_d, &mut sum_d, &mut n_d);
+            }
+            _ => pending += 1,
+        }
+    }
+
+    println!("==============================================");
+    println!("dkg-stress complete");
+    println!("  Passed:  {}/{}", passed, snapshot.len());
+    println!("  Failed:  {}/{}", failed, snapshot.len());
+    if pending > 0 {
+        println!("  Skipped: {} (aborted before they ran)", pending);
+    }
+    if n_d > 0 {
+        let mean = (sum_d as f64) / (n_d as f64);
+        println!("  Duration min/mean/max = {}s / {:.1}s / {}s", min_d, mean, max_d);
+    }
+    println!("  Summary: {}", config.summary_path.display());
+
+    if failed > 0 {
+        println!("Failed runs:");
+        for (i, s) in snapshot.iter().enumerate() {
+            if let RunState::Fail { duration_s } = s {
+                let label = format!("run-{:04}", i + 1);
+                let dir = config.work_dir.join(&label);
+                println!("  {}  ({}s)  {}", label, duration_s, dir.display());
+            }
+        }
+    }
+    println!("==============================================");
+
+    // Suppress unused-import warning when we only conditionally read Instant.
+    let _ = Instant::now;
+}
+
+fn update_stats(d: u64, min_d: &mut u64, max_d: &mut u64, sum_d: &mut u64, n: &mut u64) {
+    if d < *min_d {
+        *min_d = d;
+    }
+    if d > *max_d {
+        *max_d = d;
+    }
+    *sum_d = sum_d.saturating_add(d);
+    *n = n.saturating_add(1);
+}
diff --git a/tools/dkg-stress/src/state.rs b/tools/dkg-stress/src/state.rs
new file mode 100644
index 00000000..7305cf64
--- /dev/null
+++ b/tools/dkg-stress/src/state.rs
@@ -0,0 +1,215 @@
+use std::time::Instant;
+
+#[derive(Clone, Copy, Debug)]
+pub enum RunState {
+    Pending,
+    Running { started_at: Instant },
+    Pass { duration_s: u64 },
+    Fail { duration_s: u64 },
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum ActivePane {
+    Runs,
+    Logs,
+}
+
+pub struct App {
+    pub runs: Vec<RunState>,
+    /// Which pane receives navigation and scroll input.
+    pub active_pane: ActivePane,
+    /// 0-based index of the run currently displayed in the detail pane.
+    pub selected_run: usize,
+    /// 0 = run.log, 1..=N = node-(idx-1)/node.log.
+    pub selected_tab: usize,
+    /// True once the user has navigated manually; suppresses auto-follow so
+    /// the table doesn't yank focus away from what they're inspecting.
+    pub manual_select: bool,
+    /// Number of lines scrolled back from the tail of the active log. 0
+    /// means "stick to the tail" (live updates appear). Grows as the user
+    /// scrolls up; clamped on render to the available content. Reset on
+    /// run/tab switch and on `G` / End.
+    pub log_scroll: usize,
+}
+
+impl App {
+    pub fn new(total: usize) -> Self {
+        Self {
+            runs: vec![RunState::Pending; total],
+            active_pane: ActivePane::Runs,
+            selected_run: 0,
+            selected_tab: 0,
+            manual_select: false,
+            log_scroll: 0,
+        }
+    }
+
+    pub fn focus_runs(&mut self) {
+        self.active_pane = ActivePane::Runs;
+        self.manual_select = true;
+    }
+
+    pub fn focus_logs(&mut self) {
+        self.active_pane = ActivePane::Logs;
+        self.manual_select = true;
+    }
+
+    pub fn toggle_pane(&mut self) {
+        self.active_pane = match self.active_pane {
+            ActivePane::Runs => ActivePane::Logs,
+            ActivePane::Logs => ActivePane::Runs,
+        };
+        self.manual_select = true;
+    }
+
+    pub fn next_run(&mut self) {
+        if self.runs.is_empty() {
+            return;
+        }
+        if self.selected_run + 1 < self.runs.len() {
+            self.selected_run += 1;
+        }
+        self.manual_select = true;
+        self.log_scroll = 0;
+    }
+
+    pub fn prev_run(&mut self) {
+        self.selected_run = self.selected_run.saturating_sub(1);
+        self.manual_select = true;
+        self.log_scroll = 0;
+    }
+
+    pub fn next_run_page(&mut self, page: usize) {
+        let last = self.runs.len().saturating_sub(1);
+        self.selected_run = self.selected_run.saturating_add(page).min(last);
+        self.manual_select = true;
+        self.log_scroll = 0;
+    }
+
+    pub fn prev_run_page(&mut self, page: usize) {
+        self.selected_run = self.selected_run.saturating_sub(page);
+        self.manual_select = true;
+        self.log_scroll = 0;
+    }
+
+    pub fn first_run(&mut self) {
+        self.selected_run = 0;
+        self.manual_select = true;
+        self.log_scroll = 0;
+    }
+
+    pub fn last_run(&mut self) {
+        self.selected_run = self.runs.len().saturating_sub(1);
+        self.manual_select = true;
+        self.log_scroll = 0;
+    }
+
+    pub fn next_tab(&mut self, tab_count: usize) {
+        if tab_count == 0 {
+            return;
+        }
+        self.selected_tab = (self.selected_tab + 1) % tab_count;
+        self.log_scroll = 0;
+    }
+
+    pub fn prev_tab(&mut self, tab_count: usize) {
+        if tab_count == 0 {
+            return;
+        }
+        self.selected_tab = if self.selected_tab == 0 {
+            tab_count - 1
+        } else {
+            self.selected_tab - 1
+        };
+        self.log_scroll = 0;
+    }
+
+    pub fn scroll_log_up(&mut self, lines: usize) {
+        self.log_scroll = self.log_scroll.saturating_add(lines);
+        // Pin the selected run while reading scrollback so auto_advance
+        // doesn't yank us to a different ceremony mid-scroll. `a` / `G`
+        // re-engage auto-follow.
+        self.manual_select = true;
+    }
+
+    pub fn scroll_log_down(&mut self, lines: usize) {
+        self.log_scroll = self.log_scroll.saturating_sub(lines);
+        self.manual_select = true;
+    }
+
+    pub fn scroll_log_to_tail(&mut self) {
+        self.log_scroll = 0;
+    }
+
+    /// "Go to top" — set scroll past any sane document length; render code
+    /// clamps to the actual line count.
+    pub fn scroll_log_to_top(&mut self) {
+        self.log_scroll = usize::MAX / 2;
+        self.manual_select = true;
+    }
+
+    /// Re-engage auto-follow (selection tracks the active frontier again).
+    pub fn follow_auto(&mut self) {
+        self.manual_select = false;
+        self.log_scroll = 0;
+        self.active_pane = ActivePane::Runs;
+    }
+
+    /// If the user hasn't taken manual control, keep the selection on the
+    /// most-recent active run. Resets log scroll when the focus moves so
+    /// the live tail kicks back in.
+    pub fn auto_advance_selection(&mut self) {
+        if self.manual_select {
+            return;
+        }
+        if let Some(idx) = self.focus_idx()
+            && self.selected_run != idx
+        {
+            self.selected_run = idx;
+            self.log_scroll = 0;
+        }
+    }
+
+    pub fn counts(&self) -> Counts {
+        let mut c = Counts::default();
+        for state in &self.runs {
+            match state {
+                RunState::Pending => c.pending += 1,
+                RunState::Running { .. } => c.running += 1,
+                RunState::Pass { .. } => c.passed += 1,
+                RunState::Fail { .. } => c.failed += 1,
+            }
+        }
+        c
+    }
+
+    /// The largest 1-based run index that is no longer Pending. Used by the
+    /// UI as the auto-scroll focus so the table follows the active frontier.
+    pub fn focus_idx(&self) -> Option<usize> {
+        self.runs
+            .iter()
+            .enumerate()
+            .rev()
+            .find_map(|(i, s)| (!matches!(s, RunState::Pending)).then_some(i))
+    }
+}
+
+#[derive(Default, Clone, Copy)]
+pub struct Counts {
+    pub passed: usize,
+    pub failed: usize,
+    pub running: usize,
+    pub pending: usize,
+}
+
+impl Counts {
+    pub fn done(&self) -> usize {
+        self.passed.saturating_add(self.failed)
+    }
+    pub fn total(&self) -> usize {
+        self.passed
+            .saturating_add(self.failed)
+            .saturating_add(self.running)
+            .saturating_add(self.pending)
+    }
+}
diff --git a/tools/dkg-stress/src/ui.rs b/tools/dkg-stress/src/ui.rs
new file mode 100644
index 00000000..432fca93
--- /dev/null
+++ b/tools/dkg-stress/src/ui.rs
@@ -0,0 +1,700 @@
+use ansi_to_tui::IntoText;
+use anyhow::Result;
+use ratatui::Frame;
+use ratatui::crossterm::{
+    event::{
+        self, DisableMouseCapture, EnableMouseCapture, Event, KeyCode, KeyEventKind, KeyModifiers,
+        MouseEvent, MouseEventKind,
+    },
+    execute,
+};
+use ratatui::layout::{Constraint, Layout, Position, Rect};
+use ratatui::style::{Color, Modifier, Style};
+use ratatui::text::{Line, Span, Text};
+use ratatui::widgets::{Block, Borders, Cell, Paragraph, Row, Table, TableState, Tabs, Wrap};
+use std::io;
+use std::path::PathBuf;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use crate::config::Config;
+use crate::logs::{enumerate_nodes, read_tail};
+use crate::state::{ActivePane, App, Counts, RunState};
+use crate::worker::{Killers, kill_all};
+
+/// Maximum bytes read from the end of a log file per draw. 256KB gives
+/// the user meaningful room to scroll back through a node's log while
+/// bounding I/O regardless of total log size.
+const LOG_TAIL_BYTES: u64 = 256 * 1024;
+
+pub fn run_tui(
+    config: Arc<Config>,
+    app: Arc<Mutex<App>>,
+    stop: Arc<AtomicBool>,
+    killers: Killers,
+    workers_done: impl Fn() -> bool,
+) -> Result<()> {
+    let mut terminal = ratatui::init();
+    if let Err(err) = execute!(io::stdout(), EnableMouseCapture) {
+        ratatui::restore();
+        return Err(err.into());
+    }
+    let result = event_loop(&mut terminal, &config, &app, &stop, &killers, workers_done);
+    let _ = execute!(io::stdout(), DisableMouseCapture);
+    ratatui::restore();
+    result
+}
+
+fn event_loop(
+    terminal: &mut ratatui::DefaultTerminal,
+    config: &Config,
+    app: &Mutex<App>,
+    stop: &AtomicBool,
+    killers: &Killers,
+    workers_done: impl Fn() -> bool,
+) -> Result<()> {
+    let tick = Duration::from_millis(config.tick_ms);
+    let mut next_tick = Instant::now() + tick;
+    let mut table_state = TableState::default();
+    let mut completed = false;
+
+    loop {
+        // External shutdown (SIGINT/SIGTERM/SIGHUP from the signal handler
+        // installed in main). Mirror the in-TUI quit path: SIGTERM the
+        // in-flight ceremonies, then return so ratatui::restore() runs.
+        if stop.load(Ordering::Relaxed) {
+            kill_all(killers, Duration::from_secs(5));
+            return Ok(());
+        }
+
+        terminal.draw(|frame| draw(frame, config, app, &mut table_state, completed))?;
+
+        if workers_done() {
+            completed = true;
+            terminal.draw(|frame| draw(frame, config, app, &mut table_state, completed))?;
+        }
+
+        let now = Instant::now();
+        let timeout = next_tick.saturating_duration_since(now);
+        if event::poll(timeout)? {
+            match event::read()? {
+                Event::Key(key)
+                    if key.kind == KeyEventKind::Press
+                        && handle_key(key.code, key.modifiers, config, app, stop, killers) =>
+                {
+                    return Ok(());
+                }
+                Event::Mouse(mouse) => handle_mouse(mouse, terminal, config, app)?,
+                _ => {}
+            }
+        }
+        if Instant::now() >= next_tick {
+            next_tick = Instant::now() + tick;
+        }
+    }
+}
+
+/// Returns true if the caller should exit the event loop.
+fn handle_key(
+    code: KeyCode,
+    mods: KeyModifiers,
+    config: &Config,
+    app: &Mutex<App>,
+    stop: &AtomicBool,
+    killers: &Killers,
+) -> bool {
+    let quit = matches!(code, KeyCode::Char('q') | KeyCode::Esc)
+        || (code == KeyCode::Char('c') && mods.contains(KeyModifiers::CONTROL));
+    if quit {
+        stop.store(true, Ordering::Relaxed);
+        kill_all(killers, Duration::from_secs(5));
+        return true;
+    }
+
+    let Ok(mut a) = app.lock() else {
+        return false;
+    };
+    let ctrl = mods.contains(KeyModifiers::CONTROL);
+    match code {
+        KeyCode::Tab | KeyCode::BackTab => a.toggle_pane(),
+        KeyCode::Char('1') => a.focus_runs(),
+        KeyCode::Char('2') => a.focus_logs(),
+        KeyCode::Char('a') => a.follow_auto(),
+        _ => match a.active_pane {
+            ActivePane::Runs => handle_runs_key(code, &mut a),
+            ActivePane::Logs => handle_logs_key(code, ctrl, config, &mut a),
+        },
+    }
+    false
+}
+
+fn handle_runs_key(code: KeyCode, app: &mut App) {
+    match code {
+        KeyCode::Down | KeyCode::Char('j') => app.next_run(),
+        KeyCode::Up | KeyCode::Char('k') => app.prev_run(),
+        KeyCode::PageDown | KeyCode::Char('J') => app.next_run_page(10),
+        KeyCode::PageUp | KeyCode::Char('K') => app.prev_run_page(10),
+        KeyCode::Home => app.first_run(),
+        KeyCode::End => app.last_run(),
+        KeyCode::Right | KeyCode::Char('l') | KeyCode::Enter => app.focus_logs(),
+        _ => {}
+    }
+}
+
+fn handle_logs_key(code: KeyCode, ctrl: bool, config: &Config, app: &mut App) {
+    match code {
+        KeyCode::Left | KeyCode::Char('h') => {
+            let tabs = tab_count_for(config, app, app.selected_run);
+            app.prev_tab(tabs);
+        }
+        KeyCode::Right | KeyCode::Char('l') => {
+            let tabs = tab_count_for(config, app, app.selected_run);
+            app.next_tab(tabs);
+        }
+        KeyCode::Up | KeyCode::Char('k') => app.scroll_log_up(1),
+        KeyCode::Down | KeyCode::Char('j') => app.scroll_log_down(1),
+        KeyCode::PageUp => app.scroll_log_up(20),
+        KeyCode::PageDown => app.scroll_log_down(20),
+        KeyCode::Char('u') if ctrl => app.scroll_log_up(10),
+        KeyCode::Char('d') if ctrl => app.scroll_log_down(10),
+        KeyCode::Char('b') if ctrl => app.scroll_log_up(20),
+        KeyCode::Char('f') if ctrl => app.scroll_log_down(20),
+        KeyCode::Home | KeyCode::Char('g') => app.scroll_log_to_top(),
+        KeyCode::End | KeyCode::Char('G') => app.scroll_log_to_tail(),
+        _ => {}
+    }
+}
+
+fn handle_mouse(
+    mouse: MouseEvent,
+    terminal: &ratatui::DefaultTerminal,
+    config: &Config,
+    app: &Mutex<App>,
+) -> Result<()> {
+    let size = terminal.size()?;
+    let areas = ui_areas(Rect::new(0, 0, size.width, size.height));
+    let pos = Position {
+        x: mouse.column,
+        y: mouse.row,
+    };
+
+    let Ok(mut a) = app.lock() else {
+        return Ok(());
+    };
+    let target = pane_at(areas, pos).unwrap_or(a.active_pane);
+    match mouse.kind {
+        MouseEventKind::Down(_) => match target {
+            ActivePane::Runs => a.focus_runs(),
+            ActivePane::Logs => a.focus_logs(),
+        },
+        MouseEventKind::ScrollUp => match target {
+            ActivePane::Runs => {
+                a.focus_runs();
+                a.prev_run();
+            }
+            ActivePane::Logs => {
+                a.focus_logs();
+                a.scroll_log_up(3);
+            }
+        },
+        MouseEventKind::ScrollDown => match target {
+            ActivePane::Runs => {
+                a.focus_runs();
+                a.next_run();
+            }
+            ActivePane::Logs => {
+                a.focus_logs();
+                a.scroll_log_down(3);
+            }
+        },
+        MouseEventKind::ScrollLeft if target == ActivePane::Logs => {
+            a.focus_logs();
+            let tabs = tab_count_for(config, &a, a.selected_run);
+            a.prev_tab(tabs);
+        }
+        MouseEventKind::ScrollRight if target == ActivePane::Logs => {
+            a.focus_logs();
+            let tabs = tab_count_for(config, &a, a.selected_run);
+            a.next_tab(tabs);
+        }
+        _ => {}
+    }
+    Ok(())
+}
+
+fn pane_at(areas: UiAreas, pos: Position) -> Option<ActivePane> {
+    if areas.list.contains(pos) {
+        Some(ActivePane::Runs)
+    } else if areas.detail.contains(pos) {
+        Some(ActivePane::Logs)
+    } else {
+        None
+    }
+}
+
+fn tab_count_for(config: &Config, app: &App, run_idx: usize) -> usize {
+    let Some(state) = app.runs.get(run_idx) else {
+        return 1;
+    };
+    if matches!(state, RunState::Pending) {
+        return 1; // run.log only (and it'll show "not started")
+    }
+    let run_dir = run_dir_for(config, run_idx);
+    1 + enumerate_nodes(&run_dir).len()
+}
+
+fn run_dir_for(config: &Config, run_idx: usize) -> PathBuf {
+    config
+        .work_dir
+        .join(format!("run-{:04}", run_idx.saturating_add(1)))
+}
+
+#[derive(Clone, Copy)]
+struct UiAreas {
+    header: Rect,
+    list: Rect,
+    detail: Rect,
+    footer: Rect,
+}
+
+#[derive(Clone, Copy)]
+struct DetailView {
+    selected_run: usize,
+    selected_tab: usize,
+    active: bool,
+    log_scroll: usize,
+}
+
+fn ui_areas(area: Rect) -> UiAreas {
+    let [header, body, footer] = Layout::vertical([
+        Constraint::Length(5),
+        Constraint::Min(8),
+        Constraint::Length(3),
+    ])
+    .areas(area);
+    let [list, detail] =
+        Layout::horizontal([Constraint::Percentage(40), Constraint::Percentage(60)]).areas(body);
+    UiAreas {
+        header,
+        list,
+        detail,
+        footer,
+    }
+}
+
+fn draw(
+    frame: &mut Frame,
+    config: &Config,
+    app: &Mutex<App>,
+    table_state: &mut TableState,
+    completed: bool,
+) {
+    // Take everything we need from app under a single short lock, then
+    // release it before doing file I/O. That keeps worker threads from
+    // stalling on the lock during draws.
+    let (snapshot, counts, active_pane, selected_run, selected_tab, manual_select, log_scroll) = {
+        let mut a = match app.lock() {
+            Ok(a) => a,
+            Err(_) => return,
+        };
+        a.auto_advance_selection();
+        (
+            a.runs.clone(),
+            a.counts(),
+            a.active_pane,
+            a.selected_run,
+            a.selected_tab,
+            a.manual_select,
+            a.log_scroll,
+        )
+    };
+    let now = Instant::now();
+
+    let areas = ui_areas(frame.area());
+
+    frame.render_widget(header(config), areas.header);
+
+    render_run_list(
+        frame,
+        areas.list,
+        &snapshot,
+        selected_run,
+        active_pane == ActivePane::Runs,
+        now,
+        table_state,
+    );
+    let final_log_scroll = render_detail(
+        frame,
+        areas.detail,
+        config,
+        &snapshot,
+        DetailView {
+            selected_run,
+            selected_tab,
+            active: active_pane == ActivePane::Logs,
+            log_scroll,
+        },
+    );
+
+    // Clamp the stored scroll back to whatever the renderer ended up using
+    // (lines available, screen height, etc.) so a future user keystroke
+    // operates on the actual offset rather than usize::MAX/2.
+    if final_log_scroll != log_scroll
+        && let Ok(mut a) = app.lock()
+    {
+        a.log_scroll = final_log_scroll;
+    }
+
+    frame.render_widget(
+        footer(counts, active_pane, manual_select, log_scroll, completed),
+        areas.footer,
+    );
+}
+
+fn header(config: &Config) -> Paragraph<'_> {
+    Paragraph::new(vec![
+        Line::from(vec![
+            Span::styled("DKG stress test", Style::new().add_modifier(Modifier::BOLD)),
+            Span::raw(format!(
+                "   runs={}  workers={}  work_dir={}",
+                config.runs,
+                config.workers,
+                config.work_dir.display()
+            )),
+        ]),
+        Line::from(Span::styled(
+            "Tab/click=focus · wheel=scroll active pane · a=auto · q=quit",
+            Style::new().fg(Color::DarkGray),
+        )),
+        Line::from(Span::styled(
+            "runs: j/k/Pg/Home/End · logs: j/k/Pg/Ctrl-u/d scroll, h/l tabs, g/G top/tail",
+            Style::new().fg(Color::DarkGray),
+        )),
+    ])
+    .block(Block::default().borders(Borders::ALL))
+}
+
+fn render_run_list(
+    frame: &mut Frame,
+    area: ratatui::layout::Rect,
+    snapshot: &[RunState],
+    selected: usize,
+    active: bool,
+    now: Instant,
+    table_state: &mut TableState,
+) {
+    let rows: Vec<Row> = snapshot
+        .iter()
+        .enumerate()
+        .map(|(i, state)| run_row(i + 1, *state, now))
+        .collect();
+
+    let widths = [
+        Constraint::Length(10),
+        Constraint::Length(10),
+        Constraint::Length(7),
+    ];
+
+    let block = active_block(" runs ", active);
+    let table = Table::new(rows, widths)
+        .header(
+            Row::new(vec![
+                Cell::from("run").style(Style::new().add_modifier(Modifier::BOLD)),
+                Cell::from("status").style(Style::new().add_modifier(Modifier::BOLD)),
+                Cell::from("time").style(Style::new().add_modifier(Modifier::BOLD)),
+            ])
+            .bottom_margin(0),
+        )
+        .block(block)
+        .column_spacing(2)
+        .row_highlight_style(Style::new().bg(Color::DarkGray));
+
+    table_state.select(Some(selected.min(snapshot.len().saturating_sub(1))));
+    frame.render_stateful_widget(table, area, table_state);
+}
+
+fn active_block(title: &'static str, active: bool) -> Block<'static> {
+    let block = Block::default().borders(Borders::ALL).title(title);
+    if active {
+        block.border_style(active_border_style())
+    } else {
+        block
+    }
+}
+
+fn active_border_style() -> Style {
+    Style::new().fg(Color::Cyan).add_modifier(Modifier::BOLD)
+}
+
+/// Returns the clamped log_scroll value actually used for rendering, so
+/// the caller can persist it back into App state.
+fn render_detail(
+    frame: &mut Frame,
+    area: ratatui::layout::Rect,
+    config: &Config,
+    snapshot: &[RunState],
+    view: DetailView,
+) -> usize {
+    let label = format!("run-{:04}", view.selected_run.saturating_add(1));
+    let run_dir = run_dir_for(config, view.selected_run);
+    let state = snapshot
+        .get(view.selected_run)
+        .copied()
+        .unwrap_or(RunState::Pending);
+
+    let nodes = enumerate_nodes(&run_dir);
+    let mut tab_titles: Vec<String> = Vec::with_capacity(1 + nodes.len());
+    tab_titles.push("run.log".into());
+    for n in &nodes {
+        if let Some(name) = n.file_name().and_then(|s| s.to_str()) {
+            tab_titles.push(name.to_string());
+        }
+    }
+
+    let tab_count = tab_titles.len();
+    let active_tab = view.selected_tab.min(tab_count.saturating_sub(1));
+
+    let scroll_suffix = if view.log_scroll == 0 {
+        String::new()
+    } else {
+        format!("  [+{} lines]", view.log_scroll)
+    };
+    let block = Block::default().borders(Borders::ALL).title(format!(
+        " {} — {}{} ",
+        label,
+        status_short(state),
+        scroll_suffix
+    ));
+    let block = if view.active {
+        block.border_style(active_border_style())
+    } else {
+        block
+    };
+    let inner = block.inner(area);
+    frame.render_widget(block, area);
+
+    let [tabs_area, content_area] =
+        Layout::vertical([Constraint::Length(2), Constraint::Min(1)]).areas(inner);
+
+    let tabs = Tabs::new(
+        tab_titles
+            .iter()
+            .map(|t| Line::from(t.as_str()))
+            .collect::<Vec<_>>(),
+    )
+    .select(active_tab)
+    .style(Style::new().fg(Color::Gray))
+    .highlight_style(Style::new().fg(Color::Cyan).add_modifier(Modifier::BOLD))
+    .divider(" │ ");
+    frame.render_widget(tabs, tabs_area);
+
+    let log_path = if active_tab == 0 {
+        run_dir.join("run.log")
+    } else {
+        let n = active_tab.saturating_sub(1);
+        nodes
+            .get(n)
+            .cloned()
+            .unwrap_or_else(|| run_dir.clone())
+            .join("node.log")
+    };
+
+    let (body, used_scroll) = log_body(
+        &log_path,
+        state,
+        content_area.width,
+        content_area.height,
+        view.log_scroll,
+    );
+    frame.render_widget(body, content_area);
+    used_scroll
+}
+
+/// Renders the log pane body. Returns the actual scroll offset used
+/// (clamped to available content) so the caller can persist it.
+fn log_body(
+    path: &std::path::Path,
+    state: RunState,
+    width: u16,
+    height: u16,
+    scroll: usize,
+) -> (Paragraph<'static>, usize) {
+    if matches!(state, RunState::Pending) {
+        let p = Paragraph::new(Line::from(Span::styled(
+            "(run not started yet)",
+            Style::new().fg(Color::DarkGray),
+        )));
+        return (p, 0);
+    }
+    let raw = match read_tail(path, LOG_TAIL_BYTES) {
+        Some(s) if !s.is_empty() => s,
+        _ => {
+            let msg = if path.exists() {
+                "(log file is empty)"
+            } else if matches!(state, RunState::Pass { .. }) {
+                "(log pruned — passed run with KEEP_PASSED off)"
+            } else {
+                "(log file not found)"
+            };
+            let p = Paragraph::new(Line::from(Span::styled(
+                msg,
+                Style::new().fg(Color::DarkGray),
+            )));
+            return (p, 0);
+        }
+    };
+
+    let window = height.max(1) as usize;
+    let text = match raw.into_text() {
+        Ok(text) => text,
+        Err(_) => Text::from(raw),
+    };
+
+    // `scroll` is stored as "lines back from the tail" so 0 keeps live
+    // output pinned to the bottom. Ratatui's paragraph scroll is top-based,
+    // after wrapping, so convert the tail-relative value at render time.
+    let total = wrapped_height(&text, width);
+    let max_scroll = total.saturating_sub(window);
+    let used_scroll = scroll.min(max_scroll);
+    let top_offset = max_scroll.saturating_sub(used_scroll);
+    let top_offset = u16::try_from(top_offset).unwrap_or(u16::MAX);
+
+    (
+        Paragraph::new(text)
+            .wrap(Wrap { trim: false })
+            .scroll((top_offset, 0)),
+        used_scroll,
+    )
+}
+
+fn wrapped_height(text: &Text<'_>, width: u16) -> usize {
+    let width = usize::from(width.max(1));
+    text.lines
+        .iter()
+        .map(|line| {
+            let rows = line.width().saturating_add(width.saturating_sub(1)) / width;
+            rows.max(1)
+        })
+        .sum()
+}
+
+fn run_row(id: usize, state: RunState, now: Instant) -> Row<'static> {
+    let label = format!("run-{:04}", id);
+    let (status_span, time_text) = match state {
+        RunState::Pending => (
+            Span::styled("pending", Style::new().fg(Color::DarkGray)),
+            String::new(),
+        ),
+        RunState::Running { started_at } => {
+            let elapsed = now.saturating_duration_since(started_at).as_secs();
+            (
+                Span::styled("running", Style::new().fg(Color::Yellow)),
+                format!("{:>4}s", elapsed),
+            )
+        }
+        RunState::Pass { duration_s } => (
+            Span::styled(
+                "PASS",
+                Style::new().fg(Color::Green).add_modifier(Modifier::BOLD),
+            ),
+            format!("{:>4}s", duration_s),
+        ),
+        RunState::Fail { duration_s } => (
+            Span::styled(
+                "FAIL",
+                Style::new().fg(Color::Red).add_modifier(Modifier::BOLD),
+            ),
+            format!("{:>4}s", duration_s),
+        ),
+    };
+    Row::new(vec![
+        Cell::from(label),
+        Cell::from(Line::from(status_span)),
+        Cell::from(time_text),
+    ])
+}
+
+fn status_short(state: RunState) -> &'static str {
+    match state {
+        RunState::Pending => "pending",
+        RunState::Running { .. } => "running",
+        RunState::Pass { .. } => "PASS",
+        RunState::Fail { .. } => "FAIL",
+    }
+}
+
+fn footer(
+    counts: Counts,
+    active_pane: ActivePane,
+    manual: bool,
+    log_scroll: usize,
+    completed: bool,
+) -> Paragraph<'static> {
+    let pane = match active_pane {
+        ActivePane::Runs => Span::styled("  pane:runs", Style::new().fg(Color::Cyan)),
+        ActivePane::Logs => Span::styled("  pane:logs", Style::new().fg(Color::Cyan)),
+    };
+    let follow = if manual {
+        Span::styled("  manual", Style::new().fg(Color::Magenta))
+    } else {
+        Span::styled("  auto", Style::new().fg(Color::DarkGray))
+    };
+    let scroll_hint = if log_scroll == 0 {
+        Span::styled("  tail", Style::new().fg(Color::DarkGray))
+    } else {
+        Span::styled(
+            format!("  log:+{log_scroll}"),
+            Style::new().fg(Color::Magenta),
+        )
+    };
+    let done_hint = if completed {
+        Span::styled("  done q=exit", Style::new().fg(Color::Green))
+    } else {
+        Span::raw("")
+    };
+    let line = Line::from(vec![
+        Span::styled(
+            "PASS ",
+            Style::new().fg(Color::Green).add_modifier(Modifier::BOLD),
+        ),
+        Span::raw(format!("{}", counts.passed)),
+        Span::raw("   "),
+        Span::styled(
+            "FAIL ",
+            Style::new().fg(Color::Red).add_modifier(Modifier::BOLD),
+        ),
+        Span::raw(format!("{}", counts.failed)),
+        Span::raw("   "),
+        Span::styled("run ", Style::new().fg(Color::Yellow)),
+        Span::raw(format!("{}", counts.running)),
+        Span::raw("   "),
+        Span::styled("pend ", Style::new().fg(Color::DarkGray)),
+        Span::raw(format!("{}", counts.pending)),
+        Span::raw(format!("   {}/{}", counts.done(), counts.total())),
+        pane,
+        follow,
+        scroll_hint,
+        done_hint,
+    ]);
+    Paragraph::new(line).block(Block::default().borders(Borders::ALL).title(" summary "))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn wrapped_height_counts_empty_lines() {
+        let text = Text::from("one\n\nthree");
+
+        assert_eq!(wrapped_height(&text, 80), 3);
+    }
+
+    #[test]
+    fn wrapped_height_counts_wrapped_rows() {
+        let text = Text::from("1234567890\nabc");
+
+        assert_eq!(wrapped_height(&text, 4), 4);
+    }
+}
diff --git a/tools/dkg-stress/src/worker.rs b/tools/dkg-stress/src/worker.rs
new file mode 100644
index 00000000..7f88bab1
--- /dev/null
+++ b/tools/dkg-stress/src/worker.rs
@@ -0,0 +1,246 @@
+use anyhow::Result;
+use std::collections::HashSet;
+use std::fs;
+use std::process::{Command, Stdio};
+use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
+use std::sync::{Arc, Mutex};
+use std::thread::{self, JoinHandle};
+use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
+
+use crate::config::Config;
+use crate::state::{App, RunState};
+
+/// Set of process-group IDs (== PIDs since we put each child in its own
+/// group) for in-flight run.sh invocations. The UI thread uses this on
+/// shutdown to SIGTERM the whole tree per ceremony.
+pub type Killers = Arc<Mutex<HashSet<u32>>>;
+
+pub fn spawn_workers(
+    config: Arc<Config>,
+    app: Arc<Mutex<App>>,
+    stop: Arc<AtomicBool>,
+    killers: Killers,
+) -> Vec<JoinHandle<()>> {
+    let counter = Arc::new(AtomicU32::new(1));
+    (0..config.workers)
+        .map(|_| {
+            let config = config.clone();
+            let app = app.clone();
+            let stop = stop.clone();
+            let killers = killers.clone();
+            let counter = counter.clone();
+            thread::spawn(move || worker_loop(config, app, stop, killers, counter))
+        })
+        .collect()
+}
+
+fn worker_loop(
+    config: Arc<Config>,
+    app: Arc<Mutex<App>>,
+    stop: Arc<AtomicBool>,
+    killers: Killers,
+    counter: Arc<AtomicU32>,
+) {
+    loop {
+        if stop.load(Ordering::Relaxed) {
+            return;
+        }
+        let id = counter.fetch_add(1, Ordering::Relaxed);
+        if id > config.runs {
+            return;
+        }
+        if let Err(err) = run_one(id, &config, &app, &killers) {
+            // Worker errors (spawn failures, fs errors, etc.) are recorded as
+            // failures via the App update inside run_one's error path; this
+            // arm only fires when even that bookkeeping failed. Print to
+            // stderr so it shows up after the TUI is restored.
+            eprintln!("[run-{:04}] worker error: {:#}", id, err);
+        }
+    }
+}
+
+fn run_one(id: u32, config: &Config, app: &Mutex<App>, killers: &Killers) -> Result<()> {
+    let label = format!("run-{:04}", id);
+    let run_dir = config.work_dir.join(&label);
+    let _ = fs::remove_dir_all(&run_dir);
+    fs::create_dir_all(&run_dir)?;
+
+    let log_path = run_dir.join("run.log");
+    let log_file = fs::File::create(&log_path)?;
+    let log_clone = log_file.try_clone()?;
+
+    let started = Instant::now();
+    let started_unix = SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .unwrap_or_default()
+        .as_secs();
+    let started_iso = format_iso_utc(started_unix);
+
+    set_state(app, id, RunState::Running { started_at: started });
+
+    let mut cmd = Command::new(&config.run_script);
+    cmd.env("WORK_DIR", &run_dir)
+        .env("CI", &config.worker_ci)
+        .stdout(Stdio::from(log_file))
+        .stderr(Stdio::from(log_clone))
+        .stdin(Stdio::null());
+
+    #[cfg(unix)]
+    {
+        use std::os::unix::process::CommandExt;
+        // Make the child a process-group leader so we can SIGTERM the whole
+        // tree (run.sh + its node children) with kill(-pgid, SIGTERM).
+        cmd.process_group(0);
+    }
+
+    let mut child = match cmd.spawn() {
+        Ok(c) => c,
+        Err(e) => {
+            set_state(app, id, RunState::Fail { duration_s: 0 });
+            config.append_summary_line(&label, "fail", 0, &started_iso, &run_dir)?;
+            return Err(e.into());
+        }
+    };
+
+    let pid = child.id();
+    insert_killer(killers, pid);
+
+    let wait_result = child.wait();
+    remove_killer(killers, pid);
+
+    let duration_s = started.elapsed().as_secs();
+    let pass = wait_result.map(|s| s.success()).unwrap_or(false);
+
+    let final_state = if pass {
+        RunState::Pass { duration_s }
+    } else {
+        RunState::Fail { duration_s }
+    };
+    set_state(app, id, final_state);
+
+    let status_str = if pass { "pass" } else { "fail" };
+    config.append_summary_line(&label, status_str, duration_s, &started_iso, &run_dir)?;
+
+    if pass && !config.keep_passed {
+        prune_node_dirs(&run_dir);
+    }
+
+    Ok(())
+}
+
+fn set_state(app: &Mutex<App>, id: u32, state: RunState) {
+    if let Ok(mut a) = app.lock() {
+        let idx = (id as usize).saturating_sub(1);
+        if let Some(slot) = a.runs.get_mut(idx) {
+            *slot = state;
+        }
+    }
+}
+
+fn insert_killer(killers: &Killers, pid: u32) {
+    if let Ok(mut k) = killers.lock() {
+        k.insert(pid);
+    }
+}
+
+fn remove_killer(killers: &Killers, pid: u32) {
+    if let Ok(mut k) = killers.lock() {
+        k.remove(&pid);
+    }
+}
+
+/// Drop node-*/ subdirectories of a passed run to keep disk usage bounded.
+/// run.log and the cluster-lock outputs are kept for verification.
+fn prune_node_dirs(run_dir: &std::path::Path) {
+    let Ok(entries) = fs::read_dir(run_dir) else {
+        return;
+    };
+    for entry in entries.flatten() {
+        let name = entry.file_name();
+        if name.to_string_lossy().starts_with("node-") {
+            let _ = fs::remove_dir_all(entry.path());
+        }
+    }
+}
+
+/// Send SIGTERM to every registered process group, then SIGKILL stragglers
+/// after a short grace period.
+pub fn kill_all(killers: &Killers, grace: Duration) {
+    let pids: Vec<u32> = killers.lock().map(|k| k.iter().copied().collect()).unwrap_or_default();
+    if pids.is_empty() {
+        return;
+    }
+    for pid in &pids {
+        send_signal(*pid, libc::SIGTERM);
+    }
+    let deadline = Instant::now() + grace;
+    while Instant::now() < deadline {
+        let remaining = killers.lock().map(|k| k.len()).unwrap_or(0);
+        if remaining == 0 {
+            return;
+        }
+        thread::sleep(Duration::from_millis(100));
+    }
+    let remaining: Vec<u32> = killers.lock().map(|k| k.iter().copied().collect()).unwrap_or_default();
+    for pid in remaining {
+        send_signal(pid, libc::SIGKILL);
+    }
+}
+
+#[cfg(unix)]
+fn send_signal(pid: u32, sig: libc::c_int) {
+    // Negate the PID to address the whole process group. Each child was
+    // spawned with process_group(0), making it the group leader (so PID ==
+    // PGID). Negative values to libc::kill mean "every process in this
+    // group". This is the kernel's standard mechanism for taking down a
+    // shell-launched subtree (run.sh + the four DKG nodes it forked).
+    //
+    // Cast safety: PIDs fit in i32 on every Unix we target.
+    let signed: i32 = pid.try_into().unwrap_or(0);
+    if signed > 0 {
+        // SAFETY: kill is a pure libc syscall with no aliasing or memory
+        // requirements; we pass a valid signal number. Out-of-range signed
+        // we already filtered above. Errors (ESRCH if the process is gone)
+        // are acceptable and ignored.
+        unsafe {
+            libc::kill(-signed, sig);
+        }
+    }
+}
+
+#[cfg(not(unix))]
+fn send_signal(_pid: u32, _sig: libc::c_int) {
+    // No-op on non-Unix; the tool only targets Unix anyway (run.sh is bash).
+}
+
+fn format_iso_utc(unix_secs: u64) -> String {
+    // RFC3339 / ISO-8601 in UTC without external chrono dep.
+    // Range covers years 1970..9999 which is plenty for log timestamps.
+    let secs = unix_secs as i64;
+    let days = secs.div_euclid(86_400);
+    let time = secs.rem_euclid(86_400);
+    let h = (time / 3600) as u32;
+    let m = ((time % 3600) / 60) as u32;
+    let s = (time % 60) as u32;
+    let (year, month, day) = days_to_ymd(days);
+    format!(
+        "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}Z",
+        year, month, day, h, m, s
+    )
+}
+
+/// Convert days since 1970-01-01 to (year, month, day) using the proleptic
+/// Gregorian calendar (Howard Hinnant's algorithm).
+fn days_to_ymd(days: i64) -> (i32, u32, u32) {
+    let z = days.saturating_add(719_468);
+    let era = if z >= 0 { z } else { z - 146_096 } / 146_097;
+    let doe = (z - era * 146_097) as u64; // [0, 146096]
+    let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146_096) / 365; // [0, 399]
+    let y = yoe as i64 + era * 400;
+    let doy = doe - (365 * yoe + yoe / 4 - yoe / 100); // [0, 365]
+    let mp = (5 * doy + 2) / 153; // [0, 11]
+    let d = (doy - (153 * mp + 2) / 5 + 1) as u32;
+    let m = if mp < 10 { (mp + 3) as u32 } else { (mp - 9) as u32 };
+    let year = if m <= 2 { y + 1 } else { y };
+    (year as i32, m, d)
+}