diff --git a/src/commands/common.rs b/src/commands/common.rs index fac68d86..a95ca969 100644 --- a/src/commands/common.rs +++ b/src/commands/common.rs @@ -1436,7 +1436,10 @@ pub async fn create_snapshot_core( // Firecracker resets all vsock connections during snapshot creation // (VIRTIO_VSOCK_EVENT_TRANSPORT_RESET). Bump restore-epoch in MMDS so fc-agent's - // background watcher detects this and remounts FUSE volumes. + // background watcher detects this and triggers handle_clone_restore() which + // re-registers the exec server's AsyncFd (stale after transport reset) and + // reconnects the output vsock. This MUST succeed — if the epoch isn't bumped, + // the exec server stays stale and health checks hang for ~60s. let restore_epoch = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .map(|d| d.as_secs()) @@ -1449,7 +1452,8 @@ pub async fn create_snapshot_core( })) .await { - warn!(error = %e, "failed to bump restore-epoch after snapshot (FUSE remount may be delayed)"); + let _ = tokio::fs::remove_dir_all(&temp_snapshot_dir).await; + return Err(e).context("bumping restore-epoch in MMDS after snapshot"); } if has_base {