From bffc6fcc3ea333d08fb20f415a52ae90de1d131e Mon Sep 17 00:00:00 2001 From: ejc3 Date: Mon, 23 Feb 2026 04:14:34 +0000 Subject: [PATCH 1/2] fix: upgrade patch_mmds to hard error after snapshot creation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The restore-epoch MMDS update in create_snapshot_core() was best-effort (if let Err → warn). This was dangerous because the entire exec rebind chain depends on it: epoch watcher → handle_clone_restore() → exec re_register → output reconnect. If patch_mmds failed silently, the exec server's AsyncFd epoll stayed stale after vsock transport reset, causing health checks to hang for ~60s. The clone path already uses hard error (put_mmds().context()?). The baseline pre-start snapshot path now matches. Found by code review on PR #468. Tested: make test-root FILTER=localhost_rootless_btrfs_snapshot_restore - First run: snapshot created, patch_mmds succeeds (hard error path) - Second run: restored from snapshot, exec stress 10/10 in 14ms --- src/commands/common.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/commands/common.rs b/src/commands/common.rs index fac68d86..1dc30deb 100644 --- a/src/commands/common.rs +++ b/src/commands/common.rs @@ -1436,21 +1436,22 @@ pub async fn create_snapshot_core( // Firecracker resets all vsock connections during snapshot creation // (VIRTIO_VSOCK_EVENT_TRANSPORT_RESET). Bump restore-epoch in MMDS so fc-agent's - // background watcher detects this and remounts FUSE volumes. + // background watcher detects this and triggers handle_clone_restore() which + // re-registers the exec server's AsyncFd (stale after transport reset) and + // reconnects the output vsock. This MUST succeed — if the epoch isn't bumped, + // the exec server stays stale and health checks hang for ~60s. let restore_epoch = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .map(|d| d.as_secs()) .unwrap_or(0); - if let Err(e) = client + client .patch_mmds(serde_json::json!({ "latest": { "restore-epoch": restore_epoch.to_string() } })) .await - { - warn!(error = %e, "failed to bump restore-epoch after snapshot (FUSE remount may be delayed)"); - } + .context("bumping restore-epoch in MMDS after snapshot")?; if has_base { // Diff snapshot: copy base to temp, merge diff onto it, then atomic rename From 2134b7fd39cee8b3c18d633610781367fb69c7e1 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Mon, 23 Feb 2026 04:18:55 +0000 Subject: [PATCH 2/2] fix: clean up temp snapshot dir on MMDS epoch bump failure The hard-error patch_mmds path returns early from create_snapshot_core() but didn't remove temp_snapshot_dir first. This leaves a large .creating directory (memory snapshot files) on disk, which can cause ENOSPC. Match the cleanup pattern used by all other early-return error paths in this function. Found by code review on PR #470. --- src/commands/common.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/commands/common.rs b/src/commands/common.rs index 1dc30deb..a95ca969 100644 --- a/src/commands/common.rs +++ b/src/commands/common.rs @@ -1444,14 +1444,17 @@ pub async fn create_snapshot_core( .duration_since(std::time::UNIX_EPOCH) .map(|d| d.as_secs()) .unwrap_or(0); - client + if let Err(e) = client .patch_mmds(serde_json::json!({ "latest": { "restore-epoch": restore_epoch.to_string() } })) .await - .context("bumping restore-epoch in MMDS after snapshot")?; + { + let _ = tokio::fs::remove_dir_all(&temp_snapshot_dir).await; + return Err(e).context("bumping restore-epoch in MMDS after snapshot"); + } if has_base { // Diff snapshot: copy base to temp, merge diff onto it, then atomic rename