From bffc6fcc3ea333d08fb20f415a52ae90de1d131e Mon Sep 17 00:00:00 2001
From: ejc3 <ejc3@users.noreply.github.com>
Date: Mon, 23 Feb 2026 04:14:34 +0000
Subject: [PATCH 1/2] fix: upgrade patch_mmds to hard error after snapshot
 creation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The restore-epoch MMDS update in create_snapshot_core() was best-effort
(if let Err → warn). This was dangerous because the entire exec rebind
chain depends on it: epoch watcher → handle_clone_restore() → exec
re_register → output reconnect. If patch_mmds failed silently, the
exec server's AsyncFd epoll stayed stale after vsock transport reset,
causing health checks to hang for ~60s.

The clone path already uses hard error (put_mmds().context()?). The
baseline pre-start snapshot path now matches.

Found by code review on PR #468.

Tested: make test-root FILTER=localhost_rootless_btrfs_snapshot_restore
  - First run: snapshot created, patch_mmds succeeds (hard error path)
  - Second run: restored from snapshot, exec stress 10/10 in 14ms
---
 src/commands/common.rs | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/commands/common.rs b/src/commands/common.rs
index fac68d86..1dc30deb 100644
--- a/src/commands/common.rs
+++ b/src/commands/common.rs
@@ -1436,21 +1436,22 @@ pub async fn create_snapshot_core(
 
     // Firecracker resets all vsock connections during snapshot creation
     // (VIRTIO_VSOCK_EVENT_TRANSPORT_RESET). Bump restore-epoch in MMDS so fc-agent's
-    // background watcher detects this and remounts FUSE volumes.
+    // background watcher detects this and triggers handle_clone_restore() which
+    // re-registers the exec server's AsyncFd (stale after transport reset) and
+    // reconnects the output vsock. This MUST succeed — if the epoch isn't bumped,
+    // the exec server stays stale and health checks hang for ~60s.
     let restore_epoch = std::time::SystemTime::now()
         .duration_since(std::time::UNIX_EPOCH)
         .map(|d| d.as_secs())
         .unwrap_or(0);
-    if let Err(e) = client
+    client
         .patch_mmds(serde_json::json!({
             "latest": {
                 "restore-epoch": restore_epoch.to_string()
             }
         }))
         .await
-    {
-        warn!(error = %e, "failed to bump restore-epoch after snapshot (FUSE remount may be delayed)");
-    }
+        .context("bumping restore-epoch in MMDS after snapshot")?;
 
     if has_base {
         // Diff snapshot: copy base to temp, merge diff onto it, then atomic rename

From 2134b7fd39cee8b3c18d633610781367fb69c7e1 Mon Sep 17 00:00:00 2001
From: ejc3 <ejc3@users.noreply.github.com>
Date: Mon, 23 Feb 2026 04:18:55 +0000
Subject: [PATCH 2/2] fix: clean up temp snapshot dir on MMDS epoch bump
 failure

The hard-error patch_mmds path returns early from create_snapshot_core()
but didn't remove temp_snapshot_dir first. This leaves a large .creating
directory (memory snapshot files) on disk, which can cause ENOSPC.

Match the cleanup pattern used by all other early-return error paths in
this function.

Found by code review on PR #470.
---
 src/commands/common.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/commands/common.rs b/src/commands/common.rs
index 1dc30deb..a95ca969 100644
--- a/src/commands/common.rs
+++ b/src/commands/common.rs
@@ -1444,14 +1444,17 @@ pub async fn create_snapshot_core(
         .duration_since(std::time::UNIX_EPOCH)
         .map(|d| d.as_secs())
         .unwrap_or(0);
-    client
+    if let Err(e) = client
         .patch_mmds(serde_json::json!({
             "latest": {
                 "restore-epoch": restore_epoch.to_string()
             }
         }))
         .await
-        .context("bumping restore-epoch in MMDS after snapshot")?;
+    {
+        let _ = tokio::fs::remove_dir_all(&temp_snapshot_dir).await;
+        return Err(e).context("bumping restore-epoch in MMDS after snapshot");
+    }
 
     if has_base {
         // Diff snapshot: copy base to temp, merge diff onto it, then atomic rename