fix: pause VM before MMDS injection, resume after to ensure config is applied

- Load snapshot with ResumeVM: false so MMDS data can be written while VM is paused
- Call ResumeVM explicitly after configureMmds succeeds
- Skip PUT /mmds/config on restored VMs (Firecracker rejects it with 400)
- Strip JSON quotes from MMDS values with tr -d '"' in net-init script
- Add 169.254.169.2/32 link-local addr and flush eth0 before applying new IP

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-14 15:11:14 +00:00
parent 5e23e0ab4e
commit bfc1f47287
4 changed files with 22 additions and 9 deletions

View File

@@ -118,7 +118,7 @@ func RunConsoleProxy(cfg Config, id int, tapName string) error {
}) })
} }
// --- Start VM (blocks until snapshot is loaded and VM is running) --- // --- Start VM (blocks until snapshot is loaded and VM is PAUSED) ---
start := time.Now() start := time.Now()
logger.Infof("restoring clone %d from snapshot ...", id) logger.Infof("restoring clone %d from snapshot ...", id)
if err := m.Start(ctx); err != nil { if err := m.Start(ctx); err != nil {
@@ -126,10 +126,10 @@ func RunConsoleProxy(cfg Config, id int, tapName string) error {
ptm.Close() ptm.Close()
return fmt.Errorf("restore clone %d: %w", id, err) return fmt.Errorf("restore clone %d: %w", id, err)
} }
elapsed := time.Since(start)
// Inject per-clone IP config via MMDS so the fc-net-init guest daemon // Inject per-clone IP config via MMDS so the fc-net-init guest daemon
// can configure eth0 without any manual steps inside the VM. // can configure eth0 without any manual steps inside the VM.
// This must happen while the VM is PAUSED (ResumeVM: false in snapshot load).
if cfg.AutoNetConfig && cfg.Bridge != "none" { if cfg.AutoNetConfig && cfg.Bridge != "none" {
guestIP := fmt.Sprintf("%s.%d/24", cfg.GuestPrefix, 10+id) guestIP := fmt.Sprintf("%s.%d/24", cfg.GuestPrefix, 10+id)
if err := configureMmds(ctx, sockPath, guestIP, cfg.GuestGW, "1.1.1.1"); err != nil { if err := configureMmds(ctx, sockPath, guestIP, cfg.GuestGW, "1.1.1.1"); err != nil {
@@ -139,6 +139,14 @@ func RunConsoleProxy(cfg Config, id int, tapName string) error {
} }
} }
// Now RESUME the VM to start execution!
if err := m.ResumeVM(ctx); err != nil {
pts.Close()
ptm.Close()
return fmt.Errorf("resume clone %d: %w", id, err)
}
elapsed := time.Since(start)
// Release our copy of the slave — firecracker holds its own fd now. // Release our copy of the slave — firecracker holds its own fd now.
// Closing here ensures we get EOF on ptm when firecracker exits. // Closing here ensures we get EOF on ptm when firecracker exits.
pts.Close() pts.Close()

View File

@@ -110,9 +110,12 @@ func configureMmds(ctx context.Context, sockPath, ip, gw, dns string) error {
return nil return nil
} }
// Store the network config the guest daemon will poll for. // 1. MMDS configuration (version, network_interfaces binding, etc.) is
// PUT /mmds/config (interface association) was already handled by the SDK // persisted in the golden snapshot, so we don't need to configure it here.
// via AllowMMDS: true on the NetworkInterface before the VM started. // In fact, Firecracker will reject PUT /mmds/config with a 400 error
// on a restored VM, which previously caused this function to abort early.
// 2. Store the network config the guest daemon will poll for.
return doJSON(http.MethodPut, "/mmds", map[string]string{ return doJSON(http.MethodPut, "/mmds", map[string]string{
"ip": ip, "ip": ip,
"gw": gw, "gw": gw,

View File

@@ -121,11 +121,13 @@ func (o *Orchestrator) buildRootfs() error {
netInitScript := `#!/bin/sh netInitScript := `#!/bin/sh
# Poll Firecracker MMDS for network config, apply it, then exit. # Poll Firecracker MMDS for network config, apply it, then exit.
# Runs in background; loops until MMDS responds (survives snapshot resume). # Runs in background; loops until MMDS responds (survives snapshot resume).
ip addr add 169.254.169.2/32 dev eth0 2>/dev/null
while true; do while true; do
ip=$(wget -q -T1 -O- http://169.254.169.254/ip 2>/dev/null) ip=$(wget -q -T1 -O- http://169.254.169.254/ip 2>/dev/null | tr -d '"')
[ -n "$ip" ] || { sleep 1; continue; } [ -n "$ip" ] || { sleep 1; continue; }
gw=$(wget -q -T1 -O- http://169.254.169.254/gw 2>/dev/null) gw=$(wget -q -T1 -O- http://169.254.169.254/gw 2>/dev/null | tr -d '"')
dns=$(wget -q -T1 -O- http://169.254.169.254/dns 2>/dev/null) dns=$(wget -q -T1 -O- http://169.254.169.254/dns 2>/dev/null | tr -d '"')
ip addr flush dev eth0 2>/dev/null
ip addr add "$ip" dev eth0 2>/dev/null ip addr add "$ip" dev eth0 2>/dev/null
ip route add default via "$gw" dev eth0 2>/dev/null ip route add default via "$gw" dev eth0 2>/dev/null
printf "nameserver %s\n" "$dns" > /etc/resolv.conf printf "nameserver %s\n" "$dns" > /etc/resolv.conf

View File

@@ -30,7 +30,7 @@ func loadSnapshotWithNetworkOverride(ctx context.Context, sockPath, memPath, vms
payload := snapshotLoadRequest{ payload := snapshotLoadRequest{
MemFilePath: memPath, MemFilePath: memPath,
SnapshotPath: vmstatePath, SnapshotPath: vmstatePath,
ResumeVM: true, ResumeVM: false, // Changed: We pause here so MMDS can be configured BEFORE Resume.
NetworkOverrides: []networkOverride{ NetworkOverrides: []networkOverride{
{IfaceID: "1", HostDevName: tapName}, {IfaceID: "1", HostDevName: tapName},
}, },