fix: pause VM before MMDS injection, resume after to ensure config is applied
- Load snapshot with ResumeVM: false so MMDS data can be written while VM is paused - Call ResumeVM explicitly after configureMmds succeeds - Skip PUT /mmds/config on restored VMs (Firecracker rejects it with 400) - Strip JSON quotes from MMDS values with tr -d '"' in net-init script - Add 169.254.169.2/32 link-local addr and flush eth0 before applying new IP Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -118,7 +118,7 @@ func RunConsoleProxy(cfg Config, id int, tapName string) error {
|
||||
})
|
||||
}
|
||||
|
||||
// --- Start VM (blocks until snapshot is loaded and VM is running) ---
|
||||
// --- Start VM (blocks until snapshot is loaded and VM is PAUSED) ---
|
||||
start := time.Now()
|
||||
logger.Infof("restoring clone %d from snapshot ...", id)
|
||||
if err := m.Start(ctx); err != nil {
|
||||
@@ -126,10 +126,10 @@ func RunConsoleProxy(cfg Config, id int, tapName string) error {
|
||||
ptm.Close()
|
||||
return fmt.Errorf("restore clone %d: %w", id, err)
|
||||
}
|
||||
elapsed := time.Since(start)
|
||||
|
||||
// Inject per-clone IP config via MMDS so the fc-net-init guest daemon
|
||||
// can configure eth0 without any manual steps inside the VM.
|
||||
// This must happen while the VM is PAUSED (ResumeVM: false in snapshot load).
|
||||
if cfg.AutoNetConfig && cfg.Bridge != "none" {
|
||||
guestIP := fmt.Sprintf("%s.%d/24", cfg.GuestPrefix, 10+id)
|
||||
if err := configureMmds(ctx, sockPath, guestIP, cfg.GuestGW, "1.1.1.1"); err != nil {
|
||||
@@ -139,6 +139,14 @@ func RunConsoleProxy(cfg Config, id int, tapName string) error {
|
||||
}
|
||||
}
|
||||
|
||||
// Now RESUME the VM to start execution!
|
||||
if err := m.ResumeVM(ctx); err != nil {
|
||||
pts.Close()
|
||||
ptm.Close()
|
||||
return fmt.Errorf("resume clone %d: %w", id, err)
|
||||
}
|
||||
elapsed := time.Since(start)
|
||||
|
||||
// Release our copy of the slave — firecracker holds its own fd now.
|
||||
// Closing here ensures we get EOF on ptm when firecracker exits.
|
||||
pts.Close()
|
||||
|
||||
@@ -110,9 +110,12 @@ func configureMmds(ctx context.Context, sockPath, ip, gw, dns string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Store the network config the guest daemon will poll for.
|
||||
// PUT /mmds/config (interface association) was already handled by the SDK
|
||||
// via AllowMMDS: true on the NetworkInterface before the VM started.
|
||||
// 1. MMDS configuration (version, network_interfaces binding, etc.) is
|
||||
// persisted in the golden snapshot, so we don't need to configure it here.
|
||||
// In fact, Firecracker will reject PUT /mmds/config with a 400 error
|
||||
// on a restored VM, which previously caused this function to abort early.
|
||||
|
||||
// 2. Store the network config the guest daemon will poll for.
|
||||
return doJSON(http.MethodPut, "/mmds", map[string]string{
|
||||
"ip": ip,
|
||||
"gw": gw,
|
||||
|
||||
@@ -121,11 +121,13 @@ func (o *Orchestrator) buildRootfs() error {
|
||||
netInitScript := `#!/bin/sh
|
||||
# Poll Firecracker MMDS for network config, apply it, then exit.
|
||||
# Runs in background; loops until MMDS responds (survives snapshot resume).
|
||||
ip addr add 169.254.169.2/32 dev eth0 2>/dev/null
|
||||
while true; do
|
||||
ip=$(wget -q -T1 -O- http://169.254.169.254/ip 2>/dev/null)
|
||||
ip=$(wget -q -T1 -O- http://169.254.169.254/ip 2>/dev/null | tr -d '"')
|
||||
[ -n "$ip" ] || { sleep 1; continue; }
|
||||
gw=$(wget -q -T1 -O- http://169.254.169.254/gw 2>/dev/null)
|
||||
dns=$(wget -q -T1 -O- http://169.254.169.254/dns 2>/dev/null)
|
||||
gw=$(wget -q -T1 -O- http://169.254.169.254/gw 2>/dev/null | tr -d '"')
|
||||
dns=$(wget -q -T1 -O- http://169.254.169.254/dns 2>/dev/null | tr -d '"')
|
||||
ip addr flush dev eth0 2>/dev/null
|
||||
ip addr add "$ip" dev eth0 2>/dev/null
|
||||
ip route add default via "$gw" dev eth0 2>/dev/null
|
||||
printf "nameserver %s\n" "$dns" > /etc/resolv.conf
|
||||
|
||||
@@ -30,7 +30,7 @@ func loadSnapshotWithNetworkOverride(ctx context.Context, sockPath, memPath, vms
|
||||
payload := snapshotLoadRequest{
|
||||
MemFilePath: memPath,
|
||||
SnapshotPath: vmstatePath,
|
||||
ResumeVM: true,
|
||||
ResumeVM: false, // Changed: We pause here so MMDS can be configured BEFORE Resume.
|
||||
NetworkOverrides: []networkOverride{
|
||||
{IfaceID: "1", HostDevName: tapName},
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user