From 04067f7e6b70805d1a67ee51cd9c0b62195203ef Mon Sep 17 00:00:00 2001 From: Honza Novak Date: Mon, 13 Apr 2026 09:07:35 +0000 Subject: [PATCH] feat: add structured logging, --dev flag, and snapshot network_overrides - Add --dev flag to main that enables logrus caller info (file:line) for easier debugging without changing production log output - Wire firecracker SDK logger (WithLogger) and FIFO log file to both the golden VM and each clone machine so Firecracker's own logs are surfaced - Log the exact shell commands being run (cp --reflink, ip tuntap, ip link, firecracker binary) at Info level before each syscall/exec, making it straightforward to reproduce steps manually - Extract snapshot.go with loadSnapshotWithNetworkOverride: a direct PUT /snapshot/load call over the Unix socket that includes network_overrides, remapping the stored tap to the per-clone tap name (Firecracker v1.15+ feature not yet exposed by SDK v1.0.0) - Use firecracker.WithSnapshot + a Handlers.FcInit.Swap to replace the SDK's LoadSnapshotHandler with the above when Bridge != "none" Co-Authored-By: Claude Sonnet 4.6 --- main.go | 30 ++++++++++++++- orchestrator/orchestrator.go | 49 +++++++++++++++++++++---- orchestrator/snapshot.go | 71 ++++++++++++++++++++++++++++++++++++ 3 files changed, 141 insertions(+), 9 deletions(-) create mode 100644 orchestrator/snapshot.go diff --git a/main.go b/main.go index 0a1bc2a..41a08ad 100644 --- a/main.go +++ b/main.go @@ -18,11 +18,36 @@ package main import ( "fmt" "os" + "path/filepath" + "runtime" + + log "github.com/sirupsen/logrus" "github.com/kacerr/fc-orchestrator/orchestrator" ) func main() { + // strip --dev flag before subcommand routing + dev := false + filtered := os.Args[:1] + for _, a := range os.Args[1:] { + if a == "--dev" { + dev = true + } else { + filtered = append(filtered, a) + } + } + os.Args = filtered + + if dev { + log.SetReportCaller(true) + log.SetFormatter(&log.TextFormatter{ + CallerPrettyfier: func(f *runtime.Frame) (string, string) { + return "", fmt.Sprintf("%s:%d", filepath.Base(f.File), f.Line) + }, + }) + } + // figure out if we are running as root if os.Geteuid() == 0 { fmt.Println("Running with root/sudo privileges!") @@ -61,7 +86,10 @@ func main() { } func usage() { - fmt.Fprintf(os.Stderr, `Usage: %s [args] + fmt.Fprintf(os.Stderr, `Usage: %s [--dev] [args] + +Flags: + --dev log format with source file:line (e.g. file="orchestrator.go:123") Commands: init Download kernel + create Alpine rootfs diff --git a/orchestrator/orchestrator.go b/orchestrator/orchestrator.go index de746c5..42c650f 100644 --- a/orchestrator/orchestrator.go +++ b/orchestrator/orchestrator.go @@ -203,6 +203,9 @@ func (o *Orchestrator) Golden() error { }, }, NetworkInterfaces: netIfaces, + LogPath: sockPath + ".log", + LogLevel: "Debug", + FifoLogWriter: o.log.Writer(), } ctx, cancel := context.WithCancel(context.Background()) @@ -218,7 +221,10 @@ func (o *Orchestrator) Golden() error { WithSocketPath(sockPath). Build(ctx) - m, err := firecracker.NewMachine(ctx, fcCfg, firecracker.WithProcessRunner(cmd)) + m, err := firecracker.NewMachine(ctx, fcCfg, + firecracker.WithProcessRunner(cmd), + firecracker.WithLogger(o.log), + ) if err != nil { return fmt.Errorf("new machine: %w", err) } @@ -308,6 +314,7 @@ func (o *Orchestrator) spawnOne(id int) error { // --- COW rootfs --- cloneRootfs := filepath.Join(cloneDir, "rootfs.ext4") + o.log.Infof("clone %d: running: cp --reflink=always %s %s", id, filepath.Join(goldenDir, "rootfs.ext4"), cloneRootfs) if err := reflinkCopy(filepath.Join(goldenDir, "rootfs.ext4"), cloneRootfs); err != nil { return fmt.Errorf("copy rootfs: %w", err) } @@ -326,6 +333,9 @@ func (o *Orchestrator) spawnOne(id int) error { tapName := fmt.Sprintf("fctap%d", id) var netIfaces firecracker.NetworkInterfaces if o.cfg.Bridge != "none" { + o.log.Infof("clone %d: running: ip tuntap add dev %s mode tap", id, tapName) + o.log.Infof("clone %d: running: ip link set %s up", id, tapName) + o.log.Infof("clone %d: running: ip link set %s master %s", id, tapName, o.cfg.Bridge) if err := o.createTap(tapName); err != nil { return err } @@ -354,6 +364,8 @@ func (o *Orchestrator) spawnOne(id int) error { WithSocketPath(sockPath). Build(ctx) + o.log.Infof("clone %d: running: %s", id, strings.Join(cmd.Args, " ")) + vcpus := o.cfg.VCPUs mem := o.cfg.MemMiB @@ -364,20 +376,41 @@ func (o *Orchestrator) spawnOne(id int) error { MemSizeMib: &mem, }, NetworkInterfaces: netIfaces, - // Snapshot config: tells the SDK to restore instead of fresh boot. - Snapshot: firecracker.SnapshotConfig{ - MemFilePath: sharedMem, - SnapshotPath: cloneVmstate, - ResumeVM: true, - }, + LogPath: sockPath + ".log", + LogLevel: "Debug", + FifoLogWriter: o.log.Writer(), } - m, err := firecracker.NewMachine(ctx, fcCfg, firecracker.WithProcessRunner(cmd)) + m, err := firecracker.NewMachine(ctx, fcCfg, + firecracker.WithProcessRunner(cmd), + firecracker.WithLogger(o.log), + // WithSnapshot replaces the default handler set with snapshot-specific + // handlers: skips validate.Cfg (no KernelImagePath needed) and uses + // LoadSnapshotHandler instead of CreateBootSourceHandler. + firecracker.WithSnapshot(sharedMem, cloneVmstate, func(sc *firecracker.SnapshotConfig) { + sc.ResumeVM = true + }), + ) if err != nil { cancel() return fmt.Errorf("new machine: %w", err) } + // Firecracker v1.15+ supports network_overrides in PUT /snapshot/load to + // remap the tap backend stored in the snapshot. The SDK v1.0.0 doesn't + // expose this field, so we replace the SDK's LoadSnapshotHandler with a + // direct HTTP call that includes the per-clone tap name. + if o.cfg.Bridge != "none" { + m.Handlers.FcInit = m.Handlers.FcInit.Swap(firecracker.Handler{ + Name: firecracker.LoadSnapshotHandlerName, + Fn: func(ctx context.Context, m *firecracker.Machine) error { + return loadSnapshotWithNetworkOverride( + ctx, sockPath, sharedMem, cloneVmstate, tapName, + ) + }, + }) + } + start := time.Now() if err := m.Start(ctx); err != nil { cancel() diff --git a/orchestrator/snapshot.go b/orchestrator/snapshot.go new file mode 100644 index 0000000..f851b82 --- /dev/null +++ b/orchestrator/snapshot.go @@ -0,0 +1,71 @@ +package orchestrator + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net" + "net/http" +) + +type networkOverride struct { + IfaceID string `json:"iface_id"` + HostDevName string `json:"host_dev_name"` +} + +type snapshotLoadRequest struct { + MemFilePath string `json:"mem_file_path"` + SnapshotPath string `json:"snapshot_path"` + ResumeVM bool `json:"resume_vm,omitempty"` + NetworkOverrides []networkOverride `json:"network_overrides,omitempty"` +} + +// loadSnapshotWithNetworkOverride calls PUT /snapshot/load on the Firecracker +// Unix socket, remapping the first network interface to tapName. +// This bypasses the SDK's LoadSnapshotHandler which doesn't expose +// network_overrides (added in Firecracker v1.15, SDK v1.0.0 omits it). +func loadSnapshotWithNetworkOverride(ctx context.Context, sockPath, memPath, vmstatePath, tapName string) error { + payload := snapshotLoadRequest{ + MemFilePath: memPath, + SnapshotPath: vmstatePath, + ResumeVM: true, + NetworkOverrides: []networkOverride{ + {IfaceID: "1", HostDevName: tapName}, + }, + } + + data, err := json.Marshal(payload) + if err != nil { + return fmt.Errorf("marshal snapshot load params: %w", err) + } + + httpClient := &http.Client{ + Transport: &http.Transport{ + DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { + return net.Dial("unix", sockPath) + }, + }, + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPut, + "http://localhost/snapshot/load", bytes.NewReader(data)) + if err != nil { + return fmt.Errorf("build snapshot load request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "application/json") + + resp, err := httpClient.Do(req) + if err != nil { + return fmt.Errorf("snapshot load request: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusNoContent { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("snapshot load failed (%d): %s", resp.StatusCode, body) + } + return nil +}