feat: add structured logging, --dev flag, and snapshot network_overrides

- Add --dev flag to main that enables logrus caller info (file:line) for
  easier debugging without changing production log output
- Wire firecracker SDK logger (WithLogger) and FIFO log file to both the
  golden VM and each clone machine so Firecracker's own logs are surfaced
- Log the exact shell commands being run (cp --reflink, ip tuntap, ip link,
  firecracker binary) at Info level before each syscall/exec, making it
  straightforward to reproduce steps manually
- Extract snapshot.go with loadSnapshotWithNetworkOverride: a direct PUT
  /snapshot/load call over the Unix socket that includes network_overrides,
  remapping the stored tap to the per-clone tap name (Firecracker v1.15+
  feature not yet exposed by SDK v1.0.0)
- Use firecracker.WithSnapshot + a Handlers.FcInit.Swap to replace the SDK's
  LoadSnapshotHandler with the above when Bridge != "none"

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-13 09:07:35 +00:00
parent b46d510cb7
commit 04067f7e6b
3 changed files with 141 additions and 9 deletions

30
main.go
View File

@@ -18,11 +18,36 @@ package main
import (
"fmt"
"os"
"path/filepath"
"runtime"
log "github.com/sirupsen/logrus"
"github.com/kacerr/fc-orchestrator/orchestrator"
)
func main() {
// strip --dev flag before subcommand routing
dev := false
filtered := os.Args[:1]
for _, a := range os.Args[1:] {
if a == "--dev" {
dev = true
} else {
filtered = append(filtered, a)
}
}
os.Args = filtered
if dev {
log.SetReportCaller(true)
log.SetFormatter(&log.TextFormatter{
CallerPrettyfier: func(f *runtime.Frame) (string, string) {
return "", fmt.Sprintf("%s:%d", filepath.Base(f.File), f.Line)
},
})
}
// figure out if we are running as root
if os.Geteuid() == 0 {
fmt.Println("Running with root/sudo privileges!")
@@ -61,7 +86,10 @@ func main() {
}
func usage() {
fmt.Fprintf(os.Stderr, `Usage: %s <command> [args]
fmt.Fprintf(os.Stderr, `Usage: %s [--dev] <command> [args]
Flags:
--dev log format with source file:line (e.g. file="orchestrator.go:123")
Commands:
init Download kernel + create Alpine rootfs

View File

@@ -203,6 +203,9 @@ func (o *Orchestrator) Golden() error {
},
},
NetworkInterfaces: netIfaces,
LogPath: sockPath + ".log",
LogLevel: "Debug",
FifoLogWriter: o.log.Writer(),
}
ctx, cancel := context.WithCancel(context.Background())
@@ -218,7 +221,10 @@ func (o *Orchestrator) Golden() error {
WithSocketPath(sockPath).
Build(ctx)
m, err := firecracker.NewMachine(ctx, fcCfg, firecracker.WithProcessRunner(cmd))
m, err := firecracker.NewMachine(ctx, fcCfg,
firecracker.WithProcessRunner(cmd),
firecracker.WithLogger(o.log),
)
if err != nil {
return fmt.Errorf("new machine: %w", err)
}
@@ -308,6 +314,7 @@ func (o *Orchestrator) spawnOne(id int) error {
// --- COW rootfs ---
cloneRootfs := filepath.Join(cloneDir, "rootfs.ext4")
o.log.Infof("clone %d: running: cp --reflink=always %s %s", id, filepath.Join(goldenDir, "rootfs.ext4"), cloneRootfs)
if err := reflinkCopy(filepath.Join(goldenDir, "rootfs.ext4"), cloneRootfs); err != nil {
return fmt.Errorf("copy rootfs: %w", err)
}
@@ -326,6 +333,9 @@ func (o *Orchestrator) spawnOne(id int) error {
tapName := fmt.Sprintf("fctap%d", id)
var netIfaces firecracker.NetworkInterfaces
if o.cfg.Bridge != "none" {
o.log.Infof("clone %d: running: ip tuntap add dev %s mode tap", id, tapName)
o.log.Infof("clone %d: running: ip link set %s up", id, tapName)
o.log.Infof("clone %d: running: ip link set %s master %s", id, tapName, o.cfg.Bridge)
if err := o.createTap(tapName); err != nil {
return err
}
@@ -354,6 +364,8 @@ func (o *Orchestrator) spawnOne(id int) error {
WithSocketPath(sockPath).
Build(ctx)
o.log.Infof("clone %d: running: %s", id, strings.Join(cmd.Args, " "))
vcpus := o.cfg.VCPUs
mem := o.cfg.MemMiB
@@ -364,20 +376,41 @@ func (o *Orchestrator) spawnOne(id int) error {
MemSizeMib: &mem,
},
NetworkInterfaces: netIfaces,
// Snapshot config: tells the SDK to restore instead of fresh boot.
Snapshot: firecracker.SnapshotConfig{
MemFilePath: sharedMem,
SnapshotPath: cloneVmstate,
ResumeVM: true,
},
LogPath: sockPath + ".log",
LogLevel: "Debug",
FifoLogWriter: o.log.Writer(),
}
m, err := firecracker.NewMachine(ctx, fcCfg, firecracker.WithProcessRunner(cmd))
m, err := firecracker.NewMachine(ctx, fcCfg,
firecracker.WithProcessRunner(cmd),
firecracker.WithLogger(o.log),
// WithSnapshot replaces the default handler set with snapshot-specific
// handlers: skips validate.Cfg (no KernelImagePath needed) and uses
// LoadSnapshotHandler instead of CreateBootSourceHandler.
firecracker.WithSnapshot(sharedMem, cloneVmstate, func(sc *firecracker.SnapshotConfig) {
sc.ResumeVM = true
}),
)
if err != nil {
cancel()
return fmt.Errorf("new machine: %w", err)
}
// Firecracker v1.15+ supports network_overrides in PUT /snapshot/load to
// remap the tap backend stored in the snapshot. The SDK v1.0.0 doesn't
// expose this field, so we replace the SDK's LoadSnapshotHandler with a
// direct HTTP call that includes the per-clone tap name.
if o.cfg.Bridge != "none" {
m.Handlers.FcInit = m.Handlers.FcInit.Swap(firecracker.Handler{
Name: firecracker.LoadSnapshotHandlerName,
Fn: func(ctx context.Context, m *firecracker.Machine) error {
return loadSnapshotWithNetworkOverride(
ctx, sockPath, sharedMem, cloneVmstate, tapName,
)
},
})
}
start := time.Now()
if err := m.Start(ctx); err != nil {
cancel()

71
orchestrator/snapshot.go Normal file
View File

@@ -0,0 +1,71 @@
package orchestrator
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net"
"net/http"
)
type networkOverride struct {
IfaceID string `json:"iface_id"`
HostDevName string `json:"host_dev_name"`
}
type snapshotLoadRequest struct {
MemFilePath string `json:"mem_file_path"`
SnapshotPath string `json:"snapshot_path"`
ResumeVM bool `json:"resume_vm,omitempty"`
NetworkOverrides []networkOverride `json:"network_overrides,omitempty"`
}
// loadSnapshotWithNetworkOverride calls PUT /snapshot/load on the Firecracker
// Unix socket, remapping the first network interface to tapName.
// This bypasses the SDK's LoadSnapshotHandler which doesn't expose
// network_overrides (added in Firecracker v1.15, SDK v1.0.0 omits it).
func loadSnapshotWithNetworkOverride(ctx context.Context, sockPath, memPath, vmstatePath, tapName string) error {
payload := snapshotLoadRequest{
MemFilePath: memPath,
SnapshotPath: vmstatePath,
ResumeVM: true,
NetworkOverrides: []networkOverride{
{IfaceID: "1", HostDevName: tapName},
},
}
data, err := json.Marshal(payload)
if err != nil {
return fmt.Errorf("marshal snapshot load params: %w", err)
}
httpClient := &http.Client{
Transport: &http.Transport{
DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
return net.Dial("unix", sockPath)
},
},
}
req, err := http.NewRequestWithContext(ctx, http.MethodPut,
"http://localhost/snapshot/load", bytes.NewReader(data))
if err != nil {
return fmt.Errorf("build snapshot load request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Accept", "application/json")
resp, err := httpClient.Do(req)
if err != nil {
return fmt.Errorf("snapshot load request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusNoContent {
body, _ := io.ReadAll(resp.Body)
return fmt.Errorf("snapshot load failed (%d): %s", resp.StatusCode, body)
}
return nil
}