package orchestrator import ( "context" "fmt" "io" "net" "os" "os/exec" "path/filepath" "strconv" "sync" "syscall" "time" "github.com/creack/pty" firecracker "github.com/firecracker-microvm/firecracker-go-sdk" "github.com/firecracker-microvm/firecracker-go-sdk/client/models" log "github.com/sirupsen/logrus" ) // RunConsoleProxy is the entrypoint for the "_console-proxy" internal subcommand. // It restores a Firecracker clone from the golden snapshot, connecting its serial // console (ttyS0) to a PTY, then serves the PTY master on a Unix socket at // {cloneDir}/console.sock for the lifetime of the VM. func RunConsoleProxy(cfg Config, id int, tapName string) error { logger := log.WithField("component", fmt.Sprintf("console-proxy[%d]", id)) cloneDir := filepath.Join(cfg.BaseDir, "clones", strconv.Itoa(id)) goldenDir := filepath.Join(cfg.BaseDir, "golden") sockPath := filepath.Join(cloneDir, "api.sock") consoleSockPath := filepath.Join(cloneDir, "console.sock") sharedMem := filepath.Join(goldenDir, "mem") cloneVmstate := filepath.Join(cloneDir, "vmstate") // --- Create PTY --- // ptm = master (we hold and proxy), pts = slave (firecracker's stdio) ptm, pts, err := pty.Open() if err != nil { return fmt.Errorf("open pty: %w", err) } pty.Setsize(ptm, &pty.Winsize{Rows: 24, Cols: 80}) //nolint:errcheck fcBin, err := exec.LookPath(cfg.FCBin) if err != nil { pts.Close() ptm.Close() return fmt.Errorf("firecracker not found: %w", err) } ctx, cancel := context.WithCancel(context.Background()) defer cancel() // --- Build firecracker command with slave PTY as stdin/stdout/stderr --- // Setsid makes the slave PTY the controlling terminal for firecracker, // which is required for job control (Ctrl+C, SIGWINCH, etc.) to work. cmd := firecracker.VMCommandBuilder{}. WithBin(fcBin). WithSocketPath(sockPath). Build(ctx) cmd.Stdin = pts cmd.Stdout = pts cmd.Stderr = pts cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true} // --- Build Machine config (mirrors spawnOne) --- vcpus := cfg.VCPUs mem := cfg.MemMiB fcCfg := firecracker.Config{ SocketPath: sockPath, MachineCfg: models.MachineConfiguration{ VcpuCount: &vcpus, MemSizeMib: &mem, }, LogPath: sockPath + ".log", LogLevel: "Debug", FifoLogWriter: logger.Writer(), } if cfg.Bridge != "none" && tapName != "" { mac := fmt.Sprintf("AA:FC:00:00:%02X:%02X", id/256, id%256) fcCfg.NetworkInterfaces = firecracker.NetworkInterfaces{ { StaticConfiguration: &firecracker.StaticNetworkConfiguration{ MacAddress: mac, HostDevName: tapName, }, }, } } m, err := firecracker.NewMachine(ctx, fcCfg, firecracker.WithProcessRunner(cmd), firecracker.WithLogger(logger), firecracker.WithSnapshot(sharedMem, cloneVmstate, func(sc *firecracker.SnapshotConfig) { sc.ResumeVM = true }), ) if err != nil { pts.Close() ptm.Close() return fmt.Errorf("new machine: %w", err) } // Swap in network-override snapshot handler (same as spawnOne) if cfg.Bridge != "none" && tapName != "" { m.Handlers.FcInit = m.Handlers.FcInit.Swap(firecracker.Handler{ Name: firecracker.LoadSnapshotHandlerName, Fn: func(ctx context.Context, m *firecracker.Machine) error { return loadSnapshotWithNetworkOverride( ctx, sockPath, sharedMem, cloneVmstate, tapName, ) }, }) } // --- Start VM (blocks until snapshot is loaded and VM is running) --- start := time.Now() logger.Infof("restoring clone %d from snapshot ...", id) if err := m.Start(ctx); err != nil { pts.Close() ptm.Close() return fmt.Errorf("restore clone %d: %w", id, err) } elapsed := time.Since(start) // Release our copy of the slave — firecracker holds its own fd now. // Closing here ensures we get EOF on ptm when firecracker exits. pts.Close() // --- Write PID file --- pidsDir := filepath.Join(cfg.BaseDir, "pids") os.MkdirAll(pidsDir, 0o755) //nolint:errcheck if cmd.Process != nil { os.WriteFile( //nolint:errcheck filepath.Join(pidsDir, fmt.Sprintf("clone-%d.pid", id)), []byte(strconv.Itoa(cmd.Process.Pid)), 0o644, ) } logger.Infof("clone %d: restored in %s (pid=%d, tap=%s)", id, elapsed.Round(time.Millisecond), cmd.Process.Pid, tapName) // --- Create console socket --- os.Remove(consoleSockPath) //nolint:errcheck listener, err := net.Listen("unix", consoleSockPath) if err != nil { ptm.Close() return fmt.Errorf("listen on console socket: %w", err) } // --- Serve until VM exits --- vmDone := make(chan struct{}) go func() { m.Wait(ctx) //nolint:errcheck close(vmDone) }() serveConsole(listener, ptm, vmDone, logger) listener.Close() ptm.Close() os.Remove(consoleSockPath) //nolint:errcheck if tapName != "" { destroyTap(tapName) } logger.Infof("clone %d: exiting", id) return nil } // atomicWriter wraps an io.Writer behind a read/write lock so it can be // swapped between connections without holding the lock during writes. type atomicWriter struct { mu sync.RWMutex w io.Writer } func (a *atomicWriter) set(w io.Writer) { a.mu.Lock() defer a.mu.Unlock() a.w = w } func (a *atomicWriter) Write(p []byte) (int, error) { a.mu.RLock() defer a.mu.RUnlock() return a.w.Write(p) } // serveConsole accepts connections on listener and proxies console I/O via ptm. // A background goroutine reads from the PTY master continuously (discarding // output when no client is connected so the VM never blocks on a full buffer). // Only one client is served at a time; sessions are serialised. func serveConsole(listener net.Listener, ptm *os.File, vmDone <-chan struct{}, logger *log.Entry) { aw := &atomicWriter{w: io.Discard} // Background PTY reader — runs for the full VM lifetime. go func() { buf := make([]byte, 4096) for { n, err := ptm.Read(buf) if n > 0 { aw.Write(buf[:n]) //nolint:errcheck } if err != nil { return // PTY closed (VM exited) } } }() // Unblock Accept() when the VM exits. go func() { <-vmDone listener.Close() }() for { conn, err := listener.Accept() if err != nil { return // listener closed } logger.Info("console client connected") // Route PTY output to this connection. aw.set(conn) // Forward client input to the PTY master. clientGone := make(chan struct{}) go func() { defer close(clientGone) buf := make([]byte, 256) for { n, err := conn.Read(buf) if n > 0 { ptm.Write(buf[:n]) //nolint:errcheck } if err != nil { return } } }() // Block until client disconnects or VM dies. select { case <-clientGone: logger.Info("console client disconnected") case <-vmDone: logger.Info("VM exited while console client was connected") } aw.set(io.Discard) conn.Close() } }