feat: add serial console access via PTY + Unix socket proxy
Each spawned clone now runs under a _console-proxy daemon that connects firecracker's ttyS0 (stdin/stdout) to a PTY and serves it on a Unix socket at clones/<id>/console.sock for the VM's lifetime. sudo ./fc-orch spawn 1 sudo ./fc-orch console 1 # Ctrl+] to detach spawnOne delegates VM startup to the proxy process (Setsid, detached) and waits for console.sock to appear before returning. Kill continues to work via PID files — proxy and firecracker PIDs are both recorded. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
258
orchestrator/console.go
Normal file
258
orchestrator/console.go
Normal file
@@ -0,0 +1,258 @@
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/creack/pty"
|
||||
firecracker "github.com/firecracker-microvm/firecracker-go-sdk"
|
||||
"github.com/firecracker-microvm/firecracker-go-sdk/client/models"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// RunConsoleProxy is the entrypoint for the "_console-proxy" internal subcommand.
|
||||
// It restores a Firecracker clone from the golden snapshot, connecting its serial
|
||||
// console (ttyS0) to a PTY, then serves the PTY master on a Unix socket at
|
||||
// {cloneDir}/console.sock for the lifetime of the VM.
|
||||
func RunConsoleProxy(cfg Config, id int, tapName string) error {
|
||||
logger := log.WithField("component", fmt.Sprintf("console-proxy[%d]", id))
|
||||
|
||||
cloneDir := filepath.Join(cfg.BaseDir, "clones", strconv.Itoa(id))
|
||||
goldenDir := filepath.Join(cfg.BaseDir, "golden")
|
||||
sockPath := filepath.Join(cloneDir, "api.sock")
|
||||
consoleSockPath := filepath.Join(cloneDir, "console.sock")
|
||||
sharedMem := filepath.Join(goldenDir, "mem")
|
||||
cloneVmstate := filepath.Join(cloneDir, "vmstate")
|
||||
|
||||
// --- Create PTY ---
|
||||
// ptm = master (we hold and proxy), pts = slave (firecracker's stdio)
|
||||
ptm, pts, err := pty.Open()
|
||||
if err != nil {
|
||||
return fmt.Errorf("open pty: %w", err)
|
||||
}
|
||||
pty.Setsize(ptm, &pty.Winsize{Rows: 24, Cols: 80}) //nolint:errcheck
|
||||
|
||||
fcBin, err := exec.LookPath(cfg.FCBin)
|
||||
if err != nil {
|
||||
pts.Close()
|
||||
ptm.Close()
|
||||
return fmt.Errorf("firecracker not found: %w", err)
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// --- Build firecracker command with slave PTY as stdin/stdout/stderr ---
|
||||
// Setsid makes the slave PTY the controlling terminal for firecracker,
|
||||
// which is required for job control (Ctrl+C, SIGWINCH, etc.) to work.
|
||||
cmd := firecracker.VMCommandBuilder{}.
|
||||
WithBin(fcBin).
|
||||
WithSocketPath(sockPath).
|
||||
Build(ctx)
|
||||
cmd.Stdin = pts
|
||||
cmd.Stdout = pts
|
||||
cmd.Stderr = pts
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true}
|
||||
|
||||
// --- Build Machine config (mirrors spawnOne) ---
|
||||
vcpus := cfg.VCPUs
|
||||
mem := cfg.MemMiB
|
||||
|
||||
fcCfg := firecracker.Config{
|
||||
SocketPath: sockPath,
|
||||
MachineCfg: models.MachineConfiguration{
|
||||
VcpuCount: &vcpus,
|
||||
MemSizeMib: &mem,
|
||||
},
|
||||
LogPath: sockPath + ".log",
|
||||
LogLevel: "Debug",
|
||||
FifoLogWriter: logger.Writer(),
|
||||
}
|
||||
|
||||
if cfg.Bridge != "none" && tapName != "" {
|
||||
mac := fmt.Sprintf("AA:FC:00:00:%02X:%02X", id/256, id%256)
|
||||
fcCfg.NetworkInterfaces = firecracker.NetworkInterfaces{
|
||||
{
|
||||
StaticConfiguration: &firecracker.StaticNetworkConfiguration{
|
||||
MacAddress: mac,
|
||||
HostDevName: tapName,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
m, err := firecracker.NewMachine(ctx, fcCfg,
|
||||
firecracker.WithProcessRunner(cmd),
|
||||
firecracker.WithLogger(logger),
|
||||
firecracker.WithSnapshot(sharedMem, cloneVmstate, func(sc *firecracker.SnapshotConfig) {
|
||||
sc.ResumeVM = true
|
||||
}),
|
||||
)
|
||||
if err != nil {
|
||||
pts.Close()
|
||||
ptm.Close()
|
||||
return fmt.Errorf("new machine: %w", err)
|
||||
}
|
||||
|
||||
// Swap in network-override snapshot handler (same as spawnOne)
|
||||
if cfg.Bridge != "none" && tapName != "" {
|
||||
m.Handlers.FcInit = m.Handlers.FcInit.Swap(firecracker.Handler{
|
||||
Name: firecracker.LoadSnapshotHandlerName,
|
||||
Fn: func(ctx context.Context, m *firecracker.Machine) error {
|
||||
return loadSnapshotWithNetworkOverride(
|
||||
ctx, sockPath, sharedMem, cloneVmstate, tapName,
|
||||
)
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// --- Start VM (blocks until snapshot is loaded and VM is running) ---
|
||||
start := time.Now()
|
||||
logger.Infof("restoring clone %d from snapshot ...", id)
|
||||
if err := m.Start(ctx); err != nil {
|
||||
pts.Close()
|
||||
ptm.Close()
|
||||
return fmt.Errorf("restore clone %d: %w", id, err)
|
||||
}
|
||||
elapsed := time.Since(start)
|
||||
|
||||
// Release our copy of the slave — firecracker holds its own fd now.
|
||||
// Closing here ensures we get EOF on ptm when firecracker exits.
|
||||
pts.Close()
|
||||
|
||||
// --- Write PID file ---
|
||||
pidsDir := filepath.Join(cfg.BaseDir, "pids")
|
||||
os.MkdirAll(pidsDir, 0o755) //nolint:errcheck
|
||||
if cmd.Process != nil {
|
||||
os.WriteFile( //nolint:errcheck
|
||||
filepath.Join(pidsDir, fmt.Sprintf("clone-%d.pid", id)),
|
||||
[]byte(strconv.Itoa(cmd.Process.Pid)),
|
||||
0o644,
|
||||
)
|
||||
}
|
||||
logger.Infof("clone %d: restored in %s (pid=%d, tap=%s)",
|
||||
id, elapsed.Round(time.Millisecond), cmd.Process.Pid, tapName)
|
||||
|
||||
// --- Create console socket ---
|
||||
os.Remove(consoleSockPath) //nolint:errcheck
|
||||
listener, err := net.Listen("unix", consoleSockPath)
|
||||
if err != nil {
|
||||
ptm.Close()
|
||||
return fmt.Errorf("listen on console socket: %w", err)
|
||||
}
|
||||
|
||||
// --- Serve until VM exits ---
|
||||
vmDone := make(chan struct{})
|
||||
go func() {
|
||||
m.Wait(ctx) //nolint:errcheck
|
||||
close(vmDone)
|
||||
}()
|
||||
|
||||
serveConsole(listener, ptm, vmDone, logger)
|
||||
|
||||
listener.Close()
|
||||
ptm.Close()
|
||||
os.Remove(consoleSockPath) //nolint:errcheck
|
||||
|
||||
if tapName != "" {
|
||||
destroyTap(tapName)
|
||||
}
|
||||
|
||||
logger.Infof("clone %d: exiting", id)
|
||||
return nil
|
||||
}
|
||||
|
||||
// atomicWriter wraps an io.Writer behind a read/write lock so it can be
|
||||
// swapped between connections without holding the lock during writes.
|
||||
type atomicWriter struct {
|
||||
mu sync.RWMutex
|
||||
w io.Writer
|
||||
}
|
||||
|
||||
func (a *atomicWriter) set(w io.Writer) {
|
||||
a.mu.Lock()
|
||||
defer a.mu.Unlock()
|
||||
a.w = w
|
||||
}
|
||||
|
||||
func (a *atomicWriter) Write(p []byte) (int, error) {
|
||||
a.mu.RLock()
|
||||
defer a.mu.RUnlock()
|
||||
return a.w.Write(p)
|
||||
}
|
||||
|
||||
// serveConsole accepts connections on listener and proxies console I/O via ptm.
|
||||
// A background goroutine reads from the PTY master continuously (discarding
|
||||
// output when no client is connected so the VM never blocks on a full buffer).
|
||||
// Only one client is served at a time; sessions are serialised.
|
||||
func serveConsole(listener net.Listener, ptm *os.File, vmDone <-chan struct{}, logger *log.Entry) {
|
||||
aw := &atomicWriter{w: io.Discard}
|
||||
|
||||
// Background PTY reader — runs for the full VM lifetime.
|
||||
go func() {
|
||||
buf := make([]byte, 4096)
|
||||
for {
|
||||
n, err := ptm.Read(buf)
|
||||
if n > 0 {
|
||||
aw.Write(buf[:n]) //nolint:errcheck
|
||||
}
|
||||
if err != nil {
|
||||
return // PTY closed (VM exited)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// Unblock Accept() when the VM exits.
|
||||
go func() {
|
||||
<-vmDone
|
||||
listener.Close()
|
||||
}()
|
||||
|
||||
for {
|
||||
conn, err := listener.Accept()
|
||||
if err != nil {
|
||||
return // listener closed
|
||||
}
|
||||
|
||||
logger.Info("console client connected")
|
||||
|
||||
// Route PTY output to this connection.
|
||||
aw.set(conn)
|
||||
|
||||
// Forward client input to the PTY master.
|
||||
clientGone := make(chan struct{})
|
||||
go func() {
|
||||
defer close(clientGone)
|
||||
buf := make([]byte, 256)
|
||||
for {
|
||||
n, err := conn.Read(buf)
|
||||
if n > 0 {
|
||||
ptm.Write(buf[:n]) //nolint:errcheck
|
||||
}
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// Block until client disconnects or VM dies.
|
||||
select {
|
||||
case <-clientGone:
|
||||
logger.Info("console client disconnected")
|
||||
case <-vmDone:
|
||||
logger.Info("VM exited while console client was connected")
|
||||
}
|
||||
|
||||
aw.set(io.Discard)
|
||||
conn.Close()
|
||||
}
|
||||
}
|
||||
104
orchestrator/console_client.go
Normal file
104
orchestrator/console_client.go
Normal file
@@ -0,0 +1,104 @@
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"os"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"sync"
|
||||
"syscall"
|
||||
|
||||
"golang.org/x/term"
|
||||
)
|
||||
|
||||
// ConnectConsole attaches the calling terminal to the serial console of the
|
||||
// clone with the given ID. The connection is made over the Unix socket at
|
||||
// {cloneDir}/console.sock served by the console-proxy process.
|
||||
//
|
||||
// Escape sequence: Ctrl+] (byte 0x1D) detaches without stopping the VM.
|
||||
func ConnectConsole(cfg Config, id int) error {
|
||||
sockPath := filepath.Join(cfg.BaseDir, "clones", strconv.Itoa(id), "console.sock")
|
||||
if _, err := os.Stat(sockPath); err != nil {
|
||||
return fmt.Errorf("clone %d has no console socket — is it running?", id)
|
||||
}
|
||||
|
||||
conn, err := net.Dial("unix", sockPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("connect to console: %w", err)
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
if !term.IsTerminal(int(os.Stdin.Fd())) {
|
||||
return fmt.Errorf("console requires an interactive terminal")
|
||||
}
|
||||
|
||||
oldState, err := term.MakeRaw(int(os.Stdin.Fd()))
|
||||
if err != nil {
|
||||
return fmt.Errorf("set raw terminal: %w", err)
|
||||
}
|
||||
defer term.Restore(int(os.Stdin.Fd()), oldState) //nolint:errcheck
|
||||
|
||||
// Ensure the terminal is restored even on SIGTERM / SIGHUP.
|
||||
sigCh := make(chan os.Signal, 1)
|
||||
signal.Notify(sigCh, syscall.SIGTERM, syscall.SIGHUP)
|
||||
go func() {
|
||||
<-sigCh
|
||||
term.Restore(int(os.Stdin.Fd()), oldState) //nolint:errcheck
|
||||
os.Exit(0)
|
||||
}()
|
||||
|
||||
fmt.Fprintf(os.Stderr, "Connected to clone %d console. Escape: Ctrl+]\r\n", id)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(2)
|
||||
|
||||
// stdin → VM (with Ctrl+] escape detection)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
io.Copy(conn, &escapeReader{r: os.Stdin, cancel: cancel}) //nolint:errcheck
|
||||
// Half-close so the proxy knows we're done sending.
|
||||
if uc, ok := conn.(*net.UnixConn); ok {
|
||||
uc.CloseWrite() //nolint:errcheck
|
||||
}
|
||||
}()
|
||||
|
||||
// VM → stdout
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
io.Copy(os.Stdout, conn) //nolint:errcheck
|
||||
cancel() // VM exited or proxy closed the connection
|
||||
}()
|
||||
|
||||
<-ctx.Done()
|
||||
conn.Close() // unblock both goroutines
|
||||
wg.Wait()
|
||||
|
||||
fmt.Fprintf(os.Stderr, "\r\nDetached from clone %d.\r\n", id)
|
||||
return nil
|
||||
}
|
||||
|
||||
// escapeReader wraps an io.Reader and intercepts Ctrl+] (0x1D), calling cancel
|
||||
// and returning io.EOF when the escape byte is seen.
|
||||
type escapeReader struct {
|
||||
r io.Reader
|
||||
cancel context.CancelFunc
|
||||
}
|
||||
|
||||
func (e *escapeReader) Read(p []byte) (int, error) {
|
||||
n, err := e.r.Read(p)
|
||||
for i := 0; i < n; i++ {
|
||||
if p[i] == 0x1D { // Ctrl+]
|
||||
e.cancel()
|
||||
// Return only the bytes before the escape so nothing leaks to the VM.
|
||||
return i, io.EOF
|
||||
}
|
||||
}
|
||||
return n, err
|
||||
}
|
||||
@@ -11,6 +11,7 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
firecracker "github.com/firecracker-microvm/firecracker-go-sdk"
|
||||
@@ -319,10 +320,6 @@ func (o *Orchestrator) spawnOne(id int) error {
|
||||
return fmt.Errorf("copy rootfs: %w", err)
|
||||
}
|
||||
|
||||
// --- Memory: point at the shared golden mem file ---
|
||||
// Firecracker uses MAP_PRIVATE → kernel COW. No copy needed.
|
||||
sharedMem := filepath.Join(goldenDir, "mem")
|
||||
|
||||
// --- vmstate: small, cheap copy ---
|
||||
cloneVmstate := filepath.Join(cloneDir, "vmstate")
|
||||
if err := copyFile(filepath.Join(goldenDir, "vmstate"), cloneVmstate); err != nil {
|
||||
@@ -331,7 +328,6 @@ func (o *Orchestrator) spawnOne(id int) error {
|
||||
|
||||
// --- Networking ---
|
||||
tapName := fmt.Sprintf("fctap%d", id)
|
||||
var netIfaces firecracker.NetworkInterfaces
|
||||
if o.cfg.Bridge != "none" {
|
||||
o.log.Infof("clone %d: running: ip tuntap add dev %s mode tap", id, tapName)
|
||||
o.log.Infof("clone %d: running: ip link set %s up", id, tapName)
|
||||
@@ -339,106 +335,57 @@ func (o *Orchestrator) spawnOne(id int) error {
|
||||
if err := o.createTap(tapName); err != nil {
|
||||
return err
|
||||
}
|
||||
mac := fmt.Sprintf("AA:FC:00:00:%02X:%02X", id/256, id%256)
|
||||
netIfaces = firecracker.NetworkInterfaces{
|
||||
firecracker.NetworkInterface{
|
||||
StaticConfiguration: &firecracker.StaticNetworkConfiguration{
|
||||
MacAddress: mac,
|
||||
HostDevName: tapName,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// --- Restore from snapshot ---
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
fcBin, err := exec.LookPath(o.cfg.FCBin)
|
||||
// --- Launch console proxy (detached daemon) ---
|
||||
// The proxy owns the full VM lifecycle: it starts firecracker with a PTY,
|
||||
// loads the snapshot, and serves cloneDir/console.sock until the VM exits.
|
||||
selfExe, err := os.Executable()
|
||||
if err != nil {
|
||||
cancel()
|
||||
return fmt.Errorf("firecracker not found: %w", err)
|
||||
return fmt.Errorf("resolve self path: %w", err)
|
||||
}
|
||||
|
||||
cmd := firecracker.VMCommandBuilder{}.
|
||||
WithBin(fcBin).
|
||||
WithSocketPath(sockPath).
|
||||
Build(ctx)
|
||||
|
||||
o.log.Infof("clone %d: running: %s", id, strings.Join(cmd.Args, " "))
|
||||
|
||||
vcpus := o.cfg.VCPUs
|
||||
mem := o.cfg.MemMiB
|
||||
|
||||
fcCfg := firecracker.Config{
|
||||
SocketPath: sockPath,
|
||||
MachineCfg: models.MachineConfiguration{
|
||||
VcpuCount: &vcpus,
|
||||
MemSizeMib: &mem,
|
||||
},
|
||||
NetworkInterfaces: netIfaces,
|
||||
LogPath: sockPath + ".log",
|
||||
LogLevel: "Debug",
|
||||
FifoLogWriter: o.log.Writer(),
|
||||
}
|
||||
|
||||
m, err := firecracker.NewMachine(ctx, fcCfg,
|
||||
firecracker.WithProcessRunner(cmd),
|
||||
firecracker.WithLogger(o.log),
|
||||
// WithSnapshot replaces the default handler set with snapshot-specific
|
||||
// handlers: skips validate.Cfg (no KernelImagePath needed) and uses
|
||||
// LoadSnapshotHandler instead of CreateBootSourceHandler.
|
||||
firecracker.WithSnapshot(sharedMem, cloneVmstate, func(sc *firecracker.SnapshotConfig) {
|
||||
sc.ResumeVM = true
|
||||
}),
|
||||
)
|
||||
if err != nil {
|
||||
cancel()
|
||||
return fmt.Errorf("new machine: %w", err)
|
||||
}
|
||||
|
||||
// Firecracker v1.15+ supports network_overrides in PUT /snapshot/load to
|
||||
// remap the tap backend stored in the snapshot. The SDK v1.0.0 doesn't
|
||||
// expose this field, so we replace the SDK's LoadSnapshotHandler with a
|
||||
// direct HTTP call that includes the per-clone tap name.
|
||||
proxyArgs := []string{"_console-proxy", "--id", strconv.Itoa(id)}
|
||||
if o.cfg.Bridge != "none" {
|
||||
m.Handlers.FcInit = m.Handlers.FcInit.Swap(firecracker.Handler{
|
||||
Name: firecracker.LoadSnapshotHandlerName,
|
||||
Fn: func(ctx context.Context, m *firecracker.Machine) error {
|
||||
return loadSnapshotWithNetworkOverride(
|
||||
ctx, sockPath, sharedMem, cloneVmstate, tapName,
|
||||
)
|
||||
},
|
||||
})
|
||||
proxyArgs = append(proxyArgs, "--tap", tapName)
|
||||
}
|
||||
proxyCmd := exec.Command(selfExe, proxyArgs...)
|
||||
// New session: proxy is detached from our terminal and survives our exit.
|
||||
proxyCmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true}
|
||||
proxyCmd.Stdin = nil
|
||||
proxyCmd.Stdout = nil
|
||||
proxyCmd.Stderr = nil
|
||||
|
||||
if err := proxyCmd.Start(); err != nil {
|
||||
return fmt.Errorf("start console proxy: %w", err)
|
||||
}
|
||||
os.WriteFile(filepath.Join(o.pidsDir(), fmt.Sprintf("clone-%d.proxy.pid", id)),
|
||||
[]byte(strconv.Itoa(proxyCmd.Process.Pid)), 0o644) //nolint:errcheck
|
||||
|
||||
// Wait for the console socket to appear — it is created by the proxy once
|
||||
// the VM is running, so this also gates on successful snapshot restore.
|
||||
consoleSockPath := filepath.Join(cloneDir, "console.sock")
|
||||
if err := waitForSocket(consoleSockPath, 15*time.Second); err != nil {
|
||||
return fmt.Errorf("clone %d: %w", id, err)
|
||||
}
|
||||
|
||||
start := time.Now()
|
||||
if err := m.Start(ctx); err != nil {
|
||||
cancel()
|
||||
return fmt.Errorf("restore clone %d: %w", id, err)
|
||||
}
|
||||
elapsed := time.Since(start)
|
||||
|
||||
// store PID
|
||||
if cmd.Process != nil {
|
||||
os.WriteFile(filepath.Join(o.pidsDir(), fmt.Sprintf("clone-%d.pid", id)),
|
||||
[]byte(strconv.Itoa(cmd.Process.Pid)), 0o644)
|
||||
}
|
||||
|
||||
o.mu.Lock()
|
||||
o.clones[id] = &cloneInfo{
|
||||
ID: id,
|
||||
Machine: m,
|
||||
Cancel: cancel,
|
||||
Tap: tapName,
|
||||
}
|
||||
o.mu.Unlock()
|
||||
|
||||
o.log.Infof("clone %d: restored in %s (pid=%d, tap=%s)",
|
||||
id, elapsed.Round(time.Millisecond), cmd.Process.Pid, tapName)
|
||||
|
||||
o.log.Infof("clone %d: ready (proxy pid=%d, tap=%s, console=%s)",
|
||||
id, proxyCmd.Process.Pid, tapName, consoleSockPath)
|
||||
return nil
|
||||
}
|
||||
|
||||
// waitForSocket polls path every 50 ms until it appears or timeout elapses.
|
||||
func waitForSocket(path string, timeout time.Duration) error {
|
||||
deadline := time.Now().Add(timeout)
|
||||
for time.Now().Before(deadline) {
|
||||
if _, err := os.Stat(path); err == nil {
|
||||
return nil
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
}
|
||||
return fmt.Errorf("timed out waiting for %s", path)
|
||||
}
|
||||
|
||||
// ——— Status ————————————————————————————————————————————————————————————
|
||||
|
||||
func (o *Orchestrator) Status() {
|
||||
|
||||
Reference in New Issue
Block a user