From f736d722e1c7f42e36e2088ab926cc212ac66c50 Mon Sep 17 00:00:00 2001 From: Sanchit2662 Date: Sun, 25 Jan 2026 05:51:50 +0530 Subject: [PATCH] fix: detect Execve failures after StartSuccess signal Signed-off-by: Sanchit2662 --- cmd/urunc/start.go | 9 +++-- pkg/unikontainers/ipc.go | 55 ++++++++++++++++++++++++++++++ pkg/unikontainers/unikontainers.go | 16 +++++---- 3 files changed, 70 insertions(+), 10 deletions(-) diff --git a/cmd/urunc/start.go b/cmd/urunc/start.go index a020b547..575f01df 100644 --- a/cmd/urunc/start.go +++ b/cmd/urunc/start.go @@ -19,6 +19,7 @@ import ( "errors" "fmt" "os" + "time" "github.com/sirupsen/logrus" "github.com/urfave/cli/v3" @@ -26,6 +27,8 @@ import ( "github.com/urunc-dev/urunc/pkg/unikontainers" ) +const execveErrorCheckTimeout = 100 * time.Millisecond + var startCommand = &cli.Command{ Name: "start", Usage: "executes the user defined process in a created container", @@ -98,10 +101,10 @@ func startUnikontainer(cmd *cli.Command) error { } metrics.Capture(m.TS13) - // wait ContainerStarted message on start.sock from reexec process - err = unikontainer.AwaitMsg(unikontainers.StartSuccess) + // Wait for success, then briefly check for error (catches Execve failures) + err = unikontainer.AwaitMsgWithErrorCheck(unikontainers.StartSuccess, unikontainers.StartErr, execveErrorCheckTimeout) if err != nil { - err = fmt.Errorf("failed to get message from successful start from reexec: %w", err) + err = fmt.Errorf("failed to start container: %w", err) return err } diff --git a/pkg/unikontainers/ipc.go b/pkg/unikontainers/ipc.go index d7c19c45..f402329a 100644 --- a/pkg/unikontainers/ipc.go +++ b/pkg/unikontainers/ipc.go @@ -169,3 +169,58 @@ func AwaitMessage(listener *net.UnixListener, expectedMessage IPCMessage) error } return nil } + +// AwaitMessageWithErrorCheck waits for a success message, then briefly waits for +// a potential error message to catch Execve failures after StartSuccess is sent. +func AwaitMessageWithErrorCheck(listener *net.UnixListener, successMsg, errorMsg IPCMessage, errorCheckTimeout time.Duration) error { + conn, err := listener.AcceptUnix() + if err != nil { + return err + } + defer func() { + closeErr := conn.Close() + if closeErr != nil { + logrus.WithError(closeErr).Error("failed to close connection") + } + }() + + bufSize := len(successMsg) + if len(errorMsg) > bufSize { + bufSize = len(errorMsg) + } + + buf := make([]byte, bufSize) + n, err := conn.Read(buf) + if err != nil { + return fmt.Errorf("failed to read from socket: %w", err) + } + msg := string(buf[0:n]) + + if msg == string(errorMsg) { + return fmt.Errorf("container execution failed: received error signal from reexec") + } + + if msg != string(successMsg) { + return fmt.Errorf("received unexpected message: %s", msg) + } + + // Wait briefly for potential error after success (catches Execve failures) + if errorCheckTimeout > 0 { + if err = conn.SetReadDeadline(time.Now().Add(errorCheckTimeout)); err != nil { + logrus.WithError(err).Warn("failed to set read deadline for error check") + return nil + } + + errBuf := make([]byte, len(errorMsg)) + n, readErr := conn.Read(errBuf) + if readErr == nil && n > 0 { + errMsg := string(errBuf[0:n]) + if errMsg == string(errorMsg) { + return fmt.Errorf("VMM execution failed after signaling start") + } + logrus.WithField("message", errMsg).Warn("received unexpected message after success") + } + } + + return nil +} diff --git a/pkg/unikontainers/unikontainers.go b/pkg/unikontainers/unikontainers.go index 85dc1e2a..401c4cba 100644 --- a/pkg/unikontainers/unikontainers.go +++ b/pkg/unikontainers/unikontainers.go @@ -28,6 +28,7 @@ import ( "strings" "sync" "syscall" + "time" "github.com/urunc-dev/urunc/pkg/network" "github.com/urunc-dev/urunc/pkg/unikontainers/hypervisors" @@ -493,12 +494,8 @@ func (u *Unikontainer) Exec(metrics m.Writer) error { uniklog.Debug("calling vmm execve") metrics.Capture(m.TS18) - // metrics.Wait() - // TODO: We set the state to running and notify urunc Start that the monitor - // started, but we might encounter issues with the monitor execution. We need - // to revisit this and check if a failed monitor execution affects this approach. - // If it affects then we need to re-design the whole spawning of the monitor. - // Notify urunc start + // Notify urunc start. If Execve fails after this, urunc start will detect it + // via AwaitMsgWithErrorCheck which waits briefly for a potential StartErr. err = u.SendMessage(StartSuccess) if err != nil { return err @@ -1102,11 +1099,16 @@ func (u *Unikontainer) DestroyConn(isReexec bool) error { return nil } -// AwaitMessage waits for a specific message in the listener of unikontainer instance +// AwaitMsg waits for a specific message in the listener of unikontainer instance func (u *Unikontainer) AwaitMsg(msg IPCMessage) error { return AwaitMessage(u.Listener, msg) } +// AwaitMsgWithErrorCheck waits for success, then checks for a follow-up error message +func (u *Unikontainer) AwaitMsgWithErrorCheck(successMsg, errorMsg IPCMessage, errorTimeout time.Duration) error { + return AwaitMessageWithErrorCheck(u.Listener, successMsg, errorMsg, errorTimeout) +} + // SendMessage sends message over the active connection func (u *Unikontainer) SendMessage(message IPCMessage) error { conn := u.Conn