diff --git a/internal/linux/linux.go b/internal/linux/linux.go index 0fb8cc4c3f1..13713159328 100644 --- a/internal/linux/linux.go +++ b/internal/linux/linux.go @@ -2,6 +2,7 @@ package linux import ( "os" + "unsafe" "golang.org/x/sys/unix" ) @@ -65,6 +66,22 @@ func Recvfrom(fd int, p []byte, flags int) (n int, from unix.Sockaddr, err error return n, from, err } +// SchedSetaffinity wraps sched_setaffinity syscall without unix.CPUSet size limitation. +func SchedSetaffinity(pid int, buf []byte) error { + err := retryOnEINTR(func() error { + _, _, errno := unix.Syscall( + unix.SYS_SCHED_SETAFFINITY, + uintptr(pid), + uintptr(len(buf)), + uintptr((unsafe.Pointer)(&buf[0]))) + if errno != 0 { + return errno + } + return nil + }) + return os.NewSyscallError("sched_setaffinity", err) +} + // Sendmsg wraps [unix.Sendmsg]. func Sendmsg(fd int, p, oob []byte, to unix.Sockaddr, flags int) error { err := retryOnEINTR(func() error { diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 439c7c7341d..794bf4ec391 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -1,6 +1,7 @@ package libcontainer import ( + "bytes" "context" "encoding/json" "errors" @@ -25,6 +26,7 @@ import ( "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fs2" + "github.com/opencontainers/runc/internal/linux" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/internal/userns" @@ -178,33 +180,22 @@ type setnsProcess struct { // tryResetCPUAffinity tries to reset the CPU affinity of the process // identified by pid to include all possible CPUs (notwithstanding cgroup -// cpuset restrictions and isolated CPUs). +// cpuset restrictions, isolated CPUs and CPU online status). func tryResetCPUAffinity(pid int) { - // When resetting the CPU affinity, we want to match the configured cgroup - // cpuset (or the default set of all CPUs, if no cpuset is configured) - // rather than some more restrictive affinity we were spawned in (such as - // one that may have been inherited from systemd). The cpuset cgroup used - // to reconfigure the cpumask automatically for joining processes, but - // kcommit da019032819a ("sched: Enforce user requested affinity") changed - // this behaviour in Linux 6.2. + // When resetting the CPU affinity, we want to allow all + // possible CPUs in the system, including those not in + // cpuset.cpus, online or even present (hot-plugged) at call + // time. Using a cpumask any tighter this that may disallow + // using those CPUs if they are added to cpuset.cpus later. // - // Parsing cpuset.cpus.effective is quite inefficient (and looking at - // things like /proc/stat would be wrong for most nested containers), but - // luckily sched_setaffinity(2) will implicitly: - // - // * Clamp the cpumask so that it matches the current number of CPUs on - // the system. - // * Mask out any CPUs that are not a member of the target task's - // configured cgroup cpuset. - // - // So we can just pass a very large array of set cpumask bits and the - // kernel will silently convert that to the correct value very cheaply. - var cpuset unix.CPUSet - cpuset.Fill() // set all bits - if err := unix.SchedSetaffinity(pid, &cpuset); err != nil { - logrus.WithError( - os.NewSyscallError("sched_setaffinity", err), - ).Warnf("resetting the CPU affinity of pid %d failed -- the container process may inherit runc's CPU affinity", pid) + // Use similar huge buffer as go 1.25 runtime in getCPUCount() + // does for mask. This avoids reading and parsing + // /sys/devices/system/cpu/possible. + const maxCPUs = 64 * 1024 + buf := bytes.Repeat([]byte{0xff}, maxCPUs/8) + if err := linux.SchedSetaffinity(pid, buf); err != nil { + logrus.WithError(err).Warnf("resetting the CPU affinity of pid %d failed -- the container process may inherit runc's CPU affinity", pid) + return } }