From 3307324bf450e46629522399e57b6ad07657908d Mon Sep 17 00:00:00 2001 From: Antti Kervinen Date: Tue, 18 Nov 2025 14:13:20 +0200 Subject: [PATCH] libct: fix resetting CPU affinity unix.CPUSet is limited to 1024 CPUs. Calling unix.SchedSetaffinity(pid, cpuset) removes all CPUs starting from 1024 from allowed CPUs of pid, even if cpuset is all ones. As a consequence, when runc tries to reset CPU affinity to "allow all" by default, it prevents all containers from CPUs 1024 onwards. This change uses a huge CPU mask to play safe and get all possible CPUs enabled with a single sched_setaffinity call. Fixes: #5023 Signed-off-by: Antti Kervinen --- internal/linux/linux.go | 17 +++++++++++++++ libcontainer/process_linux.go | 41 ++++++++++++++--------------------- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/internal/linux/linux.go b/internal/linux/linux.go index 0fb8cc4c3f1..13713159328 100644 --- a/internal/linux/linux.go +++ b/internal/linux/linux.go @@ -2,6 +2,7 @@ package linux import ( "os" + "unsafe" "golang.org/x/sys/unix" ) @@ -65,6 +66,22 @@ func Recvfrom(fd int, p []byte, flags int) (n int, from unix.Sockaddr, err error return n, from, err } +// SchedSetaffinity wraps sched_setaffinity syscall without unix.CPUSet size limitation. +func SchedSetaffinity(pid int, buf []byte) error { + err := retryOnEINTR(func() error { + _, _, errno := unix.Syscall( + unix.SYS_SCHED_SETAFFINITY, + uintptr(pid), + uintptr(len(buf)), + uintptr((unsafe.Pointer)(&buf[0]))) + if errno != 0 { + return errno + } + return nil + }) + return os.NewSyscallError("sched_setaffinity", err) +} + // Sendmsg wraps [unix.Sendmsg]. func Sendmsg(fd int, p, oob []byte, to unix.Sockaddr, flags int) error { err := retryOnEINTR(func() error { diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 439c7c7341d..794bf4ec391 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -1,6 +1,7 @@ package libcontainer import ( + "bytes" "context" "encoding/json" "errors" @@ -25,6 +26,7 @@ import ( "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fs2" + "github.com/opencontainers/runc/internal/linux" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/internal/userns" @@ -178,33 +180,22 @@ type setnsProcess struct { // tryResetCPUAffinity tries to reset the CPU affinity of the process // identified by pid to include all possible CPUs (notwithstanding cgroup -// cpuset restrictions and isolated CPUs). +// cpuset restrictions, isolated CPUs and CPU online status). func tryResetCPUAffinity(pid int) { - // When resetting the CPU affinity, we want to match the configured cgroup - // cpuset (or the default set of all CPUs, if no cpuset is configured) - // rather than some more restrictive affinity we were spawned in (such as - // one that may have been inherited from systemd). The cpuset cgroup used - // to reconfigure the cpumask automatically for joining processes, but - // kcommit da019032819a ("sched: Enforce user requested affinity") changed - // this behaviour in Linux 6.2. + // When resetting the CPU affinity, we want to allow all + // possible CPUs in the system, including those not in + // cpuset.cpus, online or even present (hot-plugged) at call + // time. Using a cpumask any tighter this that may disallow + // using those CPUs if they are added to cpuset.cpus later. // - // Parsing cpuset.cpus.effective is quite inefficient (and looking at - // things like /proc/stat would be wrong for most nested containers), but - // luckily sched_setaffinity(2) will implicitly: - // - // * Clamp the cpumask so that it matches the current number of CPUs on - // the system. - // * Mask out any CPUs that are not a member of the target task's - // configured cgroup cpuset. - // - // So we can just pass a very large array of set cpumask bits and the - // kernel will silently convert that to the correct value very cheaply. - var cpuset unix.CPUSet - cpuset.Fill() // set all bits - if err := unix.SchedSetaffinity(pid, &cpuset); err != nil { - logrus.WithError( - os.NewSyscallError("sched_setaffinity", err), - ).Warnf("resetting the CPU affinity of pid %d failed -- the container process may inherit runc's CPU affinity", pid) + // Use similar huge buffer as go 1.25 runtime in getCPUCount() + // does for mask. This avoids reading and parsing + // /sys/devices/system/cpu/possible. + const maxCPUs = 64 * 1024 + buf := bytes.Repeat([]byte{0xff}, maxCPUs/8) + if err := linux.SchedSetaffinity(pid, buf); err != nil { + logrus.WithError(err).Warnf("resetting the CPU affinity of pid %d failed -- the container process may inherit runc's CPU affinity", pid) + return } }