diff --git a/cmd/urunc/create.go b/cmd/urunc/create.go index 66ff8516..39916d6d 100644 --- a/cmd/urunc/create.go +++ b/cmd/urunc/create.go @@ -29,6 +29,7 @@ import ( "github.com/sirupsen/logrus" "github.com/urfave/cli/v3" m "github.com/urunc-dev/urunc/internal/metrics" + "github.com/urunc-dev/urunc/pkg/cgroup" "github.com/urunc-dev/urunc/pkg/unikontainers" "golang.org/x/sys/unix" ) @@ -259,6 +260,16 @@ func createUnikontainer(cmd *cli.Command, uruncCfg *unikontainers.UruncConfig) ( return err } + // Setup cgroups + err = setupCgroups(cmd, unikontainer, containerPid) + if err != nil { + // Clean up on cgroup creation failure + if unikontainer.CgroupMgr != nil { + _ = unikontainer.CgroupMgr.Delete() + } + return fmt.Errorf("failed to setup cgroups: %w", err) + } + // execute CreateRuntime hooks err = unikontainer.ExecuteHooks("CreateRuntime") if err != nil { @@ -278,6 +289,51 @@ func createUnikontainer(cmd *cli.Command, uruncCfg *unikontainers.UruncConfig) ( return err } +// setupCgroups creates and configures cgroups for the container +func setupCgroups(cmd *cli.Command, u *unikontainers.Unikontainer, pid int) error { + // Check if cgroups are disabled + if u.Spec.Linux == nil || u.Spec.Linux.CgroupsPath == "" { + logrus.Debug("Cgroups disabled or no cgroup path specified") + return nil + } + + // Check if systemd cgroup driver is enabled + useSystemd := cmd.Bool("systemd-cgroup") + + // Create cgroup manager config + cgroupCfg := cgroup.Config{ + CgroupPath: u.Spec.Linux.CgroupsPath, + ContainerID: u.State.ID, + Resources: u.Spec.Linux.Resources, + SandboxCgroupOnly: u.UruncCfg.Cgroup.SandboxCgroupOnly, + OverheadPath: u.UruncCfg.Cgroup.OverheadPath, + UseSystemd: useSystemd, + } + + // Create cgroup manager + cgroupMgr, err := cgroup.NewManager(cgroupCfg) + if err != nil { + return fmt.Errorf("failed to create cgroup manager: %w", err) + } + + // Create cgroups and add reexec process + if err := cgroupMgr.Create(context.Background(), u.Spec.Linux.Resources, pid, useSystemd); err != nil { + return fmt.Errorf("failed to create cgroups: %w", err) + } + + // Store manager in unikontainer + u.CgroupMgr = cgroupMgr + + logrus.WithFields(logrus.Fields{ + "cgroup_path": u.Spec.Linux.CgroupsPath, + "sandbox_cgroup_only": u.UruncCfg.Cgroup.SandboxCgroupOnly, + "use_systemd": useSystemd, + "pid": pid, + }).Info("Cgroups created successfully") + + return nil +} + func createReexecCmd(initSock *os.File, logPipe *os.File) *exec.Cmd { selfPath := "/proc/self/exe" reexecCommand := &exec.Cmd{ diff --git a/cmd/urunc/start.go b/cmd/urunc/start.go index a020b547..d82a04e1 100644 --- a/cmd/urunc/start.go +++ b/cmd/urunc/start.go @@ -19,6 +19,7 @@ import ( "errors" "fmt" "os" + "time" "github.com/sirupsen/logrus" "github.com/urfave/cli/v3" @@ -111,5 +112,16 @@ func startUnikontainer(cmd *cli.Command) error { return err } + if unikontainer.CgroupMgr != nil && unikontainer.CgroupMgr.UsingSplitPolicy() { + vmmPid := unikontainer.State.Pid + time.Sleep(200 * time.Millisecond) + + if err := unikontainer.CgroupMgr.MoveVCPUThreads(vmmPid); err != nil { + logrus.WithError(err).Warn("Failed to move vCPU threads to sandbox cgroup") + } else { + logrus.Info("Successfully moved vCPU threads to sandbox cgroup") + } + } + return unikontainer.ExecuteHooks("Poststart") } diff --git a/go.mod b/go.mod index 831d381d..9f4cc0e9 100644 --- a/go.mod +++ b/go.mod @@ -8,6 +8,7 @@ require ( github.com/BurntSushi/toml v1.5.0 github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 github.com/cavaliergopher/cpio v1.0.1 + github.com/containerd/cgroups/v3 v3.1.0 github.com/containerd/containerd v1.7.29 github.com/creack/pty v1.1.24 github.com/elastic/go-seccomp-bpf v1.6.0 @@ -32,7 +33,6 @@ require ( github.com/Microsoft/go-winio v0.6.2 // indirect github.com/Microsoft/hcsshim v0.13.0 // indirect github.com/cilium/ebpf v0.20.0 // indirect - github.com/containerd/cgroups/v3 v3.1.0 // indirect github.com/containerd/console v1.0.5 // indirect github.com/containerd/containerd/api v1.10.0 // indirect github.com/containerd/continuity v0.4.5 // indirect diff --git a/pkg/cgroup/manager.go b/pkg/cgroup/manager.go new file mode 100644 index 00000000..46152adf --- /dev/null +++ b/pkg/cgroup/manager.go @@ -0,0 +1,531 @@ +// Copyright (c) 2023-2025, Nubificus LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cgroup + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + cgroupsv2 "github.com/containerd/cgroups/v3/cgroup2" + "github.com/containerd/cgroups/v3/cgroup2/stats" + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" +) + +var cgroupLog = logrus.WithField("subsystem", "cgroup") + +// Manager handles cgroup lifecycle for urunc containers +type Manager struct { + sandboxCgroup *cgroupsv2.Manager + overheadCgroup *cgroupsv2.Manager + cgroupPath string + overheadPath string + splitPolicy bool + containerID string +} + +// Config holds configuration for cgroup creation +type Config struct { + CgroupPath string + ContainerID string + Resources *specs.LinuxResources + SandboxCgroupOnly bool + OverheadPath string + UseSystemd bool // Whether to use systemd cgroup driver +} + +// NewManager creates a new cgroup manager +func NewManager(cfg Config) (*Manager, error) { + if cfg.CgroupPath == "" { + return nil, fmt.Errorf("cgroup path cannot be empty") + } + + cgroupPath := normalizeCgroupPath(cfg.CgroupPath, cfg.ContainerID) + + m := &Manager{ + cgroupPath: cgroupPath, + overheadPath: cfg.OverheadPath, + splitPolicy: !cfg.SandboxCgroupOnly, + containerID: cfg.ContainerID, + } + + cgroupLog.WithFields(logrus.Fields{ + "cgroup_path": cgroupPath, + "split_policy": m.splitPolicy, + "container_id": cfg.ContainerID, + }).Debug("Creating cgroup manager") + + return m, nil +} + +// Create creates the necessary cgroups +func (m *Manager) Create(ctx context.Context, resources *specs.LinuxResources, pid int, useSystemd bool) error { + // Convert OCI resources to cgroup v2 resources + cgroupResources, err := specToCgroupResources(resources) + if err != nil { + return fmt.Errorf("failed to convert resources: %w", err) + } + + var sandboxMgr *cgroupsv2.Manager + + // Auto-detect systemd path format or use explicit flag + useSystemdDriver := useSystemd || isSystemdPath(m.cgroupPath) + + // Create sandbox cgroup using appropriate method + if useSystemdDriver && isSystemdPath(m.cgroupPath) { + // Parse systemd path format: slice:prefix:name + slice, group, err := parseSystemdPath(m.cgroupPath) + if err != nil { + return fmt.Errorf("failed to parse systemd cgroup path %s: %w", m.cgroupPath, err) + } + + cgroupLog.WithFields(logrus.Fields{ + "slice": slice, + "group": group, + "pid": pid, + }).Debug("Creating systemd cgroup") + + sandboxMgr, err = cgroupsv2.NewSystemd(slice, group, pid, cgroupResources) + if err != nil { + return fmt.Errorf("failed to create systemd sandbox cgroup %s:%s: %w", slice, group, err) + } + } else { + // Use filesystem-based cgroup manager + sandboxMgr, err = cgroupsv2.NewManager( + "/sys/fs/cgroup", + m.cgroupPath, + cgroupResources, + ) + if err != nil { + return fmt.Errorf("failed to create sandbox cgroup at %s: %w", m.cgroupPath, err) + } + + // Add process to sandbox cgroup (NewSystemd already does this) + if err := sandboxMgr.AddProc(uint64(pid)); err != nil { + _ = sandboxMgr.Delete() + return fmt.Errorf("failed to add pid %d to cgroup: %w", pid, err) + } + } + + m.sandboxCgroup = sandboxMgr + + cgroupLog.WithFields(logrus.Fields{ + "path": m.cgroupPath, + "pid": pid, + }).Info("Created sandbox cgroup and added process") + + // If split policy, create overhead cgroup + // Note: We always use filesystem-based cgroup for overhead, even with systemd, + // because it's an internal urunc feature and doesn't need systemd integration + if m.splitPolicy { + overheadPath := filepath.Join(m.overheadPath, m.containerID) + + // Overhead gets minimal resources (no limits) + overheadMgr, err := cgroupsv2.NewManager( + "/sys/fs/cgroup", + overheadPath, + &cgroupsv2.Resources{}, + ) + if err != nil { + _ = m.sandboxCgroup.Delete() + return fmt.Errorf("failed to create overhead cgroup at %s: %w", overheadPath, err) + } + m.overheadCgroup = overheadMgr + + cgroupLog.WithField("path", overheadPath).Info("Created overhead cgroup") + } + + return nil +} + +// MoveVCPUThreads identifies vCPU threads and moves them FROM overhead TO sandbox cgroup. +// This assumes the VMM process is already in the overhead cgroup (moved before exec via MoveToOverhead). +// All threads initially inherit the overhead cgroup, and we selectively move only vCPU threads +// to the sandbox (workload) cgroup. I/O threads stay in overhead. +func (m *Manager) MoveVCPUThreads(vmmPid int) error { + if !m.splitPolicy { + // In sandbox_cgroup_only mode, all threads stay in sandbox cgroup + return nil + } + + cgroupLog.WithField("vmm_pid", vmmPid).Debug("Identifying and moving vCPU threads from overhead to sandbox cgroup") + + // Retry thread detection with exponential backoff + // VMM may not spawn all vCPU threads immediately + var vcpuThreads, ioThreads []int + maxAttempts := 5 + var lastThreadCount int + + for attempt := 0; attempt < maxAttempts; attempt++ { + // Read all threads of the VMM process + threadIDs, err := getProcessThreads(vmmPid) + if err != nil { + cgroupLog.WithError(err).WithField("attempt", attempt+1).Warn("Failed to get threads") + time.Sleep(50 * time.Millisecond * time.Duration(1< 0 || (attempt > 0 && len(threadIDs) == lastThreadCount) { + break + } + + lastThreadCount = len(threadIDs) + + // Wait before retry with exponential backoff + sleepDuration := 50 * time.Millisecond * time.Duration(1< 0 { + return fmt.Errorf("cgroup deletion errors: %v", errs) + } + + return nil +} + +// GetStats returns cgroup statistics +func (m *Manager) GetStats() (*stats.Metrics, error) { + if m.sandboxCgroup == nil { + return nil, fmt.Errorf("sandbox cgroup not initialized") + } + + return m.sandboxCgroup.Stat() +} + +// normalizeCgroupPath handles OCI cgroup path formats +func normalizeCgroupPath(cgroupPath, containerID string) string { + if cgroupPath == "" { + return containerID + } + + // If it starts with /, it's an absolute path + if strings.HasPrefix(cgroupPath, "/") { + return cgroupPath + } + + // Otherwise, it's a relative path + return cgroupPath +} + +// isSystemdPath checks if a cgroup path is in systemd format (slice:prefix:name) +func isSystemdPath(path string) bool { + return strings.Contains(path, ":") +} + +// parseSystemdPath parses a systemd cgroup path format +// Input: "slice:prefix:name" (e.g., "system.slice:docker:containerID") +// Output: slice ("system.slice"), group ("docker-containerID.scope") +func parseSystemdPath(path string) (string, string, error) { + parts := strings.Split(path, ":") + if len(parts) < 2 { + return "", "", fmt.Errorf("invalid systemd path format: %s", path) + } + + slice := parts[0] + + // Construct group name from remaining parts + // For "system.slice:docker:containerID" -> "docker-containerID.scope" + group := strings.Join(parts[1:], "-") + if !strings.HasSuffix(group, ".scope") { + group = group + ".scope" + } + + cgroupLog.WithFields(logrus.Fields{ + "input": path, + "slice": slice, + "group": group, + }).Debug("Parsed systemd cgroup path") + + return slice, group, nil +} + +// getProcessThreads returns all thread IDs for a process +func getProcessThreads(pid int) ([]int, error) { + taskDir := fmt.Sprintf("/proc/%d/task", pid) + entries, err := os.ReadDir(taskDir) + if err != nil { + return nil, err + } + + threads := make([]int, 0, len(entries)) + for _, entry := range entries { + if !entry.IsDir() { + continue + } + + tid, err := strconv.Atoi(entry.Name()) + if err != nil { + continue + } + threads = append(threads, tid) + } + + return threads, nil +} + +// getThreadName reads the thread name from /proc//task//comm +func getThreadName(pid, tid int) (string, error) { + commPath := fmt.Sprintf("/proc/%d/task/%d/comm", pid, tid) + data, err := os.ReadFile(commPath) + if err != nil { + return "", err + } + + return strings.TrimSpace(string(data)), nil +} + +// isVCPUThread determines if a thread is a vCPU thread based on its name +func isVCPUThread(name string) bool { + // QEMU vCPU threads + if strings.HasPrefix(name, "CPU ") || strings.Contains(name, "/KVM") { + return true + } + // Firecracker vCPU threads + if strings.HasPrefix(name, "fc_vcpu") { + return true + } + // Generic vcpu naming + if strings.HasPrefix(name, "vcpu") { + return true + } + + return false +} + +// specToCgroupResources converts OCI resources to cgroup v2 resources +func specToCgroupResources(spec *specs.LinuxResources) (*cgroupsv2.Resources, error) { + if spec == nil { + return &cgroupsv2.Resources{}, nil + } + + res := &cgroupsv2.Resources{} + + // CPU resources + if spec.CPU != nil { + res.CPU = &cgroupsv2.CPU{} + + if spec.CPU.Shares != nil { + weight := sharesToWeight(*spec.CPU.Shares) + res.CPU.Weight = &weight + } + + if spec.CPU.Quota != nil && spec.CPU.Period != nil { + res.CPU.Max = cgroupsv2.NewCPUMax(spec.CPU.Quota, spec.CPU.Period) + } + + if spec.CPU.Cpus != "" { + res.CPU.Cpus = spec.CPU.Cpus + } + + if spec.CPU.Mems != "" { + res.CPU.Mems = spec.CPU.Mems + } + } + + // Memory resources + if spec.Memory != nil { + res.Memory = &cgroupsv2.Memory{} + + if spec.Memory.Limit != nil { + res.Memory.Max = spec.Memory.Limit + } + + if spec.Memory.Swap != nil { + res.Memory.Swap = spec.Memory.Swap + } + + if spec.Memory.Reservation != nil { + res.Memory.Low = spec.Memory.Reservation + } + } + + // I/O resources + if spec.BlockIO != nil { + res.IO = &cgroupsv2.IO{} + + if spec.BlockIO.Weight != nil { + res.IO.BFQ.Weight = uint16(*spec.BlockIO.Weight) + } + } + + // PID resources + if spec.Pids != nil { + res.Pids = &cgroupsv2.Pids{} + + if spec.Pids.Limit > 0 { + res.Pids.Max = spec.Pids.Limit + } + } + + return res, nil +} + +// sharesToWeight converts CPU shares (OCI) to CPU weight (cgroup v2) +// OCI shares range: 2-262144, default 1024 +// cgroup v2 weight range: 1-10000, default 100 +func sharesToWeight(shares uint64) uint64 { + if shares == 0 { + return 100 // default weight + } + + // Convert shares to weight + // Formula: weight = (shares * 100) / 1024 + weight := (shares * 100) / 1024 + + if weight < 1 { + weight = 1 + } + if weight > 10000 { + weight = 10000 + } + + return weight +} diff --git a/pkg/cgroup/manager_test.go b/pkg/cgroup/manager_test.go new file mode 100644 index 00000000..d5e81e09 --- /dev/null +++ b/pkg/cgroup/manager_test.go @@ -0,0 +1,327 @@ +// Copyright (c) 2023-2025, Nubificus LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cgroup + +import ( + "testing" + + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func TestNewManager(t *testing.T) { + tests := []struct { + name string + cfg Config + wantErr bool + }{ + { + name: "valid config - sandbox only", + cfg: Config{ + CgroupPath: "/test/cgroup", + ContainerID: "test123", + SandboxCgroupOnly: true, + OverheadPath: "/urunc_overhead", + }, + wantErr: false, + }, + { + name: "valid config - split policy", + cfg: Config{ + CgroupPath: "/test/cgroup", + ContainerID: "test456", + SandboxCgroupOnly: false, + OverheadPath: "/urunc_overhead", + }, + wantErr: false, + }, + { + name: "empty cgroup path", + cfg: Config{ + CgroupPath: "", + ContainerID: "test789", + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mgr, err := NewManager(tt.cfg) + if (err != nil) != tt.wantErr { + t.Errorf("NewManager() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !tt.wantErr && mgr == nil { + t.Error("NewManager() returned nil manager") + } + }) + } +} + +func TestSharesToWeight(t *testing.T) { + tests := []struct { + name string + shares uint64 + want uint64 + }{ + { + name: "default shares (1024)", + shares: 1024, + want: 100, + }, + { + name: "minimum shares (2)", + shares: 2, + want: 1, // (2 * 100) / 1024 = 0.195 -> clamped to 1 + }, + { + name: "maximum shares (262144)", + shares: 262144, + want: 10000, // (262144 * 100) / 1024 = 25600 -> clamped to 10000 + }, + { + name: "zero shares", + shares: 0, + want: 100, // default + }, + { + name: "half default (512)", + shares: 512, + want: 50, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := sharesToWeight(tt.shares) + if got != tt.want { + t.Errorf("sharesToWeight(%d) = %d, want %d", tt.shares, got, tt.want) + } + }) + } +} + +func TestNormalizeCgroupPath(t *testing.T) { + tests := []struct { + name string + cgroupPath string + containerID string + want string + }{ + { + name: "absolute path", + cgroupPath: "/kubepods/pod123/container456", + containerID: "container456", + want: "/kubepods/pod123/container456", + }, + { + name: "relative path", + cgroupPath: "kubepods/pod123/container456", + containerID: "container456", + want: "kubepods/pod123/container456", + }, + { + name: "empty path uses container ID", + cgroupPath: "", + containerID: "container789", + want: "container789", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := normalizeCgroupPath(tt.cgroupPath, tt.containerID) + if got != tt.want { + t.Errorf("normalizeCgroupPath(%q, %q) = %q, want %q", + tt.cgroupPath, tt.containerID, got, tt.want) + } + }) + } +} + +func TestIsVCPUThread(t *testing.T) { + tests := []struct { + name string + threadName string + want bool + }{ + { + name: "QEMU vCPU thread with KVM", + threadName: "CPU 0/KVM", + want: true, + }, + { + name: "QEMU vCPU thread simple", + threadName: "CPU 1/KVM", + want: true, + }, + { + name: "generic vcpu thread", + threadName: "vcpu0", + want: true, + }, + { + name: "Firecracker vCPU thread", + threadName: "fc_vcpu0", + want: true, + }, + { + name: "Firecracker vCPU thread 2", + threadName: "fc_vcpu1", + want: true, + }, + { + name: "I/O thread", + threadName: "IO 0", + want: false, + }, + { + name: "main thread", + threadName: "qemu-system-x86", + want: false, + }, + { + name: "worker thread", + threadName: "worker0", + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := isVCPUThread(tt.threadName) + if got != tt.want { + t.Errorf("isVCPUThread(%q) = %v, want %v", tt.threadName, got, tt.want) + } + }) + } +} + +func TestSpecToCgroupResources(t *testing.T) { + // Test CPU shares conversion + shares := uint64(2048) + quota := int64(50000) + period := uint64(100000) + + spec := &specs.LinuxResources{ + CPU: &specs.LinuxCPU{ + Shares: &shares, + Quota: "a, + Period: &period, + Cpus: "0-1", + Mems: "0", + }, + } + + res, err := specToCgroupResources(spec) + if err != nil { + t.Fatalf("specToCgroupResources() error = %v", err) + } + + if res.CPU == nil { + t.Fatal("CPU resources not set") + } + + if res.CPU.Weight == nil { + t.Fatal("CPU weight not set") + } + + expectedWeight := sharesToWeight(shares) + if *res.CPU.Weight != expectedWeight { + t.Errorf("CPU weight = %d, want %d", *res.CPU.Weight, expectedWeight) + } + + if res.CPU.Cpus != "0-1" { + t.Errorf("CPU cpus = %q, want %q", res.CPU.Cpus, "0-1") + } + + if res.CPU.Mems != "0" { + t.Errorf("CPU mems = %q, want %q", res.CPU.Mems, "0") + } +} + +func TestSpecToCgroupResources_Memory(t *testing.T) { + limit := int64(536870912) // 512MB + swap := int64(1073741824) // 1GB + reservation := int64(268435456) // 256MB + + spec := &specs.LinuxResources{ + Memory: &specs.LinuxMemory{ + Limit: &limit, + Swap: &swap, + Reservation: &reservation, + }, + } + + res, err := specToCgroupResources(spec) + if err != nil { + t.Fatalf("specToCgroupResources() error = %v", err) + } + + if res.Memory == nil { + t.Fatal("Memory resources not set") + } + + if res.Memory.Max == nil || *res.Memory.Max != limit { + t.Errorf("Memory max = %v, want %d", res.Memory.Max, limit) + } + + if res.Memory.Swap == nil || *res.Memory.Swap != swap { + t.Errorf("Memory swap = %v, want %d", res.Memory.Swap, swap) + } + + if res.Memory.Low == nil || *res.Memory.Low != reservation { + t.Errorf("Memory low = %v, want %d", res.Memory.Low, reservation) + } +} + +func TestSpecToCgroupResources_Pids(t *testing.T) { + pidsLimit := int64(1024) + + spec := &specs.LinuxResources{ + Pids: &specs.LinuxPids{ + Limit: pidsLimit, + }, + } + + res, err := specToCgroupResources(spec) + if err != nil { + t.Fatalf("specToCgroupResources() error = %v", err) + } + + if res.Pids == nil { + t.Fatal("Pids resources not set") + } + + if res.Pids.Max != pidsLimit { + t.Errorf("Pids max = %d, want %d", res.Pids.Max, pidsLimit) + } +} + +func TestSpecToCgroupResources_NilResources(t *testing.T) { + res, err := specToCgroupResources(nil) + if err != nil { + t.Fatalf("specToCgroupResources(nil) error = %v", err) + } + + if res == nil { + t.Fatal("Expected non-nil result for nil input") + } + + // All fields should be nil/empty + if res.CPU != nil || res.Memory != nil || res.Pids != nil || res.IO != nil { + t.Error("Expected all resource fields to be nil for nil input") + } +} diff --git a/pkg/unikontainers/unikontainers.go b/pkg/unikontainers/unikontainers.go index 85dc1e2a..8e94b5e2 100644 --- a/pkg/unikontainers/unikontainers.go +++ b/pkg/unikontainers/unikontainers.go @@ -29,6 +29,7 @@ import ( "sync" "syscall" + "github.com/urunc-dev/urunc/pkg/cgroup" "github.com/urunc-dev/urunc/pkg/network" "github.com/urunc-dev/urunc/pkg/unikontainers/hypervisors" "github.com/urunc-dev/urunc/pkg/unikontainers/initrd" @@ -55,13 +56,14 @@ var ErrNotExistingNS = errors.New("the namespace does not exist") // Unikontainer holds the data necessary to create, manage and delete unikernel containers type Unikontainer struct { - State *specs.State - Spec *specs.Spec - BaseDir string - RootDir string - UruncCfg *UruncConfig - Listener *net.UnixListener - Conn *net.UnixConn + State *specs.State + Spec *specs.Spec + BaseDir string + RootDir string + UruncCfg *UruncConfig + Listener *net.UnixListener + Conn *net.UnixConn + CgroupMgr *cgroup.Manager } // New parses the bundle and creates a new Unikontainer object @@ -504,6 +506,14 @@ func (u *Unikontainer) Exec(metrics m.Writer) error { return err } + if u.CgroupMgr != nil && u.CgroupMgr.UsingSplitPolicy() { + myPid := os.Getpid() + if err := u.CgroupMgr.MoveToOverhead(myPid); err != nil { + return fmt.Errorf("failed to move reexec to overhead cgroup: %w", err) + } + uniklog.Info("Moved reexec to overhead cgroup") + } + return vmm.Execve(vmmArgs, unikernel) } @@ -578,6 +588,14 @@ func (u *Unikontainer) Delete() error { return fmt.Errorf("cannot delete running container: %s", u.State.ID) } + // Delete cgroups + if u.CgroupMgr != nil { + if err := u.CgroupMgr.Delete(); err != nil { + uniklog.WithError(err).Error("Failed to delete cgroups") + // Don't fail delete - just log + } + } + // get a monitor instance of the running monitor vmmType := u.State.Annotations[annotHypervisor] vmm, err := hypervisors.NewVMM(hypervisors.VmmType(vmmType), u.UruncCfg.Monitors) diff --git a/pkg/unikontainers/urunc_config.go b/pkg/unikontainers/urunc_config.go index 8b8686de..6f8eb537 100644 --- a/pkg/unikontainers/urunc_config.go +++ b/pkg/unikontainers/urunc_config.go @@ -34,9 +34,15 @@ type UruncTimestamps struct { Destination string `toml:"destination"` // Used to specify a file for timestamps } +type UruncCgroup struct { + SandboxCgroupOnly bool `toml:"sandbox_cgroup_only"` + OverheadPath string `toml:"overhead_path"` +} + type UruncConfig struct { Log UruncLog `toml:"log"` Timestamps UruncTimestamps `toml:"timestamps"` + Cgroup UruncCgroup `toml:"cgroup"` Monitors map[string]types.MonitorConfig `toml:"monitors"` ExtraBins map[string]types.ExtraBinConfig `toml:"extra_binaries"` } @@ -78,6 +84,13 @@ func defaultTimestampsConfig() UruncTimestamps { } } +func defaultCgroupConfig() UruncCgroup { + return UruncCgroup{ + SandboxCgroupOnly: true, + OverheadPath: "/urunc_overhead", + } +} + func defaultMonitorsConfig() map[string]types.MonitorConfig { return map[string]types.MonitorConfig{ "qemu": {DefaultMemoryMB: 256, DefaultVCPUs: 1}, @@ -97,6 +110,7 @@ func defaultUruncConfig() *UruncConfig { return &UruncConfig{ Log: defaultLogConfig(), Timestamps: defaultTimestampsConfig(), + Cgroup: defaultCgroupConfig(), Monitors: defaultMonitorsConfig(), ExtraBins: defaultExtraBinConfig(), } @@ -119,6 +133,10 @@ func (p *UruncConfig) Map() map[string]string { // them to this map. this map will be used to save the rest of the urunc config to state.json cfgMap := make(map[string]string) + // Cgroup config + cfgMap["urunc_config.cgroup.sandbox_cgroup_only"] = strconv.FormatBool(p.Cgroup.SandboxCgroupOnly) + cfgMap["urunc_config.cgroup.overhead_path"] = p.Cgroup.OverheadPath + for hv, hvCfg := range p.Monitors { prefix := "urunc_config.monitors." + hv + "." cfgMap[prefix+"default_memory_mb"] = strconv.FormatUint(uint64(hvCfg.DefaultMemoryMB), 10) @@ -138,10 +156,21 @@ func UruncConfigFromMap(cfgMap map[string]string) *UruncConfig { // since log and timestamps are loaded at the start of urunc, we will not be reading // them from this map. this map will be used to parse the rest of the urunc config from state.json cfg := &UruncConfig{ + Cgroup: defaultCgroupConfig(), Monitors: defaultMonitorsConfig(), ExtraBins: defaultExtraBinConfig(), } + // Parse cgroup config + if val, ok := cfgMap["urunc_config.cgroup.sandbox_cgroup_only"]; ok { + if boolVal, err := strconv.ParseBool(val); err == nil { + cfg.Cgroup.SandboxCgroupOnly = boolVal + } + } + if val, ok := cfgMap["urunc_config.cgroup.overhead_path"]; ok { + cfg.Cgroup.OverheadPath = val + } + for key, val := range cfgMap { if !strings.HasPrefix(key, "urunc_config.monitors.") { continue diff --git a/pkg/unikontainers/urunc_config_test.go b/pkg/unikontainers/urunc_config_test.go index 9c2be68a..528a7ea7 100644 --- a/pkg/unikontainers/urunc_config_test.go +++ b/pkg/unikontainers/urunc_config_test.go @@ -392,28 +392,36 @@ func TestUruncConfigMap(t *testing.T) { assert.Equal(t, config.ExtraBins["custom"].Options, cfgMap["urunc_config.extra_binaries.custom.options"]) }) - t.Run("empty monitors map produces empty result", func(t *testing.T) { + t.Run("empty monitors map produces only cgroup config", func(t *testing.T) { t.Parallel() config := &UruncConfig{ + Cgroup: defaultCgroupConfig(), Monitors: map[string]types.MonitorConfig{}, } cfgMap := config.Map() assert.NotNil(t, cfgMap) - assert.Empty(t, cfgMap) + // Only cgroup fields should be present + assert.Len(t, cfgMap, 2) + assert.Contains(t, cfgMap, "urunc_config.cgroup.sandbox_cgroup_only") + assert.Contains(t, cfgMap, "urunc_config.cgroup.overhead_path") }) - t.Run("empty extra binaries map produces empty result", func(t *testing.T) { + t.Run("empty extra binaries map produces only cgroup config", func(t *testing.T) { t.Parallel() config := &UruncConfig{ + Cgroup: defaultCgroupConfig(), ExtraBins: map[string]types.ExtraBinConfig{}, } cfgMap := config.Map() assert.NotNil(t, cfgMap) - assert.Empty(t, cfgMap) + // Only cgroup fields should be present + assert.Len(t, cfgMap, 2) + assert.Contains(t, cfgMap, "urunc_config.cgroup.sandbox_cgroup_only") + assert.Contains(t, cfgMap, "urunc_config.cgroup.overhead_path") }) }