From dbb9c935981cba8e37b57ec344606f2218d48bc8 Mon Sep 17 00:00:00 2001 From: sidneychang <2190206983@qq.com> Date: Mon, 26 Jan 2026 18:36:53 -0500 Subject: [PATCH] fix(network): add cleanupOrphanTaps() to remove orphan TAPs by carrier state - Call new function cleanupOrphanTaps() at the start of DynamicNetwork.NetworkSetup(). - Add cleanupOrphanTaps(): scan netns for interfaces matching ^tap.*_urunc$ and use kernel carrier/operational state as the sole criterion: - NO-CARRIER => delete orphan (remove TC/qdisc, then delete link) - LOWER_UP / operational up / FlagRunning => treat as in-use and abort - Do not scan /proc or check /dev/net/tun; do not attempt to reuse TAPs. - Skip cleanup when no container interface (e.g. no eth0) is present. - Remove PID/FD based checks and netns flock; document the single unikernel-per-netns assumption. - Preserve networkSetup() create-only semantics and ensure TC/qdisc cleanup before link deletion. This resolves an issue on Kubernetes where restarting urunc left orphan TAP devices in the pod network namespace and prevented subsequent network setup. Signed-off-by: sidneychang <2190206983@qq.com> --- pkg/network/network.go | 60 ++++++++++++++++++++++++++++++++++ pkg/network/network_dynamic.go | 5 +++ 2 files changed, 65 insertions(+) diff --git a/pkg/network/network.go b/pkg/network/network.go index 6aa55e22..ceeb2b94 100644 --- a/pkg/network/network.go +++ b/pkg/network/network.go @@ -18,6 +18,7 @@ import ( "errors" "fmt" "net" + "regexp" "strings" "github.com/jackpal/gateway" @@ -389,3 +390,62 @@ func deleteTapDevice(device netlink.Link) error { } return nil } + +func cleanupOrphanTaps() error { + netlog.Debug("running cleanupOrphanTaps (carrier-state based)") + + // If there is no container interface (e.g. no eth0), do not attempt to create/delete taps. + // This avoids touching taps in netns that aren't ready or belong to other runtimes (ctr). + if _, err := netlink.LinkByName(DefaultInterface); err != nil { + netlog.Debug("no container interface found in namespace; skipping orphan TAP cleanup") + return nil + } + + // Per design: assume at-most-one unikernel per netns. No inter-process netns lock is used. + + handle, err := netlink.NewHandle() + if err != nil { + return fmt.Errorf("failed to get netlink handle: %w", err) + } + defer handle.Close() + + links, err := handle.LinkList() + if err != nil { + return fmt.Errorf("failed to list links: %w", err) + } + + tapRe := regexp.MustCompile(`^tap.*_urunc$`) + for _, link := range links { + attrs := link.Attrs() + if attrs == nil { + continue + } + name := attrs.Name + if !tapRe.MatchString(name) { + continue + } + + // The device is in a 'Zombie' state: Administrative status is UP, but + // Operational status is DOWN with NO-CARRIER. + // In the Linux TUN/TAP driver model, NO-CARRIER on an UP interface + // definitively proves that no userspace process holds the file descriptor + // for this device. + if (attrs.Flags&net.FlagRunning) != 0 || attrs.OperState == netlink.OperUp { + return fmt.Errorf("found tap %s with carrier/oper state UP: aborting cleanup (unikernel may be running)", name) + } + + netlog.Debugf("deleting orphan tap %s (no carrier)", name) + if err := deleteAllTCFilters(link); err != nil { + return fmt.Errorf("failed to delete tc filters for %s: %w", name, err) + } + if err := deleteAllQDiscs(link); err != nil { + return fmt.Errorf("failed to delete qdiscs for %s: %w", name, err) + } + if err := deleteTapDevice(link); err != nil { + return fmt.Errorf("failed to delete tap %s: %w", name, err) + } + netlog.Debugf("deleted orphan tap %s", name) + } + + return nil +} diff --git a/pkg/network/network_dynamic.go b/pkg/network/network_dynamic.go index f4264bbe..58990549 100644 --- a/pkg/network/network_dynamic.go +++ b/pkg/network/network_dynamic.go @@ -35,6 +35,11 @@ type DynamicNetwork struct { // for multiple unikernels in the same pod/network namespace. // See: https://github.com/urunc-dev/urunc/issues/13 func (n DynamicNetwork) NetworkSetup(uid uint32, gid uint32) (*UnikernelNetworkInfo, error) { + // Attempt to clean up orphan TAPs created by urunc in this netns + if err := cleanupOrphanTaps(); err != nil { + return nil, fmt.Errorf("cleanupOrphanTaps failed: %w", err) + } + tapIndex, err := getTapIndex() if err != nil { return nil, fmt.Errorf("getTapIndex failed: %w", err)