diff --git a/docs/HACKING.md b/docs/HACKING.md index aa226445e7b1d..f2c21cb16ec6b 100644 --- a/docs/HACKING.md +++ b/docs/HACKING.md @@ -36,6 +36,10 @@ or via your distribution's package manager. Note that systemd regularly adopts newer mkosi features that are not in an official release yet so there's a good chance that your distribution's packaged version of mkosi will be too old. +Make sure to read the "Unprivileged User Namespaces" section in the mkosi documentation +(run `mkosi documentation` to view the mkosi docs) and apply any necessary instructions +to make sure unprivileged user namespaces work on your system. + Then, you can build, run and test systemd executables as follows: ```sh diff --git a/src/core/bpf-bind-iface.c b/src/core/bpf-bind-iface.c index ea439d307ba1d..6b11e83543f00 100644 --- a/src/core/bpf-bind-iface.c +++ b/src/core/bpf-bind-iface.c @@ -49,25 +49,18 @@ int bpf_bind_network_interface_supported(void) { return (supported = bpf_can_link_program(obj->progs.sd_bind_interface)); } -int bpf_bind_network_interface_install(Unit *u) { +static int bind_network_interface_install_impl(Unit *u, CGroupRuntime *crt) { _cleanup_(bpf_link_freep) struct bpf_link *link = NULL; _cleanup_(bind_iface_bpf_freep) struct bind_iface_bpf *obj = NULL; _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; _cleanup_free_ char *cgroup_path = NULL; _cleanup_close_ int cgroup_fd = -EBADF; - CGroupContext *cc; - CGroupRuntime *crt; int r, ifindex; assert(u); + assert(crt); - cc = unit_get_cgroup_context(u); - if (!cc) - return 0; - - crt = unit_get_cgroup_runtime(u); - if (!crt) - return 0; + CGroupContext *cc = ASSERT_PTR(unit_get_cgroup_context(u)); if (isempty(cc->bind_network_interface)) return 0; @@ -118,6 +111,21 @@ int bpf_bind_network_interface_install(Unit *u) { return 0; } +int bpf_bind_network_interface_install(Unit *u) { + CGroupRuntime *crt; + int r; + + assert(u); + + crt = unit_get_cgroup_runtime(u); + if (!crt) + return 0; + + r = bind_network_interface_install_impl(u, crt); + crt->initial_bind_network_interface_link_fd = safe_close(crt->initial_bind_network_interface_link_fd); + return r; +} + int bpf_bind_network_interface_serialize(Unit *u, FILE *f, FDSet *fds) { CGroupRuntime *crt; @@ -127,7 +135,7 @@ int bpf_bind_network_interface_serialize(Unit *u, FILE *f, FDSet *fds) { if (!crt) return 0; - return bpf_serialize_link(f, fds, "bind-interface-fd", crt->bpf_bind_network_interface_link); + return bpf_serialize_link(f, fds, "bind-iface-bpf-fd", crt->bpf_bind_network_interface_link); } #else /* ! BPF_FRAMEWORK */ diff --git a/src/core/bpf-restrict-ifaces.c b/src/core/bpf-restrict-ifaces.c index 29df72f2b06dc..a1bac8301be34 100644 --- a/src/core/bpf-restrict-ifaces.c +++ b/src/core/bpf-restrict-ifaces.c @@ -98,22 +98,17 @@ int bpf_restrict_ifaces_supported(void) { return (supported = bpf_can_link_program(obj->progs.sd_restrictif_i)); } -static int restrict_ifaces_install_impl(Unit *u) { +static int restrict_ifaces_install_impl(Unit *u, CGroupRuntime *crt) { _cleanup_(bpf_link_freep) struct bpf_link *egress_link = NULL, *ingress_link = NULL; _cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL; _cleanup_free_ char *cgroup_path = NULL; _cleanup_close_ int cgroup_fd = -EBADF; - CGroupContext *cc; - CGroupRuntime *crt; int r; - cc = unit_get_cgroup_context(u); - if (!cc) - return 0; + assert(u); + assert(crt); - crt = unit_get_cgroup_runtime(u); - if (!crt) - return 0; + CGroupContext *cc = ASSERT_PTR(unit_get_cgroup_context(u)); r = cg_get_path(crt->cgroup_path, /* suffix= */ NULL, &cgroup_path); if (r < 0) @@ -159,7 +154,7 @@ int bpf_restrict_ifaces_install(Unit *u) { if (!crt) return 0; - r = restrict_ifaces_install_impl(u); + r = restrict_ifaces_install_impl(u, crt); fdset_close(crt->initial_restrict_ifaces_link_fds, /* async= */ false); return r; } diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 33d0ab5adde39..8925772437945 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -571,7 +571,7 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { fprintf(f, "%sDelegateSubgroup: %s\n", prefix, c->delegate_subgroup); - if (!isempty(c->bind_network_interface)) + if (c->bind_network_interface) fprintf(f, "%sBindNetworkInterface: %s\n", prefix, c->bind_network_interface); @@ -1698,7 +1698,7 @@ static bool unit_get_needs_bind_network_interface(Unit *u) { if (!c) return false; - return !isempty(c->bind_network_interface); + return c->bind_network_interface; } static CGroupMask unit_get_cgroup_mask(Unit *u) { @@ -3050,9 +3050,7 @@ int unit_check_oom(Unit *u) { if (!crt || !crt->cgroup_path) return 0; - CGroupContext *ctx = unit_get_cgroup_context(u); - if (!ctx) - return 0; + CGroupContext *ctx = ASSERT_PTR(unit_get_cgroup_context(u)); /* If memory.oom.group=1, then look up the oom_group_kill field, which reports how many times the * kernel killed every process recursively in this cgroup and its descendants, similar to @@ -4201,6 +4199,8 @@ CGroupRuntime* cgroup_runtime_new(void) { .ipv4_deny_map_fd = -EBADF, .ipv6_deny_map_fd = -EBADF, + .initial_bind_network_interface_link_fd = -EBADF, + .cgroup_invalidated_mask = _CGROUP_MASK_ALL, .deserialized_cgroup_realized = -1, @@ -4235,6 +4235,7 @@ CGroupRuntime* cgroup_runtime_free(CGroupRuntime *crt) { #endif fdset_free(crt->initial_restrict_ifaces_link_fds); + safe_close(crt->initial_bind_network_interface_link_fd); bpf_firewall_close(crt); @@ -4461,34 +4462,24 @@ int cgroup_runtime_deserialize_one(Unit *u, const char *key, const char *value, if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-invalidated-mask", key, value, cg_mask_from_string, cgroup_invalidated_mask)) return 1; - if (STR_IN_SET(key, "ipv4-socket-bind-bpf-link-fd", "ipv6-socket-bind-bpf-link-fd")) { - int fd; - - fd = deserialize_fd(fds, value); - if (fd >= 0) - (void) bpf_socket_bind_add_initial_link_fd(u, fd); - - return 1; - } - if (STR_IN_SET(key, - "ip-bpf-ingress-installed", "ip-bpf-egress-installed", "bpf-device-control-installed", + "ip-bpf-ingress-installed", "ip-bpf-egress-installed", "ip-bpf-custom-ingress-installed", "ip-bpf-custom-egress-installed")) { CGroupRuntime *crt = unit_setup_cgroup_runtime(u); if (!crt) log_oom_debug(); else { + if (streq(key, "bpf-device-control-installed")) + (void) bpf_program_deserialize_attachment(value, fds, &crt->bpf_device_control_installed); + if (streq(key, "ip-bpf-ingress-installed")) (void) bpf_program_deserialize_attachment(value, fds, &crt->ip_bpf_ingress_installed); if (streq(key, "ip-bpf-egress-installed")) (void) bpf_program_deserialize_attachment(value, fds, &crt->ip_bpf_egress_installed); - if (streq(key, "bpf-device-control-installed")) - (void) bpf_program_deserialize_attachment(value, fds, &crt->bpf_device_control_installed); - if (streq(key, "ip-bpf-custom-ingress-installed")) (void) bpf_program_deserialize_attachment_set(value, fds, &crt->ip_bpf_custom_ingress_installed); @@ -4499,12 +4490,47 @@ int cgroup_runtime_deserialize_one(Unit *u, const char *key, const char *value, return 1; } + /* We keep the previous bpf link fds stashed until we reattach anew, to close the window where + * the cgroup restrictions would otherwise be lifted. */ + + if (STR_IN_SET(key, "ipv4-socket-bind-bpf-link-fd", "ipv6-socket-bind-bpf-link-fd")) { + _cleanup_close_ int fd = -EBADF; + + fd = deserialize_fd(fds, value); + if (fd >= 0) { + r = bpf_socket_bind_add_initial_link_fd(u, fd); + if (r >= 0) + TAKE_FD(fd); + } + + return 1; + } + if (streq(key, "restrict-ifaces-bpf-fd")) { - int fd; + _cleanup_close_ int fd = -EBADF; fd = deserialize_fd(fds, value); - if (fd >= 0) - (void) bpf_restrict_ifaces_add_initial_link_fd(u, fd); + if (fd >= 0) { + r = bpf_restrict_ifaces_add_initial_link_fd(u, fd); + if (r >= 0) + TAKE_FD(fd); + } + + return 1; + } + + if (streq(key, "bind-iface-bpf-fd")) { + _cleanup_close_ int fd = -EBADF; + + fd = deserialize_fd(fds, value); + if (fd >= 0) { + CGroupRuntime *crt = unit_setup_cgroup_runtime(u); + if (!crt) + log_oom_debug(); + else + close_and_replace(crt->initial_bind_network_interface_link_fd, fd); + } + return 1; } diff --git a/src/core/cgroup.h b/src/core/cgroup.h index de091605d4269..4f1a77392f315 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -325,6 +325,13 @@ typedef struct CGroupRuntime { struct bpf_link *restrict_ifaces_egress_bpf_link; #endif +#if BPF_FRAMEWORK + /* BPF link to BPF programs attached to cgroup/sock_create hooks and + * responsible for binding created sockets to a given VRF interface. */ + struct bpf_link *bpf_bind_network_interface_link; +#endif + int initial_bind_network_interface_link_fd; + bool cgroup_members_mask_valid:1; /* Reset cgroup accounting next time we fork something off */ @@ -334,12 +341,6 @@ typedef struct CGroupRuntime { bool warned_clamping_cpu_quota_period:1; int deserialized_cgroup_realized; /* tristate, for backwards compat */ - -#if BPF_FRAMEWORK - /* BPF link to BPF programs attached to cgroup/sock_create hooks and - * responsible for binding created sockets to a given VRF interface. */ - struct bpf_link *bpf_bind_network_interface_link; -#endif } CGroupRuntime; uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state); diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index 29b59ea7057e2..92c59cd4ecf59 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -1893,6 +1893,7 @@ int bus_cgroup_set_property( return 1; } + if (streq(name, "RestrictNetworkInterfaces")) { int is_allow_list; _cleanup_strv_free_ char **l = NULL; @@ -1958,19 +1959,15 @@ int bus_cgroup_set_property( if (r < 0) return r; - if (!ifname_valid_full(s, IFNAME_VALID_ALTERNATIVE)) + if (!isempty(s) && !ifname_valid_full(s, IFNAME_VALID_ALTERNATIVE)) return sd_bus_error_setf(reterr_error, SD_BUS_ERROR_INVALID_ARGS, "Invalid interface name: %s", s); if (!UNIT_WRITE_FLAGS_NOOP(flags)) { - if (isempty(s)) - c->bind_network_interface = mfree(c->bind_network_interface); - else { - r = free_and_strdup_warn(&c->bind_network_interface, s); - if (r < 0) - return r; - } - - unit_write_settingf(u, flags, name, "BindNetworkInterface=%s", strempty(s)); + r = free_and_strdup_warn(&c->bind_network_interface, empty_to_null(s)); + if (r < 0) + return r; + if (r > 0) + unit_write_settingf(u, flags, name, "BindNetworkInterface=%s", s); } return 1; diff --git a/src/core/unit.c b/src/core/unit.c index 7161c1cb6c44d..4287179903e37 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -5141,12 +5141,11 @@ int unit_setup_exec_runtime(Unit *u) { return r; } -CGroupRuntime *unit_setup_cgroup_runtime(Unit *u) { - size_t offset; - +CGroupRuntime* unit_setup_cgroup_runtime(Unit *u) { assert(u); + assert(UNIT_HAS_CGROUP_CONTEXT(u)); - offset = UNIT_VTABLE(u)->cgroup_runtime_offset; + size_t offset = UNIT_VTABLE(u)->cgroup_runtime_offset; assert(offset > 0); CGroupRuntime **rt = (CGroupRuntime**) ((uint8_t*) u + offset); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index c0d665a404a12..0b9e6ba073b67 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -2370,6 +2370,7 @@ static const BusProperty cgroup_properties[] = { { "SocketBindDeny", bus_append_socket_filter }, { "MemoryPressureThresholdSec", bus_append_parse_sec_rename }, { "NFTSet", bus_append_nft_set }, + { "BindNetworkInterface", bus_append_string }, /* While infinity is disallowed in unit file, infinity is allowed in D-Bus API which * means use the default memory pressure duration from oomd.conf. */ @@ -2551,7 +2552,6 @@ static const BusProperty execute_properties[] = { { "StateDirectoryAccounting", bus_append_parse_boolean }, { "CacheDirectoryAccounting", bus_append_parse_boolean }, { "LogsDirectoryAccounting", bus_append_parse_boolean }, - { "BindNetworkInterface", bus_append_string }, { NULL, bus_try_append_resource_limit, dump_resource_limits }, {}