Skip to content
Merged
4 changes: 4 additions & 0 deletions docs/HACKING.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ or via your distribution's package manager. Note that systemd regularly adopts
newer mkosi features that are not in an official release yet so there's a good
chance that your distribution's packaged version of mkosi will be too old.

Make sure to read the "Unprivileged User Namespaces" section in the mkosi documentation
(run `mkosi documentation` to view the mkosi docs) and apply any necessary instructions
to make sure unprivileged user namespaces work on your system.

Then, you can build, run and test systemd executables as follows:

```sh
Expand Down
30 changes: 19 additions & 11 deletions src/core/bpf-bind-iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,25 +49,18 @@ int bpf_bind_network_interface_supported(void) {
return (supported = bpf_can_link_program(obj->progs.sd_bind_interface));
}

int bpf_bind_network_interface_install(Unit *u) {
static int bind_network_interface_install_impl(Unit *u, CGroupRuntime *crt) {
_cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
_cleanup_(bind_iface_bpf_freep) struct bind_iface_bpf *obj = NULL;
_cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
_cleanup_free_ char *cgroup_path = NULL;
_cleanup_close_ int cgroup_fd = -EBADF;
CGroupContext *cc;
CGroupRuntime *crt;
int r, ifindex;

assert(u);
assert(crt);

cc = unit_get_cgroup_context(u);
if (!cc)
return 0;

crt = unit_get_cgroup_runtime(u);
if (!crt)
return 0;
CGroupContext *cc = ASSERT_PTR(unit_get_cgroup_context(u));

if (isempty(cc->bind_network_interface))
return 0;
Expand Down Expand Up @@ -118,6 +111,21 @@ int bpf_bind_network_interface_install(Unit *u) {
return 0;
}

int bpf_bind_network_interface_install(Unit *u) {
CGroupRuntime *crt;
int r;

assert(u);

crt = unit_get_cgroup_runtime(u);
if (!crt)
return 0;

r = bind_network_interface_install_impl(u, crt);
crt->initial_bind_network_interface_link_fd = safe_close(crt->initial_bind_network_interface_link_fd);
return r;
}

int bpf_bind_network_interface_serialize(Unit *u, FILE *f, FDSet *fds) {
CGroupRuntime *crt;

Expand All @@ -127,7 +135,7 @@ int bpf_bind_network_interface_serialize(Unit *u, FILE *f, FDSet *fds) {
if (!crt)
return 0;

return bpf_serialize_link(f, fds, "bind-interface-fd", crt->bpf_bind_network_interface_link);
return bpf_serialize_link(f, fds, "bind-iface-bpf-fd", crt->bpf_bind_network_interface_link);
}

#else /* ! BPF_FRAMEWORK */
Expand Down
15 changes: 5 additions & 10 deletions src/core/bpf-restrict-ifaces.c
Original file line number Diff line number Diff line change
Expand Up @@ -98,22 +98,17 @@ int bpf_restrict_ifaces_supported(void) {
return (supported = bpf_can_link_program(obj->progs.sd_restrictif_i));
}

static int restrict_ifaces_install_impl(Unit *u) {
static int restrict_ifaces_install_impl(Unit *u, CGroupRuntime *crt) {
_cleanup_(bpf_link_freep) struct bpf_link *egress_link = NULL, *ingress_link = NULL;
_cleanup_(restrict_ifaces_bpf_freep) struct restrict_ifaces_bpf *obj = NULL;
_cleanup_free_ char *cgroup_path = NULL;
_cleanup_close_ int cgroup_fd = -EBADF;
CGroupContext *cc;
CGroupRuntime *crt;
int r;

cc = unit_get_cgroup_context(u);
if (!cc)
return 0;
assert(u);
assert(crt);

crt = unit_get_cgroup_runtime(u);
if (!crt)
return 0;
CGroupContext *cc = ASSERT_PTR(unit_get_cgroup_context(u));

r = cg_get_path(crt->cgroup_path, /* suffix= */ NULL, &cgroup_path);
if (r < 0)
Expand Down Expand Up @@ -159,7 +154,7 @@ int bpf_restrict_ifaces_install(Unit *u) {
if (!crt)
return 0;

r = restrict_ifaces_install_impl(u);
r = restrict_ifaces_install_impl(u, crt);
fdset_close(crt->initial_restrict_ifaces_link_fds, /* async= */ false);
return r;
}
Expand Down
70 changes: 48 additions & 22 deletions src/core/cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -571,7 +571,7 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
fprintf(f, "%sDelegateSubgroup: %s\n",
prefix, c->delegate_subgroup);

if (!isempty(c->bind_network_interface))
if (c->bind_network_interface)
fprintf(f, "%sBindNetworkInterface: %s\n",
prefix, c->bind_network_interface);

Expand Down Expand Up @@ -1698,7 +1698,7 @@ static bool unit_get_needs_bind_network_interface(Unit *u) {
if (!c)
return false;

return !isempty(c->bind_network_interface);
return c->bind_network_interface;
}

static CGroupMask unit_get_cgroup_mask(Unit *u) {
Expand Down Expand Up @@ -3050,9 +3050,7 @@ int unit_check_oom(Unit *u) {
if (!crt || !crt->cgroup_path)
return 0;

CGroupContext *ctx = unit_get_cgroup_context(u);
if (!ctx)
return 0;
CGroupContext *ctx = ASSERT_PTR(unit_get_cgroup_context(u));

/* If memory.oom.group=1, then look up the oom_group_kill field, which reports how many times the
* kernel killed every process recursively in this cgroup and its descendants, similar to
Expand Down Expand Up @@ -4201,6 +4199,8 @@ CGroupRuntime* cgroup_runtime_new(void) {
.ipv4_deny_map_fd = -EBADF,
.ipv6_deny_map_fd = -EBADF,

.initial_bind_network_interface_link_fd = -EBADF,

.cgroup_invalidated_mask = _CGROUP_MASK_ALL,

.deserialized_cgroup_realized = -1,
Expand Down Expand Up @@ -4235,6 +4235,7 @@ CGroupRuntime* cgroup_runtime_free(CGroupRuntime *crt) {
#endif

fdset_free(crt->initial_restrict_ifaces_link_fds);
safe_close(crt->initial_bind_network_interface_link_fd);

bpf_firewall_close(crt);

Expand Down Expand Up @@ -4461,34 +4462,24 @@ int cgroup_runtime_deserialize_one(Unit *u, const char *key, const char *value,
if (MATCH_DESERIALIZE_IMMEDIATE(u, "cgroup-invalidated-mask", key, value, cg_mask_from_string, cgroup_invalidated_mask))
return 1;

if (STR_IN_SET(key, "ipv4-socket-bind-bpf-link-fd", "ipv6-socket-bind-bpf-link-fd")) {
int fd;

fd = deserialize_fd(fds, value);
if (fd >= 0)
(void) bpf_socket_bind_add_initial_link_fd(u, fd);

return 1;
}

if (STR_IN_SET(key,
"ip-bpf-ingress-installed", "ip-bpf-egress-installed",
"bpf-device-control-installed",
"ip-bpf-ingress-installed", "ip-bpf-egress-installed",
"ip-bpf-custom-ingress-installed", "ip-bpf-custom-egress-installed")) {

CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
if (!crt)
log_oom_debug();
else {
if (streq(key, "bpf-device-control-installed"))
(void) bpf_program_deserialize_attachment(value, fds, &crt->bpf_device_control_installed);

if (streq(key, "ip-bpf-ingress-installed"))
(void) bpf_program_deserialize_attachment(value, fds, &crt->ip_bpf_ingress_installed);

if (streq(key, "ip-bpf-egress-installed"))
(void) bpf_program_deserialize_attachment(value, fds, &crt->ip_bpf_egress_installed);

if (streq(key, "bpf-device-control-installed"))
(void) bpf_program_deserialize_attachment(value, fds, &crt->bpf_device_control_installed);

if (streq(key, "ip-bpf-custom-ingress-installed"))
(void) bpf_program_deserialize_attachment_set(value, fds, &crt->ip_bpf_custom_ingress_installed);

Expand All @@ -4499,12 +4490,47 @@ int cgroup_runtime_deserialize_one(Unit *u, const char *key, const char *value,
return 1;
}

/* We keep the previous bpf link fds stashed until we reattach anew, to close the window where
* the cgroup restrictions would otherwise be lifted. */

if (STR_IN_SET(key, "ipv4-socket-bind-bpf-link-fd", "ipv6-socket-bind-bpf-link-fd")) {
_cleanup_close_ int fd = -EBADF;

fd = deserialize_fd(fds, value);
if (fd >= 0) {
r = bpf_socket_bind_add_initial_link_fd(u, fd);
if (r >= 0)
TAKE_FD(fd);
}

return 1;
}

if (streq(key, "restrict-ifaces-bpf-fd")) {
int fd;
_cleanup_close_ int fd = -EBADF;

fd = deserialize_fd(fds, value);
if (fd >= 0)
(void) bpf_restrict_ifaces_add_initial_link_fd(u, fd);
if (fd >= 0) {
r = bpf_restrict_ifaces_add_initial_link_fd(u, fd);
if (r >= 0)
TAKE_FD(fd);
}

return 1;
}

if (streq(key, "bind-iface-bpf-fd")) {
_cleanup_close_ int fd = -EBADF;

fd = deserialize_fd(fds, value);
if (fd >= 0) {
CGroupRuntime *crt = unit_setup_cgroup_runtime(u);
if (!crt)
log_oom_debug();
else
close_and_replace(crt->initial_bind_network_interface_link_fd, fd);
}

return 1;
}

Expand Down
13 changes: 7 additions & 6 deletions src/core/cgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,13 @@ typedef struct CGroupRuntime {
struct bpf_link *restrict_ifaces_egress_bpf_link;
#endif

#if BPF_FRAMEWORK
/* BPF link to BPF programs attached to cgroup/sock_create hooks and
* responsible for binding created sockets to a given VRF interface. */
struct bpf_link *bpf_bind_network_interface_link;
#endif
int initial_bind_network_interface_link_fd;

bool cgroup_members_mask_valid:1;

/* Reset cgroup accounting next time we fork something off */
Expand All @@ -334,12 +341,6 @@ typedef struct CGroupRuntime {
bool warned_clamping_cpu_quota_period:1;

int deserialized_cgroup_realized; /* tristate, for backwards compat */

#if BPF_FRAMEWORK
/* BPF link to BPF programs attached to cgroup/sock_create hooks and
* responsible for binding created sockets to a given VRF interface. */
struct bpf_link *bpf_bind_network_interface_link;
#endif
} CGroupRuntime;

uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state);
Expand Down
17 changes: 7 additions & 10 deletions src/core/dbus-cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -1893,6 +1893,7 @@ int bus_cgroup_set_property(

return 1;
}

if (streq(name, "RestrictNetworkInterfaces")) {
int is_allow_list;
_cleanup_strv_free_ char **l = NULL;
Expand Down Expand Up @@ -1958,19 +1959,15 @@ int bus_cgroup_set_property(
if (r < 0)
return r;

if (!ifname_valid_full(s, IFNAME_VALID_ALTERNATIVE))
if (!isempty(s) && !ifname_valid_full(s, IFNAME_VALID_ALTERNATIVE))
return sd_bus_error_setf(reterr_error, SD_BUS_ERROR_INVALID_ARGS, "Invalid interface name: %s", s);

if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
if (isempty(s))
c->bind_network_interface = mfree(c->bind_network_interface);
else {
r = free_and_strdup_warn(&c->bind_network_interface, s);
if (r < 0)
return r;
}

unit_write_settingf(u, flags, name, "BindNetworkInterface=%s", strempty(s));
r = free_and_strdup_warn(&c->bind_network_interface, empty_to_null(s));
if (r < 0)
return r;
if (r > 0)
unit_write_settingf(u, flags, name, "BindNetworkInterface=%s", s);
}

return 1;
Expand Down
7 changes: 3 additions & 4 deletions src/core/unit.c
Original file line number Diff line number Diff line change
Expand Up @@ -5141,12 +5141,11 @@ int unit_setup_exec_runtime(Unit *u) {
return r;
}

CGroupRuntime *unit_setup_cgroup_runtime(Unit *u) {
size_t offset;

CGroupRuntime* unit_setup_cgroup_runtime(Unit *u) {
assert(u);
assert(UNIT_HAS_CGROUP_CONTEXT(u));

offset = UNIT_VTABLE(u)->cgroup_runtime_offset;
size_t offset = UNIT_VTABLE(u)->cgroup_runtime_offset;
assert(offset > 0);

CGroupRuntime **rt = (CGroupRuntime**) ((uint8_t*) u + offset);
Expand Down
2 changes: 1 addition & 1 deletion src/shared/bus-unit-util.c
Original file line number Diff line number Diff line change
Expand Up @@ -2370,6 +2370,7 @@ static const BusProperty cgroup_properties[] = {
{ "SocketBindDeny", bus_append_socket_filter },
{ "MemoryPressureThresholdSec", bus_append_parse_sec_rename },
{ "NFTSet", bus_append_nft_set },
{ "BindNetworkInterface", bus_append_string },

/* While infinity is disallowed in unit file, infinity is allowed in D-Bus API which
* means use the default memory pressure duration from oomd.conf. */
Expand Down Expand Up @@ -2551,7 +2552,6 @@ static const BusProperty execute_properties[] = {
{ "StateDirectoryAccounting", bus_append_parse_boolean },
{ "CacheDirectoryAccounting", bus_append_parse_boolean },
{ "LogsDirectoryAccounting", bus_append_parse_boolean },
{ "BindNetworkInterface", bus_append_string },

{ NULL, bus_try_append_resource_limit, dump_resource_limits },
{}
Expand Down
Loading