You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
489 lines
20 KiB
489 lines
20 KiB
From 92b12c7dc013c95bd0d35bae99ff6df023ce0e1f Mon Sep 17 00:00:00 2001 |
|
From: Lennart Poettering <lennart@poettering.net> |
|
Date: Wed, 4 May 2016 20:43:23 +0200 |
|
Subject: [PATCH] core: use an AF_UNIX/SOCK_DGRAM socket for cgroup agent |
|
notification |
|
|
|
dbus-daemon currently uses a backlog of 30 on its D-bus system bus socket. On |
|
overloaded systems this means that only 30 connections may be queued without |
|
dbus-daemon processing them before further connection attempts fail. Our |
|
cgroups-agent binary so far used D-Bus for its messaging, and hitting this |
|
limit hence may result in us losing cgroup empty messages. |
|
|
|
This patch adds a seperate cgroup agent socket of type AF_UNIX/SOCK_DGRAM. |
|
Since sockets of these types need no connection set up, no listen() backlog |
|
applies. Our cgroup-agent binary will hence simply block as long as it can't |
|
enqueue its datagram message, so that we won't lose cgroup empty messages as |
|
likely anymore. |
|
|
|
This also rearranges the ordering of the processing of SIGCHLD signals, service |
|
notification messages (sd_notify()...) and the two types of cgroup |
|
notifications (inotify for the unified hierarchy support, and agent for the |
|
classic hierarchy support). We now always process events for these in the |
|
following order: |
|
|
|
1. service notification messages (SD_EVENT_PRIORITY_NORMAL-7) |
|
2. SIGCHLD signals (SD_EVENT_PRIORITY_NORMAL-6) |
|
3. cgroup inotify and cgroup agent (SD_EVENT_PRIORITY_NORMAL-5) |
|
|
|
This is because when receiving SIGCHLD we invalidate PID information, which we |
|
need to process the service notification messages which are bound to PIDs. |
|
Hence the order between the first two items. And we want to process SIGCHLD |
|
metadata to detect whether a service is gone, before using cgroup |
|
notifications, to decide when a service is gone, since the former carries more |
|
useful metadata. |
|
|
|
Related to this: |
|
https://bugs.freedesktop.org/show_bug.cgi?id=95264 |
|
https://github.com/systemd/systemd/issues/1961 |
|
|
|
Cherry-picked from: d8fdc62037b5b0a9fd603ad5efd6b49f956f86b5 |
|
Resolves: #1305608 |
|
--- |
|
src/cgroups-agent/cgroups-agent.c | 48 ++++++------ |
|
src/core/cgroup.c | 2 + |
|
src/core/dbus.c | 56 +++++++------- |
|
src/core/dbus.h | 2 + |
|
src/core/manager.c | 149 ++++++++++++++++++++++++++++++++++++-- |
|
src/core/manager.h | 3 + |
|
6 files changed, 198 insertions(+), 62 deletions(-) |
|
|
|
diff --git a/src/cgroups-agent/cgroups-agent.c b/src/cgroups-agent/cgroups-agent.c |
|
index 529e84303..2fe65830e 100644 |
|
--- a/src/cgroups-agent/cgroups-agent.c |
|
+++ b/src/cgroups-agent/cgroups-agent.c |
|
@@ -1,5 +1,3 @@ |
|
-/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ |
|
- |
|
/*** |
|
This file is part of systemd. |
|
|
|
@@ -20,14 +18,21 @@ |
|
***/ |
|
|
|
#include <stdlib.h> |
|
+#include <sys/socket.h> |
|
|
|
-#include "sd-bus.h" |
|
#include "log.h" |
|
-#include "bus-util.h" |
|
+#include "socket-util.h" |
|
|
|
int main(int argc, char *argv[]) { |
|
- _cleanup_bus_close_unref_ sd_bus *bus = NULL; |
|
- int r; |
|
+ |
|
+ static const union sockaddr_union sa = { |
|
+ .un.sun_family = AF_UNIX, |
|
+ .un.sun_path = "/run/systemd/cgroups-agent", |
|
+ }; |
|
+ |
|
+ _cleanup_close_ int fd = -1; |
|
+ ssize_t n; |
|
+ size_t l; |
|
|
|
if (argc != 2) { |
|
log_error("Incorrect number of arguments."); |
|
@@ -38,27 +43,22 @@ int main(int argc, char *argv[]) { |
|
log_parse_environment(); |
|
log_open(); |
|
|
|
- /* We send this event to the private D-Bus socket and then the |
|
- * system instance will forward this to the system bus. We do |
|
- * this to avoid an activation loop when we start dbus when we |
|
- * are called when the dbus service is shut down. */ |
|
- |
|
- r = bus_open_system_systemd(&bus); |
|
- if (r < 0) { |
|
- /* If we couldn't connect we assume this was triggered |
|
- * while systemd got restarted/transitioned from |
|
- * initrd to the system, so let's ignore this */ |
|
- log_debug_errno(r, "Failed to get D-Bus connection: %m"); |
|
+ fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0); |
|
+ if (fd < 0) { |
|
+ log_debug_errno(errno, "Failed to allocate socket: %m"); |
|
+ return EXIT_FAILURE; |
|
+ } |
|
+ |
|
+ l = strlen(argv[1]); |
|
+ |
|
+ n = sendto(fd, argv[1], l, 0, &sa.sa, offsetof(struct sockaddr_un, sun_path) + strlen(sa.un.sun_path)); |
|
+ if (n < 0) { |
|
+ log_debug_errno(errno, "Failed to send cgroups agent message: %m"); |
|
return EXIT_FAILURE; |
|
} |
|
|
|
- r = sd_bus_emit_signal(bus, |
|
- "/org/freedesktop/systemd1/agent", |
|
- "org.freedesktop.systemd1.Agent", |
|
- "Released", |
|
- "s", argv[1]); |
|
- if (r < 0) { |
|
- log_debug_errno(r, "Failed to send signal message on private connection: %m"); |
|
+ if ((size_t) n != l) { |
|
+ log_debug("Datagram size mismatch"); |
|
return EXIT_FAILURE; |
|
} |
|
|
|
diff --git a/src/core/cgroup.c b/src/core/cgroup.c |
|
index 10fdcc998..b7f08fb42 100644 |
|
--- a/src/core/cgroup.c |
|
+++ b/src/core/cgroup.c |
|
@@ -1028,6 +1028,8 @@ int manager_notify_cgroup_empty(Manager *m, const char *cgroup) { |
|
assert(m); |
|
assert(cgroup); |
|
|
|
+ log_debug("Got cgroup empty notification for: %s", cgroup); |
|
+ |
|
u = manager_get_unit_by_cgroup(m, cgroup); |
|
if (u) { |
|
r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true); |
|
diff --git a/src/core/dbus.c b/src/core/dbus.c |
|
index 85b517486..29524d49a 100644 |
|
--- a/src/core/dbus.c |
|
+++ b/src/core/dbus.c |
|
@@ -72,12 +72,37 @@ int bus_send_queued_message(Manager *m) { |
|
return 0; |
|
} |
|
|
|
+int bus_forward_agent_released(Manager *m, const char *path) { |
|
+ int r; |
|
+ |
|
+ assert(m); |
|
+ assert(path); |
|
+ |
|
+ if (!m->running_as == SYSTEMD_SYSTEM) |
|
+ return 0; |
|
+ |
|
+ if (!m->system_bus) |
|
+ return 0; |
|
+ |
|
+ /* If we are running a system instance we forward the agent message on the system bus, so that the user |
|
+ * instances get notified about this, too */ |
|
+ |
|
+ r = sd_bus_emit_signal(m->system_bus, |
|
+ "/org/freedesktop/systemd1/agent", |
|
+ "org.freedesktop.systemd1.Agent", |
|
+ "Released", |
|
+ "s", path); |
|
+ if (r < 0) |
|
+ return log_warning_errno(r, "Failed to propagate agent release message: %m"); |
|
+ |
|
+ return 1; |
|
+} |
|
+ |
|
static int signal_agent_released(sd_bus *bus, sd_bus_message *message, void *userdata, sd_bus_error *error) { |
|
Manager *m = userdata; |
|
const char *cgroup; |
|
int r; |
|
|
|
- assert(bus); |
|
assert(message); |
|
assert(m); |
|
|
|
@@ -88,16 +113,6 @@ static int signal_agent_released(sd_bus *bus, sd_bus_message *message, void *use |
|
} |
|
|
|
manager_notify_cgroup_empty(m, cgroup); |
|
- |
|
- if (m->running_as == SYSTEMD_SYSTEM && m->system_bus) { |
|
- /* If we are running as system manager, forward the |
|
- * message to the system bus */ |
|
- |
|
- r = sd_bus_send(m->system_bus, message, NULL); |
|
- if (r < 0) |
|
- log_warning_errno(r, "Failed to forward Released message: %m"); |
|
- } |
|
- |
|
return 0; |
|
} |
|
|
|
@@ -679,25 +694,6 @@ static int bus_on_connection(sd_event_source *s, int fd, uint32_t revents, void |
|
return 0; |
|
} |
|
|
|
- if (m->running_as == SYSTEMD_SYSTEM) { |
|
- /* When we run as system instance we get the Released |
|
- * signal via a direct connection */ |
|
- |
|
- r = sd_bus_add_match( |
|
- bus, |
|
- NULL, |
|
- "type='signal'," |
|
- "interface='org.freedesktop.systemd1.Agent'," |
|
- "member='Released'," |
|
- "path='/org/freedesktop/systemd1/agent'", |
|
- signal_agent_released, m); |
|
- |
|
- if (r < 0) { |
|
- log_warning_errno(r, "Failed to register Released match on new connection bus: %m"); |
|
- return 0; |
|
- } |
|
- } |
|
- |
|
r = bus_setup_disconnected_match(m, bus); |
|
if (r < 0) |
|
return 0; |
|
diff --git a/src/core/dbus.h b/src/core/dbus.h |
|
index d04f5326c..c27d136e3 100644 |
|
--- a/src/core/dbus.h |
|
+++ b/src/core/dbus.h |
|
@@ -40,3 +40,5 @@ int bus_verify_manage_unit_async(Manager *m, sd_bus_message *call, sd_bus_error |
|
int bus_verify_manage_unit_async_for_kill(Manager *m, sd_bus_message *call, sd_bus_error *error); |
|
int bus_verify_manage_unit_files_async(Manager *m, sd_bus_message *call, sd_bus_error *error); |
|
int bus_verify_reload_daemon_async(Manager *m, sd_bus_message *call, sd_bus_error *error); |
|
+ |
|
+int bus_forward_agent_released(Manager *m, const char *path); |
|
diff --git a/src/core/manager.c b/src/core/manager.c |
|
index ee456fb79..370c8cbbe 100644 |
|
--- a/src/core/manager.c |
|
+++ b/src/core/manager.c |
|
@@ -83,8 +83,10 @@ |
|
#define JOBS_IN_PROGRESS_WAIT_USEC (5*USEC_PER_SEC) |
|
#define JOBS_IN_PROGRESS_PERIOD_USEC (USEC_PER_SEC / 3) |
|
#define JOBS_IN_PROGRESS_PERIOD_DIVISOR 3 |
|
+#define CGROUPS_AGENT_RCVBUF_SIZE (8*1024*1024) |
|
|
|
static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); |
|
+static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); |
|
static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); |
|
static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); |
|
static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); |
|
@@ -456,11 +458,11 @@ static int manager_setup_signals(Manager *m) { |
|
if (r < 0) |
|
return r; |
|
|
|
- /* Process signals a bit earlier than the rest of things, but |
|
- * later than notify_fd processing, so that the notify |
|
- * processing can still figure out to which process/service a |
|
- * message belongs, before we reap the process. */ |
|
- r = sd_event_source_set_priority(m->signal_event_source, -5); |
|
+ /* Process signals a bit earlier than the rest of things, but later than notify_fd processing, so that the |
|
+ * notify processing can still figure out to which process/service a message belongs, before we reap the |
|
+ * process. Also, process this before handling cgroup notifications, so that we always collect child exit |
|
+ * status information before detecting that there's no process in a cgroup. */ |
|
+ r = sd_event_source_set_priority(m->signal_event_source, -6); |
|
if (r < 0) |
|
return r; |
|
|
|
@@ -541,7 +543,7 @@ int manager_new(SystemdRunningAs running_as, bool test_run, Manager **_m) { |
|
|
|
m->idle_pipe[0] = m->idle_pipe[1] = m->idle_pipe[2] = m->idle_pipe[3] = -1; |
|
|
|
- m->pin_cgroupfs_fd = m->notify_fd = m->signal_fd = m->time_change_fd = m->dev_autofs_fd = m->private_listen_fd = m->kdbus_fd = m->utab_inotify_fd = -1; |
|
+ m->pin_cgroupfs_fd = m->notify_fd = m->cgroups_agent_fd = m->signal_fd = m->time_change_fd = m->dev_autofs_fd = m->private_listen_fd = m->kdbus_fd = m->utab_inotify_fd = -1; |
|
m->current_job_id = 1; /* start as id #1, so that we can leave #0 around as "null-like" value */ |
|
|
|
m->ask_password_inotify_fd = -1; |
|
@@ -689,8 +691,8 @@ static int manager_setup_notify(Manager *m) { |
|
if (r < 0) |
|
return log_error_errno(r, "Failed to allocate notify event source: %m"); |
|
|
|
- /* Process signals a bit earlier than SIGCHLD, so that we can |
|
- * still identify to which service an exit message belongs */ |
|
+ /* Process notification messages a bit earlier than SIGCHLD, so that we can still identify to which |
|
+ * service an exit message belongs. */ |
|
r = sd_event_source_set_priority(m->notify_event_source, -7); |
|
if (r < 0) |
|
return log_error_errno(r, "Failed to set priority of notify event source: %m"); |
|
@@ -699,6 +701,77 @@ static int manager_setup_notify(Manager *m) { |
|
return 0; |
|
} |
|
|
|
+static int manager_setup_cgroups_agent(Manager *m) { |
|
+ |
|
+ static const union sockaddr_union sa = { |
|
+ .un.sun_family = AF_UNIX, |
|
+ .un.sun_path = "/run/systemd/cgroups-agent", |
|
+ }; |
|
+ int r; |
|
+ |
|
+ /* This creates a listening socket we receive cgroups agent messages on. We do not use D-Bus for delivering |
|
+ * these messages from the cgroups agent binary to PID 1, as the cgroups agent binary is very short-living, and |
|
+ * each instance of it needs a new D-Bus connection. Since D-Bus connections are SOCK_STREAM/AF_UNIX, on |
|
+ * overloaded systems the backlog of the D-Bus socket becomes relevant, as not more than the configured number |
|
+ * of D-Bus connections may be queued until the kernel will start dropping further incoming connections, |
|
+ * possibly resulting in lost cgroups agent messages. To avoid this, we'll use a private SOCK_DGRAM/AF_UNIX |
|
+ * socket, where no backlog is relevant as communication may take place without an actual connect() cycle, and |
|
+ * we thus won't lose messages. |
|
+ * |
|
+ * Note that PID 1 will forward the agent message to system bus, so that the user systemd instance may listen |
|
+ * to it. The system instance hence listens on this special socket, but the user instances listen on the system |
|
+ * bus for these messages. */ |
|
+ |
|
+ if (m->test_run) |
|
+ return 0; |
|
+ |
|
+ if (!m->running_as == SYSTEMD_SYSTEM) |
|
+ return 0; |
|
+ |
|
+ if (m->cgroups_agent_fd < 0) { |
|
+ _cleanup_close_ int fd = -1; |
|
+ |
|
+ /* First free all secondary fields */ |
|
+ m->cgroups_agent_event_source = sd_event_source_unref(m->cgroups_agent_event_source); |
|
+ |
|
+ fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); |
|
+ if (fd < 0) |
|
+ return log_error_errno(errno, "Failed to allocate cgroups agent socket: %m"); |
|
+ |
|
+ fd_inc_rcvbuf(fd, CGROUPS_AGENT_RCVBUF_SIZE); |
|
+ |
|
+ (void) unlink(sa.un.sun_path); |
|
+ |
|
+ /* Only allow root to connect to this socket */ |
|
+ RUN_WITH_UMASK(0077) |
|
+ r = bind(fd, &sa.sa, offsetof(struct sockaddr_un, sun_path) + strlen(sa.un.sun_path)); |
|
+ if (r < 0) |
|
+ return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path); |
|
+ |
|
+ m->cgroups_agent_fd = fd; |
|
+ fd = -1; |
|
+ } |
|
+ |
|
+ if (!m->cgroups_agent_event_source) { |
|
+ r = sd_event_add_io(m->event, &m->cgroups_agent_event_source, m->cgroups_agent_fd, EPOLLIN, manager_dispatch_cgroups_agent_fd, m); |
|
+ if (r < 0) |
|
+ return log_error_errno(r, "Failed to allocate cgroups agent event source: %m"); |
|
+ |
|
+ /* Process cgroups notifications early, but after having processed service notification messages or |
|
+ * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of notification, |
|
+ * and we collected the metadata the notification and SIGCHLD stuff offers first. Also see handling of |
|
+ * cgroup inotify for the unified cgroup stuff. */ |
|
+ r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-5); |
|
+ if (r < 0) |
|
+ return log_error_errno(r, "Failed to set priority of cgroups agent event source: %m"); |
|
+ |
|
+ (void) sd_event_source_set_description(m->cgroups_agent_event_source, "manager-cgroups-agent"); |
|
+ } |
|
+ |
|
+ return 0; |
|
+} |
|
+ |
|
+ |
|
static int manager_setup_kdbus(Manager *m) { |
|
#ifdef ENABLE_KDBUS |
|
_cleanup_free_ char *p = NULL; |
|
@@ -912,6 +985,7 @@ Manager* manager_free(Manager *m) { |
|
|
|
sd_event_source_unref(m->signal_event_source); |
|
sd_event_source_unref(m->notify_event_source); |
|
+ sd_event_source_unref(m->cgroups_agent_event_source); |
|
sd_event_source_unref(m->time_change_event_source); |
|
sd_event_source_unref(m->jobs_in_progress_event_source); |
|
sd_event_source_unref(m->idle_pipe_event_source); |
|
@@ -919,6 +993,7 @@ Manager* manager_free(Manager *m) { |
|
|
|
safe_close(m->signal_fd); |
|
safe_close(m->notify_fd); |
|
+ safe_close(m->cgroups_agent_fd); |
|
safe_close(m->time_change_fd); |
|
safe_close(m->kdbus_fd); |
|
|
|
@@ -1167,6 +1242,10 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds) { |
|
if (q < 0 && r == 0) |
|
r = q; |
|
|
|
+ q = manager_setup_cgroups_agent(m); |
|
+ if (q < 0 && r == 0) |
|
+ r = q; |
|
+ |
|
/* We might have deserialized the kdbus control fd, but if we |
|
* didn't, then let's create the bus now. */ |
|
manager_setup_kdbus(m); |
|
@@ -1492,6 +1571,35 @@ static unsigned manager_dispatch_dbus_queue(Manager *m) { |
|
return n; |
|
} |
|
|
|
+static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { |
|
+ Manager *m = userdata; |
|
+ char buf[PATH_MAX+1]; |
|
+ ssize_t n; |
|
+ |
|
+ n = recv(fd, buf, sizeof(buf), 0); |
|
+ if (n < 0) |
|
+ return log_error_errno(errno, "Failed to read cgroups agent message: %m"); |
|
+ if (n == 0) { |
|
+ log_error("Got zero-length cgroups agent message, ignoring."); |
|
+ return 0; |
|
+ } |
|
+ if ((size_t) n >= sizeof(buf)) { |
|
+ log_error("Got overly long cgroups agent message, ignoring."); |
|
+ return 0; |
|
+ } |
|
+ |
|
+ if (memchr(buf, 0, n)) { |
|
+ log_error("Got cgroups agent message with embedded NUL byte, ignoring."); |
|
+ return 0; |
|
+ } |
|
+ buf[n] = 0; |
|
+ |
|
+ manager_notify_cgroup_empty(m, buf); |
|
+ bus_forward_agent_released(m, buf); |
|
+ |
|
+ return 0; |
|
+} |
|
+ |
|
static void manager_invoke_notify_message(Manager *m, Unit *u, pid_t pid, char *buf, size_t n, FDSet *fds) { |
|
_cleanup_strv_free_ char **tags = NULL; |
|
|
|
@@ -2304,6 +2412,16 @@ int manager_serialize(Manager *m, FILE *f, FDSet *fds, bool switching_root) { |
|
fprintf(f, "notify-socket=%s\n", m->notify_socket); |
|
} |
|
|
|
+ if (m->cgroups_agent_fd >= 0) { |
|
+ int copy; |
|
+ |
|
+ copy = fdset_put_dup(fds, m->cgroups_agent_fd); |
|
+ if (copy < 0) |
|
+ return copy; |
|
+ |
|
+ fprintf(f, "cgroups-agent-fd=%i\n", copy); |
|
+ } |
|
+ |
|
if (m->kdbus_fd >= 0) { |
|
int copy; |
|
|
|
@@ -2473,6 +2591,17 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) { |
|
free(m->notify_socket); |
|
m->notify_socket = n; |
|
|
|
+ } else if (startswith(l, "cgroups-agent-fd=")) { |
|
+ int fd; |
|
+ |
|
+ if (safe_atoi(l + 17, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) |
|
+ log_debug("Failed to parse cgroups agent fd: %s", l + 10); |
|
+ else { |
|
+ m->cgroups_agent_event_source = sd_event_source_unref(m->cgroups_agent_event_source); |
|
+ safe_close(m->cgroups_agent_fd); |
|
+ m->cgroups_agent_fd = fdset_remove(fds, fd); |
|
+ } |
|
+ |
|
} else if (startswith(l, "kdbus-fd=")) { |
|
int fd; |
|
|
|
@@ -2599,6 +2728,10 @@ int manager_reload(Manager *m) { |
|
if (q < 0 && r >= 0) |
|
r = q; |
|
|
|
+ q = manager_setup_cgroups_agent(m); |
|
+ if (q < 0 && r >= 0) |
|
+ r = q; |
|
+ |
|
/* Third, fire things up! */ |
|
q = manager_coldplug(m); |
|
if (q < 0 && r >= 0) |
|
diff --git a/src/core/manager.h b/src/core/manager.h |
|
index d3971f168..3e855db46 100644 |
|
--- a/src/core/manager.h |
|
+++ b/src/core/manager.h |
|
@@ -137,6 +137,9 @@ struct Manager { |
|
int notify_fd; |
|
sd_event_source *notify_event_source; |
|
|
|
+ int cgroups_agent_fd; |
|
+ sd_event_source *cgroups_agent_event_source; |
|
+ |
|
int signal_fd; |
|
sd_event_source *signal_event_source; |
|
|
|
|