You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
449 lines
16 KiB
449 lines
16 KiB
From c67c643418b1df5b5705b3a72eba1e6755830dc5 Mon Sep 17 00:00:00 2001 |
|
From: Jan Synacek <jsynacek@redhat.com> |
|
Date: Tue, 21 Nov 2017 12:46:28 +0100 |
|
Subject: [PATCH] journald: never block when sending messages on NOTIFY_SOCKET |
|
socket |
|
|
|
Otherwise we might run into deadlocks, when journald blocks on the |
|
notify socket on PID 1, and PID 1 blocks on IPC to dbus-daemon and |
|
dbus-daemon blocks on logging to journald. Break this cycle by making |
|
sure that journald never ever blocks on PID 1. |
|
|
|
Note that this change disables support for event loop watchdog support, |
|
as these messages are sent in blocking style by sd-event. That should |
|
not be a big loss though, as people reported frequent problems with the |
|
watchdog hitting journald on excessively slow IO. |
|
|
|
Fixes: #1505. |
|
(cherry-picked from commit e22aa3d3284709234f086ebebc13a905a295b7a7) |
|
|
|
Resolves: #1511565 |
|
--- |
|
src/journal/journald-server.c | 130 +++++++++++++++++++++++++++++++++++++- |
|
src/journal/journald-server.h | 13 ++-- |
|
src/journal/journald-stream.c | 68 ++++++++++++++++++-- |
|
src/journal/journald-stream.h | 3 + |
|
src/journal/journald.c | 8 --- |
|
units/systemd-journald.service.in | 1 - |
|
6 files changed, 201 insertions(+), 22 deletions(-) |
|
|
|
diff --git a/src/journal/journald-server.c b/src/journal/journald-server.c |
|
index daeecd519..a810829b2 100644 |
|
--- a/src/journal/journald-server.c |
|
+++ b/src/journal/journald-server.c |
|
@@ -67,6 +67,8 @@ |
|
|
|
#define RECHECK_AVAILABLE_SPACE_USEC (30*USEC_PER_SEC) |
|
|
|
+#define NOTIFY_SNDBUF_SIZE (8*1024*1024) |
|
+ |
|
/* Pick a good default that is likely to fit into AF_UNIX and AF_INET SOCK_DGRAM datagrams, and even leaves some room |
|
+ * for a bit of additional metadata. */ |
|
#define DEFAULT_LINE_MAX (48*1024) |
|
@@ -1556,6 +1558,126 @@ static int server_open_hostname(Server *s) { |
|
return 0; |
|
} |
|
|
|
+static int dispatch_notify_event(sd_event_source *es, int fd, uint32_t revents, void *userdata) { |
|
+ Server *s = userdata; |
|
+ int r; |
|
+ |
|
+ assert(s); |
|
+ assert(s->notify_event_source == es); |
|
+ assert(s->notify_fd == fd); |
|
+ |
|
+ if (revents != EPOLLOUT) { |
|
+ log_error("Invalid events on notify file descriptor."); |
|
+ return -EINVAL; |
|
+ } |
|
+ |
|
+ /* The $NOTIFY_SOCKET is writable again, now send exactly one |
|
+ * message on it. Either it's the initial READY=1 event or an |
|
+ * stdout stream event. If there's nothing to write anymore, |
|
+ * turn our event source off. The next time there's something |
|
+ * to send it will be turned on again. */ |
|
+ |
|
+ if (!s->sent_notify_ready) { |
|
+ static const char p[] = |
|
+ "READY=1\n" |
|
+ "STATUS=Processing requests..."; |
|
+ ssize_t l; |
|
+ |
|
+ l = send(s->notify_fd, p, strlen(p), MSG_DONTWAIT); |
|
+ if (l < 0) { |
|
+ if (errno == EAGAIN) |
|
+ return 0; |
|
+ |
|
+ return log_error_errno(errno, "Failed to send READY=1 notification message: %m"); |
|
+ } |
|
+ |
|
+ s->sent_notify_ready = true; |
|
+ log_debug("Sent READY=1 notification."); |
|
+ |
|
+ } else if (s->stdout_streams_notify_queue) |
|
+ /* Dispatch one stream notification event */ |
|
+ stdout_stream_send_notify(s->stdout_streams_notify_queue); |
|
+ |
|
+ /* Leave us enabled if there's still more to to do. */ |
|
+ if (s->stdout_streams_notify_queue) |
|
+ return 0; |
|
+ |
|
+ /* There was nothing to do anymore, let's turn ourselves off. */ |
|
+ r = sd_event_source_set_enabled(es, SD_EVENT_OFF); |
|
+ if (r < 0) |
|
+ return log_error_errno(r, "Failed to turn off notify event source: %m"); |
|
+ |
|
+ return 0; |
|
+} |
|
+ |
|
+static int server_connect_notify(Server *s) { |
|
+ union sockaddr_union sa = { |
|
+ .un.sun_family = AF_UNIX, |
|
+ }; |
|
+ const char *e; |
|
+ int r; |
|
+ |
|
+ assert(s); |
|
+ assert(s->notify_fd < 0); |
|
+ assert(!s->notify_event_source); |
|
+ |
|
+ /* |
|
+ So here's the problem: we'd like to send notification |
|
+ messages to PID 1, but we cannot do that via sd_notify(), |
|
+ since that's synchronous, and we might end up blocking on |
|
+ it. Specifically: given that PID 1 might block on |
|
+ dbus-daemon during IPC, and dbus-daemon is logging to us, |
|
+ and might hence block on us, we might end up in a deadlock |
|
+ if we block on sending PID 1 notification messages -- by |
|
+ generating a full blocking circle. To avoid this, let's |
|
+ create a non-blocking socket, and connect it to the |
|
+ notification socket, and then wait for POLLOUT before we |
|
+ send anything. This should efficiently avoid any deadlocks, |
|
+ as we'll never block on PID 1, hence PID 1 can safely block |
|
+ on dbus-daemon which can safely block on us again. |
|
+ |
|
+ Don't think that this issue is real? It is, see: |
|
+ https://github.com/systemd/systemd/issues/1505 |
|
+ */ |
|
+ |
|
+ e = getenv("NOTIFY_SOCKET"); |
|
+ if (!e) |
|
+ return 0; |
|
+ |
|
+ if ((e[0] != '@' && e[0] != '/') || e[1] == 0) { |
|
+ log_error("NOTIFY_SOCKET set to an invalid value: %s", e); |
|
+ return -EINVAL; |
|
+ } |
|
+ |
|
+ if (strlen(e) > sizeof(sa.un.sun_path)) { |
|
+ log_error("NOTIFY_SOCKET path too long: %s", e); |
|
+ return -EINVAL; |
|
+ } |
|
+ |
|
+ s->notify_fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); |
|
+ if (s->notify_fd < 0) |
|
+ return log_error_errno(errno, "Failed to create notify socket: %m"); |
|
+ |
|
+ (void) fd_inc_sndbuf(s->notify_fd, NOTIFY_SNDBUF_SIZE); |
|
+ |
|
+ strncpy(sa.un.sun_path, e, sizeof(sa.un.sun_path)); |
|
+ if (sa.un.sun_path[0] == '@') |
|
+ sa.un.sun_path[0] = 0; |
|
+ |
|
+ r = connect(s->notify_fd, &sa.sa, offsetof(struct sockaddr_un, sun_path) + strlen(e)); |
|
+ if (r < 0) |
|
+ return log_error_errno(errno, "Failed to connect to notify socket: %m"); |
|
+ |
|
+ r = sd_event_add_io(s->event, &s->notify_event_source, s->notify_fd, EPOLLOUT, dispatch_notify_event, s); |
|
+ if (r < 0) |
|
+ return log_error_errno(r, "Failed to watch notification socket: %m"); |
|
+ |
|
+ /* This should fire pretty soon, which we'll use to send the |
|
+ * READY=1 event. */ |
|
+ |
|
+ return 0; |
|
+} |
|
+ |
|
int server_init(Server *s) { |
|
_cleanup_fdset_free_ FDSet *fds = NULL; |
|
int n, r, fd; |
|
@@ -1563,7 +1685,7 @@ int server_init(Server *s) { |
|
assert(s); |
|
|
|
zero(*s); |
|
- s->syslog_fd = s->native_fd = s->stdout_fd = s->dev_kmsg_fd = s->audit_fd = s->hostname_fd = -1; |
|
+ s->syslog_fd = s->native_fd = s->stdout_fd = s->dev_kmsg_fd = s->audit_fd = s->hostname_fd = s->notify_fd = -1; |
|
s->compress = true; |
|
s->seal = true; |
|
|
|
@@ -1611,8 +1733,6 @@ int server_init(Server *s) { |
|
if (r < 0) |
|
return log_error_errno(r, "Failed to create event loop: %m"); |
|
|
|
- sd_event_set_watchdog(s->event, true); |
|
- |
|
n = sd_listen_fds(true); |
|
if (n < 0) |
|
return log_error_errno(n, "Failed to read listening file descriptors from environment: %m"); |
|
@@ -1718,6 +1838,8 @@ int server_init(Server *s) { |
|
server_cache_boot_id(s); |
|
server_cache_machine_id(s); |
|
|
|
+ (void) server_connect_notify(s); |
|
+ |
|
r = system_journal_open(s, false); |
|
if (r < 0) |
|
return r; |
|
@@ -1770,6 +1892,7 @@ void server_done(Server *s) { |
|
sd_event_source_unref(s->sigterm_event_source); |
|
sd_event_source_unref(s->sigint_event_source); |
|
sd_event_source_unref(s->hostname_event_source); |
|
+ sd_event_source_unref(s->notify_event_source); |
|
sd_event_unref(s->event); |
|
|
|
safe_close(s->syslog_fd); |
|
@@ -1778,6 +1901,7 @@ void server_done(Server *s) { |
|
safe_close(s->dev_kmsg_fd); |
|
safe_close(s->audit_fd); |
|
safe_close(s->hostname_fd); |
|
+ safe_close(s->notify_fd); |
|
|
|
if (s->rate_limit) |
|
journal_rate_limit_free(s->rate_limit); |
|
diff --git a/src/journal/journald-server.h b/src/journal/journald-server.h |
|
index b29410778..e59ff35e2 100644 |
|
--- a/src/journal/journald-server.h |
|
+++ b/src/journal/journald-server.h |
|
@@ -27,12 +27,15 @@ |
|
#include <sys/types.h> |
|
#include <sys/socket.h> |
|
|
|
+typedef struct Server Server; |
|
+ |
|
#include "sd-event.h" |
|
#include "journal-file.h" |
|
#include "hashmap.h" |
|
#include "util.h" |
|
#include "audit.h" |
|
#include "journald-rate-limit.h" |
|
+#include "journald-stream.h" |
|
#include "list.h" |
|
|
|
typedef enum Storage { |
|
@@ -52,15 +55,14 @@ typedef enum SplitMode { |
|
_SPLIT_INVALID = -1 |
|
} SplitMode; |
|
|
|
-typedef struct StdoutStream StdoutStream; |
|
- |
|
-typedef struct Server { |
|
+struct Server { |
|
int syslog_fd; |
|
int native_fd; |
|
int stdout_fd; |
|
int dev_kmsg_fd; |
|
int audit_fd; |
|
int hostname_fd; |
|
+ int notify_fd; |
|
|
|
sd_event *event; |
|
|
|
@@ -75,6 +77,7 @@ typedef struct Server { |
|
sd_event_source *sigterm_event_source; |
|
sd_event_source *sigint_event_source; |
|
sd_event_source *hostname_event_source; |
|
+ sd_event_source *notify_event_source; |
|
|
|
JournalFile *runtime_journal; |
|
JournalFile *system_journal; |
|
@@ -114,6 +117,7 @@ typedef struct Server { |
|
usec_t oldest_file_usec; |
|
|
|
LIST_HEAD(StdoutStream, stdout_streams); |
|
+ LIST_HEAD(StdoutStream, stdout_streams_notify_queue); |
|
unsigned n_stdout_streams; |
|
|
|
char *tty_path; |
|
@@ -135,6 +139,7 @@ typedef struct Server { |
|
|
|
struct udev *udev; |
|
|
|
+ bool sent_notify_ready; |
|
bool sync_scheduled; |
|
|
|
char machine_id_field[sizeof("_MACHINE_ID=") + 32]; |
|
@@ -145,7 +150,7 @@ typedef struct Server { |
|
char *cgroup_root; |
|
|
|
size_t line_max; |
|
-} Server; |
|
+}; |
|
|
|
#define N_IOVEC_META_FIELDS 20 |
|
#define N_IOVEC_KERNEL_FIELDS 64 |
|
diff --git a/src/journal/journald-stream.c b/src/journal/journald-stream.c |
|
index 4d6b7ad18..9118d1a31 100644 |
|
--- a/src/journal/journald-stream.c |
|
+++ b/src/journal/journald-stream.c |
|
@@ -80,6 +80,7 @@ struct StdoutStream { |
|
bool forward_to_console:1; |
|
|
|
bool fdstore:1; |
|
+ bool in_notify_queue:1; |
|
|
|
char *buffer; |
|
size_t length; |
|
@@ -90,6 +91,7 @@ struct StdoutStream { |
|
char *state_file; |
|
|
|
LIST_FIELDS(StdoutStream, stdout_stream); |
|
+ LIST_FIELDS(StdoutStream, stdout_stream_notify_queue); |
|
|
|
char id_field[sizeof("_STREAM_ID=")-1 + SD_ID128_STRING_MAX]; |
|
}; |
|
@@ -102,6 +104,9 @@ void stdout_stream_free(StdoutStream *s) { |
|
assert(s->server->n_stdout_streams > 0); |
|
s->server->n_stdout_streams --; |
|
LIST_REMOVE(stdout_stream, s->server->stdout_streams, s); |
|
+ |
|
+ if (s->in_notify_queue) |
|
+ LIST_REMOVE(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s); |
|
} |
|
|
|
if (s->event_source) { |
|
@@ -126,7 +131,7 @@ static void stdout_stream_destroy(StdoutStream *s) { |
|
return; |
|
|
|
if (s->state_file) |
|
- unlink(s->state_file); |
|
+ (void) unlink(s->state_file); |
|
|
|
stdout_stream_free(s); |
|
} |
|
@@ -210,11 +215,15 @@ static int stdout_stream_save(StdoutStream *s) { |
|
free(temp_path); |
|
temp_path = NULL; |
|
|
|
- /* Store the connection fd in PID 1, so that we get it passed |
|
- * in again on next start */ |
|
- if (!s->fdstore) { |
|
- sd_pid_notify_with_fds(0, false, "FDSTORE=1", &s->fd, 1); |
|
- s->fdstore = true; |
|
+ if (!s->fdstore && !s->in_notify_queue) { |
|
+ LIST_PREPEND(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s); |
|
+ s->in_notify_queue = true; |
|
+ |
|
+ if (s->server->notify_event_source) { |
|
+ r = sd_event_source_set_enabled(s->server->notify_event_source, SD_EVENT_ON); |
|
+ if (r < 0) |
|
+ log_warning_errno(r, "Failed to enable notify event source: %m"); |
|
+ } |
|
} |
|
|
|
finish: |
|
@@ -801,3 +810,50 @@ int server_open_stdout_socket(Server *s, FDSet *fds) { |
|
|
|
return 0; |
|
} |
|
+ |
|
+void stdout_stream_send_notify(StdoutStream *s) { |
|
+ struct iovec iovec = { |
|
+ .iov_base = (char*) "FDSTORE=1", |
|
+ .iov_len = strlen("FDSTORE=1"), |
|
+ }; |
|
+ struct msghdr msghdr = { |
|
+ .msg_iov = &iovec, |
|
+ .msg_iovlen = 1, |
|
+ }; |
|
+ struct cmsghdr *cmsg; |
|
+ ssize_t l; |
|
+ |
|
+ assert(s); |
|
+ assert(!s->fdstore); |
|
+ assert(s->in_notify_queue); |
|
+ assert(s->server); |
|
+ assert(s->server->notify_fd >= 0); |
|
+ |
|
+ /* Store the connection fd in PID 1, so that we get it passed |
|
+ * in again on next start */ |
|
+ |
|
+ msghdr.msg_controllen = CMSG_SPACE(sizeof(int)); |
|
+ msghdr.msg_control = alloca0(msghdr.msg_controllen); |
|
+ |
|
+ cmsg = CMSG_FIRSTHDR(&msghdr); |
|
+ cmsg->cmsg_level = SOL_SOCKET; |
|
+ cmsg->cmsg_type = SCM_RIGHTS; |
|
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int)); |
|
+ |
|
+ memcpy(CMSG_DATA(cmsg), &s->fd, sizeof(int)); |
|
+ |
|
+ l = sendmsg(s->server->notify_fd, &msghdr, MSG_DONTWAIT|MSG_NOSIGNAL); |
|
+ if (l < 0) { |
|
+ if (errno == EAGAIN) |
|
+ return; |
|
+ |
|
+ log_error_errno(errno, "Failed to send stream file descriptor to service manager: %m"); |
|
+ } else { |
|
+ log_debug("Successfully sent stream file descriptor to service manager."); |
|
+ s->fdstore = 1; |
|
+ } |
|
+ |
|
+ LIST_REMOVE(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s); |
|
+ s->in_notify_queue = false; |
|
+ |
|
+} |
|
diff --git a/src/journal/journald-stream.h b/src/journal/journald-stream.h |
|
index 94bf955d7..fd8c94fb6 100644 |
|
--- a/src/journal/journald-stream.h |
|
+++ b/src/journal/journald-stream.h |
|
@@ -21,9 +21,12 @@ |
|
along with systemd; If not, see <http://www.gnu.org/licenses/>. |
|
***/ |
|
|
|
+typedef struct StdoutStream StdoutStream; |
|
+ |
|
#include "fdset.h" |
|
#include "journald-server.h" |
|
|
|
int server_open_stdout_socket(Server *s, FDSet *fds); |
|
|
|
void stdout_stream_free(StdoutStream *s); |
|
+void stdout_stream_send_notify(StdoutStream *s); |
|
diff --git a/src/journal/journald.c b/src/journal/journald.c |
|
index 15bbcbe3d..b7ba2b6ec 100644 |
|
--- a/src/journal/journald.c |
|
+++ b/src/journal/journald.c |
|
@@ -64,10 +64,6 @@ int main(int argc, char *argv[]) { |
|
log_debug("systemd-journald running as pid "PID_FMT, getpid()); |
|
server_driver_message(&server, SD_MESSAGE_JOURNAL_START, "Journal started"); |
|
|
|
- sd_notify(false, |
|
- "READY=1\n" |
|
- "STATUS=Processing requests..."); |
|
- |
|
for (;;) { |
|
usec_t t = USEC_INFINITY, n; |
|
|
|
@@ -120,10 +116,6 @@ int main(int argc, char *argv[]) { |
|
server_driver_message(&server, SD_MESSAGE_JOURNAL_STOP, "Journal stopped"); |
|
|
|
finish: |
|
- sd_notify(false, |
|
- "STOPPING=1\n" |
|
- "STATUS=Shutting down..."); |
|
- |
|
server_done(&server); |
|
|
|
return r < 0 ? EXIT_FAILURE : EXIT_SUCCESS; |
|
diff --git a/units/systemd-journald.service.in b/units/systemd-journald.service.in |
|
index 0d1ea61fe..c94c0bfba 100644 |
|
--- a/units/systemd-journald.service.in |
|
+++ b/units/systemd-journald.service.in |
|
@@ -22,7 +22,6 @@ RestartSec=0 |
|
StandardOutput=null |
|
FileDescriptorStoreMax=4224 |
|
CapabilityBoundingSet=CAP_SYS_ADMIN CAP_DAC_OVERRIDE CAP_SYS_PTRACE CAP_SYSLOG CAP_AUDIT_CONTROL CAP_AUDIT_READ CAP_CHOWN CAP_DAC_READ_SEARCH CAP_FOWNER CAP_SETUID CAP_SETGID CAP_MAC_OVERRIDE |
|
-WatchdogSec=3min |
|
|
|
# Increase the default a bit in order to allow many simultaneous |
|
# services being run since we keep one fd open per service. Also, when
|
|
|