You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
170 lines
6.6 KiB
170 lines
6.6 KiB
6 years ago
|
From db57bf73d3e5e650b261834a0c39c9d368f9eeea Mon Sep 17 00:00:00 2001
|
||
|
From: Kyle Walker <kwalker@redhat.com>
|
||
|
Date: Thu, 14 Dec 2017 11:46:03 -0500
|
||
|
Subject: [PATCH] core: Implement sync_with_progress()
|
||
|
|
||
|
In similar fashion to the previous change, sync() operations can stall
|
||
|
endlessly if cache is unable to be written out. In order to avoid an
|
||
|
unbounded hang, the sync takes place within a child process. Every 10
|
||
|
seconds (SYNC_TIMEOUT_USEC), the value of /proc/meminfo "Dirty" is checked
|
||
|
to verify it is smaller than the last iteration. If the sync is not making
|
||
|
progress for 3 successive iterations (SYNC_PROGRESS_ATTEMPTS), a SIGKILL is
|
||
|
sent to the sync process and the shutdown continues.
|
||
|
|
||
|
(cherry picked from commit 73ad712fcfea5d8ba475044698d31d2c15d4180d)
|
||
|
|
||
|
Related: #1571098
|
||
|
---
|
||
|
src/core/shutdown.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++++---
|
||
|
1 file changed, 111 insertions(+), 5 deletions(-)
|
||
|
|
||
|
diff --git a/src/core/shutdown.c b/src/core/shutdown.c
|
||
|
index 71f001ac1..0b0a54a7d 100644
|
||
|
--- a/src/core/shutdown.c
|
||
|
+++ b/src/core/shutdown.c
|
||
|
@@ -53,6 +53,9 @@
|
||
|
|
||
|
#define FINALIZE_ATTEMPTS 50
|
||
|
|
||
|
+#define SYNC_PROGRESS_ATTEMPTS 3
|
||
|
+#define SYNC_TIMEOUT_USEC (10*USEC_PER_SEC)
|
||
|
+
|
||
|
static char* arg_verb;
|
||
|
|
||
|
static int parse_argv(int argc, char *argv[]) {
|
||
|
@@ -152,6 +155,102 @@ static int switch_root_initramfs(void) {
|
||
|
return switch_root("/run/initramfs", "/oldroot", false, MS_BIND);
|
||
|
}
|
||
|
|
||
|
+/* Read the following fields from /proc/meminfo:
|
||
|
+ *
|
||
|
+ * NFS_Unstable
|
||
|
+ * Writeback
|
||
|
+ * Dirty
|
||
|
+ *
|
||
|
+ * Return true if the sum of these fields is greater than the previous
|
||
|
+ * value input. For all other issues, report the failure and indicate that
|
||
|
+ * the sync is not making progress.
|
||
|
+ */
|
||
|
+static bool sync_making_progress(unsigned long long *prev_dirty) {
|
||
|
+ _cleanup_fclose_ FILE *f = NULL;
|
||
|
+ char line[LINE_MAX];
|
||
|
+ bool r = false;
|
||
|
+ unsigned long long val = 0;
|
||
|
+
|
||
|
+ f = fopen("/proc/meminfo", "re");
|
||
|
+ if (!f)
|
||
|
+ return log_warning_errno(errno, "Failed to open /proc/meminfo: %m");
|
||
|
+
|
||
|
+ FOREACH_LINE(line, f, log_warning_errno(errno, "Failed to parse /proc/meminfo: %m")) {
|
||
|
+ unsigned long long ull = 0;
|
||
|
+
|
||
|
+ if (!first_word(line, "NFS_Unstable:") && !first_word(line, "Writeback:") && !first_word(line, "Dirty:"))
|
||
|
+ continue;
|
||
|
+
|
||
|
+ errno = 0;
|
||
|
+ if (sscanf(line, "%*s %llu %*s", &ull) != 1) {
|
||
|
+ if (errno != 0)
|
||
|
+ log_warning_errno(errno, "Failed to parse /proc/meminfo: %m");
|
||
|
+ else
|
||
|
+ log_warning("Failed to parse /proc/meminfo");
|
||
|
+
|
||
|
+ return false;
|
||
|
+ }
|
||
|
+
|
||
|
+ val += ull;
|
||
|
+ }
|
||
|
+
|
||
|
+ r = *prev_dirty > val;
|
||
|
+
|
||
|
+ *prev_dirty = val;
|
||
|
+
|
||
|
+ return r;
|
||
|
+}
|
||
|
+
|
||
|
+static void sync_with_progress(void) {
|
||
|
+ unsigned checks;
|
||
|
+ pid_t pid;
|
||
|
+ int r;
|
||
|
+ unsigned long long dirty = ULONG_LONG_MAX;
|
||
|
+
|
||
|
+ BLOCK_SIGNALS(SIGCHLD);
|
||
|
+
|
||
|
+ /* Due to the possiblity of the sync operation hanging, we fork
|
||
|
+ * a child process and monitor the progress. If the timeout
|
||
|
+ * lapses, the assumption is that that particular sync stalled. */
|
||
|
+ pid = fork();
|
||
|
+ if (pid < 0) {
|
||
|
+ log_error_errno(errno, "Failed to fork: %m");
|
||
|
+ return;
|
||
|
+ }
|
||
|
+
|
||
|
+ if (pid == 0) {
|
||
|
+ /* Start the sync operation here in the child */
|
||
|
+ sync();
|
||
|
+ _exit(EXIT_SUCCESS);
|
||
|
+ }
|
||
|
+
|
||
|
+ log_info("Syncing filesystems and block devices.");
|
||
|
+
|
||
|
+ /* Start monitoring the sync operation. If more than
|
||
|
+ * SYNC_PROGRESS_ATTEMPTS lapse without progress being made,
|
||
|
+ * we assume that the sync is stalled */
|
||
|
+ for (checks = 0; checks < SYNC_PROGRESS_ATTEMPTS; checks++) {
|
||
|
+ r = wait_for_terminate_with_timeout(pid, SYNC_TIMEOUT_USEC);
|
||
|
+ if (r == 0)
|
||
|
+ /* Sync finished without error.
|
||
|
+ * (The sync itself does not return an error code) */
|
||
|
+ return;
|
||
|
+ else if (r == -ETIMEDOUT) {
|
||
|
+ /* Reset the check counter if the "Dirty" value is
|
||
|
+ * decreasing */
|
||
|
+ if (sync_making_progress(&dirty))
|
||
|
+ checks = 0;
|
||
|
+ } else {
|
||
|
+ log_error_errno(r, "Failed to sync filesystems and block devices: %m");
|
||
|
+ return;
|
||
|
+ }
|
||
|
+ }
|
||
|
+
|
||
|
+ /* Only reached in the event of a timeout. We should issue a kill
|
||
|
+ * to the stray process. */
|
||
|
+ log_error("Syncing filesystems and block devices - timed out, issuing SIGKILL to PID "PID_FMT".", pid);
|
||
|
+ (void) kill(pid, SIGKILL);
|
||
|
+}
|
||
|
|
||
|
int main(int argc, char *argv[]) {
|
||
|
bool need_umount, need_swapoff, need_loop_detach, need_dm_detach;
|
||
|
@@ -202,6 +301,13 @@ int main(int argc, char *argv[]) {
|
||
|
/* lock us into memory */
|
||
|
mlockall(MCL_CURRENT|MCL_FUTURE);
|
||
|
|
||
|
+ /* Synchronize everything that is not written to disk yet at this point already. This is a good idea so that
|
||
|
+ * slow IO is processed here already and the final process killing spree is not impacted by processes
|
||
|
+ * desperately trying to sync IO to disk within their timeout. Do not remove this sync, data corruption will
|
||
|
+ * result. */
|
||
|
+ if (!in_container)
|
||
|
+ sync_with_progress();
|
||
|
+
|
||
|
log_info("Sending SIGTERM to remaining processes...");
|
||
|
broadcast_signal(SIGTERM, true, true);
|
||
|
|
||
|
@@ -338,12 +444,12 @@ int main(int argc, char *argv[]) {
|
||
|
need_loop_detach ? " loop devices," : "",
|
||
|
need_dm_detach ? " DM devices," : "");
|
||
|
|
||
|
- /* The kernel will automaticall flush ATA disks and suchlike
|
||
|
- * on reboot(), but the file systems need to be synce'd
|
||
|
- * explicitly in advance. So let's do this here, but not
|
||
|
- * needlessly slow down containers. */
|
||
|
+ /* The kernel will automatically flush ATA disks and suchlike on reboot(), but the file systems need to be
|
||
|
+ * sync'ed explicitly in advance. So let's do this here, but not needlessly slow down containers. Note that we
|
||
|
+ * sync'ed things already once above, but we did some more work since then which might have caused IO, hence
|
||
|
+ * let's do it once more. Do not remove this sync, data corruption will result. */
|
||
|
if (!in_container)
|
||
|
- sync();
|
||
|
+ sync_with_progress();
|
||
|
|
||
|
switch (cmd) {
|
||
|
|