You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
109 lines
3.6 KiB
109 lines
3.6 KiB
From ae7d61e35ec2ab6361c3e509a8db00698ef3396f Mon Sep 17 00:00:00 2001 |
|
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com> |
|
Date: Tue, 7 May 2019 16:08:47 +0200 |
|
Subject: [RHEL7.8 PATCH V2 25/47] mdmon: fix wrong array state when disk fails |
|
during mdmon startup |
|
|
|
If a member drive disappears and is set faulty by the kernel during |
|
mdmon startup, after ss->load_container() but before manage_new(), mdmon |
|
will try to readd the faulty drive to the array and start rebuilding. |
|
Metadata on the active drive is updated, but the faulty drive is not |
|
removed from the array and is left in a "blocked" state and any write |
|
request to the array will block. If the faulty drive reappears in the |
|
system e.g. after a reboot, the array will not assemble because metadata |
|
on the drives will be incompatible (at least on imsm). |
|
|
|
Fix this by adding a new option for sysfs_read(): "GET_DEVS_ALL". This |
|
is an extension for the "GET_DEVS" option and causes all member devices |
|
to be returned, even if the associated block device has been removed. |
|
Use this option in manage_new() to include the faulty device on the |
|
active_array's devices list. Mdmon will then properly remove the faulty |
|
device from the array and update the metadata to reflect the degraded |
|
state. |
|
|
|
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com> |
|
Signed-off-by: Jes Sorensen <jsorensen@fb.com> |
|
--- |
|
managemon.c | 2 +- |
|
mdadm.h | 1 + |
|
super-intel.c | 2 +- |
|
sysfs.c | 23 ++++++++++++++--------- |
|
4 files changed, 17 insertions(+), 11 deletions(-) |
|
|
|
diff --git a/managemon.c b/managemon.c |
|
index 29b91ba..200cf83 100644 |
|
--- a/managemon.c |
|
+++ b/managemon.c |
|
@@ -678,7 +678,7 @@ static void manage_new(struct mdstat_ent *mdstat, |
|
mdi = sysfs_read(-1, mdstat->devnm, |
|
GET_LEVEL|GET_CHUNK|GET_DISKS|GET_COMPONENT| |
|
GET_SAFEMODE|GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE| |
|
- GET_LAYOUT); |
|
+ GET_LAYOUT|GET_DEVS_ALL); |
|
|
|
if (!mdi) |
|
return; |
|
diff --git a/mdadm.h b/mdadm.h |
|
index 705bd9b..427cc52 100644 |
|
--- a/mdadm.h |
|
+++ b/mdadm.h |
|
@@ -647,6 +647,7 @@ enum sysfs_read_flags { |
|
GET_ERROR = (1 << 24), |
|
GET_ARRAY_STATE = (1 << 25), |
|
GET_CONSISTENCY_POLICY = (1 << 26), |
|
+ GET_DEVS_ALL = (1 << 27), |
|
}; |
|
|
|
/* If fd >= 0, get the array it is open on, |
|
diff --git a/super-intel.c b/super-intel.c |
|
index 2ba045a..4fd5e84 100644 |
|
--- a/super-intel.c |
|
+++ b/super-intel.c |
|
@@ -8560,7 +8560,7 @@ static void imsm_set_disk(struct active_array *a, int n, int state) |
|
disk = get_imsm_disk(super, ord_to_idx(ord)); |
|
|
|
/* check for new failures */ |
|
- if (state & DS_FAULTY) { |
|
+ if (disk && (state & DS_FAULTY)) { |
|
if (mark_failure(super, dev, disk, ord_to_idx(ord))) |
|
super->updates_pending++; |
|
} |
|
diff --git a/sysfs.c b/sysfs.c |
|
index df6fdda..2dd9ab6 100644 |
|
--- a/sysfs.c |
|
+++ b/sysfs.c |
|
@@ -313,17 +313,22 @@ struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options) |
|
/* assume this is a stale reference to a hot |
|
* removed device |
|
*/ |
|
- free(dev); |
|
- continue; |
|
+ if (!(options & GET_DEVS_ALL)) { |
|
+ free(dev); |
|
+ continue; |
|
+ } |
|
+ } else { |
|
+ sscanf(buf, "%d:%d", &dev->disk.major, &dev->disk.minor); |
|
} |
|
- sscanf(buf, "%d:%d", &dev->disk.major, &dev->disk.minor); |
|
|
|
- /* special case check for block devices that can go 'offline' */ |
|
- strcpy(dbase, "block/device/state"); |
|
- if (load_sys(fname, buf, sizeof(buf)) == 0 && |
|
- strncmp(buf, "offline", 7) == 0) { |
|
- free(dev); |
|
- continue; |
|
+ if (!(options & GET_DEVS_ALL)) { |
|
+ /* special case check for block devices that can go 'offline' */ |
|
+ strcpy(dbase, "block/device/state"); |
|
+ if (load_sys(fname, buf, sizeof(buf)) == 0 && |
|
+ strncmp(buf, "offline", 7) == 0) { |
|
+ free(dev); |
|
+ continue; |
|
+ } |
|
} |
|
|
|
/* finally add this disk to the array */ |
|
-- |
|
2.7.5 |
|
|
|
|