You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
360 lines
12 KiB
360 lines
12 KiB
From bc6588890ce94101a63b861178cf38db5549d8a8 Mon Sep 17 00:00:00 2001 |
|
From: Ashish Pandey <aspandey@redhat.com> |
|
Date: Wed, 28 Nov 2018 11:22:52 +0530 |
|
Subject: [PATCH 44/52] cluster/ec: Don't enqueue an entry if it is already |
|
healing |
|
|
|
Problem: |
|
1 - heal-wait-qlength is by default 128. If shd is disabled |
|
and we need to heal files, client side heal is needed. |
|
If we access these files that will trigger the heal. |
|
However, it has been observed that a file will be enqueued |
|
multiple times in the heal wait queue, which in turn causes |
|
queue to be filled and prevent other files to be enqueued. |
|
|
|
2 - While a file is going through healing and a write fop from |
|
mount comes on that file, it sends write on all the bricks including |
|
healing one. At the end it updates version and size on all the |
|
bricks. However, it does not unset dirty flag on all the bricks, |
|
even if this write fop was successful on all the bricks. |
|
After healing completion this dirty flag remain set and never |
|
gets cleaned up if SHD is disabled. |
|
|
|
Solution: |
|
1 - If an entry is already in queue or going through heal process, |
|
don't enqueue next client side request to heal the same file. |
|
|
|
2 - Unset dirty on all the bricks at the end if fop has succeeded on |
|
all the bricks even if some of the bricks are going through heal. |
|
|
|
backport of : https://review.gluster.org/#/c/glusterfs/+/21744/ |
|
|
|
Change-Id: Ia61ffe230c6502ce6cb934425d55e2f40dd1a727 |
|
BUG: 1600918 |
|
Signed-off-by: Ashish Pandey <aspandey@redhat.com> |
|
Reviewed-on: https://code.engineering.redhat.com/gerrit/166296 |
|
Tested-by: RHGS Build Bot <nigelb@redhat.com> |
|
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com> |
|
--- |
|
tests/bugs/ec/bug-1236065.t | 1 - |
|
xlators/cluster/ec/src/ec-common.c | 43 +++++++++------ |
|
xlators/cluster/ec/src/ec-common.h | 8 +++ |
|
xlators/cluster/ec/src/ec-heal.c | 104 +++++++++++++++++++++++++++++++----- |
|
xlators/cluster/ec/src/ec-helpers.c | 1 + |
|
xlators/cluster/ec/src/ec-types.h | 1 + |
|
6 files changed, 127 insertions(+), 31 deletions(-) |
|
|
|
diff --git a/tests/bugs/ec/bug-1236065.t b/tests/bugs/ec/bug-1236065.t |
|
index 76d25d7..9181e73 100644 |
|
--- a/tests/bugs/ec/bug-1236065.t |
|
+++ b/tests/bugs/ec/bug-1236065.t |
|
@@ -85,7 +85,6 @@ TEST pidof glusterd |
|
EXPECT "$V0" volinfo_field $V0 'Volume Name' |
|
EXPECT 'Started' volinfo_field $V0 'Status' |
|
EXPECT '7' online_brick_count |
|
- |
|
## cleanup |
|
cd |
|
EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 |
|
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c |
|
index 8d65670..5183680 100644 |
|
--- a/xlators/cluster/ec/src/ec-common.c |
|
+++ b/xlators/cluster/ec/src/ec-common.c |
|
@@ -313,14 +313,15 @@ ec_check_status(ec_fop_data_t *fop) |
|
|
|
gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS, |
|
"Operation failed on %d of %d subvolumes.(up=%s, mask=%s, " |
|
- "remaining=%s, good=%s, bad=%s)", |
|
+ "remaining=%s, good=%s, bad=%s, %s)", |
|
gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes, |
|
ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), |
|
ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), |
|
ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes), |
|
ec_bin(str4, sizeof(str4), fop->good, ec->nodes), |
|
ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good), |
|
- ec->nodes)); |
|
+ ec->nodes), |
|
+ ec_msg_str(fop)); |
|
if (fop->use_fd) { |
|
if (fop->fd != NULL) { |
|
ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, |
|
@@ -2371,37 +2372,47 @@ ec_update_info(ec_lock_link_t *link) |
|
uint64_t dirty[2] = {0, 0}; |
|
uint64_t size; |
|
ec_t *ec = NULL; |
|
+ uintptr_t mask; |
|
|
|
lock = link->lock; |
|
ctx = lock->ctx; |
|
ec = link->fop->xl->private; |
|
|
|
/* pre_version[*] will be 0 if have_version is false */ |
|
- version[0] = ctx->post_version[0] - ctx->pre_version[0]; |
|
- version[1] = ctx->post_version[1] - ctx->pre_version[1]; |
|
+ version[EC_DATA_TXN] = ctx->post_version[EC_DATA_TXN] - |
|
+ ctx->pre_version[EC_DATA_TXN]; |
|
+ version[EC_METADATA_TXN] = ctx->post_version[EC_METADATA_TXN] - |
|
+ ctx->pre_version[EC_METADATA_TXN]; |
|
|
|
size = ctx->post_size - ctx->pre_size; |
|
/* If we set the dirty flag for update fop, we have to unset it. |
|
* If fop has failed on some bricks, leave the dirty as marked. */ |
|
+ |
|
if (lock->unlock_now) { |
|
+ if (version[EC_DATA_TXN]) { |
|
+ /*A data fop will have difference in post and pre version |
|
+ *and for data fop we send writes on healing bricks also */ |
|
+ mask = lock->good_mask | lock->healing; |
|
+ } else { |
|
+ mask = lock->good_mask; |
|
+ } |
|
/* Ensure that nodes are up while doing final |
|
* metadata update.*/ |
|
- if (!(ec->node_mask & ~lock->good_mask) && |
|
- !(ec->node_mask & ~ec->xl_up)) { |
|
- if (ctx->dirty[0] != 0) { |
|
- dirty[0] = -1; |
|
+ if (!(ec->node_mask & ~(mask)) && !(ec->node_mask & ~ec->xl_up)) { |
|
+ if (ctx->dirty[EC_DATA_TXN] != 0) { |
|
+ dirty[EC_DATA_TXN] = -1; |
|
} |
|
- if (ctx->dirty[1] != 0) { |
|
- dirty[1] = -1; |
|
+ if (ctx->dirty[EC_METADATA_TXN] != 0) { |
|
+ dirty[EC_METADATA_TXN] = -1; |
|
} |
|
/*If everything is fine and we already |
|
*have version xattr set on entry, there |
|
*is no need to update version again*/ |
|
- if (ctx->pre_version[0]) { |
|
- version[0] = 0; |
|
+ if (ctx->pre_version[EC_DATA_TXN]) { |
|
+ version[EC_DATA_TXN] = 0; |
|
} |
|
- if (ctx->pre_version[1]) { |
|
- version[1] = 0; |
|
+ if (ctx->pre_version[EC_METADATA_TXN]) { |
|
+ version[EC_METADATA_TXN] = 0; |
|
} |
|
} else { |
|
link->optimistic_changelog = _gf_false; |
|
@@ -2410,8 +2421,8 @@ ec_update_info(ec_lock_link_t *link) |
|
memset(ctx->dirty, 0, sizeof(ctx->dirty)); |
|
} |
|
|
|
- if ((version[0] != 0) || (version[1] != 0) || (dirty[0] != 0) || |
|
- (dirty[1] != 0)) { |
|
+ if ((version[EC_DATA_TXN] != 0) || (version[EC_METADATA_TXN] != 0) || |
|
+ (dirty[EC_DATA_TXN] != 0) || (dirty[EC_METADATA_TXN] != 0)) { |
|
ec_update_size_version(link, version, size, dirty); |
|
return _gf_true; |
|
} |
|
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h |
|
index 115e147..54aaa77 100644 |
|
--- a/xlators/cluster/ec/src/ec-common.h |
|
+++ b/xlators/cluster/ec/src/ec-common.h |
|
@@ -190,4 +190,12 @@ ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, |
|
void |
|
ec_update_fd_status(fd_t *fd, xlator_t *xl, int child_index, |
|
int32_t ret_status); |
|
+gf_boolean_t |
|
+ec_is_entry_healing(ec_fop_data_t *fop); |
|
+void |
|
+ec_set_entry_healing(ec_fop_data_t *fop); |
|
+void |
|
+ec_reset_entry_healing(ec_fop_data_t *fop); |
|
+char * |
|
+ec_msg_str(ec_fop_data_t *fop); |
|
#endif /* __EC_COMMON_H__ */ |
|
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c |
|
index eaf80e0..1ca12c1 100644 |
|
--- a/xlators/cluster/ec/src/ec-heal.c |
|
+++ b/xlators/cluster/ec/src/ec-heal.c |
|
@@ -103,6 +103,48 @@ ec_sh_key_match(dict_t *dict, char *key, data_t *val, void *mdata) |
|
} |
|
/* FOP: heal */ |
|
|
|
+void |
|
+ec_set_entry_healing(ec_fop_data_t *fop) |
|
+{ |
|
+ ec_inode_t *ctx = NULL; |
|
+ loc_t *loc = NULL; |
|
+ |
|
+ if (!fop) |
|
+ return; |
|
+ |
|
+ loc = &fop->loc[0]; |
|
+ LOCK(&loc->inode->lock); |
|
+ { |
|
+ ctx = __ec_inode_get(loc->inode, fop->xl); |
|
+ if (ctx) { |
|
+ ctx->heal_count += 1; |
|
+ } |
|
+ } |
|
+ UNLOCK(&loc->inode->lock); |
|
+} |
|
+ |
|
+void |
|
+ec_reset_entry_healing(ec_fop_data_t *fop) |
|
+{ |
|
+ ec_inode_t *ctx = NULL; |
|
+ loc_t *loc = NULL; |
|
+ int32_t heal_count = 0; |
|
+ if (!fop) |
|
+ return; |
|
+ |
|
+ loc = &fop->loc[0]; |
|
+ LOCK(&loc->inode->lock); |
|
+ { |
|
+ ctx = __ec_inode_get(loc->inode, fop->xl); |
|
+ if (ctx) { |
|
+ ctx->heal_count += -1; |
|
+ heal_count = ctx->heal_count; |
|
+ } |
|
+ } |
|
+ UNLOCK(&loc->inode->lock); |
|
+ GF_ASSERT(heal_count >= 0); |
|
+} |
|
+ |
|
uintptr_t |
|
ec_heal_check(ec_fop_data_t *fop, uintptr_t *pgood) |
|
{ |
|
@@ -2507,17 +2549,6 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial) |
|
"Heal is not required for : %s ", uuid_utoa(loc->gfid)); |
|
goto out; |
|
} |
|
- |
|
- msources = alloca0(ec->nodes); |
|
- mhealed_sinks = alloca0(ec->nodes); |
|
- ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks); |
|
- if (ret == 0) { |
|
- mgood = ec_char_array_to_mask(msources, ec->nodes); |
|
- mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes); |
|
- } else { |
|
- op_ret = -1; |
|
- op_errno = -ret; |
|
- } |
|
sources = alloca0(ec->nodes); |
|
healed_sinks = alloca0(ec->nodes); |
|
if (IA_ISREG(loc->inode->ia_type)) { |
|
@@ -2538,8 +2569,19 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial) |
|
op_ret = -1; |
|
op_errno = -ret; |
|
} |
|
+ msources = alloca0(ec->nodes); |
|
+ mhealed_sinks = alloca0(ec->nodes); |
|
+ ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks); |
|
+ if (ret == 0) { |
|
+ mgood = ec_char_array_to_mask(msources, ec->nodes); |
|
+ mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes); |
|
+ } else { |
|
+ op_ret = -1; |
|
+ op_errno = -ret; |
|
+ } |
|
|
|
out: |
|
+ ec_reset_entry_healing(fop); |
|
if (fop->cbks.heal) { |
|
fop->cbks.heal(fop->req_frame, fop, fop->xl, op_ret, op_errno, |
|
ec_char_array_to_mask(participants, ec->nodes), |
|
@@ -2650,11 +2692,33 @@ ec_handle_healers_done(ec_fop_data_t *fop) |
|
ec_launch_heal(ec, heal_fop); |
|
} |
|
|
|
+gf_boolean_t |
|
+ec_is_entry_healing(ec_fop_data_t *fop) |
|
+{ |
|
+ ec_inode_t *ctx = NULL; |
|
+ int32_t heal_count = 0; |
|
+ loc_t *loc = NULL; |
|
+ |
|
+ loc = &fop->loc[0]; |
|
+ |
|
+ LOCK(&loc->inode->lock); |
|
+ { |
|
+ ctx = __ec_inode_get(loc->inode, fop->xl); |
|
+ if (ctx) { |
|
+ heal_count = ctx->heal_count; |
|
+ } |
|
+ } |
|
+ UNLOCK(&loc->inode->lock); |
|
+ GF_ASSERT(heal_count >= 0); |
|
+ return heal_count; |
|
+} |
|
+ |
|
void |
|
ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop) |
|
{ |
|
gf_boolean_t can_heal = _gf_true; |
|
ec_t *ec = this->private; |
|
+ ec_fop_data_t *fop_rel = NULL; |
|
|
|
if (fop->req_frame == NULL) { |
|
LOCK(&ec->lock); |
|
@@ -2662,8 +2726,13 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop) |
|
if ((ec->background_heals > 0) && |
|
(ec->heal_wait_qlen + ec->background_heals) > |
|
(ec->heal_waiters + ec->healers)) { |
|
- list_add_tail(&fop->healer, &ec->heal_waiting); |
|
- ec->heal_waiters++; |
|
+ if (!ec_is_entry_healing(fop)) { |
|
+ list_add_tail(&fop->healer, &ec->heal_waiting); |
|
+ ec->heal_waiters++; |
|
+ ec_set_entry_healing(fop); |
|
+ } else { |
|
+ fop_rel = fop; |
|
+ } |
|
fop = __ec_dequeue_heals(ec); |
|
} else { |
|
can_heal = _gf_false; |
|
@@ -2673,8 +2742,12 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop) |
|
} |
|
|
|
if (can_heal) { |
|
- if (fop) |
|
+ if (fop) { |
|
+ if (fop->req_frame != NULL) { |
|
+ ec_set_entry_healing(fop); |
|
+ } |
|
ec_launch_heal(ec, fop); |
|
+ } |
|
} else { |
|
gf_msg_debug(this->name, 0, |
|
"Max number of heals are " |
|
@@ -2682,6 +2755,9 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop) |
|
ec_fop_set_error(fop, EBUSY); |
|
ec_heal_fail(ec, fop); |
|
} |
|
+ if (fop_rel) { |
|
+ ec_heal_done(0, NULL, fop_rel); |
|
+ } |
|
} |
|
|
|
void |
|
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c |
|
index e6b0359..43f6e3b 100644 |
|
--- a/xlators/cluster/ec/src/ec-helpers.c |
|
+++ b/xlators/cluster/ec/src/ec-helpers.c |
|
@@ -717,6 +717,7 @@ __ec_inode_get(inode_t *inode, xlator_t *xl) |
|
memset(ctx, 0, sizeof(*ctx)); |
|
INIT_LIST_HEAD(&ctx->heal); |
|
INIT_LIST_HEAD(&ctx->stripe_cache.lru); |
|
+ ctx->heal_count = 0; |
|
value = (uint64_t)(uintptr_t)ctx; |
|
if (__inode_ctx_set(inode, xl, &value) != 0) { |
|
GF_FREE(ctx); |
|
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h |
|
index f3d63ca..6ae4a2b 100644 |
|
--- a/xlators/cluster/ec/src/ec-types.h |
|
+++ b/xlators/cluster/ec/src/ec-types.h |
|
@@ -171,6 +171,7 @@ struct _ec_inode { |
|
gf_boolean_t have_config; |
|
gf_boolean_t have_version; |
|
gf_boolean_t have_size; |
|
+ int32_t heal_count; |
|
ec_config_t config; |
|
uint64_t pre_version[2]; |
|
uint64_t post_version[2]; |
|
-- |
|
1.8.3.1 |
|
|
|
|