You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
204 lines
8.3 KiB
204 lines
8.3 KiB
From cca418b78ec976aa69eacd56b0e6127ea7e3dd26 Mon Sep 17 00:00:00 2001 |
|
From: Pranith Kumar K <pkarampu@redhat.com> |
|
Date: Thu, 4 Apr 2019 15:31:56 +0530 |
|
Subject: [PATCH 095/124] cluster/afr: Remove local from owners_list on failure |
|
of lock-acquisition |
|
|
|
Backport of https://review.gluster.org/c/glusterfs/+/22515 |
|
|
|
When eager-lock lock acquisition fails because of say network failures, the |
|
local is not being removed from owners_list, this leads to accumulation of |
|
waiting frames and the application will hang because the waiting frames are |
|
under the assumption that another transaction is in the process of acquiring |
|
lock because owner-list is not empty. Handled this case as well in this patch. |
|
Added asserts to make it easier to find these problems in future. |
|
|
|
Change-Id: I3101393265e9827755725b1f2d94a93d8709e923 |
|
fixes: bz#1688395 |
|
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> |
|
Reviewed-on: https://code.engineering.redhat.com/gerrit/167859 |
|
Tested-by: RHGS Build Bot <nigelb@redhat.com> |
|
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com> |
|
--- |
|
tests/bugs/replicate/bug-1696599-io-hang.t | 47 ++++++++++++++++++++++++++++++ |
|
xlators/cluster/afr/src/afr-common.c | 8 ++--- |
|
xlators/cluster/afr/src/afr-lk-common.c | 1 - |
|
xlators/cluster/afr/src/afr-transaction.c | 19 +++++------- |
|
xlators/cluster/afr/src/afr.h | 4 +-- |
|
5 files changed, 61 insertions(+), 18 deletions(-) |
|
create mode 100755 tests/bugs/replicate/bug-1696599-io-hang.t |
|
|
|
diff --git a/tests/bugs/replicate/bug-1696599-io-hang.t b/tests/bugs/replicate/bug-1696599-io-hang.t |
|
new file mode 100755 |
|
index 0000000..869cdb9 |
|
--- /dev/null |
|
+++ b/tests/bugs/replicate/bug-1696599-io-hang.t |
|
@@ -0,0 +1,47 @@ |
|
+#!/bin/bash |
|
+ |
|
+. $(dirname $0)/../../include.rc |
|
+. $(dirname $0)/../../volume.rc |
|
+. $(dirname $0)/../../fileio.rc |
|
+ |
|
+#Tests that local structures in afr are removed from granted/blocked list of |
|
+#locks when inodelk fails on all bricks |
|
+ |
|
+cleanup; |
|
+ |
|
+TEST glusterd |
|
+TEST pidof glusterd |
|
+ |
|
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3} |
|
+TEST $CLI volume set $V0 performance.quick-read off |
|
+TEST $CLI volume set $V0 performance.write-behind off |
|
+TEST $CLI volume set $V0 performance.io-cache off |
|
+TEST $CLI volume set $V0 performance.stat-prefetch off |
|
+TEST $CLI volume set $V0 performance.client-io-threads off |
|
+TEST $CLI volume set $V0 delay-gen locks |
|
+TEST $CLI volume set $V0 delay-gen.delay-duration 5000000 |
|
+TEST $CLI volume set $V0 delay-gen.delay-percentage 100 |
|
+TEST $CLI volume set $V0 delay-gen.enable finodelk |
|
+ |
|
+TEST $CLI volume start $V0 |
|
+EXPECT 'Started' volinfo_field $V0 'Status' |
|
+ |
|
+TEST $GFS -s $H0 --volfile-id $V0 $M0 |
|
+TEST touch $M0/file |
|
+#Trigger write and stop bricks so inodelks fail on all bricks leading to |
|
+#lock failure condition |
|
+echo abc >> $M0/file & |
|
+ |
|
+TEST $CLI volume stop $V0 |
|
+TEST $CLI volume reset $V0 delay-gen |
|
+wait |
|
+TEST $CLI volume start $V0 |
|
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 |
|
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1 |
|
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 2 |
|
+#Test that only one write succeeded, this tests that delay-gen worked as |
|
+#expected |
|
+echo abc >> $M0/file |
|
+EXPECT "abc" cat $M0/file |
|
+ |
|
+cleanup; |
|
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c |
|
index 45b96e3..47a5d3a 100644 |
|
--- a/xlators/cluster/afr/src/afr-common.c |
|
+++ b/xlators/cluster/afr/src/afr-common.c |
|
@@ -5763,6 +5763,10 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this) |
|
afr_private_t *priv = NULL; |
|
|
|
priv = this->private; |
|
+ INIT_LIST_HEAD(&local->transaction.wait_list); |
|
+ INIT_LIST_HEAD(&local->transaction.owner_list); |
|
+ INIT_LIST_HEAD(&local->ta_waitq); |
|
+ INIT_LIST_HEAD(&local->ta_onwireq); |
|
ret = afr_internal_lock_init(&local->internal_lock, priv->child_count); |
|
if (ret < 0) |
|
goto out; |
|
@@ -5800,10 +5804,6 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this) |
|
goto out; |
|
|
|
ret = 0; |
|
- INIT_LIST_HEAD(&local->transaction.wait_list); |
|
- INIT_LIST_HEAD(&local->transaction.owner_list); |
|
- INIT_LIST_HEAD(&local->ta_waitq); |
|
- INIT_LIST_HEAD(&local->ta_onwireq); |
|
out: |
|
return ret; |
|
} |
|
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c |
|
index 4091671..bc8eabe 100644 |
|
--- a/xlators/cluster/afr/src/afr-lk-common.c |
|
+++ b/xlators/cluster/afr/src/afr-lk-common.c |
|
@@ -397,7 +397,6 @@ afr_unlock_now(call_frame_t *frame, xlator_t *this) |
|
int_lock->lk_call_count = call_count; |
|
|
|
if (!call_count) { |
|
- GF_ASSERT(!local->transaction.do_eager_unlock); |
|
gf_msg_trace(this->name, 0, "No internal locks unlocked"); |
|
int_lock->lock_cbk(frame, this); |
|
goto out; |
|
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c |
|
index 229820b..15f3a7e 100644 |
|
--- a/xlators/cluster/afr/src/afr-transaction.c |
|
+++ b/xlators/cluster/afr/src/afr-transaction.c |
|
@@ -372,6 +372,8 @@ afr_transaction_done(call_frame_t *frame, xlator_t *this) |
|
} |
|
local->transaction.unwind(frame, this); |
|
|
|
+ GF_ASSERT(list_empty(&local->transaction.owner_list)); |
|
+ GF_ASSERT(list_empty(&local->transaction.wait_list)); |
|
AFR_STACK_DESTROY(frame); |
|
|
|
return 0; |
|
@@ -393,7 +395,7 @@ afr_lock_fail_shared(afr_local_t *local, struct list_head *list) |
|
} |
|
|
|
static void |
|
-afr_handle_lock_acquire_failure(afr_local_t *local, gf_boolean_t locked) |
|
+afr_handle_lock_acquire_failure(afr_local_t *local) |
|
{ |
|
struct list_head shared; |
|
afr_lock_t *lock = NULL; |
|
@@ -414,13 +416,8 @@ afr_handle_lock_acquire_failure(afr_local_t *local, gf_boolean_t locked) |
|
afr_lock_fail_shared(local, &shared); |
|
local->transaction.do_eager_unlock = _gf_true; |
|
out: |
|
- if (locked) { |
|
- local->internal_lock.lock_cbk = afr_transaction_done; |
|
- afr_unlock(local->transaction.frame, local->transaction.frame->this); |
|
- } else { |
|
- afr_transaction_done(local->transaction.frame, |
|
- local->transaction.frame->this); |
|
- } |
|
+ local->internal_lock.lock_cbk = afr_transaction_done; |
|
+ afr_unlock(local->transaction.frame, local->transaction.frame->this); |
|
} |
|
|
|
call_frame_t * |
|
@@ -619,7 +616,7 @@ afr_transaction_perform_fop(call_frame_t *frame, xlator_t *this) |
|
failure_count = AFR_COUNT(local->transaction.failed_subvols, |
|
priv->child_count); |
|
if (failure_count == priv->child_count) { |
|
- afr_handle_lock_acquire_failure(local, _gf_true); |
|
+ afr_handle_lock_acquire_failure(local); |
|
return 0; |
|
} else { |
|
lock = &local->inode_ctx->lock[local->transaction.type]; |
|
@@ -2092,7 +2089,7 @@ err: |
|
local->op_ret = -1; |
|
local->op_errno = op_errno; |
|
|
|
- afr_handle_lock_acquire_failure(local, _gf_true); |
|
+ afr_handle_lock_acquire_failure(local); |
|
|
|
if (xdata_req) |
|
dict_unref(xdata_req); |
|
@@ -2361,7 +2358,7 @@ afr_internal_lock_finish(call_frame_t *frame, xlator_t *this) |
|
} else { |
|
lock = &local->inode_ctx->lock[local->transaction.type]; |
|
if (local->internal_lock.lock_op_ret < 0) { |
|
- afr_handle_lock_acquire_failure(local, _gf_false); |
|
+ afr_handle_lock_acquire_failure(local); |
|
} else { |
|
lock->event_generation = local->event_generation; |
|
afr_changelog_pre_op(frame, this); |
|
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h |
|
index 2cc3797..e731cfa 100644 |
|
--- a/xlators/cluster/afr/src/afr.h |
|
+++ b/xlators/cluster/afr/src/afr.h |
|
@@ -1091,8 +1091,8 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd); |
|
#define AFR_FRAME_INIT(frame, op_errno) \ |
|
({ \ |
|
frame->local = mem_get0(THIS->local_pool); \ |
|
- if (afr_local_init(frame->local, THIS->private, &op_errno)) { \ |
|
- afr_local_cleanup(frame->local, THIS); \ |
|
+ if (afr_local_init(frame->local, frame->this->private, &op_errno)) { \ |
|
+ afr_local_cleanup(frame->local, frame->this); \ |
|
mem_put(frame->local); \ |
|
frame->local = NULL; \ |
|
}; \ |
|
-- |
|
1.8.3.1 |
|
|
|
|