You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
290 lines
9.3 KiB
290 lines
9.3 KiB
3 years ago
|
From 40ac42501d6bbff7206e753e8e988beefe74f5f4 Mon Sep 17 00:00:00 2001
|
||
|
From: Krutika Dhananjay <kdhananj@redhat.com>
|
||
|
Date: Fri, 5 Apr 2019 10:30:23 +0530
|
||
|
Subject: [PATCH 176/178] features/shard: Fix crash during background shard
|
||
|
deletion in a specific case
|
||
|
|
||
|
Consider the following case -
|
||
|
1. A file gets FALLOCATE'd such that > "shard-lru-limit" number of
|
||
|
shards are created.
|
||
|
2. And then it is deleted after that.
|
||
|
|
||
|
The unique thing about FALLOCATE is that unlike WRITE, all of the
|
||
|
participant shards are resolved and created and fallocated in a single
|
||
|
batch. This means, in this case, after the first "shard-lru-limit"
|
||
|
number of shards are resolved and added to lru list, as part of
|
||
|
resolution of the remaining shards, some of the existing shards in lru
|
||
|
list will need to be evicted. So these evicted shards will be
|
||
|
inode_unlink()d as part of eviction. Now once the fop gets to the actual
|
||
|
FALLOCATE stage, the lru'd-out shards get added to fsync list.
|
||
|
|
||
|
2 things to note at this point:
|
||
|
i. the lru'd out shards are only part of fsync list, so each holds 1 ref
|
||
|
on base shard
|
||
|
ii. and the more recently used shards are part of both fsync and lru list.
|
||
|
So each of these shards holds 2 refs on base inode - one for being
|
||
|
part of fsync list, and the other for being part of lru list.
|
||
|
|
||
|
FALLOCATE completes successfully and then this very file is deleted, and
|
||
|
background shard deletion launched. Here's where the ref counts get mismatched.
|
||
|
First as part of inode_resolve()s during the deletion, the lru'd-out inodes
|
||
|
return NULL, because they are inode_unlink()'d by now. So these inodes need to
|
||
|
be freshly looked up. But as part of linking them in lookup_cbk (precisely in
|
||
|
shard_link_block_inode()), inode_link() returns the lru'd-out inode object.
|
||
|
And its inode ctx is still valid and ctx->base_inode valid from the last
|
||
|
time it was added to list.
|
||
|
|
||
|
But shard_common_lookup_shards_cbk() passes NULL in the place of base_pointer
|
||
|
to __shard_update_shards_inode_list(). This means, as part of adding the lru'd out
|
||
|
inode back to lru list, base inode is not ref'd since its NULL.
|
||
|
|
||
|
Whereas post unlinking this shard, during shard_unlink_block_inode(),
|
||
|
ctx->base_inode is accessible and is unref'd because the shard was found to be part
|
||
|
of LRU list, although the matching ref didn't occur. This at some point leads to
|
||
|
base_inode refcount becoming 0 and it getting destroyed and released back while some
|
||
|
of its associated shards are continuing to be unlinked in parallel and the client crashes
|
||
|
whenever it is accessed next.
|
||
|
|
||
|
Fix is to pass base shard correctly, if available, in shard_link_block_inode().
|
||
|
|
||
|
Also, the patch fixes the ret value check in tests/bugs/shard/shard-fallocate.c
|
||
|
|
||
|
>Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
|
||
|
>Updates: bz#1696136
|
||
|
>Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
|
||
|
|
||
|
Upstream Patch: https://review.gluster.org/#/c/glusterfs/+/22507/
|
||
|
|
||
|
BUG: 1694595
|
||
|
Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
|
||
|
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
|
||
|
Reviewed-on: https://code.engineering.redhat.com/gerrit/172856
|
||
|
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||
|
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
||
|
---
|
||
|
tests/bugs/shard/bug-1696136.c | 121 +++++++++++++++++++++++++++++++++++++
|
||
|
tests/bugs/shard/bug-1696136.t | 33 ++++++++++
|
||
|
tests/bugs/shard/shard-fallocate.c | 2 +-
|
||
|
xlators/features/shard/src/shard.c | 12 +++-
|
||
|
4 files changed, 164 insertions(+), 4 deletions(-)
|
||
|
create mode 100644 tests/bugs/shard/bug-1696136.c
|
||
|
create mode 100644 tests/bugs/shard/bug-1696136.t
|
||
|
|
||
|
diff --git a/tests/bugs/shard/bug-1696136.c b/tests/bugs/shard/bug-1696136.c
|
||
|
new file mode 100644
|
||
|
index 0000000..b9e8d13
|
||
|
--- /dev/null
|
||
|
+++ b/tests/bugs/shard/bug-1696136.c
|
||
|
@@ -0,0 +1,121 @@
|
||
|
+#define _GNU_SOURCE
|
||
|
+#include <fcntl.h>
|
||
|
+#include <stdio.h>
|
||
|
+#include <stdlib.h>
|
||
|
+#include <glusterfs/api/glfs.h>
|
||
|
+#include <glusterfs/api/glfs-handles.h>
|
||
|
+
|
||
|
+enum fallocate_flag {
|
||
|
+ TEST_FALLOCATE_NONE,
|
||
|
+ TEST_FALLOCATE_KEEP_SIZE,
|
||
|
+ TEST_FALLOCATE_ZERO_RANGE,
|
||
|
+ TEST_FALLOCATE_PUNCH_HOLE,
|
||
|
+ TEST_FALLOCATE_MAX,
|
||
|
+};
|
||
|
+
|
||
|
+int
|
||
|
+get_fallocate_flag(int opcode)
|
||
|
+{
|
||
|
+ int ret = 0;
|
||
|
+
|
||
|
+ switch (opcode) {
|
||
|
+ case TEST_FALLOCATE_NONE:
|
||
|
+ ret = 0;
|
||
|
+ break;
|
||
|
+ case TEST_FALLOCATE_KEEP_SIZE:
|
||
|
+ ret = FALLOC_FL_KEEP_SIZE;
|
||
|
+ break;
|
||
|
+ case TEST_FALLOCATE_ZERO_RANGE:
|
||
|
+ ret = FALLOC_FL_ZERO_RANGE;
|
||
|
+ break;
|
||
|
+ case TEST_FALLOCATE_PUNCH_HOLE:
|
||
|
+ ret = FALLOC_FL_PUNCH_HOLE;
|
||
|
+ break;
|
||
|
+ default:
|
||
|
+ ret = -1;
|
||
|
+ break;
|
||
|
+ }
|
||
|
+ return ret;
|
||
|
+}
|
||
|
+
|
||
|
+int
|
||
|
+main(int argc, char *argv[])
|
||
|
+{
|
||
|
+ int ret = 1;
|
||
|
+ int opcode = -1;
|
||
|
+ off_t offset = 0;
|
||
|
+ size_t len = 0;
|
||
|
+ glfs_t *fs = NULL;
|
||
|
+ glfs_fd_t *fd = NULL;
|
||
|
+
|
||
|
+ if (argc != 8) {
|
||
|
+ fprintf(stderr,
|
||
|
+ "Syntax: %s <host> <volname> <opcode> <offset> <len> "
|
||
|
+ "<file-path> <log-file>\n",
|
||
|
+ argv[0]);
|
||
|
+ return 1;
|
||
|
+ }
|
||
|
+
|
||
|
+ fs = glfs_new(argv[2]);
|
||
|
+ if (!fs) {
|
||
|
+ fprintf(stderr, "glfs_new: returned NULL\n");
|
||
|
+ return 1;
|
||
|
+ }
|
||
|
+
|
||
|
+ ret = glfs_set_volfile_server(fs, "tcp", argv[1], 24007);
|
||
|
+ if (ret != 0) {
|
||
|
+ fprintf(stderr, "glfs_set_volfile_server: returned %d\n", ret);
|
||
|
+ goto out;
|
||
|
+ }
|
||
|
+
|
||
|
+ ret = glfs_set_logging(fs, argv[7], 7);
|
||
|
+ if (ret != 0) {
|
||
|
+ fprintf(stderr, "glfs_set_logging: returned %d\n", ret);
|
||
|
+ goto out;
|
||
|
+ }
|
||
|
+
|
||
|
+ ret = glfs_init(fs);
|
||
|
+ if (ret != 0) {
|
||
|
+ fprintf(stderr, "glfs_init: returned %d\n", ret);
|
||
|
+ goto out;
|
||
|
+ }
|
||
|
+
|
||
|
+ opcode = atoi(argv[3]);
|
||
|
+ opcode = get_fallocate_flag(opcode);
|
||
|
+ if (opcode < 0) {
|
||
|
+ fprintf(stderr, "get_fallocate_flag: invalid flag \n");
|
||
|
+ goto out;
|
||
|
+ }
|
||
|
+
|
||
|
+ offset = atoi(argv[4]);
|
||
|
+ len = atoi(argv[5]);
|
||
|
+
|
||
|
+ fd = glfs_open(fs, argv[6], O_RDWR);
|
||
|
+ if (fd == NULL) {
|
||
|
+ fprintf(stderr, "glfs_open: returned NULL\n");
|
||
|
+ goto out;
|
||
|
+ }
|
||
|
+
|
||
|
+ ret = glfs_fallocate(fd, opcode, offset, len);
|
||
|
+ if (ret < 0) {
|
||
|
+ fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
|
||
|
+ goto out;
|
||
|
+ }
|
||
|
+
|
||
|
+ ret = glfs_unlink(fs, argv[6]);
|
||
|
+ if (ret < 0) {
|
||
|
+ fprintf(stderr, "glfs_unlink: returned %d\n", ret);
|
||
|
+ goto out;
|
||
|
+ }
|
||
|
+ /* Sleep for 3s to give enough time for background deletion to complete
|
||
|
+ * during which if the bug exists, the process will crash.
|
||
|
+ */
|
||
|
+ sleep(3);
|
||
|
+ ret = 0;
|
||
|
+
|
||
|
+out:
|
||
|
+ if (fd)
|
||
|
+ glfs_close(fd);
|
||
|
+ glfs_fini(fs);
|
||
|
+ return ret;
|
||
|
+}
|
||
|
diff --git a/tests/bugs/shard/bug-1696136.t b/tests/bugs/shard/bug-1696136.t
|
||
|
new file mode 100644
|
||
|
index 0000000..b6dc858
|
||
|
--- /dev/null
|
||
|
+++ b/tests/bugs/shard/bug-1696136.t
|
||
|
@@ -0,0 +1,33 @@
|
||
|
+#!/bin/bash
|
||
|
+
|
||
|
+. $(dirname $0)/../../include.rc
|
||
|
+. $(dirname $0)/../../volume.rc
|
||
|
+. $(dirname $0)/../../fallocate.rc
|
||
|
+
|
||
|
+cleanup
|
||
|
+
|
||
|
+TEST glusterd
|
||
|
+TEST pidof glusterd
|
||
|
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
|
||
|
+TEST $CLI volume set $V0 features.shard on
|
||
|
+TEST $CLI volume set $V0 features.shard-block-size 4MB
|
||
|
+TEST $CLI volume set $V0 features.shard-lru-limit 120
|
||
|
+TEST $CLI volume set $V0 performance.write-behind off
|
||
|
+TEST $CLI volume start $V0
|
||
|
+
|
||
|
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
|
||
|
+
|
||
|
+TEST build_tester $(dirname $0)/bug-1696136.c -lgfapi -Wall -O2
|
||
|
+
|
||
|
+# Create a file
|
||
|
+TEST touch $M0/file1
|
||
|
+
|
||
|
+# Fallocate a 500M file. This will make sure number of participant shards are > lru-limit
|
||
|
+TEST $(dirname $0)/bug-1696136 $H0 $V0 "0" "0" "536870912" /file1 `gluster --print-logdir`/glfs-$V0.log
|
||
|
+
|
||
|
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
|
||
|
+TEST $CLI volume stop $V0
|
||
|
+TEST $CLI volume delete $V0
|
||
|
+rm -f $(dirname $0)/bug-1696136
|
||
|
+
|
||
|
+cleanup
|
||
|
diff --git a/tests/bugs/shard/shard-fallocate.c b/tests/bugs/shard/shard-fallocate.c
|
||
|
index 3a784d3..45b9ce0 100644
|
||
|
--- a/tests/bugs/shard/shard-fallocate.c
|
||
|
+++ b/tests/bugs/shard/shard-fallocate.c
|
||
|
@@ -97,7 +97,7 @@ main(int argc, char *argv[])
|
||
|
}
|
||
|
|
||
|
ret = glfs_fallocate(fd, opcode, offset, len);
|
||
|
- if (ret <= 0) {
|
||
|
+ if (ret < 0) {
|
||
|
fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
|
||
|
goto out;
|
||
|
}
|
||
|
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
|
||
|
index fa3564a..3c4bcdc 100644
|
||
|
--- a/xlators/features/shard/src/shard.c
|
||
|
+++ b/xlators/features/shard/src/shard.c
|
||
|
@@ -2213,13 +2213,19 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
|
||
|
xlator_t *this = NULL;
|
||
|
inode_t *fsync_inode = NULL;
|
||
|
shard_priv_t *priv = NULL;
|
||
|
+ inode_t *base_inode = NULL;
|
||
|
|
||
|
this = THIS;
|
||
|
priv = this->private;
|
||
|
- if (local->loc.inode)
|
||
|
+ if (local->loc.inode) {
|
||
|
gf_uuid_copy(gfid, local->loc.inode->gfid);
|
||
|
- else
|
||
|
+ base_inode = local->loc.inode;
|
||
|
+ } else if (local->resolver_base_inode) {
|
||
|
+ gf_uuid_copy(gfid, local->resolver_base_inode->gfid);
|
||
|
+ base_inode = local->resolver_base_inode;
|
||
|
+ } else {
|
||
|
gf_uuid_copy(gfid, local->base_gfid);
|
||
|
+ }
|
||
|
|
||
|
shard_make_block_bname(block_num, gfid, block_bname, sizeof(block_bname));
|
||
|
|
||
|
@@ -2232,7 +2238,7 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
|
||
|
LOCK(&priv->lock);
|
||
|
{
|
||
|
fsync_inode = __shard_update_shards_inode_list(
|
||
|
- linked_inode, this, local->loc.inode, block_num, gfid);
|
||
|
+ linked_inode, this, base_inode, block_num, gfid);
|
||
|
}
|
||
|
UNLOCK(&priv->lock);
|
||
|
if (fsync_inode)
|
||
|
--
|
||
|
1.8.3.1
|
||
|
|