You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
384 lines
12 KiB
384 lines
12 KiB
From 584ee2dbb8158ee3d3c3f055f1b06ff3d9177192 Mon Sep 17 00:00:00 2001 |
|
From: Kotresh HR <khiremat@redhat.com> |
|
Date: Thu, 13 Jun 2019 16:23:21 +0530 |
|
Subject: [PATCH 196/221] posix/ctime: Fix ctime upgrade issue |
|
|
|
Problem: |
|
On a EC volume, during upgrade from the older version where |
|
ctime feature is not enabled(or not present) to the newer |
|
version where the ctime feature is available (enabled default), |
|
the self heal hangs and doesn't complete. |
|
|
|
Cause: |
|
The ctime feature has both client side code (utime) and |
|
server side code (posix). The feature is driven from client. |
|
Only if the client side sets the time in the frame, should |
|
the server side sets the time attributes in xattr. But posix |
|
setattr/fseattr was not doing that. When one of the server |
|
nodes is updated, since ctime is enabled by default, it |
|
starts setting xattr on setattr/fseattr on the updated node/brick. |
|
|
|
On a EC volume the first two updated nodes(bricks) are not a |
|
problem because there are 4 other bricks with consistent data. |
|
However once the third brick is updated, the new attribute(mdata xattr) |
|
will cause an inconsistency on metadata on 3 bricks, which |
|
prevents the file to be repaired. |
|
|
|
Fix: |
|
Don't create mdata xattr with utimes/utimensat system call. |
|
Only update if already present. |
|
|
|
Backport of: |
|
> Patch: https://review.gluster.org/22858 |
|
> Change-Id: Ieacedecb8a738bb437283ef3e0f042fd49dc4c8c |
|
> fixes: bz#1720201 |
|
> Signed-off-by: Kotresh HR <khiremat@redhat.com> |
|
|
|
Change-Id: Ieacedecb8a738bb437283ef3e0f042fd49dc4c8c |
|
BUG: 1713664 |
|
Signed-off-by: Kotresh HR <khiremat@redhat.com> |
|
Reviewed-on: https://code.engineering.redhat.com/gerrit/174238 |
|
Tested-by: RHGS Build Bot <nigelb@redhat.com> |
|
Reviewed-by: Atin Mukherjee <amukherj@redhat.com> |
|
--- |
|
tests/basic/afr/split-brain-healing.t | 36 ++++--- |
|
tests/utils/get-mdata-xattr.c | 152 +++++++++++++++++++++++++++++ |
|
tests/volume.rc | 30 ++++++ |
|
xlators/storage/posix/src/posix-metadata.c | 21 ++++ |
|
4 files changed, 223 insertions(+), 16 deletions(-) |
|
create mode 100644 tests/utils/get-mdata-xattr.c |
|
|
|
diff --git a/tests/basic/afr/split-brain-healing.t b/tests/basic/afr/split-brain-healing.t |
|
index c80f900..78553e6 100644 |
|
--- a/tests/basic/afr/split-brain-healing.t |
|
+++ b/tests/basic/afr/split-brain-healing.t |
|
@@ -20,11 +20,14 @@ function get_replicate_subvol_number { |
|
cleanup; |
|
|
|
AREQUAL_PATH=$(dirname $0)/../../utils |
|
+GET_MDATA_PATH=$(dirname $0)/../../utils |
|
CFLAGS="" |
|
test "`uname -s`" != "Linux" && { |
|
CFLAGS="$CFLAGS -lintl"; |
|
} |
|
build_tester $AREQUAL_PATH/arequal-checksum.c $CFLAGS |
|
+build_tester $GET_MDATA_PATH/get-mdata-xattr.c |
|
+ |
|
TEST glusterd |
|
TEST pidof glusterd |
|
TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2,3,4} |
|
@@ -152,13 +155,13 @@ EXPECT $SMALLER_FILE_SIZE stat -c %s file4 |
|
subvolume=$(get_replicate_subvol_number file5) |
|
if [ $subvolume == 0 ] |
|
then |
|
- mtime1=$(stat -c %Y $B0/${V0}1/file5) |
|
- mtime2=$(stat -c %Y $B0/${V0}2/file5) |
|
+ mtime1=$(get_mtime $B0/${V0}1/file5) |
|
+ mtime2=$(get_mtime $B0/${V0}2/file5) |
|
LATEST_MTIME=$(($mtime1 > $mtime2 ? $mtime1:$mtime2)) |
|
elif [ $subvolume == 1 ] |
|
then |
|
- mtime1=$(stat -c %Y $B0/${V0}3/file5) |
|
- mtime2=$(stat -c %Y $B0/${V0}4/file5) |
|
+ mtime1=$(get_mtime $B0/${V0}3/file5) |
|
+ mtime2=$(get_mtime $B0/${V0}4/file5) |
|
LATEST_MTIME=$(($mtime1 > $mtime2 ? $mtime1:$mtime2)) |
|
fi |
|
$CLI volume heal $V0 split-brain latest-mtime /file5 |
|
@@ -166,12 +169,12 @@ EXPECT "0" echo $? |
|
|
|
if [ $subvolume == 0 ] |
|
then |
|
- mtime1_after_heal=$(stat -c %Y $B0/${V0}1/file5) |
|
- mtime2_after_heal=$(stat -c %Y $B0/${V0}2/file5) |
|
+ mtime1_after_heal=$(get_mtime $B0/${V0}1/file5) |
|
+ mtime2_after_heal=$(get_mtime $B0/${V0}2/file5) |
|
elif [ $subvolume == 1 ] |
|
then |
|
- mtime1_after_heal=$(stat -c %Y $B0/${V0}3/file5) |
|
- mtime2_after_heal=$(stat -c %Y $B0/${V0}4/file5) |
|
+ mtime1_after_heal=$(get_mtime $B0/${V0}3/file5) |
|
+ mtime2_after_heal=$(get_mtime $B0/${V0}4/file5) |
|
fi |
|
|
|
#TODO: To below comparisons on full sub-second resolution |
|
@@ -188,14 +191,14 @@ subvolume=$(get_replicate_subvol_number file6) |
|
if [ $subvolume == 0 ] |
|
then |
|
GFID=$(gf_get_gfid_xattr $B0/${V0}1/file6) |
|
- mtime1=$(stat -c %Y $B0/${V0}1/file6) |
|
- mtime2=$(stat -c %Y $B0/${V0}2/file6) |
|
+ mtime1=$(get_mtime $B0/${V0}1/file6) |
|
+ mtime2=$(get_mtime $B0/${V0}2/file6) |
|
LATEST_MTIME=$(($mtime1 > $mtime2 ? $mtime1:$mtime2)) |
|
elif [ $subvolume == 1 ] |
|
then |
|
GFID=$(gf_get_gfid_xattr $B0/${V0}3/file6) |
|
- mtime1=$(stat -c %Y $B0/${V0}3/file6) |
|
- mtime2=$(stat -c %Y $B0/${V0}4/file6) |
|
+ mtime1=$(get_mtime $B0/${V0}3/file6) |
|
+ mtime2=$(get_mtime $B0/${V0}4/file6) |
|
LATEST_MTIME=$(($mtime1 > $mtime2 ? $mtime1:$mtime2)) |
|
fi |
|
GFIDSTR="gfid:$(gf_gfid_xattr_to_str $GFID)" |
|
@@ -204,12 +207,12 @@ EXPECT "0" echo $? |
|
|
|
if [ $subvolume == 0 ] |
|
then |
|
- mtime1_after_heal=$(stat -c %Y $B0/${V0}1/file6) |
|
- mtime2_after_heal=$(stat -c %Y $B0/${V0}2/file6) |
|
+ mtime1_after_heal=$(get_mtime $B0/${V0}1/file6) |
|
+ mtime2_after_heal=$(get_mtime $B0/${V0}2/file6) |
|
elif [ $subvolume == 1 ] |
|
then |
|
- mtime1_after_heal=$(stat -c %Y $B0/${V0}3/file6) |
|
- mtime2_after_heal=$(stat -c %Y $B0/${V0}4/file6) |
|
+ mtime1_after_heal=$(get_mtime $B0/${V0}3/file6) |
|
+ mtime2_after_heal=$(get_mtime $B0/${V0}4/file6) |
|
fi |
|
|
|
#TODO: To below comparisons on full sub-second resolution |
|
@@ -253,4 +256,5 @@ EXPECT "1" echo $? |
|
|
|
cd - |
|
TEST rm $AREQUAL_PATH/arequal-checksum |
|
+TEST rm $GET_MDATA_PATH/get-mdata-xattr |
|
cleanup |
|
diff --git a/tests/utils/get-mdata-xattr.c b/tests/utils/get-mdata-xattr.c |
|
new file mode 100644 |
|
index 0000000..e9f5471 |
|
--- /dev/null |
|
+++ b/tests/utils/get-mdata-xattr.c |
|
@@ -0,0 +1,152 @@ |
|
+/* |
|
+ Copyright (c) 2019 Red Hat, Inc. <http://www.redhat.com> |
|
+ This file is part of GlusterFS. |
|
+ |
|
+ This file is licensed to you under your choice of the GNU Lesser |
|
+ General Public License, version 3 or any later version (LGPLv3 or |
|
+ later), or the GNU General Public License, version 2 (GPLv2), in all |
|
+ cases as published by the Free Software Foundation. |
|
+*/ |
|
+ |
|
+#include <stdlib.h> |
|
+#include <endian.h> |
|
+#include <stdio.h> |
|
+#include <time.h> |
|
+#include <string.h> |
|
+#include <inttypes.h> |
|
+#include <sys/types.h> |
|
+#include <sys/xattr.h> |
|
+#include <errno.h> |
|
+ |
|
+typedef struct gf_timespec_disk { |
|
+ uint64_t tv_sec; |
|
+ uint64_t tv_nsec; |
|
+} gf_timespec_disk_t; |
|
+ |
|
+/* posix_mdata_t on disk structure */ |
|
+typedef struct __attribute__((__packed__)) posix_mdata_disk { |
|
+ /* version of structure, bumped up if any new member is added */ |
|
+ uint8_t version; |
|
+ /* flags indicates valid fields in the structure */ |
|
+ uint64_t flags; |
|
+ gf_timespec_disk_t ctime; |
|
+ gf_timespec_disk_t mtime; |
|
+ gf_timespec_disk_t atime; |
|
+} posix_mdata_disk_t; |
|
+ |
|
+/* In memory representation posix metadata xattr */ |
|
+typedef struct { |
|
+ /* version of structure, bumped up if any new member is added */ |
|
+ uint8_t version; |
|
+ /* flags indicates valid fields in the structure */ |
|
+ uint64_t flags; |
|
+ struct timespec ctime; |
|
+ struct timespec mtime; |
|
+ struct timespec atime; |
|
+} posix_mdata_t; |
|
+ |
|
+#define GF_XATTR_MDATA_KEY "trusted.glusterfs.mdata" |
|
+ |
|
+/* posix_mdata_from_disk converts posix_mdata_disk_t into host byte order |
|
+ */ |
|
+static inline void |
|
+posix_mdata_from_disk(posix_mdata_t *out, posix_mdata_disk_t *in) |
|
+{ |
|
+ out->version = in->version; |
|
+ out->flags = be64toh(in->flags); |
|
+ |
|
+ out->ctime.tv_sec = be64toh(in->ctime.tv_sec); |
|
+ out->ctime.tv_nsec = be64toh(in->ctime.tv_nsec); |
|
+ |
|
+ out->mtime.tv_sec = be64toh(in->mtime.tv_sec); |
|
+ out->mtime.tv_nsec = be64toh(in->mtime.tv_nsec); |
|
+ |
|
+ out->atime.tv_sec = be64toh(in->atime.tv_sec); |
|
+ out->atime.tv_nsec = be64toh(in->atime.tv_nsec); |
|
+} |
|
+ |
|
+/* posix_fetch_mdata_xattr fetches the posix_mdata_t from disk */ |
|
+static int |
|
+posix_fetch_mdata_xattr(const char *real_path, posix_mdata_t *metadata) |
|
+{ |
|
+ size_t size = -1; |
|
+ char *value = NULL; |
|
+ char gfid_str[64] = {0}; |
|
+ |
|
+ char *key = GF_XATTR_MDATA_KEY; |
|
+ |
|
+ if (!metadata || !real_path) { |
|
+ goto err; |
|
+ } |
|
+ |
|
+ /* Get size */ |
|
+ size = lgetxattr(real_path, key, NULL, 0); |
|
+ if (size == -1) { |
|
+ goto err; |
|
+ } |
|
+ |
|
+ value = calloc(size + 1, sizeof(char)); |
|
+ if (!value) { |
|
+ goto err; |
|
+ } |
|
+ |
|
+ /* Get xattr value */ |
|
+ size = lgetxattr(real_path, key, value, size); |
|
+ if (size == -1) { |
|
+ goto err; |
|
+ } |
|
+ posix_mdata_from_disk(metadata, (posix_mdata_disk_t *)value); |
|
+ |
|
+out: |
|
+ if (value) |
|
+ free(value); |
|
+ return 0; |
|
+err: |
|
+ if (value) |
|
+ free(value); |
|
+ return -1; |
|
+} |
|
+ |
|
+int |
|
+main(int argc, char *argv[]) |
|
+{ |
|
+ posix_mdata_t metadata; |
|
+ uint64_t result; |
|
+ |
|
+ if (argc != 3) { |
|
+ /* |
|
+ Usage: get_mdata_xattr -c|-m|-a <file-name> |
|
+ where -c --> ctime |
|
+ -m --> mtime |
|
+ -a --> atime |
|
+ */ |
|
+ printf("-1"); |
|
+ goto err; |
|
+ } |
|
+ |
|
+ if (posix_fetch_mdata_xattr(argv[2], &metadata)) { |
|
+ printf("-1"); |
|
+ goto err; |
|
+ } |
|
+ |
|
+ switch (argv[1][1]) { |
|
+ case 'c': |
|
+ result = metadata.ctime.tv_sec; |
|
+ break; |
|
+ case 'm': |
|
+ result = metadata.mtime.tv_sec; |
|
+ break; |
|
+ case 'a': |
|
+ result = metadata.atime.tv_sec; |
|
+ break; |
|
+ default: |
|
+ printf("-1"); |
|
+ goto err; |
|
+ } |
|
+ printf("%" PRIu64, result); |
|
+ fflush(stdout); |
|
+ return 0; |
|
+err: |
|
+ fflush(stdout); |
|
+ return -1; |
|
+} |
|
diff --git a/tests/volume.rc b/tests/volume.rc |
|
index bb400cc..6a78c37 100644 |
|
--- a/tests/volume.rc |
|
+++ b/tests/volume.rc |
|
@@ -927,3 +927,33 @@ function number_healer_threads_shd { |
|
local pid=$(get_shd_mux_pid $1) |
|
pstack $pid | grep $2 | wc -l |
|
} |
|
+ |
|
+function get_mtime { |
|
+ local time=$(get-mdata-xattr -m $1) |
|
+ if [ $time == "-1" ]; |
|
+ then |
|
+ echo $(stat -c %Y $1) |
|
+ else |
|
+ echo $time |
|
+ fi |
|
+} |
|
+ |
|
+function get_ctime { |
|
+ local time=$(get-mdata-xattr -c $1) |
|
+ if [ $time == "-1" ]; |
|
+ then |
|
+ echo $(stat -c %Z $2) |
|
+ else |
|
+ echo $time |
|
+ fi |
|
+} |
|
+ |
|
+function get_atime { |
|
+ local time=$(get-mdata-xattr -a $1) |
|
+ if [ $time == "-1" ]; |
|
+ then |
|
+ echo $(stat -c %X $1) |
|
+ else |
|
+ echo $time |
|
+ fi |
|
+} |
|
diff --git a/xlators/storage/posix/src/posix-metadata.c b/xlators/storage/posix/src/posix-metadata.c |
|
index e96f222..5a5e6cd 100644 |
|
--- a/xlators/storage/posix/src/posix-metadata.c |
|
+++ b/xlators/storage/posix/src/posix-metadata.c |
|
@@ -416,6 +416,22 @@ posix_set_mdata_xattr(xlator_t *this, const char *real_path, int fd, |
|
* still fine as the times would get eventually |
|
* accurate. |
|
*/ |
|
+ |
|
+ /* Don't create xattr with utimes/utimensat, only update if |
|
+ * present. This otherwise causes issues during inservice |
|
+ * upgrade. It causes inconsistent xattr values with in replica |
|
+ * set. The scenario happens during upgrade where clients are |
|
+ * older versions (without the ctime feature) and the server is |
|
+ * upgraded to the new version (with the ctime feature which |
|
+ * is enabled by default). |
|
+ */ |
|
+ |
|
+ if (update_utime) { |
|
+ UNLOCK(&inode->lock); |
|
+ GF_FREE(mdata); |
|
+ return 0; |
|
+ } |
|
+ |
|
mdata->version = 1; |
|
mdata->flags = 0; |
|
mdata->ctime.tv_sec = time->tv_sec; |
|
@@ -527,6 +543,11 @@ posix_update_utime_in_mdata(xlator_t *this, const char *real_path, int fd, |
|
|
|
priv = this->private; |
|
|
|
+ /* NOTE: |
|
+ * This routine (utimes) is intentionally allowed for all internal and |
|
+ * external clients even if ctime is not set. This is because AFR and |
|
+ * WORM uses time attributes for it's internal operations |
|
+ */ |
|
if (inode && priv->ctime) { |
|
if ((valid & GF_SET_ATTR_ATIME) == GF_SET_ATTR_ATIME) { |
|
tv.tv_sec = stbuf->ia_atime; |
|
-- |
|
1.8.3.1 |
|
|
|
|