You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
325 lines
12 KiB
325 lines
12 KiB
3 years ago
|
From 7b2f1bd4e5a57ea3abd5f14a7d81b120735faecd Mon Sep 17 00:00:00 2001
|
||
|
From: Barak Sason Rofman <bsasonro@redhat.com>
|
||
|
Date: Wed, 6 May 2020 13:28:40 +0300
|
||
|
Subject: [PATCH 438/449] dht - sparse files rebalance enhancements
|
||
|
|
||
|
Currently data migration in rebalance reads sparse file sequentially,
|
||
|
disregarding which segments are holes and which are data. This can lead
|
||
|
to extremely long migration time for large sparse file.
|
||
|
Data migration mechanism needs to be enhanced so only data segments are
|
||
|
read and migrated. This can be achieved using lseek to seek for holes
|
||
|
and data in the file.
|
||
|
This enhancement is a consequence of
|
||
|
https://bugzilla.redhat.com/show_bug.cgi?id=1823703
|
||
|
|
||
|
> fixes: #1222
|
||
|
> Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
|
||
|
> Signed-off-by: Barak Sason Rofman <bsasonro@redhat.com>
|
||
|
> (Cherry pick from commit 7b7559733ca0c25c63f9d56cb7f4650dbd694c40)
|
||
|
> (Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/24409/)
|
||
|
|
||
|
BUG: 1836099
|
||
|
Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
|
||
|
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
|
||
|
Reviewed-on: https://code.engineering.redhat.com/gerrit/202647
|
||
|
Reviewed-by: Barak Sason Rofman <bsasonro@redhat.com>
|
||
|
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
||
|
---
|
||
|
tests/basic/distribute/spare_file_rebalance.t | 51 ++++++++
|
||
|
xlators/cluster/dht/src/dht-rebalance.c | 172 ++++++++++++--------------
|
||
|
2 files changed, 130 insertions(+), 93 deletions(-)
|
||
|
create mode 100644 tests/basic/distribute/spare_file_rebalance.t
|
||
|
|
||
|
diff --git a/tests/basic/distribute/spare_file_rebalance.t b/tests/basic/distribute/spare_file_rebalance.t
|
||
|
new file mode 100644
|
||
|
index 0000000..061c02f
|
||
|
--- /dev/null
|
||
|
+++ b/tests/basic/distribute/spare_file_rebalance.t
|
||
|
@@ -0,0 +1,51 @@
|
||
|
+#!/bin/bash
|
||
|
+
|
||
|
+. $(dirname $0)/../../include.rc
|
||
|
+. $(dirname $0)/../../volume.rc
|
||
|
+. $(dirname $0)/../../dht.rc
|
||
|
+
|
||
|
+# Initialize
|
||
|
+#------------------------------------------------------------
|
||
|
+cleanup;
|
||
|
+
|
||
|
+# Start glusterd
|
||
|
+TEST glusterd;
|
||
|
+TEST pidof glusterd;
|
||
|
+TEST $CLI volume info;
|
||
|
+
|
||
|
+# Create a volume
|
||
|
+TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2};
|
||
|
+
|
||
|
+# Verify volume creation
|
||
|
+EXPECT "$V0" volinfo_field $V0 'Volume Name';
|
||
|
+EXPECT 'Created' volinfo_field $V0 'Status';
|
||
|
+
|
||
|
+# Start volume and verify successful start
|
||
|
+TEST $CLI volume start $V0;
|
||
|
+EXPECT 'Started' volinfo_field $V0 'Status';
|
||
|
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
|
||
|
+
|
||
|
+#------------------------------------------------------------
|
||
|
+
|
||
|
+# Test case - Create sparse files on MP and verify
|
||
|
+# file info after rebalance
|
||
|
+#------------------------------------------------------------
|
||
|
+
|
||
|
+# Create some sparse files and get their size
|
||
|
+TEST cd $M0;
|
||
|
+dd if=/dev/urandom of=sparse_file bs=10k count=1 seek=2M
|
||
|
+cp --sparse=always sparse_file sparse_file_3;
|
||
|
+
|
||
|
+# Add a 3rd brick
|
||
|
+TEST $CLI volume add-brick $V0 $H0:$B0/${V0}3;
|
||
|
+
|
||
|
+# Trigger rebalance
|
||
|
+TEST $CLI volume rebalance $V0 start force;
|
||
|
+EXPECT_WITHIN $REBALANCE_TIMEOUT "0" rebalance_completed;
|
||
|
+
|
||
|
+# Compare original and rebalanced files
|
||
|
+TEST cd $B0/${V0}2
|
||
|
+TEST cmp sparse_file $B0/${V0}3/sparse_file_3
|
||
|
+EXPECT_WITHIN 30 "";
|
||
|
+
|
||
|
+cleanup;
|
||
|
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
|
||
|
index 88b6b54..d0c21b4 100644
|
||
|
--- a/xlators/cluster/dht/src/dht-rebalance.c
|
||
|
+++ b/xlators/cluster/dht/src/dht-rebalance.c
|
||
|
@@ -18,8 +18,8 @@
|
||
|
#include <glusterfs/events.h>
|
||
|
|
||
|
#define GF_DISK_SECTOR_SIZE 512
|
||
|
-#define DHT_REBALANCE_PID 4242 /* Change it if required */
|
||
|
-#define DHT_REBALANCE_BLKSIZE (1024 * 1024) /* 1 MB */
|
||
|
+#define DHT_REBALANCE_PID 4242 /* Change it if required */
|
||
|
+#define DHT_REBALANCE_BLKSIZE 1048576 /* 1 MB */
|
||
|
#define MAX_MIGRATE_QUEUE_COUNT 500
|
||
|
#define MIN_MIGRATE_QUEUE_COUNT 200
|
||
|
#define MAX_REBAL_TYPE_SIZE 16
|
||
|
@@ -178,75 +178,6 @@ dht_strip_out_acls(dict_t *dict)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
-static int
|
||
|
-dht_write_with_holes(xlator_t *to, fd_t *fd, struct iovec *vec, int count,
|
||
|
- int32_t size, off_t offset, struct iobref *iobref,
|
||
|
- int *fop_errno)
|
||
|
-{
|
||
|
- int i = 0;
|
||
|
- int ret = -1;
|
||
|
- int start_idx = 0;
|
||
|
- int tmp_offset = 0;
|
||
|
- int write_needed = 0;
|
||
|
- int buf_len = 0;
|
||
|
- int size_pending = 0;
|
||
|
- char *buf = NULL;
|
||
|
-
|
||
|
- /* loop through each vector */
|
||
|
- for (i = 0; i < count; i++) {
|
||
|
- buf = vec[i].iov_base;
|
||
|
- buf_len = vec[i].iov_len;
|
||
|
-
|
||
|
- for (start_idx = 0; (start_idx + GF_DISK_SECTOR_SIZE) <= buf_len;
|
||
|
- start_idx += GF_DISK_SECTOR_SIZE) {
|
||
|
- if (mem_0filled(buf + start_idx, GF_DISK_SECTOR_SIZE) != 0) {
|
||
|
- write_needed = 1;
|
||
|
- continue;
|
||
|
- }
|
||
|
-
|
||
|
- if (write_needed) {
|
||
|
- ret = syncop_write(
|
||
|
- to, fd, (buf + tmp_offset), (start_idx - tmp_offset),
|
||
|
- (offset + tmp_offset), iobref, 0, NULL, NULL);
|
||
|
- /* 'path' will be logged in calling function */
|
||
|
- if (ret < 0) {
|
||
|
- gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
|
||
|
- strerror(-ret));
|
||
|
- *fop_errno = -ret;
|
||
|
- ret = -1;
|
||
|
- goto out;
|
||
|
- }
|
||
|
-
|
||
|
- write_needed = 0;
|
||
|
- }
|
||
|
- tmp_offset = start_idx + GF_DISK_SECTOR_SIZE;
|
||
|
- }
|
||
|
-
|
||
|
- if ((start_idx < buf_len) || write_needed) {
|
||
|
- /* This means, last chunk is not yet written.. write it */
|
||
|
- ret = syncop_write(to, fd, (buf + tmp_offset),
|
||
|
- (buf_len - tmp_offset), (offset + tmp_offset),
|
||
|
- iobref, 0, NULL, NULL);
|
||
|
- if (ret < 0) {
|
||
|
- /* 'path' will be logged in calling function */
|
||
|
- gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
|
||
|
- strerror(-ret));
|
||
|
- *fop_errno = -ret;
|
||
|
- ret = -1;
|
||
|
- goto out;
|
||
|
- }
|
||
|
- }
|
||
|
-
|
||
|
- size_pending = (size - buf_len);
|
||
|
- if (!size_pending)
|
||
|
- break;
|
||
|
- }
|
||
|
-
|
||
|
- ret = size;
|
||
|
-out:
|
||
|
- return ret;
|
||
|
-}
|
||
|
-
|
||
|
/*
|
||
|
return values:
|
||
|
-1 : failure
|
||
|
@@ -1101,32 +1032,97 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
|
||
|
int ret = 0;
|
||
|
int count = 0;
|
||
|
off_t offset = 0;
|
||
|
+ off_t data_offset = 0;
|
||
|
+ off_t hole_offset = 0;
|
||
|
struct iovec *vector = NULL;
|
||
|
struct iobref *iobref = NULL;
|
||
|
uint64_t total = 0;
|
||
|
size_t read_size = 0;
|
||
|
+ size_t data_block_size = 0;
|
||
|
dict_t *xdata = NULL;
|
||
|
dht_conf_t *conf = NULL;
|
||
|
|
||
|
conf = this->private;
|
||
|
+
|
||
|
/* if file size is '0', no need to enter this loop */
|
||
|
while (total < ia_size) {
|
||
|
- read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
|
||
|
- ? DHT_REBALANCE_BLKSIZE
|
||
|
- : (ia_size - total));
|
||
|
+ /* This is a regular file - read it sequentially */
|
||
|
+ if (!hole_exists) {
|
||
|
+ read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
|
||
|
+ ? DHT_REBALANCE_BLKSIZE
|
||
|
+ : (ia_size - total));
|
||
|
+ } else {
|
||
|
+ /* This is a sparse file - read only the data segments in the file
|
||
|
+ */
|
||
|
+
|
||
|
+ /* If the previous data block is fully copied, find the next data
|
||
|
+ * segment
|
||
|
+ * starting at the offset of the last read and written byte, */
|
||
|
+ if (data_block_size <= 0) {
|
||
|
+ ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL,
|
||
|
+ &data_offset);
|
||
|
+ if (ret) {
|
||
|
+ if (ret == -ENXIO)
|
||
|
+ ret = 0; /* No more data segments */
|
||
|
+ else
|
||
|
+ *fop_errno = -ret; /* Error occurred */
|
||
|
+
|
||
|
+ break;
|
||
|
+ }
|
||
|
+
|
||
|
+ /* If the position of the current data segment is greater than
|
||
|
+ * the position of the next hole, find the next hole in order to
|
||
|
+ * calculate the length of the new data segment */
|
||
|
+ if (data_offset > hole_offset) {
|
||
|
+ /* Starting at the offset of the last data segment, find the
|
||
|
+ * next hole */
|
||
|
+ ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE,
|
||
|
+ NULL, &hole_offset);
|
||
|
+ if (ret) {
|
||
|
+ /* If an error occurred here it's a real error because
|
||
|
+ * if the seek for a data segment was successful then
|
||
|
+ * necessarily another hole must exist (EOF is a hole)
|
||
|
+ */
|
||
|
+ *fop_errno = -ret;
|
||
|
+ break;
|
||
|
+ }
|
||
|
+
|
||
|
+ /* Calculate the total size of the current data block */
|
||
|
+ data_block_size = hole_offset - data_offset;
|
||
|
+ }
|
||
|
+ } else {
|
||
|
+ /* There is still data in the current segment, move the
|
||
|
+ * data_offset to the position of the last written byte */
|
||
|
+ data_offset = offset;
|
||
|
+ }
|
||
|
+
|
||
|
+ /* Calculate how much data needs to be read and written. If the data
|
||
|
+ * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and
|
||
|
+ * write DHT_REBALANCE_BLKSIZE data length and the rest in the
|
||
|
+ * next iteration(s) */
|
||
|
+ read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE)
|
||
|
+ ? DHT_REBALANCE_BLKSIZE
|
||
|
+ : data_block_size);
|
||
|
+
|
||
|
+ /* Calculate the remaining size of the data block - maybe there's no
|
||
|
+ * need to seek for data in the next iteration */
|
||
|
+ data_block_size -= read_size;
|
||
|
+
|
||
|
+ /* Set offset to the offset of the data segment so read and write
|
||
|
+ * will have the correct position */
|
||
|
+ offset = data_offset;
|
||
|
+ }
|
||
|
|
||
|
ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count,
|
||
|
&iobref, NULL, NULL, NULL);
|
||
|
+
|
||
|
if (!ret || (ret < 0)) {
|
||
|
*fop_errno = -ret;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
- if (hole_exists) {
|
||
|
- ret = dht_write_with_holes(to, dst, vector, count, ret, offset,
|
||
|
- iobref, fop_errno);
|
||
|
- } else {
|
||
|
- if (!conf->force_migration && !dht_is_tier_xlator(this)) {
|
||
|
+ if (!conf->force_migration && !dht_is_tier_xlator(this)) {
|
||
|
+ if (!xdata) {
|
||
|
xdata = dict_new();
|
||
|
if (!xdata) {
|
||
|
gf_msg("dht", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
|
||
|
@@ -1146,7 +1142,7 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
|
||
|
* https://github.com/gluster/glusterfs/issues/308
|
||
|
* for more details.
|
||
|
*/
|
||
|
- ret = dict_set_int32(xdata, GF_AVOID_OVERWRITE, 1);
|
||
|
+ ret = dict_set_int32_sizen(xdata, GF_AVOID_OVERWRITE, 1);
|
||
|
if (ret) {
|
||
|
gf_msg("dht", GF_LOG_ERROR, 0, ENOMEM,
|
||
|
"failed to set dict");
|
||
|
@@ -1155,22 +1151,12 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
-
|
||
|
- ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
|
||
|
- NULL, xdata, NULL);
|
||
|
- if (ret < 0) {
|
||
|
- *fop_errno = -ret;
|
||
|
- }
|
||
|
- }
|
||
|
-
|
||
|
- if ((defrag && defrag->cmd == GF_DEFRAG_CMD_START_TIER) &&
|
||
|
- (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING)) {
|
||
|
- gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_TIER_PAUSED,
|
||
|
- "Migrate file paused");
|
||
|
- ret = -1;
|
||
|
}
|
||
|
|
||
|
+ ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
|
||
|
+ NULL, xdata, NULL);
|
||
|
if (ret < 0) {
|
||
|
+ *fop_errno = -ret;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
--
|
||
|
1.8.3.1
|
||
|
|