You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
324 lines
12 KiB
324 lines
12 KiB
From 7b2f1bd4e5a57ea3abd5f14a7d81b120735faecd Mon Sep 17 00:00:00 2001 |
|
From: Barak Sason Rofman <bsasonro@redhat.com> |
|
Date: Wed, 6 May 2020 13:28:40 +0300 |
|
Subject: [PATCH 438/449] dht - sparse files rebalance enhancements |
|
|
|
Currently data migration in rebalance reads sparse file sequentially, |
|
disregarding which segments are holes and which are data. This can lead |
|
to extremely long migration time for large sparse file. |
|
Data migration mechanism needs to be enhanced so only data segments are |
|
read and migrated. This can be achieved using lseek to seek for holes |
|
and data in the file. |
|
This enhancement is a consequence of |
|
https://bugzilla.redhat.com/show_bug.cgi?id=1823703 |
|
|
|
> fixes: #1222 |
|
> Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3 |
|
> Signed-off-by: Barak Sason Rofman <bsasonro@redhat.com> |
|
> (Cherry pick from commit 7b7559733ca0c25c63f9d56cb7f4650dbd694c40) |
|
> (Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/24409/) |
|
|
|
BUG: 1836099 |
|
Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3 |
|
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com> |
|
Reviewed-on: https://code.engineering.redhat.com/gerrit/202647 |
|
Reviewed-by: Barak Sason Rofman <bsasonro@redhat.com> |
|
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com> |
|
--- |
|
tests/basic/distribute/spare_file_rebalance.t | 51 ++++++++ |
|
xlators/cluster/dht/src/dht-rebalance.c | 172 ++++++++++++-------------- |
|
2 files changed, 130 insertions(+), 93 deletions(-) |
|
create mode 100644 tests/basic/distribute/spare_file_rebalance.t |
|
|
|
diff --git a/tests/basic/distribute/spare_file_rebalance.t b/tests/basic/distribute/spare_file_rebalance.t |
|
new file mode 100644 |
|
index 0000000..061c02f |
|
--- /dev/null |
|
+++ b/tests/basic/distribute/spare_file_rebalance.t |
|
@@ -0,0 +1,51 @@ |
|
+#!/bin/bash |
|
+ |
|
+. $(dirname $0)/../../include.rc |
|
+. $(dirname $0)/../../volume.rc |
|
+. $(dirname $0)/../../dht.rc |
|
+ |
|
+# Initialize |
|
+#------------------------------------------------------------ |
|
+cleanup; |
|
+ |
|
+# Start glusterd |
|
+TEST glusterd; |
|
+TEST pidof glusterd; |
|
+TEST $CLI volume info; |
|
+ |
|
+# Create a volume |
|
+TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2}; |
|
+ |
|
+# Verify volume creation |
|
+EXPECT "$V0" volinfo_field $V0 'Volume Name'; |
|
+EXPECT 'Created' volinfo_field $V0 'Status'; |
|
+ |
|
+# Start volume and verify successful start |
|
+TEST $CLI volume start $V0; |
|
+EXPECT 'Started' volinfo_field $V0 'Status'; |
|
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0; |
|
+ |
|
+#------------------------------------------------------------ |
|
+ |
|
+# Test case - Create sparse files on MP and verify |
|
+# file info after rebalance |
|
+#------------------------------------------------------------ |
|
+ |
|
+# Create some sparse files and get their size |
|
+TEST cd $M0; |
|
+dd if=/dev/urandom of=sparse_file bs=10k count=1 seek=2M |
|
+cp --sparse=always sparse_file sparse_file_3; |
|
+ |
|
+# Add a 3rd brick |
|
+TEST $CLI volume add-brick $V0 $H0:$B0/${V0}3; |
|
+ |
|
+# Trigger rebalance |
|
+TEST $CLI volume rebalance $V0 start force; |
|
+EXPECT_WITHIN $REBALANCE_TIMEOUT "0" rebalance_completed; |
|
+ |
|
+# Compare original and rebalanced files |
|
+TEST cd $B0/${V0}2 |
|
+TEST cmp sparse_file $B0/${V0}3/sparse_file_3 |
|
+EXPECT_WITHIN 30 ""; |
|
+ |
|
+cleanup; |
|
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c |
|
index 88b6b54..d0c21b4 100644 |
|
--- a/xlators/cluster/dht/src/dht-rebalance.c |
|
+++ b/xlators/cluster/dht/src/dht-rebalance.c |
|
@@ -18,8 +18,8 @@ |
|
#include <glusterfs/events.h> |
|
|
|
#define GF_DISK_SECTOR_SIZE 512 |
|
-#define DHT_REBALANCE_PID 4242 /* Change it if required */ |
|
-#define DHT_REBALANCE_BLKSIZE (1024 * 1024) /* 1 MB */ |
|
+#define DHT_REBALANCE_PID 4242 /* Change it if required */ |
|
+#define DHT_REBALANCE_BLKSIZE 1048576 /* 1 MB */ |
|
#define MAX_MIGRATE_QUEUE_COUNT 500 |
|
#define MIN_MIGRATE_QUEUE_COUNT 200 |
|
#define MAX_REBAL_TYPE_SIZE 16 |
|
@@ -178,75 +178,6 @@ dht_strip_out_acls(dict_t *dict) |
|
} |
|
} |
|
|
|
-static int |
|
-dht_write_with_holes(xlator_t *to, fd_t *fd, struct iovec *vec, int count, |
|
- int32_t size, off_t offset, struct iobref *iobref, |
|
- int *fop_errno) |
|
-{ |
|
- int i = 0; |
|
- int ret = -1; |
|
- int start_idx = 0; |
|
- int tmp_offset = 0; |
|
- int write_needed = 0; |
|
- int buf_len = 0; |
|
- int size_pending = 0; |
|
- char *buf = NULL; |
|
- |
|
- /* loop through each vector */ |
|
- for (i = 0; i < count; i++) { |
|
- buf = vec[i].iov_base; |
|
- buf_len = vec[i].iov_len; |
|
- |
|
- for (start_idx = 0; (start_idx + GF_DISK_SECTOR_SIZE) <= buf_len; |
|
- start_idx += GF_DISK_SECTOR_SIZE) { |
|
- if (mem_0filled(buf + start_idx, GF_DISK_SECTOR_SIZE) != 0) { |
|
- write_needed = 1; |
|
- continue; |
|
- } |
|
- |
|
- if (write_needed) { |
|
- ret = syncop_write( |
|
- to, fd, (buf + tmp_offset), (start_idx - tmp_offset), |
|
- (offset + tmp_offset), iobref, 0, NULL, NULL); |
|
- /* 'path' will be logged in calling function */ |
|
- if (ret < 0) { |
|
- gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)", |
|
- strerror(-ret)); |
|
- *fop_errno = -ret; |
|
- ret = -1; |
|
- goto out; |
|
- } |
|
- |
|
- write_needed = 0; |
|
- } |
|
- tmp_offset = start_idx + GF_DISK_SECTOR_SIZE; |
|
- } |
|
- |
|
- if ((start_idx < buf_len) || write_needed) { |
|
- /* This means, last chunk is not yet written.. write it */ |
|
- ret = syncop_write(to, fd, (buf + tmp_offset), |
|
- (buf_len - tmp_offset), (offset + tmp_offset), |
|
- iobref, 0, NULL, NULL); |
|
- if (ret < 0) { |
|
- /* 'path' will be logged in calling function */ |
|
- gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)", |
|
- strerror(-ret)); |
|
- *fop_errno = -ret; |
|
- ret = -1; |
|
- goto out; |
|
- } |
|
- } |
|
- |
|
- size_pending = (size - buf_len); |
|
- if (!size_pending) |
|
- break; |
|
- } |
|
- |
|
- ret = size; |
|
-out: |
|
- return ret; |
|
-} |
|
- |
|
/* |
|
return values: |
|
-1 : failure |
|
@@ -1101,32 +1032,97 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag, |
|
int ret = 0; |
|
int count = 0; |
|
off_t offset = 0; |
|
+ off_t data_offset = 0; |
|
+ off_t hole_offset = 0; |
|
struct iovec *vector = NULL; |
|
struct iobref *iobref = NULL; |
|
uint64_t total = 0; |
|
size_t read_size = 0; |
|
+ size_t data_block_size = 0; |
|
dict_t *xdata = NULL; |
|
dht_conf_t *conf = NULL; |
|
|
|
conf = this->private; |
|
+ |
|
/* if file size is '0', no need to enter this loop */ |
|
while (total < ia_size) { |
|
- read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) |
|
- ? DHT_REBALANCE_BLKSIZE |
|
- : (ia_size - total)); |
|
+ /* This is a regular file - read it sequentially */ |
|
+ if (!hole_exists) { |
|
+ read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) |
|
+ ? DHT_REBALANCE_BLKSIZE |
|
+ : (ia_size - total)); |
|
+ } else { |
|
+ /* This is a sparse file - read only the data segments in the file |
|
+ */ |
|
+ |
|
+ /* If the previous data block is fully copied, find the next data |
|
+ * segment |
|
+ * starting at the offset of the last read and written byte, */ |
|
+ if (data_block_size <= 0) { |
|
+ ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL, |
|
+ &data_offset); |
|
+ if (ret) { |
|
+ if (ret == -ENXIO) |
|
+ ret = 0; /* No more data segments */ |
|
+ else |
|
+ *fop_errno = -ret; /* Error occurred */ |
|
+ |
|
+ break; |
|
+ } |
|
+ |
|
+ /* If the position of the current data segment is greater than |
|
+ * the position of the next hole, find the next hole in order to |
|
+ * calculate the length of the new data segment */ |
|
+ if (data_offset > hole_offset) { |
|
+ /* Starting at the offset of the last data segment, find the |
|
+ * next hole */ |
|
+ ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE, |
|
+ NULL, &hole_offset); |
|
+ if (ret) { |
|
+ /* If an error occurred here it's a real error because |
|
+ * if the seek for a data segment was successful then |
|
+ * necessarily another hole must exist (EOF is a hole) |
|
+ */ |
|
+ *fop_errno = -ret; |
|
+ break; |
|
+ } |
|
+ |
|
+ /* Calculate the total size of the current data block */ |
|
+ data_block_size = hole_offset - data_offset; |
|
+ } |
|
+ } else { |
|
+ /* There is still data in the current segment, move the |
|
+ * data_offset to the position of the last written byte */ |
|
+ data_offset = offset; |
|
+ } |
|
+ |
|
+ /* Calculate how much data needs to be read and written. If the data |
|
+ * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and |
|
+ * write DHT_REBALANCE_BLKSIZE data length and the rest in the |
|
+ * next iteration(s) */ |
|
+ read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE) |
|
+ ? DHT_REBALANCE_BLKSIZE |
|
+ : data_block_size); |
|
+ |
|
+ /* Calculate the remaining size of the data block - maybe there's no |
|
+ * need to seek for data in the next iteration */ |
|
+ data_block_size -= read_size; |
|
+ |
|
+ /* Set offset to the offset of the data segment so read and write |
|
+ * will have the correct position */ |
|
+ offset = data_offset; |
|
+ } |
|
|
|
ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count, |
|
&iobref, NULL, NULL, NULL); |
|
+ |
|
if (!ret || (ret < 0)) { |
|
*fop_errno = -ret; |
|
break; |
|
} |
|
|
|
- if (hole_exists) { |
|
- ret = dht_write_with_holes(to, dst, vector, count, ret, offset, |
|
- iobref, fop_errno); |
|
- } else { |
|
- if (!conf->force_migration && !dht_is_tier_xlator(this)) { |
|
+ if (!conf->force_migration && !dht_is_tier_xlator(this)) { |
|
+ if (!xdata) { |
|
xdata = dict_new(); |
|
if (!xdata) { |
|
gf_msg("dht", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, |
|
@@ -1146,7 +1142,7 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag, |
|
* https://github.com/gluster/glusterfs/issues/308 |
|
* for more details. |
|
*/ |
|
- ret = dict_set_int32(xdata, GF_AVOID_OVERWRITE, 1); |
|
+ ret = dict_set_int32_sizen(xdata, GF_AVOID_OVERWRITE, 1); |
|
if (ret) { |
|
gf_msg("dht", GF_LOG_ERROR, 0, ENOMEM, |
|
"failed to set dict"); |
|
@@ -1155,22 +1151,12 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag, |
|
break; |
|
} |
|
} |
|
- |
|
- ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL, |
|
- NULL, xdata, NULL); |
|
- if (ret < 0) { |
|
- *fop_errno = -ret; |
|
- } |
|
- } |
|
- |
|
- if ((defrag && defrag->cmd == GF_DEFRAG_CMD_START_TIER) && |
|
- (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING)) { |
|
- gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_TIER_PAUSED, |
|
- "Migrate file paused"); |
|
- ret = -1; |
|
} |
|
|
|
+ ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL, |
|
+ NULL, xdata, NULL); |
|
if (ret < 0) { |
|
+ *fop_errno = -ret; |
|
break; |
|
} |
|
|
|
-- |
|
1.8.3.1 |
|
|
|
|