summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRavishankar N <ravishankar@redhat.com>2015-10-21 21:05:46 +0530
committerJeff Darcy <jdarcy@redhat.com>2015-10-28 06:39:42 -0700
commit641b3a9164227db52df1aab05795c90d06b315f2 (patch)
treee2a55cb4aed7e6376f6a6da55c6493247d99c366
parent8e5a7632edd040031e4942134331172805bc8eff (diff)
afr: write zeros to sink for non-sparse files
Problem: If a file is created with zeroes ('dd', 'fallocate' etc.) when a brick is down, the self-heal does not write the zeroes to the sink after it comes up. Consequenty, there is a mismatch in disk-usage amongst the bricks of the replica. Fix: If we definitely know that the file is not sparse, then write the zeroes to the sink even if the checksums match. Change-Id: Ic739b3da5dbf47d99801c0e1743bb13aeb3af864 BUG: 1272460 Signed-off-by: Ravishankar N <ravishankar@redhat.com> Reviewed-on: http://review.gluster.org/12371 Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com> Tested-by: Gluster Build System <jenkins@build.gluster.com>
-rw-r--r--tests/basic/afr/sparse-file-self-heal.t21
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c57
-rw-r--r--xlators/cluster/afr/src/afr.h2
-rw-r--r--xlators/storage/posix/src/posix.c43
4 files changed, 99 insertions, 24 deletions
diff --git a/tests/basic/afr/sparse-file-self-heal.t b/tests/basic/afr/sparse-file-self-heal.t
index 026ada7fe5e..d540ef786d1 100644
--- a/tests/basic/afr/sparse-file-self-heal.t
+++ b/tests/basic/afr/sparse-file-self-heal.t
@@ -2,6 +2,8 @@
#This file checks if self-heal of files with holes is working properly or not
#bigger is 2M, big is 1M, small is anything less
+#Also tests if non-sparse files with zeroes in it are healed correctly w.r.t
+#disk usage.
. $(dirname $0)/../../include.rc
. $(dirname $0)/../../volume.rc
@@ -43,6 +45,9 @@ big2bigger_md5sum=$(md5sum $M0/big2bigger | awk '{print $1}')
TEST dd if=/dev/urandom of=$M0/FILE count=1 bs=131072
TEST truncate -s 1G $M0/FILE
+#Create a non-sparse file containing zeroes.
+TEST dd if=/dev/zero of=$M0/zeroedfile bs=1024 count=1024
+
$CLI volume start $V0 force
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
@@ -79,6 +84,13 @@ EXPECT "1" has_holes $B0/${V0}0/big2bigger
#Check that self-heal has not written 0s to sink and made it non-sparse.
USED_KB=`du -s $B0/${V0}0/FILE|cut -f1`
TEST [ $USED_KB -lt 1000000 ]
+
+#Check that the non-sparse file consumes the same disk space in both bricks post
+#self-heal
+USED_KB1=`du -s $B0/${V0}0/zeroedfile|cut -f1`
+USED_KB2=`du -s $B0/${V0}1/zeroedfile|cut -f1`
+TEST [ $USED_KB1 -eq $USED_KB2 ]
+
TEST rm -f $M0/*
#check the same tests with diff self-heal
@@ -113,6 +125,9 @@ big2bigger_md5sum=$(md5sum $M0/big2bigger | awk '{print $1}')
TEST dd if=/dev/urandom of=$M0/FILE count=1 bs=131072
TEST truncate -s 1G $M0/FILE
+#Create a non-sparse file containing zeroes.
+TEST dd if=/dev/zero of=$M0/zeroedfile bs=1024 count=1024
+
$CLI volume start $V0 force
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
@@ -145,4 +160,10 @@ EXPECT "0" has_holes $B0/${V0}0/small
USED_KB=`du -s $B0/${V0}0/FILE|cut -f1`
TEST [ $USED_KB -lt 1000000 ]
+#Check that the non-sparse file consumes the same disk space in both bricks post
+#self-heal.
+USED_KB1=`du -s $B0/${V0}0/zeroedfile|cut -f1`
+USED_KB2=`du -s $B0/${V0}1/zeroedfile|cut -f1`
+TEST [ $USED_KB1 -eq $USED_KB2 ]
+
cleanup
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 36d658fa454..4529462c3a9 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -28,13 +28,18 @@ __checksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dict_t *xdata)
{
afr_local_t *local = NULL;
+ struct afr_reply *replies = NULL;
int i = (long) cookie;
local = frame->local;
-
- local->replies[i].valid = 1;
- local->replies[i].op_ret = op_ret;
- local->replies[i].op_errno = op_errno;
+ replies = local->replies;
+
+ replies[i].valid = 1;
+ replies[i].op_ret = op_ret;
+ replies[i].op_errno = op_errno;
+ if (xdata)
+ replies[i].buf_has_zeroes = dict_get_str_boolean (xdata,
+ "buf-has-zeroes", _gf_false);
if (strong)
memcpy (local->replies[i].checksum, strong, MD5_DIGEST_LENGTH);
@@ -70,19 +75,23 @@ attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
static gf_boolean_t
-__afr_selfheal_data_checksums_match (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int source,
- unsigned char *healed_sinks,
- off_t offset, size_t size)
+__afr_can_skip_data_block_heal (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks,
+ off_t offset, size_t size,
+ struct iatt *poststat)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
unsigned char *wind_subvols = NULL;
+ gf_boolean_t checksum_match = _gf_true;
+ dict_t *xdata = NULL;
int i = 0;
priv = this->private;
local = frame->local;
-
+ xdata = dict_new();
+ if (xdata)
+ i = dict_set_int32 (xdata, "check-zero-filled", 1);
wind_subvols = alloca0 (priv->child_count);
for (i = 0; i < priv->child_count; i++) {
if (i == source || healed_sinks[i])
@@ -90,7 +99,9 @@ __afr_selfheal_data_checksums_match (call_frame_t *frame, xlator_t *this,
}
AFR_ONLIST (wind_subvols, frame, __checksum_cbk, rchecksum, fd,
- offset, size, NULL);
+ offset, size, xdata);
+ if (xdata)
+ dict_unref (xdata);
if (!local->replies[source].valid || local->replies[source].op_ret != 0)
return _gf_false;
@@ -101,12 +112,26 @@ __afr_selfheal_data_checksums_match (call_frame_t *frame, xlator_t *this,
if (local->replies[i].valid) {
if (memcmp (local->replies[source].checksum,
local->replies[i].checksum,
- MD5_DIGEST_LENGTH))
- return _gf_false;
+ MD5_DIGEST_LENGTH)) {
+ checksum_match = _gf_false;
+ break;
+ }
}
}
- return _gf_true;
+ if (checksum_match) {
+ if (HAS_HOLES (poststat))
+ return _gf_true;
+
+ /* For non-sparse files, we might be better off writing the
+ * zeroes to sinks to avoid mismatch of disk-usage in bricks. */
+ if (local->replies[source].buf_has_zeroes)
+ return _gf_false;
+ else
+ return _gf_true;
+ }
+
+ return _gf_false;
}
@@ -220,7 +245,6 @@ __afr_selfheal_data_read_write (call_frame_t *frame, xlator_t *this, fd_t *fd,
return ret;
}
-
static int
afr_selfheal_data_block (call_frame_t *frame, xlator_t *this, fd_t *fd,
int source, unsigned char *healed_sinks, off_t offset,
@@ -244,8 +268,9 @@ afr_selfheal_data_block (call_frame_t *frame, xlator_t *this, fd_t *fd,
}
if (type == AFR_SELFHEAL_DATA_DIFF &&
- __afr_selfheal_data_checksums_match (frame, this, fd, source,
- healed_sinks, offset, size)) {
+ __afr_can_skip_data_block_heal (frame, this, fd, source,
+ healed_sinks, offset, size,
+ &replies[source].poststat)) {
ret = 0;
goto unlock;
}
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index bd4ff655546..28147e00579 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -265,7 +265,9 @@ struct afr_reply {
struct iatt preparent;
struct iatt preparent2;
struct iatt postparent2;
+ /* For rchecksum */
uint8_t checksum[MD5_DIGEST_LENGTH];
+ gf_boolean_t buf_has_zeroes;
};
typedef enum {
diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c
index bc4e4904b74..b544e3739c6 100644
--- a/xlators/storage/posix/src/posix.c
+++ b/xlators/storage/posix/src/posix.c
@@ -5895,9 +5895,13 @@ posix_rchecksum (call_frame_t *frame, xlator_t *this,
int op_ret = -1;
int op_errno = 0;
int ret = 0;
+ ssize_t bytes_read = 0;
int32_t weak_checksum = 0;
+ int32_t zerofillcheck = 0;
unsigned char strong_checksum[MD5_DIGEST_LENGTH] = {0};
struct posix_private *priv = NULL;
+ dict_t *rsp_xdata = NULL;
+ gf_boolean_t buf_has_zeroes = _gf_false;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
@@ -5912,6 +5916,12 @@ posix_rchecksum (call_frame_t *frame, xlator_t *this,
goto out;
}
+ rsp_xdata = dict_new();
+ if (!rsp_xdata) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
ret = posix_fd_ctx_get (fd, this, &pfd);
if (ret < 0) {
gf_msg (this->name, GF_LOG_WARNING, -ret, P_MSG_PFD_NULL,
@@ -5927,12 +5937,12 @@ posix_rchecksum (call_frame_t *frame, xlator_t *this,
if (priv->aio_capable && priv->aio_init_done)
__posix_fd_set_odirect (fd, pfd, 0, offset, len);
- ret = pread (_fd, buf, len, offset);
- if (ret < 0) {
+ bytes_read = pread (_fd, buf, len, offset);
+ if (bytes_read < 0) {
gf_msg (this->name, GF_LOG_WARNING, errno,
P_MSG_PREAD_FAILED,
- "pread of %d bytes returned %d ",
- len, ret);
+ "pread of %d bytes returned %ld ",
+ len, bytes_read);
op_errno = errno;
}
@@ -5940,17 +5950,34 @@ posix_rchecksum (call_frame_t *frame, xlator_t *this,
}
UNLOCK (&fd->lock);
- if (ret < 0)
+ if (bytes_read < 0)
goto out;
+ if (xdata && dict_get_int32 (xdata, "check-zero-filled",
+ &zerofillcheck) == 0) {
+ buf_has_zeroes = (mem_0filled (buf, bytes_read)) ? _gf_false :
+ _gf_true;
+ ret = dict_set_uint32 (rsp_xdata, "buf-has-zeroes",
+ buf_has_zeroes);
+ if (ret) {
+ gf_msg (this->name, GF_LOG_WARNING, -ret,
+ P_MSG_DICT_SET_FAILED, "%s: Failed to set "
+ "dictionary value for key: %s",
+ uuid_utoa (fd->inode->gfid), "buf-has-zeroes");
+ op_errno = -ret;
+ goto out;
+ }
+ }
weak_checksum = gf_rsync_weak_checksum ((unsigned char *) buf, (size_t) ret);
- gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) ret, (unsigned char *) strong_checksum);
+ gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) bytes_read,
+ (unsigned char *) strong_checksum);
op_ret = 0;
out:
STACK_UNWIND_STRICT (rchecksum, frame, op_ret, op_errno,
- weak_checksum, strong_checksum, NULL);
-
+ weak_checksum, strong_checksum, rsp_xdata);
+ if (rsp_xdata)
+ dict_unref (rsp_xdata);
GF_FREE (alloc_buf);
return 0;