diff options
-rw-r--r-- | tests/basic/afr/sparse-file-self-heal.t | 21 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-data.c | 57 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.h | 2 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix.c | 43 |
4 files changed, 99 insertions, 24 deletions
diff --git a/tests/basic/afr/sparse-file-self-heal.t b/tests/basic/afr/sparse-file-self-heal.t index 4101e6d1db7..a8fc095ad26 100644 --- a/tests/basic/afr/sparse-file-self-heal.t +++ b/tests/basic/afr/sparse-file-self-heal.t @@ -2,6 +2,8 @@ #This file checks if self-heal of files with holes is working properly or not #bigger is 2M, big is 1M, small is anything less +#Also tests if non-sparse files with zeroes in it are healed correctly w.r.t +#disk usage. . $(dirname $0)/../../include.rc . $(dirname $0)/../../volume.rc @@ -43,6 +45,9 @@ big2bigger_md5sum=$(md5sum $M0/big2bigger | awk '{print $1}') TEST dd if=/dev/urandom of=$M0/FILE count=1 bs=131072 TEST truncate -s 1G $M0/FILE +#Create a non-sparse file containing zeroes. +TEST dd if=/dev/zero of=$M0/zeroedfile bs=1024 count=1024 + $CLI volume start $V0 force EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status @@ -74,6 +79,13 @@ EXPECT "1" has_holes $B0/${V0}0/big2bigger #Check that self-heal has not written 0s to sink and made it non-sparse. USED_KB=`du -s $B0/${V0}0/FILE|cut -f1` TEST [ $USED_KB -lt 1000000 ] + +#Check that the non-sparse file consumes the same disk space in both bricks post +#self-heal +USED_KB1=`du -s $B0/${V0}0/zeroedfile|cut -f1` +USED_KB2=`du -s $B0/${V0}1/zeroedfile|cut -f1` +TEST [ $USED_KB1 -eq $USED_KB2 ] + TEST rm -f $M0/* #check the same tests with diff self-heal @@ -108,6 +120,9 @@ big2bigger_md5sum=$(md5sum $M0/big2bigger | awk '{print $1}') TEST dd if=/dev/urandom of=$M0/FILE count=1 bs=131072 TEST truncate -s 1G $M0/FILE +#Create a non-sparse file containing zeroes. +TEST dd if=/dev/zero of=$M0/zeroedfile bs=1024 count=1024 + $CLI volume start $V0 force EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status @@ -135,4 +150,10 @@ EXPECT "0" has_holes $B0/${V0}0/small USED_KB=`du -s $B0/${V0}0/FILE|cut -f1` TEST [ $USED_KB -lt 1000000 ] +#Check that the non-sparse file consumes the same disk space in both bricks post +#self-heal. +USED_KB1=`du -s $B0/${V0}0/zeroedfile|cut -f1` +USED_KB2=`du -s $B0/${V0}1/zeroedfile|cut -f1` +TEST [ $USED_KB1 -eq $USED_KB2 ] + cleanup diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 279f2faaaaf..6a3d6e13b1b 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -33,13 +33,18 @@ __checksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dict_t *xdata) { afr_local_t *local = NULL; + struct afr_reply *replies = NULL; int i = (long) cookie; local = frame->local; - - local->replies[i].valid = 1; - local->replies[i].op_ret = op_ret; - local->replies[i].op_errno = op_errno; + replies = local->replies; + + replies[i].valid = 1; + replies[i].op_ret = op_ret; + replies[i].op_errno = op_errno; + if (xdata) + replies[i].buf_has_zeroes = dict_get_str_boolean (xdata, + "buf-has-zeroes", _gf_false); if (strong) memcpy (local->replies[i].checksum, strong, MD5_DIGEST_LENGTH); @@ -75,19 +80,23 @@ attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, static gf_boolean_t -__afr_selfheal_data_checksums_match (call_frame_t *frame, xlator_t *this, - fd_t *fd, int source, - unsigned char *healed_sinks, - off_t offset, size_t size) +__afr_can_skip_data_block_heal (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, + off_t offset, size_t size, + struct iatt *poststat) { afr_private_t *priv = NULL; afr_local_t *local = NULL; unsigned char *wind_subvols = NULL; + gf_boolean_t checksum_match = _gf_true; + dict_t *xdata = NULL; int i = 0; priv = this->private; local = frame->local; - + xdata = dict_new(); + if (xdata) + i = dict_set_int32 (xdata, "check-zero-filled", 1); wind_subvols = alloca0 (priv->child_count); for (i = 0; i < priv->child_count; i++) { if (i == source || healed_sinks[i]) @@ -95,7 +104,9 @@ __afr_selfheal_data_checksums_match (call_frame_t *frame, xlator_t *this, } AFR_ONLIST (wind_subvols, frame, __checksum_cbk, rchecksum, fd, - offset, size, NULL); + offset, size, xdata); + if (xdata) + dict_unref (xdata); if (!local->replies[source].valid || local->replies[source].op_ret != 0) return _gf_false; @@ -106,12 +117,26 @@ __afr_selfheal_data_checksums_match (call_frame_t *frame, xlator_t *this, if (local->replies[i].valid) { if (memcmp (local->replies[source].checksum, local->replies[i].checksum, - MD5_DIGEST_LENGTH)) - return _gf_false; + MD5_DIGEST_LENGTH)) { + checksum_match = _gf_false; + break; + } } } - return _gf_true; + if (checksum_match) { + if (HAS_HOLES (poststat)) + return _gf_true; + + /* For non-sparse files, we might be better off writing the + * zeroes to sinks to avoid mismatch of disk-usage in bricks. */ + if (local->replies[source].buf_has_zeroes) + return _gf_false; + else + return _gf_true; + } + + return _gf_false; } @@ -225,7 +250,6 @@ __afr_selfheal_data_read_write (call_frame_t *frame, xlator_t *this, fd_t *fd, return ret; } - static int afr_selfheal_data_block (call_frame_t *frame, xlator_t *this, fd_t *fd, int source, unsigned char *healed_sinks, off_t offset, @@ -249,8 +273,9 @@ afr_selfheal_data_block (call_frame_t *frame, xlator_t *this, fd_t *fd, } if (type == AFR_SELFHEAL_DATA_DIFF && - __afr_selfheal_data_checksums_match (frame, this, fd, source, - healed_sinks, offset, size)) { + __afr_can_skip_data_block_heal (frame, this, fd, source, + healed_sinks, offset, size, + &replies[source].poststat)) { ret = 0; goto unlock; } diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 9ee5ae36df0..c2fd1166d96 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -270,7 +270,9 @@ struct afr_reply { struct iatt preparent; struct iatt preparent2; struct iatt postparent2; + /* For rchecksum */ uint8_t checksum[MD5_DIGEST_LENGTH]; + gf_boolean_t buf_has_zeroes; }; typedef enum { diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index 32eb18471e8..4a01e9f036f 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -5906,9 +5906,13 @@ posix_rchecksum (call_frame_t *frame, xlator_t *this, int op_ret = -1; int op_errno = 0; int ret = 0; + ssize_t bytes_read = 0; int32_t weak_checksum = 0; + int32_t zerofillcheck = 0; unsigned char strong_checksum[MD5_DIGEST_LENGTH] = {0}; struct posix_private *priv = NULL; + dict_t *rsp_xdata = NULL; + gf_boolean_t buf_has_zeroes = _gf_false; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -5923,6 +5927,12 @@ posix_rchecksum (call_frame_t *frame, xlator_t *this, goto out; } + rsp_xdata = dict_new(); + if (!rsp_xdata) { + op_errno = ENOMEM; + goto out; + } + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { gf_msg (this->name, GF_LOG_WARNING, -ret, P_MSG_PFD_NULL, @@ -5938,12 +5948,12 @@ posix_rchecksum (call_frame_t *frame, xlator_t *this, if (priv->aio_capable && priv->aio_init_done) __posix_fd_set_odirect (fd, pfd, 0, offset, len); - ret = pread (_fd, buf, len, offset); - if (ret < 0) { + bytes_read = pread (_fd, buf, len, offset); + if (bytes_read < 0) { gf_msg (this->name, GF_LOG_WARNING, errno, P_MSG_PREAD_FAILED, - "pread of %d bytes returned %d ", - len, ret); + "pread of %d bytes returned %ld ", + len, bytes_read); op_errno = errno; } @@ -5951,17 +5961,34 @@ posix_rchecksum (call_frame_t *frame, xlator_t *this, } UNLOCK (&fd->lock); - if (ret < 0) + if (bytes_read < 0) goto out; + if (xdata && dict_get_int32 (xdata, "check-zero-filled", + &zerofillcheck) == 0) { + buf_has_zeroes = (mem_0filled (buf, bytes_read)) ? _gf_false : + _gf_true; + ret = dict_set_uint32 (rsp_xdata, "buf-has-zeroes", + buf_has_zeroes); + if (ret) { + gf_msg (this->name, GF_LOG_WARNING, -ret, + P_MSG_DICT_SET_FAILED, "%s: Failed to set " + "dictionary value for key: %s", + uuid_utoa (fd->inode->gfid), "buf-has-zeroes"); + op_errno = -ret; + goto out; + } + } weak_checksum = gf_rsync_weak_checksum ((unsigned char *) buf, (size_t) ret); - gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) ret, (unsigned char *) strong_checksum); + gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) bytes_read, + (unsigned char *) strong_checksum); op_ret = 0; out: STACK_UNWIND_STRICT (rchecksum, frame, op_ret, op_errno, - weak_checksum, strong_checksum, NULL); - + weak_checksum, strong_checksum, rsp_xdata); + if (rsp_xdata) + dict_unref (rsp_xdata); GF_FREE (alloc_buf); return 0; |