From 0f84f8e8048367737a2dd6ddf0c57403e757441d Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Fri, 23 Jan 2015 11:12:54 +0530 Subject: afr: Don't write to sparse regions of sink. Problem: When data-self-heal-algorithm is set to 'full', shd just reads from source and writes to sink. If source file happened to be sparse (VM workloads), we end up actually writing 0s to the corresponding regions of the sink causing it to lose its sparseness. Fix: If the source file is sparse, and the data read from source and sink are both zeros for that range, skip writing that range to the sink. Change-Id: I787b06a553803247f43a40c00139cb483a22f9ca BUG: 1166020 Signed-off-by: Ravishankar N Reviewed-on: http://review.gluster.org/9480 Tested-by: Gluster Build System Reviewed-by: Pranith Kumar Karampuri Tested-by: Pranith Kumar Karampuri --- tests/basic/afr/sparse-file-self-heal.t | 17 ++++++++++++ xlators/cluster/afr/src/afr-self-heal-data.c | 41 ++++++++++++++++++++++++++-- 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/tests/basic/afr/sparse-file-self-heal.t b/tests/basic/afr/sparse-file-self-heal.t index 726af0710d5..1bc915e062c 100644 --- a/tests/basic/afr/sparse-file-self-heal.t +++ b/tests/basic/afr/sparse-file-self-heal.t @@ -17,6 +17,7 @@ TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; TEST dd if=/dev/urandom of=$M0/small count=1 bs=1024k TEST dd if=/dev/urandom of=$M0/bigger2big count=1 bs=2048k TEST dd if=/dev/urandom of=$M0/big2bigger count=1 bs=1024k +TEST truncate -s 1G $M0/FILE TEST kill_brick $V0 $H0 $B0/${V0}0 @@ -38,6 +39,10 @@ bigger2big_md5sum=$(md5sum $M0/bigger2big | awk '{print $1}') TEST truncate -s 2M $M0/big2bigger big2bigger_md5sum=$(md5sum $M0/big2bigger | awk '{print $1}') +#Write data to file and restore its sparseness +TEST dd if=/dev/urandom of=$M0/FILE count=1 bs=131072 +TEST truncate -s 1G $M0/FILE + $CLI volume start $V0 force EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status @@ -66,6 +71,9 @@ EXPECT "0" has_holes $B0/${V0}0/small EXPECT "0" has_holes $B0/${V0}0/bigger2big EXPECT "1" has_holes $B0/${V0}0/big2bigger +#Check that self-heal has not written 0s to sink and made it non-sparse. +USED_KB=`du -s $B0/${V0}0/FILE|cut -f1` +TEST [ $USED_KB -lt 1000000 ] TEST rm -f $M0/* #check the same tests with diff self-heal @@ -74,6 +82,7 @@ TEST $CLI volume set $V0 data-self-heal-algorithm diff TEST dd if=/dev/urandom of=$M0/small count=1 bs=1024k TEST dd if=/dev/urandom of=$M0/big2bigger count=1 bs=1024k TEST dd if=/dev/urandom of=$M0/bigger2big count=1 bs=2048k +TEST truncate -s 1G $M0/FILE TEST kill_brick $V0 $H0 $B0/${V0}0 @@ -95,6 +104,10 @@ bigger2big_md5sum=$(md5sum $M0/bigger2big | awk '{print $1}') TEST truncate -s 2M $M0/big2bigger big2bigger_md5sum=$(md5sum $M0/big2bigger | awk '{print $1}') +#Write data to file and restore its sparseness +TEST dd if=/dev/urandom of=$M0/FILE count=1 bs=131072 +TEST truncate -s 1G $M0/FILE + $CLI volume start $V0 force EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status @@ -118,4 +131,8 @@ EXPECT "1" has_holes $B0/${V0}0/big2bigger EXPECT "0" has_holes $B0/${V0}0/bigger2big EXPECT "0" has_holes $B0/${V0}0/small +#Check that self-heal has not written 0s to sink and made it non-sparse. +USED_KB=`du -s $B0/${V0}0/FILE|cut -f1` +TEST [ $USED_KB -lt 1000000 ] + cleanup diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 45a099cec86..1b1d57d0048 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -112,11 +112,38 @@ __afr_selfheal_data_checksums_match (call_frame_t *frame, xlator_t *this, } +static gf_boolean_t +__afr_is_sink_zero_filled (xlator_t *this, fd_t *fd, size_t size, + off_t offset, int sink) +{ + afr_private_t *priv = NULL; + struct iobref *iobref = NULL; + struct iovec *iovec = NULL; + int count = 0; + int ret = 0; + gf_boolean_t zero_filled = _gf_false; + + priv = this->private; + ret = syncop_readv (priv->children[sink], fd, size, offset, 0, &iovec, + &count, &iobref); + if (ret < 0) + goto out; + ret = iov_0filled (iovec, count); + if (!ret) + zero_filled = _gf_true; +out: + if (iovec) + GF_FREE (iovec); + if (iobref) + iobref_unref (iobref); + return zero_filled; +} + static int __afr_selfheal_data_read_write (call_frame_t *frame, xlator_t *this, fd_t *fd, int source, unsigned char *healed_sinks, off_t offset, size_t size, - struct afr_reply *replies) + struct afr_reply *replies, int type) { struct iovec *iovec = NULL; int count = 0; @@ -166,6 +193,16 @@ __afr_selfheal_data_read_write (call_frame_t *frame, xlator_t *this, fd_t *fd, (iov_0filled (iovec, count) == 0)) continue; + /* Avoid filling up sparse regions of the sink with 0-filled + * writes.*/ + if (type == AFR_SELFHEAL_DATA_FULL && + HAS_HOLES ((&replies[source].poststat)) && + ((offset + size) <= replies[i].poststat.ia_size) && + (iov_0filled (iovec, count) == 0) && + __afr_is_sink_zero_filled (this, fd, size, offset, i)) { + continue; + } + ret = syncop_writev (priv->children[i], fd, iovec, count, offset, iobref, 0); if (ret != iov_length (iovec, count)) { @@ -217,7 +254,7 @@ afr_selfheal_data_block (call_frame_t *frame, xlator_t *this, fd_t *fd, ret = __afr_selfheal_data_read_write (frame, this, fd, source, healed_sinks, offset, size, - replies); + replies, type); } unlock: afr_selfheal_uninodelk (frame, this, fd->inode, this->name, -- cgit