summaryrefslogtreecommitdiffstats
path: root/xlators/cluster
diff options
context:
space:
mode:
authorAshish Pandey <aspandey@redhat.com>2017-04-03 12:46:29 +0530
committerXavier Hernandez <xhernandez@datalab.es>2017-06-06 14:41:52 +0000
commit88c67b72b1d5843d11ce7cba27dd242bd0c23c6a (patch)
tree5df681630650442107f9c03f52cdc273f8872113 /xlators/cluster
parent4dba0d5f8d9ac13504087f553d7c730060f0f9c7 (diff)
cluster/ec: Update xattr and heal size properly
Problem-1 : Recursive healing of same file is happening when IO is going on even after data heal completes. Solution: RCA: At the end of the write, when ec_update_size_version gets called, we send it only on good bricks and not on healing brick. Due to this, xattr on healing brick will always remain out of sync and when the background heal check source and sink, it finds this brick to be healed and start healing from scratch. That involve ftruncate and writing all of the data again. To solve this, send xattrop on all the good bricks as well as healing bricks. Problem-2: The above fix exposes the data corruption during heal. If the write on a file is going on and heal finishes, we find that the file gets corrupted. RCA: The real problem happens in ec_rebuild_data(). Here we receive the 'size' argument which contains the real file size at the time of starting self-heal and it's assigned to heal->total_size. After that, a sequence of calls to ec_sync_heal_block() are done. Each call ends up calling ec_manager_heal_block(), which does the actual work of healing a block. First a lock on the inode is taken in state EC_STATE_INIT using ec_heal_inodelk(). When the lock is acquired, ec_heal_lock_cbk() is called. This function calls ec_set_inode_size() to store the real size of the inode (it uses heal->total_size). The next step is to read the block to be healed. This is done using a regular ec_readv(). One of the things this call does is to trim the returned size if the file is smaller than the requested size. In our case, when we read the last block of a file whose size was = 512 mod 1024 at the time of starting self-heal, ec_readv() will return only the first 512 bytes, not the whole 1024 bytes. This isn't a problem since the following ec_writev() sent from the heal code only attempts to write the amount of data read, so it shouldn't modify the remaining 512 bytes. However ec_writev() also checks the file size. If we are writing the last block of the file (determined by the size stored on the inode that we have set to heal->total_size), any data beyond the (imposed) end of file will be cleared with 0's. This causes the 512 bytes after the heal->total_size to be cleared. Since the file was written after heal started, the these bytes contained data, so the block written to the damaged brick will be incorrect. Solution: Align heal->total_size to a multiple of the stripe size. Thanks "Xavier Hernandez" <xhernandez@datalab.es> to find out the root cause and to fix the issue. Change-Id: I6c9f37b3ff9dd7f5dc1858ad6f9845c05b4e204e BUG: 1428673 Signed-off-by: Ashish Pandey <aspandey@redhat.com> Reviewed-on: https://review.gluster.org/16985 Smoke: Gluster Build System <jenkins@build.gluster.org> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com> Reviewed-by: Xavier Hernandez <xhernandez@datalab.es>
Diffstat (limited to 'xlators/cluster')
-rw-r--r--xlators/cluster/ec/src/ec-common.c11
-rw-r--r--xlators/cluster/ec/src/ec-heal.c17
2 files changed, 19 insertions, 9 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 2231a8da1f5..284c2cd5a62 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -1753,6 +1753,9 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
}
}
+ if (fop->healing) {
+ lock->healing = fop->healing & (fop->good | fop->remaining);
+ }
ec_lock_update_good(lock, fop);
lock->exclusive -= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;
@@ -1950,6 +1953,8 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
ec_lock_t *lock;
ec_inode_t *ctx;
dict_t * dict;
+ uintptr_t update_on = 0;
+
int32_t err = -ENOMEM;
fop = link->fop;
@@ -2003,12 +2008,14 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
fop->frame->root->uid = 0;
fop->frame->root->gid = 0;
+ update_on = lock->good_mask | lock->healing;
+
if (link->lock->fd == NULL) {
- ec_xattrop(fop->frame, fop->xl, lock->good_mask, EC_MINIMUM_MIN,
+ ec_xattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN,
ec_update_size_version_done, link, &link->lock->loc,
GF_XATTROP_ADD_ARRAY64, dict, NULL);
} else {
- ec_fxattrop(fop->frame, fop->xl, lock->good_mask, EC_MINIMUM_MIN,
+ ec_fxattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN,
ec_update_size_version_done, link, link->lock->fd,
GF_XATTROP_ADD_ARRAY64, dict, NULL);
}
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
index afe7833f385..fae31778532 100644
--- a/xlators/cluster/ec/src/ec-heal.c
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -1905,7 +1905,7 @@ ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
heal->fd = fd_ref (fd);
heal->xl = ec->xl;
heal->data = &barrier;
- syncbarrier_init (heal->data);
+ size = ec_adjust_size (ec, size, 0);
heal->total_size = size;
heal->size = (128 * GF_UNIT_KB * (ec->self_heal_window_size));
/* We need to adjust the size to a multiple of the stripe size of the
@@ -1943,13 +1943,15 @@ ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
}
int
-__ec_heal_trim_sinks (call_frame_t *frame, ec_t *ec, fd_t *fd,
- unsigned char *healed_sinks, unsigned char *trim)
+__ec_heal_trim_sinks (call_frame_t *frame, ec_t *ec,
+ fd_t *fd, unsigned char *healed_sinks,
+ unsigned char *trim, uint64_t size)
{
default_args_cbk_t *replies = NULL;
unsigned char *output = NULL;
int ret = 0;
int i = 0;
+ off_t trim_offset = 0;
EC_REPLIES_ALLOC (replies, ec->nodes);
output = alloca0 (ec->nodes);
@@ -1958,9 +1960,9 @@ __ec_heal_trim_sinks (call_frame_t *frame, ec_t *ec, fd_t *fd,
ret = 0;
goto out;
}
-
+ trim_offset = ec_adjust_size (ec, size, 1);
ret = cluster_ftruncate (ec->xl_list, trim, ec->nodes, replies, output,
- frame, ec->xl, fd, 0, NULL);
+ frame, ec->xl, fd, trim_offset, NULL);
for (i = 0; i < ec->nodes; i++) {
if (!output[i] && trim[i])
healed_sinks[i] = 0;
@@ -2014,7 +2016,7 @@ ec_data_undo_pending (call_frame_t *frame, ec_t *ec, fd_t *fd, dict_t *xattr,
if ((memcmp (versions_xattr, allzero, sizeof (allzero)) == 0) &&
(memcmp (dirty_xattr, allzero, sizeof (allzero)) == 0) &&
- (size == 0)) {
+ (size_xattr == 0)) {
ret = 0;
goto out;
}
@@ -2220,7 +2222,8 @@ __ec_heal_data (call_frame_t *frame, ec_t *ec, fd_t *fd, unsigned char *heal_on,
if (ret < 0)
goto unlock;
- ret = __ec_heal_trim_sinks (frame, ec, fd, healed_sinks, trim);
+ ret = __ec_heal_trim_sinks (frame, ec, fd, healed_sinks,
+ trim, size[source]);
}
unlock:
cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output,