diff options
author | Pranith Kumar K <pkarampu@redhat.com> | 2015-04-23 08:30:11 +0530 |
---|---|---|
committer | Vijay Bellur <vbellur@redhat.com> | 2015-04-24 22:39:23 -0700 |
commit | 7efa7e2116856b4cf37797218612a41bdd237e77 (patch) | |
tree | 3e492c2516e3381f34c1cb7b52aa9300e079ca9a /xlators/cluster | |
parent | de9d06cd7cfca0b42beffe003e7c1e09d469ca7e (diff) |
cluster/ec: Perform inode-write on healing subvols
If the version numbers do not match, then writes are performed only on at least
N-R bricks which have same version. But if we want to do healing of files which
are constantly modified we need to allow writes on subvols that are undergoing
heal. Data healing will mark 62nd bit while the heal is going on. When the data
transaction sees that this bit is set it needs to perform the fop on that
subvol irrespective of whether the versions match or do not match. Fop is
considered successful only if N-R non-healing bricks succeed.
Change-Id: I69a17582df397aaf6e8ca4b5e746c7ca802cbbde
BUG: 1215265
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: http://review.gluster.org/10372
Tested-by: NetBSD Build System
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'xlators/cluster')
-rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 60 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-data.h | 3 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-generic.c | 154 |
3 files changed, 88 insertions, 129 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 7354277da1b..a4bd8dafe28 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -333,6 +333,7 @@ void ec_complete(ec_fop_data_t * fop) { ec_cbk_data_t * cbk = NULL; int32_t resume = 0, update = 0; + int healing_count = 0; LOCK(&fop->lock); @@ -342,11 +343,15 @@ void ec_complete(ec_fop_data_t * fop) if (fop->answer == NULL) { if (!list_empty(&fop->cbk_list)) { cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list); - if ((cbk->count >= fop->minimum) && - ((cbk->op_ret >= 0) || (cbk->op_errno != ENOTCONN))) { - fop->answer = cbk; - - update = 1; + healing_count = ec_bits_count (cbk->mask & fop->healing); + if ((cbk->count - healing_count) >= fop->minimum) { + /* fop shouldn't be treated as success if it is not + * successful on at least fop->minimum good copies*/ + if ((cbk->op_ret >= 0) || (cbk->op_errno != ENOTCONN)) { + fop->answer = cbk; + + update = 1; + } } } @@ -431,6 +436,8 @@ int32_t ec_child_select(ec_fop_data_t * fop) } ec->idx = first; + /*Unconditionally wind on healing subvolumes*/ + fop->mask |= fop->healing; fop->remaining = fop->mask; ec_trace("SELECT", fop, ""); @@ -560,17 +567,17 @@ void ec_dispatch_inc(ec_fop_data_t * fop) } } -void ec_dispatch_all(ec_fop_data_t * fop) +void +ec_dispatch_all (ec_fop_data_t *fop) { - ec_dispatch_start(fop); + ec_dispatch_start(fop); - if (ec_child_select(fop)) - { - fop->expected = ec_bits_count(fop->remaining); - fop->first = 0; + if (ec_child_select(fop)) { + fop->expected = ec_bits_count(fop->remaining); + fop->first = 0; - ec_dispatch_mask(fop, fop->remaining); - } + ec_dispatch_mask(fop, fop->remaining); + } } void ec_dispatch_min(ec_fop_data_t * fop) @@ -1052,9 +1059,27 @@ int32_t ec_get_size_version_set(call_frame_t * frame, void * cookie, return 0; } -int32_t ec_prepare_update_cbk(call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +gf_boolean_t +ec_is_data_fop (glusterfs_fop_t fop) +{ + switch (fop) { + case GF_FOP_WRITE: + case GF_FOP_TRUNCATE: + case GF_FOP_FTRUNCATE: + case GF_FOP_FALLOCATE: + case GF_FOP_DISCARD: + case GF_FOP_ZEROFILL: + return _gf_true; + default: + return _gf_false; + } + return _gf_false; +} + +int32_t +ec_prepare_update_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) { ec_fop_data_t *fop = cookie, *parent; ec_lock_t *lock = NULL; @@ -1091,6 +1116,9 @@ int32_t ec_prepare_update_cbk(call_frame_t *frame, void *cookie, UNLOCK(&lock->loc.inode->lock); fop->parent->mask &= fop->good; + /*As of now only data healing marks bricks as healing*/ + if (ec_is_data_fop (fop->parent->id)) + fop->parent->healing |= fop->healing; fop->parent->pre_size = fop->parent->post_size = lock->size; fop->parent->have_size = 1; diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h index ac59a6b2e14..80936aeaada 100644 --- a/xlators/cluster/ec/src/ec-data.h +++ b/xlators/cluster/ec/src/ec-data.h @@ -192,6 +192,9 @@ struct _ec_fop_data uint32_t flags; uint32_t first; uintptr_t mask; + uintptr_t healing; /*Dispatch is done but call is successful only + if fop->minimum number of subvolumes succeed + which are not healing*/ uintptr_t remaining; uintptr_t good; uintptr_t bad; diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c index 73df0d89db8..f80770a3365 100644 --- a/xlators/cluster/ec/src/ec-generic.c +++ b/xlators/cluster/ec/src/ec-generic.c @@ -16,7 +16,9 @@ #include "ec-combine.h" #include "ec-method.h" #include "ec-fops.h" +#include "byte-order.h" +#define EC_SELFHEAL_BIT 62 /* FOP: flush */ int32_t ec_flush_cbk(call_frame_t * frame, void * cookie, xlator_t * this, @@ -1308,69 +1310,57 @@ int32_t ec_combine_xattrop(ec_fop_data_t * fop, ec_cbk_data_t * dst, return 1; } -int32_t ec_xattrop_cbk(call_frame_t * frame, void * cookie, xlator_t * this, - int32_t op_ret, int32_t op_errno, dict_t * xattr, - dict_t * xdata) +int32_t +ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) { - ec_fop_data_t * fop = NULL; - ec_cbk_data_t * cbk = NULL; - int32_t idx = (int32_t)(uintptr_t)cookie; + ec_fop_data_t *fop = NULL; + ec_cbk_data_t *cbk = NULL; + int32_t idx = (int32_t)(uintptr_t)cookie; + uint64_t version = 0; + uint64_t *version_xattr = 0; - VALIDATE_OR_GOTO(this, out); - GF_VALIDATE_OR_GOTO(this->name, frame, out); - GF_VALIDATE_OR_GOTO(this->name, frame->local, out); - GF_VALIDATE_OR_GOTO(this->name, this->private, out); + VALIDATE_OR_GOTO (this, out); + GF_VALIDATE_OR_GOTO (this->name, frame, out); + GF_VALIDATE_OR_GOTO (this->name, frame->local, out); + GF_VALIDATE_OR_GOTO (this->name, this->private, out); - fop = frame->local; + fop = frame->local; - ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, - frame, op_ret, op_errno); + ec_trace ("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, + frame, op_ret, op_errno); - cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_XATTROP, idx, op_ret, - op_errno); - if (cbk != NULL) - { - if (op_ret >= 0) - { - if (xattr != NULL) - { - cbk->dict = dict_ref(xattr); - if (cbk->dict == NULL) - { - gf_log(this->name, GF_LOG_ERROR, "Failed to reference a " - "dictionary."); + cbk = ec_cbk_data_allocate (frame, this, fop, fop->id, idx, op_ret, + op_errno); + if (!cbk) + goto out; - goto out; - } - } - } - if (xdata != NULL) - { - uint64_t dirty; + if (op_ret >= 0) { + uint64_t dirty; + cbk->dict = dict_ref (xattr); - cbk->xdata = dict_ref(xdata); - if (cbk->xdata == NULL) - { - gf_log(this->name, GF_LOG_ERROR, "Failed to reference a " - "dictionary."); + if (dict_get_bin (xattr, EC_XATTR_VERSION, + (void **)&version_xattr) == 0) { + version = ntoh64(version_xattr[0]); + if ((version >> EC_SELFHEAL_BIT) & 1) + fop->healing |= (1ULL<<idx); + } - goto out; - } - if (ec_dict_del_number(cbk->xdata, EC_XATTR_DIRTY, &dirty) == 0) { - cbk->dirty = dirty != 0; - } + if (ec_dict_del_number (xattr, EC_XATTR_DIRTY, &dirty) == 0) + cbk->dirty = dirty != 0; } - ec_combine(cbk, ec_combine_xattrop); - } + if (xdata) + cbk->xdata = dict_ref(xdata); + + ec_combine (cbk, ec_combine_xattrop); out: - if (fop != NULL) - { - ec_complete(fop); - } + if (fop) + ec_complete(fop); - return 0; + return 0; } void ec_wind_xattrop(ec_t * ec, ec_fop_data_t * fop, int32_t idx) @@ -1576,73 +1566,11 @@ out: } } -/* FOP: fxattrop */ - -int32_t ec_fxattrop_cbk(call_frame_t * frame, void * cookie, xlator_t * this, - int32_t op_ret, int32_t op_errno, dict_t * xattr, - dict_t * xdata) -{ - ec_fop_data_t * fop = NULL; - ec_cbk_data_t * cbk = NULL; - int32_t idx = (int32_t)(uintptr_t)cookie; - - VALIDATE_OR_GOTO(this, out); - GF_VALIDATE_OR_GOTO(this->name, frame, out); - GF_VALIDATE_OR_GOTO(this->name, frame->local, out); - GF_VALIDATE_OR_GOTO(this->name, this->private, out); - - fop = frame->local; - - ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx, - frame, op_ret, op_errno); - - cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FXATTROP, idx, op_ret, - op_errno); - if (cbk != NULL) - { - if (op_ret >= 0) - { - if (xattr != NULL) - { - cbk->dict = dict_ref(xattr); - if (cbk->dict == NULL) - { - gf_log(this->name, GF_LOG_ERROR, "Failed to reference a " - "dictionary."); - - goto out; - } - } - } - if (xdata != NULL) - { - cbk->xdata = dict_ref(xdata); - if (cbk->xdata == NULL) - { - gf_log(this->name, GF_LOG_ERROR, "Failed to reference a " - "dictionary."); - - goto out; - } - } - - ec_combine(cbk, ec_combine_xattrop); - } - -out: - if (fop != NULL) - { - ec_complete(fop); - } - - return 0; -} - void ec_wind_fxattrop(ec_t * ec, ec_fop_data_t * fop, int32_t idx) { ec_trace("WIND", fop, "idx=%d", idx); - STACK_WIND_COOKIE(fop->frame, ec_fxattrop_cbk, (void *)(uintptr_t)idx, + STACK_WIND_COOKIE(fop->frame, ec_xattrop_cbk, (void *)(uintptr_t)idx, ec->xl_list[idx], ec->xl_list[idx]->fops->fxattrop, fop->fd, fop->xattrop_flags, fop->dict, fop->xdata); } |