diff options
author | Pranith Kumar K <pkarampu@redhat.com> | 2017-03-02 07:14:14 +0530 |
---|---|---|
committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2017-03-04 07:37:56 -0500 |
commit | 78c5c5637104cf79578d0fb9173647c9c3421177 (patch) | |
tree | 4d45320f5487f900dcc1a21b7087313162da9652 /xlators/cluster/ec | |
parent | aaa5b2ec2f0ef1a62047c9ab91d957c7b0a1552a (diff) |
cluster/ec: Introduce optimistic changelog in EC
Problem: Fix to https://bugzilla.redhat.com/show_bug.cgi?id=1316873 has made
changes to set dirty flag before every update fop, data or metadata, and unset
it after successful operation. That makes some of the fops very slow such as
entry operations or metadata operations.
Solution: File data operations are the only operation which take some time and
setting dirty flag before a fop and unsetting it after serves the purpose as
probability of failure of a fop is high when the time duration is more. For all
the other operations, set dirty flag at the end of the fop, if any brick is
down and need heal.
Providing following option to choose between high performance or better heal
marking for metadata and entry fops.
Set/Unset dirty flag for every update fop at the start of the fop. If ON, this
option impacts performance of entry operations or metadata operations as it
will set dirty flag at the start and unset it at the end of ALL update fop. If
OFF and all the bricks are good, dirty flag will be set at the start only for
file fops For metadata and entry fops dirty flag will not be set at the start,
if all the bricks are good. This does not impact performance for metadata
operations and entry operation but has a very small window to miss marking
entry as dirty in case it is required to be healed.
Thanks to Xavi and Ashish for the design
Picked the .t file from Ashish' patch https://review.gluster.org/16298
BUG: 1408809
Change-Id: I3ce860063f0e2901e50754dcfc3e4ed22daf819f
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: https://review.gluster.org/16821
Smoke: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Xavier Hernandez <xhernandez@datalab.es>
Tested-by: Xavier Hernandez <xhernandez@datalab.es>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Diffstat (limited to 'xlators/cluster/ec')
-rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 49 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-generic.c | 14 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-types.h | 4 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec.c | 22 |
4 files changed, 83 insertions, 6 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 823922542a0..3ae7f110d99 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -932,16 +932,19 @@ ec_config_check (ec_fop_data_t *fop, ec_config_t *config) } gf_boolean_t -ec_set_dirty_flag (ec_lock_link_t *link, ec_inode_t *ctx, uint64_t *dirty) +ec_set_dirty_flag (ec_lock_link_t *link, ec_inode_t *ctx, + uint64_t *dirty) { gf_boolean_t set_dirty = _gf_false; if (link->update[EC_DATA_TXN] && !ctx->dirty[EC_DATA_TXN]) { + if (!link->optimistic_changelog) dirty[EC_DATA_TXN] = 1; } if (link->update[EC_METADATA_TXN] && !ctx->dirty[EC_METADATA_TXN]) { + if (!link->optimistic_changelog) dirty[EC_METADATA_TXN] = 1; } @@ -962,6 +965,7 @@ ec_prepare_update_cbk (call_frame_t *frame, void *cookie, ec_lock_link_t *link = fop->data; ec_lock_t *lock = NULL; ec_inode_t *ctx; + gf_boolean_t release = _gf_false; lock = link->lock; parent = link->fop; @@ -1055,6 +1059,26 @@ unlock: UNLOCK(&lock->loc.inode->lock); if (op_errno == 0) { + /* If the fop fails on any of the good bricks, it is important to mark + * it dirty and update versions right away if dirty was not set before. + */ + if (lock->good_mask & ~(fop->good | fop->remaining)) { + release = _gf_true; + } + + /* lock->release is a critical field that is checked and modified most + * of the time inside a locked region. This use here is safe because we + * are in a modifying fop and we currently don't allow two modifying + * fops to be processed concurrently, so no one else could be checking + * or modifying it.*/ + if (link->update[0] && !link->dirty[0]) { + lock->release |= release; + } + + if (link->update[1] && !link->dirty[1]) { + lock->release |= release; + } + /* We don't allow the main fop to be executed on bricks that have not * succeeded the initial xattrop. */ parent->mask &= fop->good; @@ -1097,6 +1121,7 @@ void ec_get_size_version(ec_lock_link_t *link) ec_inode_t *ctx; ec_fop_data_t *fop; dict_t *dict = NULL; + ec_t *ec = NULL; int32_t error = 0; gf_boolean_t getting_xattr; gf_boolean_t set_dirty = _gf_false; @@ -1105,6 +1130,17 @@ void ec_get_size_version(ec_lock_link_t *link) lock = link->lock; ctx = lock->ctx; fop = link->fop; + ec = fop->xl->private; + + if (ec->optimistic_changelog && + !(ec->node_mask & ~link->lock->good_mask) && !ec_is_data_fop (fop->id)) + link->optimistic_changelog = _gf_true; + + /* If ctx->have_info is false and lock->query is true, it means that we'll + * send the xattrop anyway, so we can use it to update dirty counts, even + * if it's not necessary to do it right now. */ + if (!ctx->have_info && lock->query) + link->optimistic_changelog = _gf_false; set_dirty = ec_set_dirty_flag (link, ctx, dirty); @@ -1714,6 +1750,13 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk, if (link->update[1]) { ctx->post_version[1]++; } + /* If the fop fails on any of the good bricks, it is important to mark + * it dirty and update versions right away. */ + if (link->update[0] || link->update[1]) { + if (lock->good_mask & ~(fop->good | fop->remaining)) { + lock->release = _gf_true; + } + } } ec_lock_update_good(lock, fop); @@ -2028,9 +2071,13 @@ ec_update_info(ec_lock_link_t *link) if (ctx->dirty[1] != 0) { dirty[1] = -1; } + } else { + link->optimistic_changelog = _gf_false; + ec_set_dirty_flag (link, ctx, dirty); } memset(ctx->dirty, 0, sizeof(ctx->dirty)); } + if ((version[0] != 0) || (version[1] != 0) || (dirty[0] != 0) || (dirty[1] != 0)) { ec_update_size_version(link, version, size, dirty); diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c index 3ce3c2ab02a..ddb90ce39cc 100644 --- a/xlators/cluster/ec/src/ec-generic.c +++ b/xlators/cluster/ec/src/ec-generic.c @@ -697,6 +697,7 @@ int32_t ec_lookup_cbk(call_frame_t * frame, void * cookie, xlator_t * this, ec_fop_data_t * fop = NULL; ec_cbk_data_t * cbk = NULL; int32_t idx = (int32_t)(uintptr_t)cookie; + uint64_t dirty[2] = {0}; VALIDATE_OR_GOTO(this, out); GF_VALIDATE_OR_GOTO(this->name, frame, out); @@ -746,8 +747,7 @@ int32_t ec_lookup_cbk(call_frame_t * frame, void * cookie, xlator_t * this, goto out; } - ec_dict_del_array (xdata, EC_XATTR_DIRTY, cbk->dirty, - EC_VERSION_SIZE); + ec_dict_del_array (xdata, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE); } ec_combine(cbk, ec_combine_lookup); @@ -1142,7 +1142,9 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dict_t *xdata) { ec_fop_data_t *fop = NULL; + ec_lock_link_t *link = NULL; ec_cbk_data_t *cbk = NULL; + uint64_t dirty[2] = {0}; data_t *data; uint64_t *version; int32_t idx = (int32_t)(uintptr_t)cookie; @@ -1178,8 +1180,14 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } } - ec_dict_del_array (xattr, EC_XATTR_DIRTY, cbk->dirty, + ec_dict_del_array (xattr, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE); + link = fop->data; + if (link) { + /*Keep a note of if the dirty is already set or not*/ + link->dirty[0] |= (dirty[0] != 0); + link->dirty[1] |= (dirty[1] != 0); + } } if (xdata) diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h index d701fe7d25e..f184f459c2e 100644 --- a/xlators/cluster/ec/src/ec-types.h +++ b/xlators/cluster/ec/src/ec-types.h @@ -246,6 +246,8 @@ struct _ec_lock_link { struct list_head owner_list; struct list_head wait_list; gf_boolean_t update[2]; + gf_boolean_t dirty[2]; + gf_boolean_t optimistic_changelog; loc_t *base; uint64_t size; }; @@ -331,7 +333,6 @@ struct _ec_cbk_data { int32_t op_errno; int32_t count; uintptr_t mask; - uint64_t dirty[2]; dict_t *xdata; dict_t *dict; @@ -561,6 +562,7 @@ struct _ec { gf_timer_t *timer; gf_boolean_t shutdown; gf_boolean_t eager_lock; + gf_boolean_t optimistic_changelog; uint32_t background_heals; uint32_t heal_wait_qlen; struct list_head pending_fops; diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index e467fea28b8..01f1473f96d 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -301,6 +301,8 @@ reconfigure (xlator_t *this, dict_t *options) ret = -1; } + GF_OPTION_RECONF ("optimistic-change-log", ec->optimistic_changelog, + options, bool, failed); failed: return ret; } @@ -611,6 +613,7 @@ init (xlator_t *this) this->private = ec; ec->xl = this; + ec->optimistic_changelog = _gf_true; LOCK_INIT(&ec->lock); INIT_LIST_HEAD(&ec->pending_fops); @@ -669,6 +672,7 @@ init (xlator_t *this) GF_OPTION_INIT ("shd-max-threads", ec->shd.max_threads, uint32, failed); GF_OPTION_INIT ("shd-wait-qlength", ec->shd.wait_qlength, uint32, failed); + GF_OPTION_INIT ("optimistic-change-log", ec->optimistic_changelog, bool, failed); this->itable = inode_table_new (EC_SHD_INODE_LRU_LIMIT, this); if (!this->itable) @@ -1463,5 +1467,21 @@ struct volume_options options[] = .description = "force the cpu extensions to be used to accelerate the " "galois field computations." }, - { } + { .key = {"optimistic-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Set/Unset dirty flag for every update fop at the start" + "of the fop. If OFF, this option impacts performance of" + "entry operations or metadata operations as it will" + "set dirty flag at the start and unset it at the end of" + "ALL update fop. If ON and all the bricks are good," + "dirty flag will be set at the start only for file fops" + "For metadata and entry fops dirty flag will not be set" + "at the start, if all the bricks are good. This does" + "not impact performance for metadata operations and" + "entry operation but has a very small window to miss" + "marking entry as dirty in case it is required to be" + "healed" + }, + { .key = {NULL} } }; |