summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPranith Kumar K <pkarampu@redhat.com>2015-04-23 08:30:11 +0530
committerPranith Kumar Karampuri <pkarampu@redhat.com>2015-05-07 03:28:06 -0700
commit3994afd6140c828edceca271974f5a90c4c70fad (patch)
tree1250c490ea7a2fb8789d7b2b6e0a471603fc31be
parentb6693de93a6f430bae4bfa9f4e58cf061664249b (diff)
cluster/ec: Perform inode-write on healing subvols
Backport of http://review.gluster.org/10372 If the version numbers do not match, then writes are performed only on at least N-R bricks which have same version. But if we want to do healing of files which are constantly modified we need to allow writes on subvols that are undergoing heal. Data healing will mark 62nd bit while the heal is going on. When the data transaction sees that this bit is set it needs to perform the fop on that subvol irrespective of whether the versions match or do not match. Fop is considered successful only if N-R non-healing bricks succeed. BUG: 1216303 Change-Id: I79aaf1ac86357c51547cdaaa56cf7338004cc512 Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> Reviewed-on: http://review.gluster.org/10440 Tested-by: Gluster Build System <jenkins@build.gluster.com> Tested-by: NetBSD Build System
-rw-r--r--xlators/cluster/ec/src/ec-common.c60
-rw-r--r--xlators/cluster/ec/src/ec-data.h3
-rw-r--r--xlators/cluster/ec/src/ec-generic.c154
3 files changed, 88 insertions, 129 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 7354277da1b..a4bd8dafe28 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -333,6 +333,7 @@ void ec_complete(ec_fop_data_t * fop)
{
ec_cbk_data_t * cbk = NULL;
int32_t resume = 0, update = 0;
+ int healing_count = 0;
LOCK(&fop->lock);
@@ -342,11 +343,15 @@ void ec_complete(ec_fop_data_t * fop)
if (fop->answer == NULL) {
if (!list_empty(&fop->cbk_list)) {
cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list);
- if ((cbk->count >= fop->minimum) &&
- ((cbk->op_ret >= 0) || (cbk->op_errno != ENOTCONN))) {
- fop->answer = cbk;
-
- update = 1;
+ healing_count = ec_bits_count (cbk->mask & fop->healing);
+ if ((cbk->count - healing_count) >= fop->minimum) {
+ /* fop shouldn't be treated as success if it is not
+ * successful on at least fop->minimum good copies*/
+ if ((cbk->op_ret >= 0) || (cbk->op_errno != ENOTCONN)) {
+ fop->answer = cbk;
+
+ update = 1;
+ }
}
}
@@ -431,6 +436,8 @@ int32_t ec_child_select(ec_fop_data_t * fop)
}
ec->idx = first;
+ /*Unconditionally wind on healing subvolumes*/
+ fop->mask |= fop->healing;
fop->remaining = fop->mask;
ec_trace("SELECT", fop, "");
@@ -560,17 +567,17 @@ void ec_dispatch_inc(ec_fop_data_t * fop)
}
}
-void ec_dispatch_all(ec_fop_data_t * fop)
+void
+ec_dispatch_all (ec_fop_data_t *fop)
{
- ec_dispatch_start(fop);
+ ec_dispatch_start(fop);
- if (ec_child_select(fop))
- {
- fop->expected = ec_bits_count(fop->remaining);
- fop->first = 0;
+ if (ec_child_select(fop)) {
+ fop->expected = ec_bits_count(fop->remaining);
+ fop->first = 0;
- ec_dispatch_mask(fop, fop->remaining);
- }
+ ec_dispatch_mask(fop, fop->remaining);
+ }
}
void ec_dispatch_min(ec_fop_data_t * fop)
@@ -1052,9 +1059,27 @@ int32_t ec_get_size_version_set(call_frame_t * frame, void * cookie,
return 0;
}
-int32_t ec_prepare_update_cbk(call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict, dict_t *xdata)
+gf_boolean_t
+ec_is_data_fop (glusterfs_fop_t fop)
+{
+ switch (fop) {
+ case GF_FOP_WRITE:
+ case GF_FOP_TRUNCATE:
+ case GF_FOP_FTRUNCATE:
+ case GF_FOP_FALLOCATE:
+ case GF_FOP_DISCARD:
+ case GF_FOP_ZEROFILL:
+ return _gf_true;
+ default:
+ return _gf_false;
+ }
+ return _gf_false;
+}
+
+int32_t
+ec_prepare_update_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
{
ec_fop_data_t *fop = cookie, *parent;
ec_lock_t *lock = NULL;
@@ -1091,6 +1116,9 @@ int32_t ec_prepare_update_cbk(call_frame_t *frame, void *cookie,
UNLOCK(&lock->loc.inode->lock);
fop->parent->mask &= fop->good;
+ /*As of now only data healing marks bricks as healing*/
+ if (ec_is_data_fop (fop->parent->id))
+ fop->parent->healing |= fop->healing;
fop->parent->pre_size = fop->parent->post_size = lock->size;
fop->parent->have_size = 1;
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h
index ac59a6b2e14..80936aeaada 100644
--- a/xlators/cluster/ec/src/ec-data.h
+++ b/xlators/cluster/ec/src/ec-data.h
@@ -192,6 +192,9 @@ struct _ec_fop_data
uint32_t flags;
uint32_t first;
uintptr_t mask;
+ uintptr_t healing; /*Dispatch is done but call is successful only
+ if fop->minimum number of subvolumes succeed
+ which are not healing*/
uintptr_t remaining;
uintptr_t good;
uintptr_t bad;
diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c
index 948eaf4974c..c22fb0a950f 100644
--- a/xlators/cluster/ec/src/ec-generic.c
+++ b/xlators/cluster/ec/src/ec-generic.c
@@ -16,7 +16,9 @@
#include "ec-combine.h"
#include "ec-method.h"
#include "ec-fops.h"
+#include "byte-order.h"
+#define EC_SELFHEAL_BIT 62
/* FOP: flush */
int32_t ec_flush_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
@@ -1311,69 +1313,57 @@ int32_t ec_combine_xattrop(ec_fop_data_t * fop, ec_cbk_data_t * dst,
return 1;
}
-int32_t ec_xattrop_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
- int32_t op_ret, int32_t op_errno, dict_t * xattr,
- dict_t * xdata)
+int32_t
+ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr,
+ dict_t *xdata)
{
- ec_fop_data_t * fop = NULL;
- ec_cbk_data_t * cbk = NULL;
- int32_t idx = (int32_t)(uintptr_t)cookie;
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+ uint64_t version = 0;
+ uint64_t *version_xattr = 0;
- VALIDATE_OR_GOTO(this, out);
- GF_VALIDATE_OR_GOTO(this->name, frame, out);
- GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
- GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+ VALIDATE_OR_GOTO (this, out);
+ GF_VALIDATE_OR_GOTO (this->name, frame, out);
+ GF_VALIDATE_OR_GOTO (this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO (this->name, this->private, out);
- fop = frame->local;
+ fop = frame->local;
- ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx,
- frame, op_ret, op_errno);
+ ec_trace ("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx,
+ frame, op_ret, op_errno);
- cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_XATTROP, idx, op_ret,
- op_errno);
- if (cbk != NULL)
- {
- if (op_ret >= 0)
- {
- if (xattr != NULL)
- {
- cbk->dict = dict_ref(xattr);
- if (cbk->dict == NULL)
- {
- gf_log(this->name, GF_LOG_ERROR, "Failed to reference a "
- "dictionary.");
+ cbk = ec_cbk_data_allocate (frame, this, fop, fop->id, idx, op_ret,
+ op_errno);
+ if (!cbk)
+ goto out;
- goto out;
- }
- }
- }
- if (xdata != NULL)
- {
- uint64_t dirty;
+ if (op_ret >= 0) {
+ uint64_t dirty;
+ cbk->dict = dict_ref (xattr);
- cbk->xdata = dict_ref(xdata);
- if (cbk->xdata == NULL)
- {
- gf_log(this->name, GF_LOG_ERROR, "Failed to reference a "
- "dictionary.");
+ if (dict_get_bin (xattr, EC_XATTR_VERSION,
+ (void **)&version_xattr) == 0) {
+ version = ntoh64(version_xattr[0]);
+ if ((version >> EC_SELFHEAL_BIT) & 1)
+ fop->healing |= (1ULL<<idx);
+ }
- goto out;
- }
- if (ec_dict_del_number(cbk->xdata, EC_XATTR_DIRTY, &dirty) == 0) {
- cbk->dirty = dirty != 0;
- }
+ if (ec_dict_del_number (xattr, EC_XATTR_DIRTY, &dirty) == 0)
+ cbk->dirty = dirty != 0;
}
- ec_combine(cbk, ec_combine_xattrop);
- }
+ if (xdata)
+ cbk->xdata = dict_ref(xdata);
+
+ ec_combine (cbk, ec_combine_xattrop);
out:
- if (fop != NULL)
- {
- ec_complete(fop);
- }
+ if (fop)
+ ec_complete(fop);
- return 0;
+ return 0;
}
void ec_wind_xattrop(ec_t * ec, ec_fop_data_t * fop, int32_t idx)
@@ -1579,73 +1569,11 @@ out:
}
}
-/* FOP: fxattrop */
-
-int32_t ec_fxattrop_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
- int32_t op_ret, int32_t op_errno, dict_t * xattr,
- dict_t * xdata)
-{
- ec_fop_data_t * fop = NULL;
- ec_cbk_data_t * cbk = NULL;
- int32_t idx = (int32_t)(uintptr_t)cookie;
-
- VALIDATE_OR_GOTO(this, out);
- GF_VALIDATE_OR_GOTO(this->name, frame, out);
- GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
- GF_VALIDATE_OR_GOTO(this->name, this->private, out);
-
- fop = frame->local;
-
- ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx,
- frame, op_ret, op_errno);
-
- cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FXATTROP, idx, op_ret,
- op_errno);
- if (cbk != NULL)
- {
- if (op_ret >= 0)
- {
- if (xattr != NULL)
- {
- cbk->dict = dict_ref(xattr);
- if (cbk->dict == NULL)
- {
- gf_log(this->name, GF_LOG_ERROR, "Failed to reference a "
- "dictionary.");
-
- goto out;
- }
- }
- }
- if (xdata != NULL)
- {
- cbk->xdata = dict_ref(xdata);
- if (cbk->xdata == NULL)
- {
- gf_log(this->name, GF_LOG_ERROR, "Failed to reference a "
- "dictionary.");
-
- goto out;
- }
- }
-
- ec_combine(cbk, ec_combine_xattrop);
- }
-
-out:
- if (fop != NULL)
- {
- ec_complete(fop);
- }
-
- return 0;
-}
-
void ec_wind_fxattrop(ec_t * ec, ec_fop_data_t * fop, int32_t idx)
{
ec_trace("WIND", fop, "idx=%d", idx);
- STACK_WIND_COOKIE(fop->frame, ec_fxattrop_cbk, (void *)(uintptr_t)idx,
+ STACK_WIND_COOKIE(fop->frame, ec_xattrop_cbk, (void *)(uintptr_t)idx,
ec->xl_list[idx], ec->xl_list[idx]->fops->fxattrop,
fop->fd, fop->xattrop_flags, fop->dict, fop->xdata);
}