summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPranith Kumar K <pkarampu@redhat.com>2015-04-23 08:30:11 +0530
committerVijay Bellur <vbellur@redhat.com>2015-04-24 22:39:23 -0700
commit7efa7e2116856b4cf37797218612a41bdd237e77 (patch)
tree3e492c2516e3381f34c1cb7b52aa9300e079ca9a
parentde9d06cd7cfca0b42beffe003e7c1e09d469ca7e (diff)
cluster/ec: Perform inode-write on healing subvols
If the version numbers do not match, then writes are performed only on at least N-R bricks which have same version. But if we want to do healing of files which are constantly modified we need to allow writes on subvols that are undergoing heal. Data healing will mark 62nd bit while the heal is going on. When the data transaction sees that this bit is set it needs to perform the fop on that subvol irrespective of whether the versions match or do not match. Fop is considered successful only if N-R non-healing bricks succeed. Change-Id: I69a17582df397aaf6e8ca4b5e746c7ca802cbbde BUG: 1215265 Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> Reviewed-on: http://review.gluster.org/10372 Tested-by: NetBSD Build System Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com>
-rw-r--r--xlators/cluster/ec/src/ec-common.c60
-rw-r--r--xlators/cluster/ec/src/ec-data.h3
-rw-r--r--xlators/cluster/ec/src/ec-generic.c154
3 files changed, 88 insertions, 129 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 7354277da1b..a4bd8dafe28 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -333,6 +333,7 @@ void ec_complete(ec_fop_data_t * fop)
{
ec_cbk_data_t * cbk = NULL;
int32_t resume = 0, update = 0;
+ int healing_count = 0;
LOCK(&fop->lock);
@@ -342,11 +343,15 @@ void ec_complete(ec_fop_data_t * fop)
if (fop->answer == NULL) {
if (!list_empty(&fop->cbk_list)) {
cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list);
- if ((cbk->count >= fop->minimum) &&
- ((cbk->op_ret >= 0) || (cbk->op_errno != ENOTCONN))) {
- fop->answer = cbk;
-
- update = 1;
+ healing_count = ec_bits_count (cbk->mask & fop->healing);
+ if ((cbk->count - healing_count) >= fop->minimum) {
+ /* fop shouldn't be treated as success if it is not
+ * successful on at least fop->minimum good copies*/
+ if ((cbk->op_ret >= 0) || (cbk->op_errno != ENOTCONN)) {
+ fop->answer = cbk;
+
+ update = 1;
+ }
}
}
@@ -431,6 +436,8 @@ int32_t ec_child_select(ec_fop_data_t * fop)
}
ec->idx = first;
+ /*Unconditionally wind on healing subvolumes*/
+ fop->mask |= fop->healing;
fop->remaining = fop->mask;
ec_trace("SELECT", fop, "");
@@ -560,17 +567,17 @@ void ec_dispatch_inc(ec_fop_data_t * fop)
}
}
-void ec_dispatch_all(ec_fop_data_t * fop)
+void
+ec_dispatch_all (ec_fop_data_t *fop)
{
- ec_dispatch_start(fop);
+ ec_dispatch_start(fop);
- if (ec_child_select(fop))
- {
- fop->expected = ec_bits_count(fop->remaining);
- fop->first = 0;
+ if (ec_child_select(fop)) {
+ fop->expected = ec_bits_count(fop->remaining);
+ fop->first = 0;
- ec_dispatch_mask(fop, fop->remaining);
- }
+ ec_dispatch_mask(fop, fop->remaining);
+ }
}
void ec_dispatch_min(ec_fop_data_t * fop)
@@ -1052,9 +1059,27 @@ int32_t ec_get_size_version_set(call_frame_t * frame, void * cookie,
return 0;
}
-int32_t ec_prepare_update_cbk(call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict, dict_t *xdata)
+gf_boolean_t
+ec_is_data_fop (glusterfs_fop_t fop)
+{
+ switch (fop) {
+ case GF_FOP_WRITE:
+ case GF_FOP_TRUNCATE:
+ case GF_FOP_FTRUNCATE:
+ case GF_FOP_FALLOCATE:
+ case GF_FOP_DISCARD:
+ case GF_FOP_ZEROFILL:
+ return _gf_true;
+ default:
+ return _gf_false;
+ }
+ return _gf_false;
+}
+
+int32_t
+ec_prepare_update_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
{
ec_fop_data_t *fop = cookie, *parent;
ec_lock_t *lock = NULL;
@@ -1091,6 +1116,9 @@ int32_t ec_prepare_update_cbk(call_frame_t *frame, void *cookie,
UNLOCK(&lock->loc.inode->lock);
fop->parent->mask &= fop->good;
+ /*As of now only data healing marks bricks as healing*/
+ if (ec_is_data_fop (fop->parent->id))
+ fop->parent->healing |= fop->healing;
fop->parent->pre_size = fop->parent->post_size = lock->size;
fop->parent->have_size = 1;
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h
index ac59a6b2e14..80936aeaada 100644
--- a/xlators/cluster/ec/src/ec-data.h
+++ b/xlators/cluster/ec/src/ec-data.h
@@ -192,6 +192,9 @@ struct _ec_fop_data
uint32_t flags;
uint32_t first;
uintptr_t mask;
+ uintptr_t healing; /*Dispatch is done but call is successful only
+ if fop->minimum number of subvolumes succeed
+ which are not healing*/
uintptr_t remaining;
uintptr_t good;
uintptr_t bad;
diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c
index 73df0d89db8..f80770a3365 100644
--- a/xlators/cluster/ec/src/ec-generic.c
+++ b/xlators/cluster/ec/src/ec-generic.c
@@ -16,7 +16,9 @@
#include "ec-combine.h"
#include "ec-method.h"
#include "ec-fops.h"
+#include "byte-order.h"
+#define EC_SELFHEAL_BIT 62
/* FOP: flush */
int32_t ec_flush_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
@@ -1308,69 +1310,57 @@ int32_t ec_combine_xattrop(ec_fop_data_t * fop, ec_cbk_data_t * dst,
return 1;
}
-int32_t ec_xattrop_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
- int32_t op_ret, int32_t op_errno, dict_t * xattr,
- dict_t * xdata)
+int32_t
+ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr,
+ dict_t *xdata)
{
- ec_fop_data_t * fop = NULL;
- ec_cbk_data_t * cbk = NULL;
- int32_t idx = (int32_t)(uintptr_t)cookie;
+ ec_fop_data_t *fop = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ int32_t idx = (int32_t)(uintptr_t)cookie;
+ uint64_t version = 0;
+ uint64_t *version_xattr = 0;
- VALIDATE_OR_GOTO(this, out);
- GF_VALIDATE_OR_GOTO(this->name, frame, out);
- GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
- GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+ VALIDATE_OR_GOTO (this, out);
+ GF_VALIDATE_OR_GOTO (this->name, frame, out);
+ GF_VALIDATE_OR_GOTO (this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO (this->name, this->private, out);
- fop = frame->local;
+ fop = frame->local;
- ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx,
- frame, op_ret, op_errno);
+ ec_trace ("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx,
+ frame, op_ret, op_errno);
- cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_XATTROP, idx, op_ret,
- op_errno);
- if (cbk != NULL)
- {
- if (op_ret >= 0)
- {
- if (xattr != NULL)
- {
- cbk->dict = dict_ref(xattr);
- if (cbk->dict == NULL)
- {
- gf_log(this->name, GF_LOG_ERROR, "Failed to reference a "
- "dictionary.");
+ cbk = ec_cbk_data_allocate (frame, this, fop, fop->id, idx, op_ret,
+ op_errno);
+ if (!cbk)
+ goto out;
- goto out;
- }
- }
- }
- if (xdata != NULL)
- {
- uint64_t dirty;
+ if (op_ret >= 0) {
+ uint64_t dirty;
+ cbk->dict = dict_ref (xattr);
- cbk->xdata = dict_ref(xdata);
- if (cbk->xdata == NULL)
- {
- gf_log(this->name, GF_LOG_ERROR, "Failed to reference a "
- "dictionary.");
+ if (dict_get_bin (xattr, EC_XATTR_VERSION,
+ (void **)&version_xattr) == 0) {
+ version = ntoh64(version_xattr[0]);
+ if ((version >> EC_SELFHEAL_BIT) & 1)
+ fop->healing |= (1ULL<<idx);
+ }
- goto out;
- }
- if (ec_dict_del_number(cbk->xdata, EC_XATTR_DIRTY, &dirty) == 0) {
- cbk->dirty = dirty != 0;
- }
+ if (ec_dict_del_number (xattr, EC_XATTR_DIRTY, &dirty) == 0)
+ cbk->dirty = dirty != 0;
}
- ec_combine(cbk, ec_combine_xattrop);
- }
+ if (xdata)
+ cbk->xdata = dict_ref(xdata);
+
+ ec_combine (cbk, ec_combine_xattrop);
out:
- if (fop != NULL)
- {
- ec_complete(fop);
- }
+ if (fop)
+ ec_complete(fop);
- return 0;
+ return 0;
}
void ec_wind_xattrop(ec_t * ec, ec_fop_data_t * fop, int32_t idx)
@@ -1576,73 +1566,11 @@ out:
}
}
-/* FOP: fxattrop */
-
-int32_t ec_fxattrop_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
- int32_t op_ret, int32_t op_errno, dict_t * xattr,
- dict_t * xdata)
-{
- ec_fop_data_t * fop = NULL;
- ec_cbk_data_t * cbk = NULL;
- int32_t idx = (int32_t)(uintptr_t)cookie;
-
- VALIDATE_OR_GOTO(this, out);
- GF_VALIDATE_OR_GOTO(this->name, frame, out);
- GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
- GF_VALIDATE_OR_GOTO(this->name, this->private, out);
-
- fop = frame->local;
-
- ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx,
- frame, op_ret, op_errno);
-
- cbk = ec_cbk_data_allocate(frame, this, fop, GF_FOP_FXATTROP, idx, op_ret,
- op_errno);
- if (cbk != NULL)
- {
- if (op_ret >= 0)
- {
- if (xattr != NULL)
- {
- cbk->dict = dict_ref(xattr);
- if (cbk->dict == NULL)
- {
- gf_log(this->name, GF_LOG_ERROR, "Failed to reference a "
- "dictionary.");
-
- goto out;
- }
- }
- }
- if (xdata != NULL)
- {
- cbk->xdata = dict_ref(xdata);
- if (cbk->xdata == NULL)
- {
- gf_log(this->name, GF_LOG_ERROR, "Failed to reference a "
- "dictionary.");
-
- goto out;
- }
- }
-
- ec_combine(cbk, ec_combine_xattrop);
- }
-
-out:
- if (fop != NULL)
- {
- ec_complete(fop);
- }
-
- return 0;
-}
-
void ec_wind_fxattrop(ec_t * ec, ec_fop_data_t * fop, int32_t idx)
{
ec_trace("WIND", fop, "idx=%d", idx);
- STACK_WIND_COOKIE(fop->frame, ec_fxattrop_cbk, (void *)(uintptr_t)idx,
+ STACK_WIND_COOKIE(fop->frame, ec_xattrop_cbk, (void *)(uintptr_t)idx,
ec->xl_list[idx], ec->xl_list[idx]->fops->fxattrop,
fop->fd, fop->xattrop_flags, fop->dict, fop->xdata);
}