summaryrefslogtreecommitdiffstats
path: root/xlators/cluster
diff options
context:
space:
mode:
authorPranith Kumar K <pkarampu@redhat.com>2015-04-26 14:28:00 +0530
committerVijay Bellur <vbellur@redhat.com>2015-05-08 05:56:11 -0700
commit02f9835d24aa07bd4e9fcb39cb7ace343f31924f (patch)
tree43ff543742f500f24237a84330cb321deca4c288 /xlators/cluster
parentbf8250bcca7f484269f64b6a73f9330d843b320b (diff)
cluster/ec: Change meaning of trusted.ec.dirty
- With this change, the xattr will represent if the file needs to be healed or not. It will have different values for data/entry and metadata changes. - inode ref leaks and dict_set_dynstr related leaks fixed - Added support for trylock/lock based on heal-cmd execution or not in data heal. - Made fixes to pass regression runs Change-Id: I9d8def4c2badde18a76b7898816fecfac113737a BUG: 1215265 Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> Reviewed-on: http://review.gluster.org/10385 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'xlators/cluster')
-rw-r--r--xlators/cluster/ec/src/ec-common.c102
-rw-r--r--xlators/cluster/ec/src/ec-data.h5
-rw-r--r--xlators/cluster/ec/src/ec-generic.c15
-rw-r--r--xlators/cluster/ec/src/ec-heal.c607
-rw-r--r--xlators/cluster/ec/src/ec-heald.c14
-rw-r--r--xlators/cluster/ec/src/ec-helpers.c14
-rw-r--r--xlators/cluster/ec/src/ec-inode-read.c6
7 files changed, 555 insertions, 208 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 5422944cfef..383c460bb32 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -1078,6 +1078,23 @@ ec_is_data_fop (glusterfs_fop_t fop)
return _gf_false;
}
+gf_boolean_t
+ec_is_metadata_fop (glusterfs_fop_t fop)
+{
+ switch (fop) {
+ case GF_FOP_SETATTR:
+ case GF_FOP_FSETATTR:
+ case GF_FOP_SETXATTR:
+ case GF_FOP_FSETXATTR:
+ case GF_FOP_REMOVEXATTR:
+ case GF_FOP_FREMOVEXATTR:
+ return _gf_true;
+ default:
+ return _gf_false;
+ }
+ return _gf_false;
+}
+
int32_t
ec_prepare_update_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
@@ -1098,7 +1115,10 @@ ec_prepare_update_cbk (call_frame_t *frame, void *cookie,
}
lock = parent->locks[0].lock;
- lock->is_dirty = _gf_true;
+ if (ec_is_metadata_fop (fop->parent->id))
+ lock->is_dirty[EC_METADATA_TXN] = _gf_true;
+ else
+ lock->is_dirty[EC_DATA_TXN] = _gf_true;
if (lock->loc.inode->ia_type == IA_IFREG) {
if (!ec_config_check(fop, dict) ||
@@ -1147,7 +1167,7 @@ void ec_get_size_version(ec_fop_data_t * fop)
uid_t uid;
gid_t gid;
int32_t error = ENOMEM;
- uint64_t version[EC_VERSION_SIZE] = {0, 0};
+ uint64_t allzero[EC_VERSION_SIZE] = {0, 0};
if (fop->have_size)
{
@@ -1177,10 +1197,11 @@ void ec_get_size_version(ec_fop_data_t * fop)
goto out;
}
if ((ec_dict_set_array(xdata, EC_XATTR_VERSION,
- version, EC_VERSION_SIZE) != 0) ||
+ allzero, EC_VERSION_SIZE) != 0) ||
(ec_dict_set_number(xdata, EC_XATTR_SIZE, 0) != 0) ||
(ec_dict_set_number(xdata, EC_XATTR_CONFIG, 0) != 0) ||
- (ec_dict_set_number(xdata, EC_XATTR_DIRTY, 0) != 0))
+ (ec_dict_set_array(xdata, EC_XATTR_DIRTY, allzero,
+ EC_VERSION_SIZE) != 0))
{
goto out;
}
@@ -1244,16 +1265,19 @@ void ec_prepare_update(ec_fop_data_t *fop)
dict_t *xdata;
ec_fop_data_t *tmp;
ec_lock_t *lock;
+ ec_t *ec;
uid_t uid;
gid_t gid;
uint64_t version[2] = {0, 0};
+ uint64_t dirty[2] = {0, 0};
int32_t error = ENOMEM;
tmp = fop;
while ((tmp != NULL) && (tmp->locks[0].lock == NULL)) {
tmp = tmp->parent;
}
- if ((tmp != NULL) && tmp->locks[0].lock->is_dirty) {
+ if ((tmp != NULL) &&
+ (tmp->locks[0].lock->is_dirty[0] || tmp->locks[0].lock->is_dirty[1])) {
lock = tmp->locks[0].lock;
fop->pre_size = fop->post_size = lock->size;
@@ -1269,6 +1293,16 @@ void ec_prepare_update(ec_fop_data_t *fop)
memset(&loc, 0, sizeof(loc));
+ ec = fop->xl->private;
+ if (ec_bits_count (fop->mask) >= ec->fragments) {
+ /* It is changing data only if the update happens on at least
+ * fragment number of bricks. Otherwise it probably is healing*/
+ if (ec_is_metadata_fop (fop->id))
+ dirty[EC_METADATA_TXN] = 1;
+ else
+ dirty[EC_DATA_TXN] = 1;
+ }
+
xdata = dict_new();
if (xdata == NULL) {
goto out;
@@ -1277,7 +1311,8 @@ void ec_prepare_update(ec_fop_data_t *fop)
version, EC_VERSION_SIZE) != 0) ||
(ec_dict_set_number(xdata, EC_XATTR_SIZE, 0) != 0) ||
(ec_dict_set_number(xdata, EC_XATTR_CONFIG, 0) != 0) ||
- (ec_dict_set_number(xdata, EC_XATTR_DIRTY, 1) != 0)) {
+ (ec_dict_set_array(xdata, EC_XATTR_DIRTY, dirty,
+ EC_VERSION_SIZE) != 0)) {
goto out;
}
@@ -1391,12 +1426,38 @@ int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie,
return 0;
}
-void ec_update_size_version(ec_fop_data_t *fop, loc_t *loc, uint64_t version[2],
- uint64_t size, gf_boolean_t dirty, ec_lock_t *lock)
+uint64_t
+ec_get_dirty_value (ec_t *ec, uintptr_t fop_mask, uint64_t version_delta,
+ gf_boolean_t dirty)
{
+ uint64_t dirty_val = 0;
+
+ if (version_delta) {
+ if (~fop_mask & ec->node_mask) {
+ /* fop didn't succeed on all subvols so 'dirty' xattr
+ * shouldn't be cleared */
+ if (!dirty)
+ dirty_val = 1;
+ } else {
+ /* fop succeed on all subvols so 'dirty' xattr
+ * should be cleared */
+ if (dirty)
+ dirty_val = -1;
+ }
+ }
+ return dirty_val;
+}
+
+void
+ec_update_size_version(ec_fop_data_t *fop, loc_t *loc, uint64_t version[2],
+ uint64_t size, gf_boolean_t dirty[2], ec_lock_t *lock)
+{
+ ec_t *ec = fop->xl->private;
dict_t * dict;
uid_t uid;
gid_t gid;
+ uint64_t dirty_values[2] = {0};
+ int i = 0;
if (fop->parent != NULL)
{
@@ -1425,8 +1486,15 @@ void ec_update_size_version(ec_fop_data_t *fop, loc_t *loc, uint64_t version[2],
goto out;
}
}
- if (dirty) {
- if (ec_dict_set_number(dict, EC_XATTR_DIRTY, -1) != 0) {
+
+ for (i = 0; i < sizeof (dirty_values)/sizeof (dirty_values[0]); i++) {
+ dirty_values[i] = ec_get_dirty_value (ec, fop->mask, version[i],
+ dirty[i]);
+ }
+
+ if (dirty_values[0] || dirty_values[1]) {
+ if (ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty_values,
+ EC_VERSION_SIZE) != 0) {
goto out;
}
}
@@ -1469,7 +1537,8 @@ void ec_unlock_now(ec_fop_data_t *fop, ec_lock_t *lock)
{
ec_trace("UNLOCK_NOW", fop, "lock=%p", lock);
- if ((lock->version_delta != 0) || lock->is_dirty) {
+ if ((lock->version_delta[0] != 0) || (lock->version_delta[1] != 0) ||
+ lock->is_dirty[0] || lock->is_dirty[1]) {
ec_update_size_version(fop, &lock->loc, lock->version_delta,
lock->size_delta, lock->is_dirty, lock);
} else {
@@ -1578,6 +1647,7 @@ void ec_flush_size_version(ec_fop_data_t * fop)
{
ec_lock_t * lock;
uint64_t version[2], delta;
+ gf_boolean_t dirty[2] = {_gf_false, _gf_false};
GF_ASSERT(fop->lock_count == 1);
@@ -1589,16 +1659,20 @@ void ec_flush_size_version(ec_fop_data_t * fop)
version[0] = lock->version_delta[0];
version[1] = lock->version_delta[1];
+ dirty[0] = lock->is_dirty[0];
+ dirty[1] = lock->is_dirty[1];
delta = lock->size_delta;
lock->version_delta[0] = 0;
lock->version_delta[1] = 0;
lock->size_delta = 0;
+ lock->is_dirty[0] = _gf_false;
+ lock->is_dirty[1] = _gf_false;
UNLOCK(&lock->loc.inode->lock);
- if (version > 0)
+ if (version[0] > 0 || version[1] > 0 || dirty[0] || dirty[1])
{
- ec_update_size_version(fop, &lock->loc, version, delta, _gf_false,
+ ec_update_size_version(fop, &lock->loc, version, delta, dirty,
NULL);
}
}
@@ -1626,7 +1700,7 @@ void ec_lock_reuse(ec_fop_data_t *fop)
if (((fop->locks_update >> i) & 1) != 0) {
if (fop->error == 0)
{
- if (fop->id == GF_FOP_SETXATTR || fop->id == GF_FOP_SETATTR) {
+ if (ec_is_metadata_fop (fop->id)) {
lock->version_delta[1]++;
} else {
lock->version_delta[0]++;
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h
index 85037f62bb4..9e5c92dd5b8 100644
--- a/xlators/cluster/ec/src/ec-data.h
+++ b/xlators/cluster/ec/src/ec-data.h
@@ -145,6 +145,7 @@ struct _ec_lock
uint64_t size_delta;
uint64_t version[2];
uint64_t version_delta[2];
+ gf_boolean_t is_dirty[2];
ec_fop_data_t *owner;
loc_t loc;
union
@@ -152,7 +153,6 @@ struct _ec_lock
entrylk_type type;
struct gf_flock flock;
};
- gf_boolean_t is_dirty;
};
struct _ec_lock_link
@@ -257,7 +257,7 @@ struct _ec_cbk_data
struct gf_flock flock;
struct iovec * vector;
struct iobref * buffers;
- gf_boolean_t dirty;
+ uint64_t dirty[2];
};
struct _ec_heal
@@ -282,6 +282,7 @@ struct _ec_heal
uintptr_t fixed;
uint64_t offset;
uint64_t size;
+ uint64_t total_size;
uint64_t version[2];
uint64_t raw_size;
};
diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c
index 50169771476..d957bf6533d 100644
--- a/xlators/cluster/ec/src/ec-generic.c
+++ b/xlators/cluster/ec/src/ec-generic.c
@@ -759,7 +759,6 @@ void ec_lookup_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)
for (i = 0, ans = cbk; (ans != NULL) && (i < ec->fragments);
ans = ans->next)
{
- if (!ans->dirty) {
data = dict_get(ans->xdata, GF_CONTENT_KEY);
if (data != NULL)
{
@@ -770,7 +769,6 @@ void ec_lookup_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)
}
i++;
}
- }
}
if (i >= ec->fragments)
@@ -878,8 +876,6 @@ int32_t ec_lookup_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
}
if (xdata != NULL)
{
- uint64_t dirty;
-
cbk->xdata = dict_ref(xdata);
if (cbk->xdata == NULL)
{
@@ -888,9 +884,8 @@ int32_t ec_lookup_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
goto out;
}
- if (ec_dict_del_number(cbk->xdata, EC_XATTR_DIRTY, &dirty) == 0) {
- cbk->dirty = dirty != 0;
- }
+ ec_dict_del_array (xdata, EC_XATTR_DIRTY, cbk->dirty,
+ EC_VERSION_SIZE);
}
ec_combine(cbk, ec_combine_lookup);
@@ -1341,7 +1336,6 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
if (op_ret >= 0) {
- uint64_t dirty;
cbk->dict = dict_ref (xattr);
if (dict_get_bin (xattr, EC_XATTR_VERSION,
@@ -1350,9 +1344,8 @@ ec_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if ((version >> EC_SELFHEAL_BIT) & 1)
fop->healing |= (1ULL<<idx);
}
-
- if (ec_dict_del_number (xattr, EC_XATTR_DIRTY, &dirty) == 0)
- cbk->dirty = dirty != 0;
+ ec_dict_del_array (xattr, EC_XATTR_DIRTY, cbk->dirty,
+ EC_VERSION_SIZE);
}
if (xdata)
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
index b7b910502f8..315de8765ad 100644
--- a/xlators/cluster/ec/src/ec-heal.c
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -76,6 +76,11 @@ out:
return _gf_false;
}
+static gf_boolean_t
+ec_sh_key_match (dict_t *dict, char *key, data_t *val, void *mdata)
+{
+ return !ec_ignorable_key_match (dict, key, val, mdata);
+}
/* FOP: heal */
void ec_heal_exclude(ec_heal_t * heal, uintptr_t mask)
@@ -1058,8 +1063,15 @@ ec_heal_writev_cbk (call_frame_t *frame, void *cookie,
struct iatt *prebuf, struct iatt *postbuf,
dict_t *xdata)
{
+ ec_fop_data_t *fop = cookie;
+ ec_heal_t *heal = fop->data;
+
ec_trace("WRITE_CBK", cookie, "ret=%d, errno=%d", op_ret, op_errno);
+ gf_log (fop->xl->name, GF_LOG_DEBUG, "%s: write op_ret %d, op_errno %s"
+ " at %"PRIu64, uuid_utoa (heal->fd->inode->gfid), op_ret,
+ strerror (op_errno), heal->offset);
+
ec_heal_update(cookie, 0);
return 0;
@@ -1080,12 +1092,19 @@ int32_t ec_heal_readv_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
if (op_ret > 0)
{
+ gf_log (fop->xl->name, GF_LOG_DEBUG, "%s: read succeeded, proceeding "
+ "to write at %"PRIu64, uuid_utoa (heal->fd->inode->gfid),
+ heal->offset);
ec_writev(heal->fop->frame, heal->xl, heal->bad, EC_MINIMUM_ONE,
ec_heal_writev_cbk, heal, heal->fd, vector, count,
heal->offset, 0, iobref, NULL);
}
else
{
+ gf_log (fop->xl->name, GF_LOG_DEBUG, "%s: read failed %s, failing "
+ "to heal block at %"PRIu64,
+ uuid_utoa (heal->fd->inode->gfid), strerror (op_errno),
+ heal->offset);
heal->done = 1;
}
@@ -1529,8 +1548,8 @@ ec_manager_heal (ec_fop_data_t * fop, int32_t state)
}
}
-void ec_heal(call_frame_t * frame, xlator_t * this, uintptr_t target,
- int32_t minimum, fop_heal_cbk_t func, void * data, loc_t * loc,
+void ec_heal2(call_frame_t *frame, xlator_t *this, uintptr_t target,
+ int32_t minimum, fop_heal_cbk_t func, void *data, loc_t *loc,
int32_t partial, dict_t *xdata)
{
ec_cbk_t callback = { .heal = func };
@@ -1647,19 +1666,15 @@ ec_char_array_to_mask (unsigned char *array, int numsubvols)
}
int
-ec_heal_find_direction (ec_t *ec, ec_txn_t type, default_args_cbk_t *replies,
+ec_heal_entry_find_direction (ec_t *ec, default_args_cbk_t *replies,
uint64_t *versions, uint64_t *dirty,
unsigned char *sources, unsigned char *healed_sinks)
{
- void *ptr = NULL;
- uint64_t *value = NULL;
+ uint64_t xattr[EC_VERSION_SIZE] = {0};
int source = -1;
uint64_t max_version = 0;
- int32_t len = 0;
int ret = 0;
int i = 0;
- struct iatt source_ia = {0};
- struct iatt child_ia = {0};
for (i = 0; i < ec->nodes; i++) {
if (!replies[i].valid)
@@ -1671,22 +1686,21 @@ ec_heal_find_direction (ec_t *ec, ec_txn_t type, default_args_cbk_t *replies,
if (source == -1)
source = i;
- ret = dict_get_ptr_and_len (replies[i].xdata, EC_XATTR_VERSION,
- &ptr, &len);
+ ret = ec_dict_del_array (replies[i].xdata, EC_XATTR_VERSION,
+ xattr, EC_VERSION_SIZE);
if (ret == 0) {
- value = ptr;
- versions[i] = ntoh64(value[type]);
+ versions[i] = xattr[EC_DATA_TXN];
if (max_version < versions[i]) {
max_version = versions[i];
source = i;
}
}
- ret = dict_get_ptr_and_len (replies[i].xdata, EC_XATTR_DIRTY,
- &ptr, &len);
+ memset (xattr, 0, sizeof(xattr));
+ ret = ec_dict_del_array (replies[i].xdata, EC_XATTR_DIRTY,
+ xattr, EC_VERSION_SIZE);
if (ret == 0) {
- value = ptr;
- dirty[i] = ntoh64(value[type]);
+ dirty[i] = xattr[EC_DATA_TXN];
}
}
@@ -1706,29 +1720,13 @@ ec_heal_find_direction (ec_t *ec, ec_txn_t type, default_args_cbk_t *replies,
healed_sinks[i] = 1;
}
- if (type == EC_METADATA_TXN) {
- source_ia = replies[source].stat;
- for (i = 0; i < ec->nodes; i++) {
- if (!sources[i])
- continue;
- child_ia = replies[i].stat;
- if (!IA_EQUAL(source_ia, child_ia, gfid) ||
- !IA_EQUAL(source_ia, child_ia, type) ||
- !IA_EQUAL(source_ia, child_ia, prot) ||
- !IA_EQUAL(source_ia, child_ia, uid) ||
- !IA_EQUAL(source_ia, child_ia, gid)) {
- sources[i] = 0;
- healed_sinks[i] = 1;
- }
- }
- }
out:
return source;
}
int
-ec_adjust_versions (call_frame_t *frame, ec_t *ec, ec_txn_t type, inode_t *inode, int source,
- unsigned char *sources,
+ec_adjust_versions (call_frame_t *frame, ec_t *ec, ec_txn_t type,
+ inode_t *inode, int source, unsigned char *sources,
unsigned char *healed_sinks, uint64_t *versions,
uint64_t *dirty)
{
@@ -1798,39 +1796,127 @@ out:
}
int
-__ec_heal_prepare (call_frame_t *frame, ec_t *ec, inode_t *inode,
- unsigned char *locked_on, default_args_cbk_t *replies,
- uint64_t *versions, uint64_t *dirty, unsigned char *sources,
- unsigned char *healed_sinks, ec_txn_t type)
-{
- loc_t loc = {0};
- unsigned char *output = NULL;
- dict_t *xdata = NULL;
- int ret = 0;
- int source = 0;
+ec_heal_metadata_find_direction (ec_t *ec, default_args_cbk_t *replies,
+ uint64_t *versions, uint64_t *dirty,
+ unsigned char *sources, unsigned char *healed_sinks)
+{
+ uint64_t xattr[EC_VERSION_SIZE] = {0};
+ int same_count = 0;
+ int max_same_count = 0;
+ int same_source = -1;
+ int ret = 0;
+ int i = 0;
+ int j = 0;
+ int *groups = NULL;
+ struct iatt source_ia = {0};
+ struct iatt child_ia = {0};
- xdata = dict_new ();
- if (!xdata) {
- ret = -ENOMEM;
- goto out;
+ groups = alloca0 (ec->nodes * sizeof(*groups));
+ for (i = 0; i < ec->nodes; i++)
+ groups[i] = -1;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (!replies[i].valid)
+ continue;
+ ret = ec_dict_del_array (replies[i].xdata, EC_XATTR_VERSION,
+ xattr, EC_VERSION_SIZE);
+ if (ret == 0) {
+ versions[i] = xattr[EC_METADATA_TXN];
+ }
+
+ memset (xattr, 0, sizeof (xattr));
+ ret = ec_dict_del_array (replies[i].xdata, EC_XATTR_DIRTY,
+ xattr, EC_VERSION_SIZE);
+ if (ret == 0) {
+ dirty[i] = xattr[EC_METADATA_TXN];
+ }
+ if (groups[i] >= 0) /*Already part of group*/
+ continue;
+ groups[i] = i;
+ same_count = 1;
+ source_ia = replies[i].stat;
+ for (j = i + 1; j < ec->nodes; j++) {
+ child_ia = replies[j].stat;
+ if (!IA_EQUAL(source_ia, child_ia, gfid) ||
+ !IA_EQUAL(source_ia, child_ia, type) ||
+ !IA_EQUAL(source_ia, child_ia, prot) ||
+ !IA_EQUAL(source_ia, child_ia, uid) ||
+ !IA_EQUAL(source_ia, child_ia, gid))
+ continue;
+ if (!are_dicts_equal(replies[i].xdata, replies[j].xdata,
+ ec_sh_key_match, NULL))
+ continue;
+ groups[j] = i; /*If iatts match put them into a group*/
+ same_count++;
+ }
+
+ if (max_same_count < same_count) {
+ max_same_count = same_count;
+ same_source = i;
+ }
}
- if (dict_set_uint64(xdata, "list-xattr", 0)) {
- ret = -ENOMEM;
+ if (max_same_count < ec->fragments) {
+ ret = -EIO;
goto out;
}
+ for (i = 0; i < ec->nodes; i++) {
+ if (groups[i] == groups[same_source])
+ sources[i] = 1;
+ else if (replies[i].valid)
+ healed_sinks[i] = 1;
+ }
+ ret = same_source;
+out:
+ return ret;
+}
+
+int
+__ec_heal_metadata_prepare (call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *locked_on, default_args_cbk_t *replies,
+ uint64_t *versions, uint64_t *dirty, unsigned char *sources,
+ unsigned char *healed_sinks)
+{
+ loc_t loc = {0};
+ unsigned char *output = NULL;
+ unsigned char *lookup_on = NULL;
+ int ret = 0;
+ int source = 0;
+ default_args_cbk_t *greplies = NULL;
+ int i = 0;
+
+ EC_REPLIES_ALLOC (greplies, ec->nodes);
+
loc.inode = inode_ref (inode);
gf_uuid_copy (loc.gfid, inode->gfid);
output = alloca0 (ec->nodes);
+ lookup_on = alloca0 (ec->nodes);
ret = cluster_lookup (ec->xl_list, locked_on, ec->nodes, replies,
- output, frame, ec->xl, &loc, xdata);
+ output, frame, ec->xl, &loc, NULL);
if (ret <= ec->fragments) {
ret = -ENOTCONN;
goto out;
}
- source = ec_heal_find_direction (ec, type, replies, versions,
+ memcpy (lookup_on, output, ec->nodes);
+ /*Use getxattr to get the filtered xattrs which filter internal xattrs*/
+ ret = cluster_getxattr (ec->xl_list, lookup_on, ec->nodes, greplies,
+ output, frame, ec->xl, &loc, NULL, NULL);
+ for (i = 0; i < ec->nodes; i++) {
+ if (lookup_on[i] && !output[i]) {
+ replies[i].valid = 0;
+ continue;
+ }
+ if (replies[i].xdata) {
+ dict_unref (replies[i].xdata);
+ replies[i].xdata = NULL;
+ if (greplies[i].xattr)
+ replies[i].xdata = dict_ref (greplies[i].xattr);
+ }
+ }
+
+ source = ec_heal_metadata_find_direction (ec, replies, versions,
dirty, sources, healed_sinks);
if (source < 0) {
ret = -EIO;
@@ -1838,9 +1924,7 @@ __ec_heal_prepare (call_frame_t *frame, ec_t *ec, inode_t *inode,
}
ret = source;
out:
- if (xdata)
- dict_unref (xdata);
-
+ cluster_replies_wipe (greplies, ec->nodes);
loc_wipe (&loc);
return ret;
}
@@ -1864,14 +1948,14 @@ __ec_removexattr_sinks (call_frame_t *frame, ec_t *ec, inode_t *inode,
continue;
if (!sources[i] && !healed_sinks[i])
continue;
- ret = dict_foreach (replies[i].xattr, ec_heal_xattr_clean,
- replies[source].xattr);
+ ret = dict_foreach (replies[i].xdata, ec_heal_xattr_clean,
+ replies[source].xdata);
if (ret < 0) {
sources[i] = 0;
healed_sinks[i] = 0;
}
- if (replies[i].xattr->count == 0) {
+ if (replies[i].xdata->count == 0) {
continue;
} else if (sources[i]) {
/* This can happen if setxattr/removexattr succeeds on
@@ -1883,7 +1967,7 @@ __ec_removexattr_sinks (call_frame_t *frame, ec_t *ec, inode_t *inode,
}
ret = syncop_removexattr (ec->xl_list[i], &loc, "",
- replies[i].xattr, NULL);
+ replies[i].xdata, NULL);
if (ret < 0)
healed_sinks[i] = 0;
}
@@ -1896,39 +1980,46 @@ __ec_removexattr_sinks (call_frame_t *frame, ec_t *ec, inode_t *inode,
int
__ec_heal_metadata (call_frame_t *frame, ec_t *ec, inode_t *inode,
- unsigned char *locked_on)
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *healed_sinks)
{
loc_t loc = {0};
int ret = 0;
int source = 0;
default_args_cbk_t *replies = NULL;
+ default_args_cbk_t *sreplies = NULL;
uint64_t *versions = NULL;
uint64_t *dirty = NULL;
- unsigned char *sources = NULL;
- unsigned char *healed_sinks = NULL;
unsigned char *output = NULL;
dict_t *source_dict = NULL;
struct iatt source_buf = {0};
EC_REPLIES_ALLOC (replies, ec->nodes);
+ EC_REPLIES_ALLOC (sreplies, ec->nodes);
loc.inode = inode_ref (inode);
gf_uuid_copy (loc.gfid, inode->gfid);
output = alloca0 (ec->nodes);
versions = alloca0 (ec->nodes * sizeof (*versions));
dirty = alloca0 (ec->nodes * sizeof (*dirty));
- sources = alloca0 (ec->nodes);
- healed_sinks = alloca0 (ec->nodes);
- source = __ec_heal_prepare (frame, ec, inode, locked_on, replies,
- versions, dirty, sources, healed_sinks,
- EC_METADATA_TXN);
+ source = __ec_heal_metadata_prepare (frame, ec, inode, locked_on, replies,
+ versions, dirty, sources, healed_sinks);
if (source < 0) {
ret = -EIO;
goto out;
}
+ if (EC_COUNT (sources, ec->nodes) == ec->nodes) {
+ ret = 0;
+ goto erase_dirty;
+ }
+
+ if (EC_COUNT (healed_sinks, ec->nodes) == 0) {
+ ret = -ENOTCONN;
+ goto out;
+ }
source_buf = replies[source].stat;
- ret = cluster_setattr (ec->xl_list, healed_sinks, ec->nodes, replies,
+ ret = cluster_setattr (ec->xl_list, healed_sinks, ec->nodes, sreplies,
output, frame, ec->xl, &loc,
&source_buf, GF_SET_ATTR_MODE |
GF_SET_ATTR_UID | GF_SET_ATTR_GID, NULL);
@@ -1939,22 +2030,12 @@ __ec_heal_metadata (call_frame_t *frame, ec_t *ec, inode_t *inode,
goto out;
}
- ret = cluster_getxattr (ec->xl_list, locked_on, ec->nodes, replies,
- output, frame, ec->xl, &loc, NULL, NULL);
- EC_INTERSECT (sources, sources, output, ec->nodes);
- EC_INTERSECT (healed_sinks, healed_sinks, output, ec->nodes);
- EC_ADJUST_SOURCE (source, sources, ec->nodes);
- if ((EC_COUNT (healed_sinks, ec->nodes) == 0) || (source < 0)) {
- ret = -ENOTCONN;
- goto out;
- }
-
ret = __ec_removexattr_sinks (frame, ec, inode, source, sources,
healed_sinks, replies);
if (ret < 0)
goto out;
- source_dict = dict_ref (replies[source].xattr);
+ source_dict = dict_ref (replies[source].xdata);
if (dict_foreach_match (source_dict, ec_ignorable_key_match, NULL,
dict_remove_foreach_fn, NULL) == -1) {
ret = -ENOMEM;
@@ -1971,6 +2052,7 @@ __ec_heal_metadata (call_frame_t *frame, ec_t *ec, inode_t *inode,
goto out;
}
+erase_dirty:
ret = ec_adjust_versions (frame, ec, EC_METADATA_TXN, inode, source,
sources, healed_sinks, versions, dirty);
out:
@@ -1979,29 +2061,21 @@ out:
loc_wipe (&loc);
cluster_replies_wipe (replies, ec->nodes);
+ cluster_replies_wipe (sreplies, ec->nodes);
return ret;
}
int
-ec_heal_metadata (call_frame_t *req_frame, ec_t *ec, inode_t *inode)
+ec_heal_metadata (call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *sources, unsigned char *healed_sinks)
{
unsigned char *locked_on = NULL;
unsigned char *up_subvols = NULL;
unsigned char *output = NULL;
int ret = 0;
default_args_cbk_t *replies = NULL;
- call_frame_t *frame = NULL;
EC_REPLIES_ALLOC (replies, ec->nodes);
- frame = copy_frame (req_frame);
- if (!frame) {
- ret = -ENOMEM;
- goto out;
- }
-
- /*Do heal as root*/
- frame->root->uid = 0;
- frame->root->gid = 0;
locked_on = alloca0(ec->nodes);
output = alloca0(ec->nodes);
up_subvols = alloca0(ec->nodes);
@@ -2017,15 +2091,13 @@ ec_heal_metadata (call_frame_t *req_frame, ec_t *ec, inode_t *inode)
ret = -ENOTCONN;
goto unlock;
}
- ret = __ec_heal_metadata (frame, ec, inode, locked_on);
+ ret = __ec_heal_metadata (frame, ec, inode, locked_on, sources,
+ healed_sinks);
}
unlock:
cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output,
frame, ec->xl, ec->xl->name, inode, 0, 0);
-out:
cluster_replies_wipe (replies, ec->nodes);
- if (frame)
- STACK_DESTROY (frame->root);
return ret;
}
@@ -2036,24 +2108,47 @@ __ec_heal_entry_prepare (call_frame_t *frame, ec_t *ec, inode_t *inode,
uint64_t *dirty, unsigned char *sources,
unsigned char *healed_sinks)
{
- int source = 0;
- default_args_cbk_t *replies = NULL;
loc_t loc = {0};
+ int source = 0;
int ret = 0;
+ default_args_cbk_t *replies = NULL;
+ unsigned char *output = NULL;
+ dict_t *xdata = NULL;
EC_REPLIES_ALLOC (replies, ec->nodes);
loc.inode = inode_ref (inode);
gf_uuid_copy (loc.gfid, inode->gfid);
- source = __ec_heal_prepare (frame, ec, inode, locked_on, replies,
- versions, dirty, sources, healed_sinks,
- EC_DATA_TXN);
+ xdata = dict_new ();
+ if (!xdata) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (dict_set_uint64(xdata, EC_XATTR_VERSION, 0) ||
+ dict_set_uint64(xdata, EC_XATTR_DIRTY, 0)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ output = alloca0 (ec->nodes);
+ ret = cluster_lookup (ec->xl_list, locked_on, ec->nodes, replies,
+ output, frame, ec->xl, &loc, xdata);
+ if (ret <= ec->fragments) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ source = ec_heal_entry_find_direction (ec, replies, versions,
+ dirty, sources, healed_sinks);
if (source < 0) {
ret = -EIO;
goto out;
}
ret = source;
out:
+ if (xdata)
+ dict_unref (xdata);
loc_wipe (&loc);
cluster_replies_wipe (replies, ec->nodes);
return ret;
@@ -2156,6 +2251,11 @@ ec_delete_stale_name (dict_t *gfid_db, char *key, data_t *d, void *data)
/*This will help in making decisions about creating names*/
dict_del (gfid_db, key);
out:
+ if (ret < 0) {
+ gf_log (ec->xl->name, GF_LOG_DEBUG, "%s/%s: heal failed %s",
+ uuid_utoa (name_data->parent->gfid), name_data->name,
+ strerror (-ret));
+ }
cluster_replies_wipe (replies, ec->nodes);
loc_wipe (&loc);
return ret;
@@ -2320,9 +2420,12 @@ ec_create_name (call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
ret = 0;
out:
+ if (ret < 0)
+ gf_log (ec->xl->name, GF_LOG_DEBUG, "%s/%s: heal failed %s",
+ uuid_utoa (parent->gfid), name, strerror (-ret));
+ cluster_replies_wipe (replies, ec->nodes);
loc_wipe (&loc);
loc_wipe (&srcloc);
- EC_REPLIES_ALLOC (replies, ec->nodes);
if (xdata)
dict_unref (xdata);
return ret;
@@ -2345,6 +2448,7 @@ __ec_heal_name (call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
unsigned char *same = NULL;
unsigned char *gfidless = NULL;
+ EC_REPLIES_ALLOC (replies, ec->nodes);
loc.parent = inode_ref (parent);
loc.inode = inode_new (parent->table);
gf_uuid_copy (loc.pargfid, parent->gfid);
@@ -2365,7 +2469,6 @@ __ec_heal_name (call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
output = alloca0 (ec->nodes);
gfidless = alloca0 (ec->nodes);
enoent = alloca0 (ec->nodes);
- EC_REPLIES_ALLOC (replies, ec->nodes);
ret = cluster_lookup (ec->xl_list, participants, ec->nodes, replies,
output, frame, ec->xl, &loc, NULL);
for (i = 0; i < ec->nodes; i++) {
@@ -2464,9 +2567,10 @@ ec_heal_name (call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
NULL);
{
if (ret <= ec->fragments) {
- gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal "
- "as only %d number of subvolumes could "
- "be locked", uuid_utoa (parent->gfid), ret);
+ gf_log (ec->xl->name, GF_LOG_DEBUG, "%s/%s: Skipping "
+ "heal as only %d number of subvolumes could "
+ "be locked", uuid_utoa (parent->gfid), name,
+ ret);
ret = -ENOTCONN;
goto unlock;
}
@@ -2534,19 +2638,19 @@ ec_heal_names (call_frame_t *frame, ec_t *ec, inode_t *inode,
if (EC_COUNT (participants, ec->nodes) <= ec->fragments)
return -ENOTCONN;
}
+ loc_wipe (&loc);
return 0;
}
int
__ec_heal_entry (call_frame_t *frame, ec_t *ec, inode_t *inode,
- unsigned char *heal_on)
+ unsigned char *heal_on, unsigned char *sources,
+ unsigned char *healed_sinks)
{
unsigned char *locked_on = NULL;
unsigned char *output = NULL;
uint64_t *versions = NULL;
uint64_t *dirty = NULL;
- unsigned char *sources = NULL;
- unsigned char *healed_sinks = NULL;
unsigned char *participants = NULL;
default_args_cbk_t *replies = NULL;
int ret = 0;
@@ -2557,8 +2661,6 @@ __ec_heal_entry (call_frame_t *frame, ec_t *ec, inode_t *inode,
output = alloca0(ec->nodes);
versions = alloca0 (ec->nodes * sizeof (*versions));
dirty = alloca0 (ec->nodes * sizeof (*dirty));
- sources = alloca0 (ec->nodes);
- healed_sinks = alloca0 (ec->nodes);
EC_REPLIES_ALLOC (replies, ec->nodes);
ret = cluster_entrylk (ec->xl_list, heal_on, ec->nodes, replies,
@@ -2608,7 +2710,8 @@ out:
}
int
-ec_heal_entry (call_frame_t *req_frame, ec_t *ec, inode_t *inode)
+ec_heal_entry (call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *sources, unsigned char *healed_sinks)
{
unsigned char *locked_on = NULL;
unsigned char *up_subvols = NULL;
@@ -2616,21 +2719,12 @@ ec_heal_entry (call_frame_t *req_frame, ec_t *ec, inode_t *inode)
char selfheal_domain[1024] = {0};
int ret = 0;
default_args_cbk_t *replies = NULL;
- call_frame_t *frame = NULL;
EC_REPLIES_ALLOC (replies, ec->nodes);
locked_on = alloca0(ec->nodes);
output = alloca0(ec->nodes);
up_subvols = alloca0(ec->nodes);
- frame = copy_frame (req_frame);
- if (!frame) {
- ret = -ENOMEM;
- goto out;
- }
- /*Do heal as root*/
- frame->root->uid = 0;
- frame->root->gid = 0;
sprintf (selfheal_domain, "%s:self-heal", ec->xl->name);
ec_mask_to_char_array (ec->xl_up, up_subvols, ec->nodes);
/*If other processes are already doing the heal, don't block*/
@@ -2645,15 +2739,13 @@ ec_heal_entry (call_frame_t *req_frame, ec_t *ec, inode_t *inode)
ret = -ENOTCONN;
goto unlock;
}
- ret = __ec_heal_entry (frame, ec, inode, locked_on);
+ ret = __ec_heal_entry (frame, ec, inode, locked_on,
+ sources, healed_sinks);
}
unlock:
cluster_unentrylk (ec->xl_list, locked_on, ec->nodes, replies, output,
frame, ec->xl, selfheal_domain, inode, NULL);
-out:
cluster_replies_wipe (replies, ec->nodes);
- if (frame)
- STACK_DESTROY (frame->root);
return ret;
}
@@ -2664,12 +2756,10 @@ ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies,
uint64_t *size, unsigned char *sources,
unsigned char *healed_sinks)
{
+ uint64_t xattr[EC_VERSION_SIZE] = {0};
char version_size[64] = {0};
- uint64_t *value = NULL;
dict_t *version_size_db = NULL;
unsigned char *same = NULL;
- void *ptr = NULL;
- int len = 0;
int max_same_count = 0;
int source = 0;
int i = 0;
@@ -2686,25 +2776,20 @@ ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies,
continue;
if (replies[i].op_ret < 0)
continue;
- ret = dict_get_ptr_and_len (replies[i].xattr, EC_XATTR_VERSION,
- &ptr, &len);
+ ret = ec_dict_del_array (replies[i].xattr, EC_XATTR_VERSION,
+ xattr, EC_VERSION_SIZE);
if (ret == 0) {
- value = ptr;
- versions[i] = ntoh64(value[EC_DATA_TXN]);
+ versions[i] = xattr[EC_DATA_TXN];
}
- ret = dict_get_ptr_and_len (replies[i].xattr, EC_XATTR_DIRTY,
- &ptr, &len);
- if (ret == 0) {
- value = ptr;
- dirty[i] = ntoh64(value[EC_DATA_TXN]);
- }
- ret = dict_get_ptr_and_len (replies[i].xattr, EC_XATTR_SIZE,
- &ptr, &len);
+ memset (xattr, 0, sizeof (xattr));
+ ret = ec_dict_del_array (replies[i].xattr, EC_XATTR_DIRTY,
+ xattr, EC_VERSION_SIZE);
if (ret == 0) {
- value = ptr;
- size[i] = ntoh64(*value);
+ dirty[i] = xattr[EC_DATA_TXN];
}
+ ret = ec_dict_del_number (replies[i].xattr, EC_XATTR_SIZE,
+ &size[i]);
/*Build a db of same version, size*/
snprintf (version_size, sizeof (version_size),
"%"PRIu64"-%"PRIu64, versions[i], size[i]);
@@ -2749,10 +2834,7 @@ ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies,
healed_sinks[i] = 1;
}
}
- if (EC_COUNT (healed_sinks, ec->nodes) == 0) {
- ret = -ENOTCONN;
- goto out;
- }
+
ret = source;
out:
if (version_size_db)
@@ -2812,8 +2894,7 @@ __ec_heal_data_prepare (call_frame_t *frame, ec_t *ec, fd_t *fd,
output, frame, ec->xl, fd, NULL);
EC_INTERSECT (sources, sources, output, ec->nodes);
EC_INTERSECT (healed_sinks, healed_sinks, output, ec->nodes);
- if ((EC_COUNT (sources, ec->nodes) < ec->fragments) ||
- (EC_COUNT (healed_sinks, ec->nodes) == 0)) {
+ if (EC_COUNT (sources, ec->nodes) < ec->fragments) {
ret = -ENOTCONN;
goto out;
}
@@ -2826,6 +2907,7 @@ __ec_heal_data_prepare (call_frame_t *frame, ec_t *ec, fd_t *fd,
sources[i] = 0;
healed_sinks[i] = 1;
} else if (stbuf) {
+ source = i;
*stbuf = replies[i].stat;
}
}
@@ -2841,11 +2923,24 @@ __ec_heal_data_prepare (call_frame_t *frame, ec_t *ec, fd_t *fd,
goto out;
}
+ if (EC_COUNT(healed_sinks, ec->nodes) == 0) {
+ ret = -ENOTCONN;
+ goto out;
+ }
ret = source;
out:
if (xattrs)
dict_unref (xattrs);
cluster_replies_wipe (replies, ec->nodes);
+ if (ret < 0) {
+ gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: heal failed %s",
+ uuid_utoa (fd->inode->gfid), strerror (-ret));
+ } else {
+ gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: sources: %d, sinks: "
+ "%d", uuid_utoa (fd->inode->gfid),
+ EC_COUNT (sources, ec->nodes),
+ EC_COUNT (healed_sinks, ec->nodes));
+ }
return ret;
}
@@ -2910,6 +3005,9 @@ out:
cluster_replies_wipe (replies, ec->nodes);
if (xattrs)
dict_unref (xattrs);
+ if (ret < 0)
+ gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: heal failed %s",
+ uuid_utoa (fd->inode->gfid), strerror (-ret));
return ret;
}
@@ -2928,6 +3026,8 @@ ec_manager_heal_block (ec_fop_data_t *fop, int32_t state)
return EC_STATE_HEAL_DATA_COPY;
case EC_STATE_HEAL_DATA_COPY:
+ gf_log (fop->xl->name, GF_LOG_DEBUG, "%s: read/write starting",
+ uuid_utoa (heal->fd->inode->gfid));
ec_heal_data_block (heal);
return EC_STATE_HEAL_DATA_UNLOCK;
@@ -2986,6 +3086,8 @@ ec_heal_block (call_frame_t *frame, xlator_t *this, uintptr_t target,
if (fop == NULL)
goto out;
+ fop->pre_size = fop->post_size = heal->total_size;
+ fop->have_size = 1;
error = 0;
out:
@@ -3039,6 +3141,7 @@ ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
heal->data = &barrier;
syncbarrier_init (heal->data);
pool = ec->xl->ctx->iobuf_pool;
+ heal->total_size = size;
heal->size = iobpool_default_pagesize (pool);
heal->bad = ec_char_array_to_mask (healed_sinks, ec->nodes);
heal->good = ec_char_array_to_mask (sources, ec->nodes);
@@ -3047,6 +3150,12 @@ ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
for (heal->offset = 0; (heal->offset < size) && !heal->done;
heal->offset += heal->size) {
+ gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: sources: %d, sinks: "
+ "%d, offset: %"PRIu64" bsize: %"PRIu64,
+ uuid_utoa (fd->inode->gfid),
+ EC_COUNT (sources, ec->nodes),
+ EC_COUNT (healed_sinks, ec->nodes), heal->offset,
+ heal->size);
ret = ec_sync_heal_block (frame, ec->xl, heal);
if (ret < 0)
break;
@@ -3055,6 +3164,9 @@ ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
fd_unref (heal->fd);
LOCK_DESTROY (&heal->lock);
syncbarrier_destroy (heal->data);
+ if (ret < 0)
+ gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: heal failed %s",
+ uuid_utoa (fd->inode->gfid), strerror (-ret));
return ret;
}
@@ -3089,6 +3201,9 @@ __ec_heal_trim_sinks (call_frame_t *frame, ec_t *ec, fd_t *fd,
out:
cluster_replies_wipe (replies, ec->nodes);
+ if (ret < 0)
+ gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: heal failed %s",
+ uuid_utoa (fd->inode->gfid), strerror (-ret));
return ret;
}
@@ -3281,15 +3396,14 @@ unlock:
}
int
-__ec_heal_data (call_frame_t *frame, ec_t *ec, fd_t *fd, unsigned char *heal_on)
+__ec_heal_data (call_frame_t *frame, ec_t *ec, fd_t *fd, unsigned char *heal_on,
+ unsigned char *sources, unsigned char *healed_sinks)
{
unsigned char *locked_on = NULL;
unsigned char *output = NULL;
uint64_t *versions = NULL;
uint64_t *dirty = NULL;
uint64_t *size = NULL;
- unsigned char *sources = NULL;
- unsigned char *healed_sinks = NULL;
unsigned char *trim = NULL;
default_args_cbk_t *replies = NULL;
int ret = 0;
@@ -3297,8 +3411,6 @@ __ec_heal_data (call_frame_t *frame, ec_t *ec, fd_t *fd, unsigned char *heal_on)
locked_on = alloca0(ec->nodes);
output = alloca0(ec->nodes);
- sources = alloca0 (ec->nodes);
- healed_sinks = alloca0 (ec->nodes);
trim = alloca0 (ec->nodes);
versions = alloca0 (ec->nodes * sizeof (*versions));
dirty = alloca0 (ec->nodes * sizeof (*dirty));
@@ -3337,6 +3449,11 @@ unlock:
if (ret < 0)
goto out;
+ gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: sources: %d, sinks: "
+ "%d", uuid_utoa (fd->inode->gfid),
+ EC_COUNT (sources, ec->nodes),
+ EC_COUNT (healed_sinks, ec->nodes));
+
ret = ec_rebuild_data (frame, ec, fd, size[source], sources,
healed_sinks);
if (ret < 0)
@@ -3351,13 +3468,13 @@ out:
}
int
-ec_heal_data2 (call_frame_t *req_frame, ec_t *ec, inode_t *inode)
+ec_heal_data (call_frame_t *frame, ec_t *ec, gf_boolean_t block, inode_t *inode,
+ unsigned char *sources, unsigned char *healed_sinks)
{
unsigned char *locked_on = NULL;
unsigned char *up_subvols = NULL;
unsigned char *output = NULL;
default_args_cbk_t *replies = NULL;
- call_frame_t *frame = NULL;
fd_t *fd = NULL;
loc_t loc = {0};
char selfheal_domain[1024] = {0};
@@ -3368,7 +3485,7 @@ ec_heal_data2 (call_frame_t *req_frame, ec_t *ec, inode_t *inode)
locked_on = alloca0(ec->nodes);
output = alloca0(ec->nodes);
up_subvols = alloca0(ec->nodes);
- loc. inode = inode_ref (inode);
+ loc.inode = inode_ref (inode);
gf_uuid_copy (loc.gfid, inode->gfid);
fd = fd_create (inode, 0);
@@ -3378,14 +3495,6 @@ ec_heal_data2 (call_frame_t *req_frame, ec_t *ec, inode_t *inode)
}
ec_mask_to_char_array (ec->xl_up, up_subvols, ec->nodes);
- frame = copy_frame (req_frame);
- if (!frame) {
- ret = -ENOMEM;
- goto out;
- }
- /*Do heal as root*/
- frame->root->uid = 0;
- frame->root->gid = 0;
ret = cluster_open (ec->xl_list, up_subvols, ec->nodes, replies, output,
frame, ec->xl, &loc, O_RDWR|O_LARGEFILE, fd, NULL);
@@ -3397,9 +3506,15 @@ ec_heal_data2 (call_frame_t *req_frame, ec_t *ec, inode_t *inode)
fd_bind (fd);
sprintf (selfheal_domain, "%s:self-heal", ec->xl->name);
/*If other processes are already doing the heal, don't block*/
- ret = cluster_tryinodelk (ec->xl_list, output, ec->nodes, replies,
- locked_on, frame, ec->xl, selfheal_domain, inode,
- 0, 0);
+ if (block) {
+ ret = cluster_inodelk (ec->xl_list, output, ec->nodes, replies,
+ locked_on, frame, ec->xl,
+ selfheal_domain, inode, 0, 0);
+ } else {
+ ret = cluster_tryinodelk (ec->xl_list, output, ec->nodes,
+ replies, locked_on, frame, ec->xl,
+ selfheal_domain, inode, 0, 0);
+ }
{
if (ret <= ec->fragments) {
gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal "
@@ -3408,7 +3523,8 @@ ec_heal_data2 (call_frame_t *req_frame, ec_t *ec, inode_t *inode)
ret = -ENOTCONN;
goto unlock;
}
- ret = __ec_heal_data (frame, ec, fd, locked_on);
+ ret = __ec_heal_data (frame, ec, fd, locked_on, sources,
+ healed_sinks);
}
unlock:
cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output,
@@ -3418,7 +3534,162 @@ out:
fd_unref (fd);
loc_wipe (&loc);
cluster_replies_wipe (replies, ec->nodes);
- if (frame)
- STACK_DESTROY (frame->root);
return ret;
}
+
+void
+ec_heal_do (xlator_t *this, void *data, loc_t *loc, int32_t partial)
+{
+ call_frame_t *frame = NULL;
+ unsigned char *participants = NULL;
+ unsigned char *msources = NULL;
+ unsigned char *mhealed_sinks = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *healed_sinks = NULL;
+ ec_t *ec = NULL;
+ int ret = 0;
+ int op_ret = 0;
+ int op_errno = 0;
+ intptr_t mgood = 0;
+ intptr_t mbad = 0;
+ intptr_t good = 0;
+ intptr_t bad = 0;
+ ec_fop_data_t *fop = data;
+ gf_boolean_t blocking = _gf_false;
+
+ ec = this->private;
+
+ /* If it is heal request from getxattr, complete the heal and then
+ * unwind, if it is ec_heal with NULL as frame then no need to block
+ * the heal as the caller doesn't care about its completion*/
+ if (fop->req_frame)
+ blocking = _gf_true;
+
+ frame = create_frame (this, this->ctx->pool);
+ if (!frame)
+ return;
+
+ ec_owner_set(frame, frame->root);
+ /*Do heal as root*/
+ frame->root->uid = 0;
+ frame->root->gid = 0;
+ participants = alloca0(ec->nodes);
+ ec_mask_to_char_array (ec->xl_up, participants, ec->nodes);
+ if (loc->name && strlen (loc->name)) {
+ ret = ec_heal_name (frame, ec, loc->parent, (char *)loc->name,
+ participants);
+ if (ret == 0) {
+ gf_log (this->name, GF_LOG_INFO, "%s: name heal "
+ "successful on %lX", loc->path,
+ ec_char_array_to_mask (participants, ec->nodes));
+ } else {
+ gf_log (this->name, GF_LOG_INFO, "%s: name heal "
+ "failed on %s", loc->path, strerror (-ret));
+ }
+ }
+
+ msources = alloca0(ec->nodes);
+ mhealed_sinks = alloca0(ec->nodes);
+ ret = ec_heal_metadata (frame, ec, loc->inode, msources, mhealed_sinks);
+ if (ret == 0) {
+ mgood = ec_char_array_to_mask (msources, ec->nodes);
+ mbad = ec_char_array_to_mask (mhealed_sinks, ec->nodes);
+ } else {
+ op_ret = -1;
+ op_errno = -ret;
+ }
+ sources = alloca0(ec->nodes);
+ healed_sinks = alloca0(ec->nodes);
+ if (IA_ISREG (loc->inode->ia_type)) {
+ ret = ec_heal_data (frame, ec, blocking, loc->inode, sources,
+ healed_sinks);
+ } else if (IA_ISDIR (loc->inode->ia_type) && !partial) {
+ ret = ec_heal_entry (frame, ec, loc->inode, sources,
+ healed_sinks);
+ } else {
+ ret = 0;
+ memcpy (sources, participants, ec->nodes);
+ memcpy (healed_sinks, participants, ec->nodes);
+ }
+
+ if (ret == 0) {
+ good = ec_char_array_to_mask (sources, ec->nodes);
+ bad = ec_char_array_to_mask (healed_sinks, ec->nodes);
+ } else {
+ op_ret = -1;
+ op_errno = -ret;
+ }
+
+
+ if (fop->cbks.heal) {
+ fop->cbks.heal (fop->req_frame, fop, fop->xl, op_ret,
+ op_errno, ec_char_array_to_mask (participants,
+ ec->nodes),
+ mgood & good, mbad & bad, NULL);
+ }
+ STACK_DESTROY (frame->root);
+ return;
+}
+
+int
+ec_synctask_heal_wrap (void *opaque)
+{
+ ec_fop_data_t *fop = opaque;
+ ec_heal_do (fop->xl, fop, &fop->loc[0], fop->int32);
+ return 0;
+}
+
+int
+ec_heal_done (int ret, call_frame_t *heal, void *opaque)
+{
+ if (opaque)
+ ec_fop_data_release (opaque);
+ return 0;
+}
+
+void
+ec_heal (call_frame_t *frame, xlator_t *this, uintptr_t target,
+ int32_t minimum, fop_heal_cbk_t func, void *data, loc_t *loc,
+ int32_t partial, dict_t *xdata)
+{
+ ec_cbk_t callback = { .heal = func };
+ ec_fop_data_t *fop = NULL;
+ int ret = 0;
+
+ gf_log("ec", GF_LOG_TRACE, "EC(HEAL) %p", frame);
+
+ VALIDATE_OR_GOTO(this, fail);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, fail);
+
+ if (!loc || !loc->inode || gf_uuid_is_null (loc->inode->gfid))
+ goto fail;
+
+ if (frame && frame->local)
+ goto fail;
+ fop = ec_fop_data_allocate (frame, this, EC_FOP_HEAL,
+ EC_FLAG_UPDATE_LOC_INODE, target, minimum,
+ ec_wind_heal, ec_manager_heal, callback, data);
+ if (fop == NULL)
+ goto fail;
+
+ fop->int32 = partial;
+
+ if (loc) {
+ if (loc_copy(&fop->loc[0], loc) != 0)
+ goto fail;
+ }
+
+ if (xdata)
+ fop->xdata = dict_ref(xdata);
+
+ ret = synctask_new (this->ctx->env, ec_synctask_heal_wrap,
+ ec_heal_done, NULL, fop);
+ if (ret < 0)
+ goto fail;
+ return;
+fail:
+ if (fop)
+ ec_fop_data_release (fop);
+ if (func)
+ func (frame, NULL, this, -1, EIO, 0, 0, 0, NULL);
+}
diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
index 53b3996590c..a7cf8f7bd30 100644
--- a/xlators/cluster/ec/src/ec-heald.c
+++ b/xlators/cluster/ec/src/ec-heald.c
@@ -18,7 +18,7 @@
#include "syncop-utils.h"
#include "protocol-common.h"
-#define SHD_INODE_LRU_LIMIT 2048
+#define SHD_INODE_LRU_LIMIT 10
#define ASSERT_LOCAL(this, healer) \
do { \
if (!ec_shd_is_subvol_local (this, healer->subvol)) { \
@@ -224,8 +224,8 @@ ec_shd_index_heal (xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
void *data)
{
struct subvol_healer *healer = data;
- ec_t *ec = NULL;
- loc_t loc = {0};
+ ec_t *ec = NULL;
+ loc_t loc = {0};
int ret = 0;
ec = healer->this->private;
@@ -254,6 +254,8 @@ ec_shd_index_heal (xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
ec_shd_selfheal (healer, healer->subvol, &loc);
out:
+ if (loc.inode)
+ inode_forget (loc.inode, 0);
loc_wipe (&loc);
return 0;
@@ -280,7 +282,7 @@ ec_shd_index_sweep (struct subvol_healer *healer)
ret = syncop_dir_scan (subvol, &loc, GF_CLIENT_PID_AFR_SELF_HEALD,
healer, ec_shd_index_heal);
- inode_forget (loc.inode, 1);
+ inode_forget (loc.inode, 0);
loc_wipe (&loc);
return ret;
@@ -318,10 +320,12 @@ ec_shd_full_heal (xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
ec_shd_selfheal (healer, healer->subvol, &loc);
- loc_wipe (&loc);
ret = 0;
out:
+ if (loc.inode)
+ inode_forget (loc.inode, 0);
+ loc_wipe (&loc);
return ret;
}
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
index e9d842fcfa9..2b497efd166 100644
--- a/xlators/cluster/ec/src/ec-helpers.c
+++ b/xlators/cluster/ec/src/ec-helpers.c
@@ -181,6 +181,7 @@ int32_t ec_dict_del_array(dict_t *dict, char *key, uint64_t value[],
void *ptr;
int32_t len;
int32_t vindex;
+ int32_t old_size = 0;
if ((dict == NULL) || (dict_get_ptr_and_len(dict, key, &ptr, &len) != 0)) {
return -1;
@@ -192,11 +193,18 @@ int32_t ec_dict_del_array(dict_t *dict, char *key, uint64_t value[],
memset (value, 0, size * sizeof(uint64_t));
/* 3.6 version ec would have stored version in 64 bit. In that case treat
- * metadata versions as 0*/
- size = min (size, len/sizeof(uint64_t));
- for (vindex = 0; vindex < size; vindex++) {
+ * metadata versions same as data*/
+ old_size = min (size, len/sizeof(uint64_t));
+ for (vindex = 0; vindex < old_size; vindex++) {
value[vindex] = ntoh64(*((uint64_t *)ptr + vindex));
}
+
+ if (old_size < size) {
+ for (vindex = old_size; vindex < size; vindex++) {
+ value[vindex] = value[old_size-1];
+ }
+ }
+
dict_del(dict, key);
return 0;
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
index f87df4016c0..7372c0a0599 100644
--- a/xlators/cluster/ec/src/ec-inode-read.c
+++ b/xlators/cluster/ec/src/ec-inode-read.c
@@ -394,7 +394,7 @@ int32_t ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
goto out;
}
- if (dict_set_str(dict, EC_XATTR_HEAL, str) != 0) {
+ if (dict_set_dynstr(dict, EC_XATTR_HEAL, str) != 0) {
GF_FREE(str);
dict_unref(dict);
dict = NULL;
@@ -1202,10 +1202,6 @@ out:
int32_t ec_combine_readv(ec_fop_data_t * fop, ec_cbk_data_t * dst,
ec_cbk_data_t * src)
{
- if (src->dirty) {
- return 0;
- }
-
if (!ec_vector_compare(dst->vector, dst->int32, src->vector, src->int32))
{
gf_log(fop->xl->name, GF_LOG_NOTICE, "Mismatching vector in "