From 33fdc310700da74a4142dab48d00c4753100904b Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Thu, 16 Apr 2015 09:25:31 +0530 Subject: cluster/ec: metadata/name/entry heal implementation for ec Metadata self-heal: 1) Take inode lock in domain 'this->name' on 0-0 range (full file) 2) perform lookup and get the xattrs on all the bricks 3) Choose the brick with highest version as source 4) Setattr uid/gid/permissions 5) removexattr stale xattrs 6) Setxattr existing/new xattrs 7) xattrop with -ve values of 'dirty' and difference of highest and its own version values for version xattr 8) unlock lock acquired in 1) Entry self-heal: 1) take directory lock in domain 'this->name:self-heal' on 'NULL' to prevent more than one self-heal 2) we take directory lock in domain 'this->name' on 'NULL' 3) Perform lookup on version, dirty and remember the values 4) unlock lock acquired in 2) 5) readdir on all the bricks and trigger name heals 6) xattrop with -ve values of 'dirty' and difference of highest and its own version values for version xattr 7) unlock lock acquired in 1) Name heal: 1) Take 'name' lock in 'this->name' on 'NULL' 2) Perform lookup on 'name' and get stat and xattr structures 3) Build gfid_db where for each gfid we know what subvolumes/bricks have a file with 'name' 4) Delete all the stale files i.e. the file does not exist on more than ec->redundancy number of bricks 5) On all the subvolumes/bricks with missing entry create 'name' with same type,gfid,permissions etc. 6) Unlock lock acquired in 1) Known limitation: At the moment with present design, it conservatively preserves the 'name' in case it can not decide whether to delete it. this can happen in the following scenario: 1) we have 3=2+1 (bricks: A, B, C) ec volume and 1 brick is down (Lets say A) 2) rename d1/f1 -> d2/f2 is performed but the rename is successful only on one of the bricks (Lets say B) 3) Now name self-heal on d1 and d2 would re-create the file on both d1 and d2 resulting in d1/f1 and d2/f2. Because we wanted to prevent data loss in the case above, the following scenario is not healable, i.e. it needs manual intervention: 1) we have 3=2+1 (bricks: A, B, C) ec volume and 1 brick is down (Lets say A) 2) We have two hard links: d1/a, d2/b and another file d3/c even before the brick went down 3) rename d3/c -> d2/b is performed 4) Now name self-heal on d2/b doesn't heal because d2/b with older gfid will not be deleted. One could think why not delete the link if there is more than 1 hardlink, but that leads to similar data loss issue I described earlier: Scenario: 1) we have 3=2+1 (bricks: A, B, C) ec volume and 1 brick is down (Lets say A) 2) We have two hard links: d1/a, d2/b 3) rename d1/a -> d3/c, d2/b -> d4/d is performed and both the operations are successful only on one of the bricks (Lets say B) 4) Now name self-heal on the 'names' above which can happen in parallel can decide to delete the file thinking it has 2 links but after all the self-heals do unlinks we are left with data loss. Change-Id: I3a68218a47bb726bd684604efea63cf11cfd11be BUG: 1213358 Signed-off-by: Pranith Kumar K Reviewed-on: http://review.gluster.org/10298 Tested-by: Gluster Build System Reviewed-by: Vijay Bellur --- xlators/cluster/ec/src/ec-common.h | 5 + xlators/cluster/ec/src/ec-heal.c | 1053 ++++++++++++++++++++++++++++++++++++ 2 files changed, 1058 insertions(+) (limited to 'xlators/cluster/ec') diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index aaae16e71c3..ba009040b71 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -15,6 +15,11 @@ #include "ec-data.h" +typedef enum { + EC_DATA_TXN, + EC_METADATA_TXN +} ec_txn_t; + #define EC_CONFIG_VERSION 0 #define EC_CONFIG_ALGORITHM 0 diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c index 6148df904a4..1e19cf57e1b 100644 --- a/xlators/cluster/ec/src/ec-heal.c +++ b/xlators/cluster/ec/src/ec-heal.c @@ -20,6 +20,35 @@ #include "ec-mem-types.h" #include "ec-data.h" +#include "byte-order.h" +#include "syncop.h" +#include "syncop-utils.h" +#include "cluster-syncop.h" + +#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr; }) +#define EC_COUNT(array, max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res; }) +#define EC_INTERSECT(dst, src1, src2, max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i]; }) +#define EC_ADJUST_SOURCE(source, sources, max) ({int __i; if (sources[source] == 0) {source = -1; for (__i = 0; __i < max; __i++) if (sources[__i]) source = __i; } }) +#define IA_EQUAL(f, s, field) (memcmp (&(f.ia_##field), &(s.ia_##field), sizeof (s.ia_##field)) == 0) +#define EC_REPLIES_ALLOC(replies, numsubvols) do { \ + int __i = 0; \ + replies = alloca0(numsubvols * sizeof (*replies)); \ + for (__i = 0; __i < numsubvols; __i++) \ + INIT_LIST_HEAD (&replies[__i].entries.list); \ + } while (0) + + +struct ec_name_data { + call_frame_t *frame; + unsigned char *participants; + unsigned char *failed_on; + unsigned char *gfidless; + unsigned char *enoent; + unsigned char *same; + char *name; + inode_t *parent; + default_args_cbk_t *replies; +}; static char *ec_ignore_xattrs[] = { GF_SELINUX_XATTR_KEY, @@ -35,6 +64,9 @@ ec_ignorable_key_match (dict_t *dict, char *key, data_t *val, void *mdata) if (!key) goto out; + if (strncmp (key, EC_XATTR_PREFIX, strlen (EC_XATTR_PREFIX)) == 0) + return _gf_true; + for (i = 0; ec_ignore_xattrs[i]; i++) { if (!strcmp (key, ec_ignore_xattrs[i])) return _gf_true; @@ -1590,3 +1622,1024 @@ void ec_fheal(call_frame_t * frame, xlator_t * this, uintptr_t target, xdata); } } + +/* Common heal code */ +void +ec_mask_to_char_array (uintptr_t mask, unsigned char *array, int numsubvols) +{ + int i = 0; + + for (i = 0; i < numsubvols; i++) + array[i] = ((mask >> i) & 1); +} + +int +ec_heal_find_direction (ec_t *ec, ec_txn_t type, default_args_cbk_t *replies, + uint64_t *versions, uint64_t *dirty, + unsigned char *sources, unsigned char *healed_sinks) +{ + void *ptr = NULL; + uint64_t *value = NULL; + uint64_t max_version = 0; + int source = -1; + int32_t len = 0; + int ret = 0; + int i = 0; + struct iatt source_ia = {0}; + struct iatt child_ia = {0}; + + for (i = 0; i < ec->nodes; i++) { + if (!replies[i].valid) + continue; + + if (replies[i].op_ret == -1) + continue; + + if (source == -1) + source = i; + + ret = dict_get_ptr_and_len (replies[i].xdata, EC_XATTR_VERSION, + &ptr, &len); + if (ret == 0) { + value = ptr; + versions[i] = ntoh64(value[type]); + if (max_version < versions[i]) { + max_version = versions[i]; + source = i; + } + } + + ret = dict_get_ptr_and_len (replies[i].xdata, EC_XATTR_DIRTY, + &ptr, &len); + if (ret == 0) { + value = ptr; + dirty[i] = ntoh64(value[type]); + } + } + + if (source < 0) + goto out; + + for (i = 0; i < ec->nodes; i++) { + if (!replies[i].valid) + continue; + + if (replies[i].op_ret == -1) + continue; + + if (versions[i] == versions[source]) + sources[i] = 1; + else + healed_sinks[i] = 1; + } + + if (type == EC_METADATA_TXN) { + source_ia = replies[source].stat; + for (i = 0; i < ec->nodes; i++) { + if (!sources[i]) + continue; + child_ia = replies[i].stat; + if (!IA_EQUAL(source_ia, child_ia, gfid) || + !IA_EQUAL(source_ia, child_ia, type) || + !IA_EQUAL(source_ia, child_ia, prot) || + !IA_EQUAL(source_ia, child_ia, uid) || + !IA_EQUAL(source_ia, child_ia, gid)) { + sources[i] = 0; + healed_sinks[i] = 1; + } + } + } +out: + return source; +} + +int +ec_adjust_versions (call_frame_t *frame, ec_t *ec, ec_txn_t type, inode_t *inode, int source, + unsigned char *sources, + unsigned char *healed_sinks, uint64_t *versions, + uint64_t *dirty) +{ + int i = 0; + int ret = 0; + dict_t *xattr = NULL; + int op_ret = 0; + loc_t loc = {0}; + gf_boolean_t erase_dirty = _gf_false; + uint64_t versions_xattr[2] = {0}; + uint64_t dirty_xattr[2] = {0}; + uint64_t allzero[2] = {0}; + + loc.inode = inode_ref (inode); + gf_uuid_copy (loc.gfid, inode->gfid); + xattr = dict_new (); + if (!xattr) + goto out; + + /* dirty xattr represents if the file/dir needs heal. Unless all the + * copies are healed, don't erase it */ + if (EC_COUNT (sources, ec->nodes) + + EC_COUNT (healed_sinks, ec->nodes) == ec->nodes) + erase_dirty = _gf_true; + + for (i = 0; i < ec->nodes; i++) { + if (!sources[i] && !healed_sinks[i]) + continue; + versions_xattr[type] = hton64(versions[source] - versions[i]); + ret = dict_set_static_bin (xattr, EC_XATTR_VERSION, + versions_xattr, + sizeof (versions_xattr)); + if (ret < 0) { + op_ret = -ENOTCONN; + continue; + } + + if (erase_dirty) { + dirty_xattr[type] = hton64(-dirty[i]); + ret = dict_set_static_bin (xattr, EC_XATTR_DIRTY, + dirty_xattr, + sizeof (dirty_xattr)); + if (ret < 0) { + op_ret = -ENOTCONN; + continue; + } + } + + if ((memcmp (versions_xattr, allzero, sizeof (allzero)) == 0) && + (memcmp (dirty_xattr, allzero, sizeof (allzero)) == 0)) + continue; + + ret = syncop_xattrop (ec->xl_list[i], &loc, + GF_XATTROP_ADD_ARRAY64, xattr, NULL, + NULL); + if (ret < 0) { + op_ret = -ret; + continue; + } + } + +out: + if (xattr) + dict_unref (xattr); + loc_wipe (&loc); + return op_ret; +} + +int +__ec_heal_prepare (call_frame_t *frame, ec_t *ec, inode_t *inode, + unsigned char *locked_on, default_args_cbk_t *replies, + uint64_t *versions, uint64_t *dirty, unsigned char *sources, + unsigned char *healed_sinks, ec_txn_t type) +{ + loc_t loc = {0}; + unsigned char *output = NULL; + dict_t *xdata = NULL; + int ret = 0; + int source = 0; + + xdata = dict_new (); + if (!xdata) { + ret = -ENOMEM; + goto out; + } + + if (dict_set_uint64(xdata, "list-xattr", 0)) { + ret = -ENOMEM; + goto out; + } + + loc.inode = inode_ref (inode); + gf_uuid_copy (loc.gfid, inode->gfid); + output = alloca0 (ec->nodes); + ret = cluster_lookup (ec->xl_list, locked_on, ec->nodes, replies, + output, frame, ec->xl, &loc, xdata); + if (ret <= ec->fragments) { + ret = -ENOTCONN; + goto out; + } + + source = ec_heal_find_direction (ec, type, replies, versions, + dirty, sources, healed_sinks); + if (source < 0) { + ret = -EIO; + goto out; + } + ret = source; +out: + if (xdata) + dict_unref (xdata); + + loc_wipe (&loc); + return ret; +} + +/* Metadata heal */ +int +__ec_removexattr_sinks (call_frame_t *frame, ec_t *ec, inode_t *inode, + int source, unsigned char *sources, + unsigned char *healed_sinks, + default_args_cbk_t *replies) +{ + int i = 0; + int ret = 0; + loc_t loc = {0}; + + loc.inode = inode_ref (inode); + gf_uuid_copy (loc.gfid, inode->gfid); + + for (i = 0; i < ec->nodes; i++) { + if (i == source) + continue; + if (!sources[i] && !healed_sinks[i]) + continue; + ret = dict_foreach (replies[i].xattr, ec_heal_xattr_clean, + replies[source].xattr); + if (ret < 0) { + sources[i] = 0; + healed_sinks[i] = 0; + } + + if (replies[i].xattr->count == 0) { + continue; + } else if (sources[i]) { + /* This can happen if setxattr/removexattr succeeds on + * the bricks but fails to update the version. This + * will make sure that the xattrs are made equal after + * heal*/ + sources[i] = 0; + healed_sinks[i] = 1; + } + + ret = syncop_removexattr (ec->xl_list[i], &loc, "", + replies[i].xattr, NULL); + if (ret < 0) + healed_sinks[i] = 0; + } + + loc_wipe (&loc); + if (EC_COUNT (healed_sinks, ec->nodes) == 0) + return -ENOTCONN; + return 0; +} + +int +__ec_heal_metadata (call_frame_t *frame, ec_t *ec, inode_t *inode, + unsigned char *locked_on) +{ + loc_t loc = {0}; + int ret = 0; + int source = 0; + default_args_cbk_t *replies = NULL; + uint64_t *versions = NULL; + uint64_t *dirty = NULL; + unsigned char *sources = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *output = NULL; + dict_t *source_dict = NULL; + struct iatt source_buf = {0}; + + EC_REPLIES_ALLOC (replies, ec->nodes); + + loc.inode = inode_ref (inode); + gf_uuid_copy (loc.gfid, inode->gfid); + output = alloca0 (ec->nodes); + versions = alloca0 (ec->nodes * sizeof (*versions)); + dirty = alloca0 (ec->nodes * sizeof (*dirty)); + sources = alloca0 (ec->nodes); + healed_sinks = alloca0 (ec->nodes); + source = __ec_heal_prepare (frame, ec, inode, locked_on, replies, + versions, dirty, sources, healed_sinks, + EC_METADATA_TXN); + if (source < 0) { + ret = -EIO; + goto out; + } + + source_buf = replies[source].stat; + ret = cluster_setattr (ec->xl_list, healed_sinks, ec->nodes, replies, + output, frame, ec->xl, &loc, + &source_buf, GF_SET_ATTR_MODE | + GF_SET_ATTR_UID | GF_SET_ATTR_GID, NULL); + /*In case the operation fails on some of the subvols*/ + memcpy (healed_sinks, output, ec->nodes); + if (EC_COUNT (healed_sinks, ec->nodes) == 0) { + ret = -ENOTCONN; + goto out; + } + + ret = cluster_getxattr (ec->xl_list, locked_on, ec->nodes, replies, + output, frame, ec->xl, &loc, NULL, NULL); + EC_INTERSECT (sources, sources, output, ec->nodes); + EC_INTERSECT (healed_sinks, healed_sinks, output, ec->nodes); + EC_ADJUST_SOURCE (source, sources, ec->nodes); + if ((EC_COUNT (healed_sinks, ec->nodes) == 0) || (source < 0)) { + ret = -ENOTCONN; + goto out; + } + + ret = __ec_removexattr_sinks (frame, ec, inode, source, sources, + healed_sinks, replies); + if (ret < 0) + goto out; + + source_dict = dict_ref (replies[source].xattr); + if (dict_foreach_match (source_dict, ec_ignorable_key_match, NULL, + dict_remove_foreach_fn, NULL) == -1) { + ret = -ENOMEM; + goto out; + } + + ret = cluster_setxattr (ec->xl_list, healed_sinks, ec->nodes, + replies, output, frame, ec->xl, &loc, + source_dict, 0, NULL); + + EC_INTERSECT (healed_sinks, healed_sinks, output, ec->nodes); + if (EC_COUNT (healed_sinks, ec->nodes) == 0) { + ret = -ENOTCONN; + goto out; + } + + ret = ec_adjust_versions (frame, ec, EC_METADATA_TXN, inode, source, + sources, healed_sinks, versions, dirty); +out: + if (source_dict) + dict_unref (source_dict); + + loc_wipe (&loc); + cluster_replies_wipe (replies, ec->nodes); + return ret; +} + +int +ec_heal_metadata (call_frame_t *req_frame, ec_t *ec, inode_t *inode) +{ + unsigned char *locked_on = NULL; + unsigned char *up_subvols = NULL; + unsigned char *output = NULL; + int ret = 0; + default_args_cbk_t *replies = NULL; + call_frame_t *frame = NULL; + + EC_REPLIES_ALLOC (replies, ec->nodes); + frame = copy_frame (req_frame); + if (!frame) { + ret = -ENOMEM; + goto out; + } + + /*Do heal as root*/ + frame->root->uid = 0; + frame->root->gid = 0; + locked_on = alloca0(ec->nodes); + output = alloca0(ec->nodes); + up_subvols = alloca0(ec->nodes); + ec_mask_to_char_array (ec->xl_up, up_subvols, ec->nodes); + ret = cluster_inodelk (ec->xl_list, up_subvols, ec->nodes, replies, + locked_on, frame, ec->xl, ec->xl->name, inode, 0, + 0); + { + if (ret <= ec->fragments) { + gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal " + "as only %d number of subvolumes could " + "be locked", uuid_utoa (inode->gfid), ret); + ret = -ENOTCONN; + goto unlock; + } + ret = __ec_heal_metadata (frame, ec, inode, locked_on); + } +unlock: + cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output, + frame, ec->xl, ec->xl->name, inode, 0, 0); +out: + cluster_replies_wipe (replies, ec->nodes); + if (frame) + STACK_DESTROY (frame->root); + return ret; +} + +/*entry heal*/ +int +__ec_heal_entry_prepare (call_frame_t *frame, ec_t *ec, inode_t *inode, + unsigned char *locked_on, uint64_t *versions, + uint64_t *dirty, unsigned char *sources, + unsigned char *healed_sinks) +{ + int source = 0; + default_args_cbk_t *replies = NULL; + loc_t loc = {0}; + int ret = 0; + + EC_REPLIES_ALLOC (replies, ec->nodes); + + loc.inode = inode_ref (inode); + gf_uuid_copy (loc.gfid, inode->gfid); + source = __ec_heal_prepare (frame, ec, inode, locked_on, replies, + versions, dirty, sources, healed_sinks, + EC_DATA_TXN); + if (source < 0) { + ret = -EIO; + goto out; + } + ret = source; +out: + loc_wipe (&loc); + cluster_replies_wipe (replies, ec->nodes); + return ret; +} + +/*Name heal*/ +int +ec_delete_stale_name (dict_t *gfid_db, char *key, data_t *d, void *data) +{ + struct ec_name_data *name_data = data; + struct iatt *ia = NULL; + ec_t *ec = NULL; + loc_t loc = {0}; + unsigned char *same = data_to_bin (d); + default_args_cbk_t *replies = NULL; + unsigned char *output = NULL; + int ret = 0; + int estale_count = 0; + int i = 0; + call_frame_t *frame = name_data->frame; + + ec = name_data->frame->this->private; + EC_REPLIES_ALLOC (replies, ec->nodes); + if (EC_COUNT (same, ec->nodes) >= ec->fragments) { + ret = 0; + goto out; + } + + loc.inode = inode_new (name_data->parent->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + gf_uuid_parse (key, loc.gfid); + output = alloca0(ec->nodes); + ret = cluster_lookup (ec->xl_list, name_data->participants, ec->nodes, + replies, output, name_data->frame, ec->xl, &loc, + NULL); + + for (i = 0; i < ec->nodes; i++) { + if (!replies[i].valid) + continue; + if (replies[i].op_ret == -1) { + if (replies[i].op_errno == ESTALE || + replies[i].op_errno == ENOENT) + estale_count++; + else + name_data->participants[i] = 0; + } + } + + if (estale_count <= ec->redundancy) { + /* We have at least ec->fragments number of fragments, so the + * file is recoverable, so don't delete it*/ + + /* Please note that the lookup call above could fail with + * ENOTCONN on all subvoumes and still this branch will be + * true, but in those cases conservatively we decide to not + * delete the file until we are sure*/ + ret = 0; + goto out; + } + + /*Noway to recover, delete the name*/ + loc_wipe (&loc); + loc.parent = inode_ref (name_data->parent); + gf_uuid_copy (loc.pargfid, loc.parent->gfid); + loc.name = name_data->name; + for (i = 0; i < ec->nodes; i++) { + if (same[i] && replies[i].valid && (replies[i].op_ret == 0)) { + ia = &replies[i].stat; + break; + } + } + + if (!ia) { + ret = -ENOTCONN; + goto out; + } + + if (IA_ISDIR (ia->ia_type)) { + ret = cluster_rmdir (ec->xl_list, same, ec->nodes, replies, + output, frame, ec->xl, &loc, 1, NULL); + } else { + ret = cluster_unlink (ec->xl_list, same, ec->nodes, replies, + output, frame, ec->xl, &loc, 0, NULL); + } + + for (i = 0; i < ec->nodes; i++) { + if (output[i]) { + same[i] = 0; + name_data->enoent[i] = 1; + } else { + /*op failed*/ + if (same[i]) + name_data->participants[i] = 0; + } + } + ret = 0; + /*This will help in making decisions about creating names*/ + dict_del (gfid_db, key); +out: + cluster_replies_wipe (replies, ec->nodes); + loc_wipe (&loc); + return ret; +} + +int +ec_delete_stale_names (call_frame_t *frame, ec_t *ec, inode_t *parent, + char *name, default_args_cbk_t *replies, dict_t *gfid_db, + unsigned char *enoent, unsigned char *gfidless, + unsigned char *participants) +{ + struct ec_name_data name_data = {0}; + + name_data.enoent = enoent; + name_data.gfidless = gfidless; + name_data.participants = participants; + name_data.name = name; + name_data.parent = parent; + name_data.frame = frame; + name_data.replies = replies; + return dict_foreach (gfid_db, ec_delete_stale_name, &name_data); +} + +int +_assign_same (dict_t *dict, char *key, data_t *value, void *data) +{ + struct ec_name_data *name_data = data; + + name_data->same = data_to_bin (value); + return 0; +} + +int +ec_create_name (call_frame_t *frame, ec_t *ec, inode_t *parent, char *name, + default_args_cbk_t *lookup_replies, dict_t *gfid_db, + unsigned char *enoent, unsigned char *participants) +{ + int ret = 0; + int i = 0; + struct ec_name_data name_data = {0}; + struct iatt *ia = NULL; + unsigned char *output = 0; + unsigned char *output1 = 0; + default_args_cbk_t *replies = NULL; + loc_t loc = {0}; + loc_t srcloc = {0}; + unsigned char *link = NULL; + unsigned char *create = NULL; + dict_t *xdata = NULL; + char *linkname = NULL; + + /* There should be just one gfid key */ + EC_REPLIES_ALLOC (replies, ec->nodes); + if (gfid_db->count != 1) { + ret = -EINVAL; + goto out; + } + + ret = dict_foreach (gfid_db, _assign_same, &name_data); + if (ret < 0) + goto out; + /*There should at least be one valid success reply with gfid*/ + for (i = 0; i < ec->nodes; i++) + if (name_data.same[i]) + break; + + if (i == ec->nodes) { + ret = -EINVAL; + goto out; + } + + ia = &lookup_replies[i].stat; + xdata = dict_new (); + loc.parent = inode_ref (parent); + gf_uuid_copy (loc.pargfid, parent->gfid); + loc.inode = inode_new (parent->table); + if (loc.inode) + srcloc.inode = inode_ref (loc.inode); + gf_uuid_copy (srcloc.gfid, ia->ia_gfid); + if (!loc.inode || !xdata || dict_set_static_bin (xdata, "gfid-req", + ia->ia_gfid, + sizeof (ia->ia_gfid))) { + ret = -ENOMEM; + goto out; + } + loc.name = name; + link = alloca0 (ec->nodes); + create = alloca0 (ec->nodes); + output = alloca0 (ec->nodes); + output1 = alloca0 (ec->nodes); + switch (ia->ia_type) { + case IA_IFDIR: + ret = cluster_mkdir (ec->xl_list, enoent, ec->nodes, + replies, output, frame, ec->xl, &loc, + st_mode_from_ia (ia->ia_prot, + ia->ia_type), 0, xdata); + break; + + case IA_IFLNK: + /*Check for hard links and create/link*/ + ret = cluster_lookup (ec->xl_list, enoent, ec->nodes, + replies, output, frame, ec->xl, + &srcloc, NULL); + for (i = 0; i < ec->nodes; i++) { + if (output[i]) { + link[i] = 1; + } else { + if (replies[i].op_errno == ENOENT || + replies[i].op_errno == ESTALE) { + create[i] = 1; + } + } + } + + if (EC_COUNT (link, ec->nodes)) { + cluster_link (ec->xl_list, link, ec->nodes, + replies, output1, frame, ec->xl, + &srcloc, &loc, NULL); + } + + if (EC_COUNT (create, ec->nodes)) { + cluster_readlink (ec->xl_list, name_data.same, + ec->nodes, replies, output, + frame, ec->xl, &srcloc, 4096, + NULL); + if (EC_COUNT (output, ec->nodes) == 0) { + ret = -ENOTCONN; + goto out; + } + + for (i = 0; i < ec->nodes; i++) { + if (output[i]) + break; + } + linkname = alloca0 (strlen(replies[i].buf) + 1); + strcpy (linkname, replies[i].buf); + cluster_symlink (ec->xl_list, create, ec->nodes, + replies, output, frame, ec->xl, + linkname, &loc, 0, xdata); + } + for (i = 0; i < ec->nodes; i++) + if (output1[i]) + output[i] = 1; + break; + default: + ret = dict_set_int32 (xdata, GLUSTERFS_INTERNAL_FOP_KEY, + 1); + if (ret) + goto out; + ret = cluster_mknod (ec->xl_list, enoent, ec->nodes, + replies, output, frame, ec->xl, + &loc, st_mode_from_ia (ia->ia_prot, + ia->ia_type), + ia->ia_rdev, 0, xdata); + break; + } + + for (i = 0; i < ec->nodes; i++) { + if (enoent[i] && !output[i]) + participants[i] = 0; + } + + ret = 0; +out: + loc_wipe (&loc); + loc_wipe (&srcloc); + EC_REPLIES_ALLOC (replies, ec->nodes); + if (xdata) + dict_unref (xdata); + return ret; +} + +int +__ec_heal_name (call_frame_t *frame, ec_t *ec, inode_t *parent, char *name, + unsigned char *participants) +{ + unsigned char *output = NULL; + unsigned char *enoent = NULL; + default_args_cbk_t *replies = NULL; + dict_t *xdata = NULL; + dict_t *gfid_db = NULL; + int ret = 0; + loc_t loc = {0}; + int i = 0; + struct iatt *ia = NULL; + char gfid[64] = {0}; + unsigned char *same = NULL; + unsigned char *gfidless = NULL; + + loc.parent = inode_ref (parent); + loc.inode = inode_new (parent->table); + gf_uuid_copy (loc.pargfid, parent->gfid); + loc.name = name; + xdata = dict_new (); + gfid_db = dict_new (); + if (!xdata || !gfid_db || !loc.inode) { + ret = -ENOMEM; + goto out; + } + + ret = dict_set_int32 (xdata, GF_GFIDLESS_LOOKUP, 1); + if (ret) { + ret = -ENOMEM; + goto out; + } + + output = alloca0 (ec->nodes); + gfidless = alloca0 (ec->nodes); + enoent = alloca0 (ec->nodes); + EC_REPLIES_ALLOC (replies, ec->nodes); + ret = cluster_lookup (ec->xl_list, participants, ec->nodes, replies, + output, frame, ec->xl, &loc, NULL); + for (i = 0; i < ec->nodes; i++) { + if (!replies[i].valid) + continue; + + if (replies[i].op_ret == -1) { + /*If ESTALE comes here, that means parent dir is not + * present, nothing to do there, so reset participants + * for that brick*/ + if (replies[i].op_errno == ENOENT) + enoent[i] = 1; + else + participants[i] = 0; + continue; + } + ia = &replies[i].stat; + if (gf_uuid_is_null (ia->ia_gfid)) { + if (IA_ISDIR (ia->ia_type) || ia->ia_size == 0) + gfidless[i] = 1; + else + participants[i] = 0; + } else { + uuid_utoa_r (ia->ia_gfid, gfid); + ret = dict_get_bin (gfid_db, gfid, (void **)&same); + if (ret < 0) { + same = alloca0(ec->nodes); + } + same[i] = 1; + if (ret < 0) { + ret = dict_set_static_bin (gfid_db, gfid, same, + ec->nodes); + } + if (ret < 0) + goto out; + } + } + + ret = ec_delete_stale_names (frame, ec, parent, name, replies, gfid_db, + enoent, gfidless, participants); + + if (gfid_db->count == 0) { + /* All entries seem to be stale entries and deleted, + * nothing more to do.*/ + goto out; + } + + if (gfid_db->count > 1) { + gf_log (ec->xl->name, GF_LOG_INFO, "%s/%s: Not able to heal", + uuid_utoa (parent->gfid), name); + memset (participants, 0, ec->nodes); + goto out; + } + + EC_INTERSECT (enoent, enoent, participants, ec->nodes); + if (EC_COUNT (enoent, ec->nodes) == 0) { + ret = 0; + goto out; + } + + ret = ec_create_name (frame, ec, parent, name, replies, gfid_db, enoent, + participants); +out: + cluster_replies_wipe (replies, ec->nodes); + loc_wipe (&loc); + if (xdata) + dict_unref (xdata); + if (gfid_db) + dict_unref (gfid_db); + return ret; +} + +int +ec_heal_name (call_frame_t *frame, ec_t *ec, inode_t *parent, char *name, + unsigned char *participants) +{ + int ret = 0; + default_args_cbk_t *replies = NULL; + unsigned char *output = NULL; + unsigned char *locked_on = NULL; + loc_t loc = {0}; + + loc.parent = inode_ref (parent); + loc.name = name; + loc.inode = inode_new (parent->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + + EC_REPLIES_ALLOC (replies, ec->nodes); + output = alloca0 (ec->nodes); + locked_on = alloca0 (ec->nodes); + ret = cluster_entrylk (ec->xl_list, participants, ec->nodes, replies, + locked_on, frame, ec->xl, ec->xl->name, parent, + NULL); + { + if (ret <= ec->fragments) { + gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal " + "as only %d number of subvolumes could " + "be locked", uuid_utoa (parent->gfid), ret); + ret = -ENOTCONN; + goto unlock; + } + EC_INTERSECT (participants, participants, locked_on, ec->nodes); + ret = __ec_heal_name (frame, ec, parent, name, participants); + } +unlock: + cluster_unentrylk (ec->xl_list, locked_on, ec->nodes, replies, output, + frame, ec->xl, ec->xl->name, parent, NULL); +out: + cluster_replies_wipe (replies, ec->nodes); + loc_wipe (&loc); + return ret; +} + +int +ec_name_heal_handler (xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + void *data) +{ + struct ec_name_data *name_data = data; + xlator_t *this = THIS; + ec_t *ec = this->private; + unsigned char *name_on = alloca0 (ec->nodes); + int i = 0; + int ret = 0; + + memcpy (name_on, name_data->participants, ec->nodes); + ret = ec_heal_name (name_data->frame, ec, parent->inode, + entry->d_name, name_on); + + if (ret < 0) + memset (name_on, 0, ec->nodes); + + for (i = 0; i < ec->nodes; i++) + if (name_data->participants[i] && !name_on[i]) + name_data->failed_on[i] = 1; + return 0; +} + +int +ec_heal_names (call_frame_t *frame, ec_t *ec, inode_t *inode, + unsigned char *participants) +{ + int i = 0; + int j = 0; + loc_t loc = {0}; + struct ec_name_data name_data = {0}; + + loc.inode = inode_ref (inode); + gf_uuid_copy (loc.gfid, inode->gfid); + name_data.frame = frame; + name_data.participants = participants; + name_data.failed_on = alloca0(ec->nodes);; + + for (i = 0; i < ec->nodes; i++) { + if (!participants[i]) + continue; + syncop_dir_scan (ec->xl_list[i], &loc, + GF_CLIENT_PID_AFR_SELF_HEALD, &name_data, + ec_name_heal_handler); + for (j = 0; j < ec->nodes; j++) + if (name_data.failed_on[j]) + participants[j] = 0; + + if (EC_COUNT (participants, ec->nodes) <= ec->fragments) + return -ENOTCONN; + } + return 0; +} + +int +__ec_heal_entry (call_frame_t *frame, ec_t *ec, inode_t *inode, + unsigned char *heal_on) +{ + unsigned char *locked_on = NULL; + unsigned char *output = NULL; + uint64_t *versions = NULL; + uint64_t *dirty = NULL; + unsigned char *sources = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *participants = NULL; + default_args_cbk_t *replies = NULL; + int ret = 0; + int source = 0; + int i = 0; + + locked_on = alloca0(ec->nodes); + output = alloca0(ec->nodes); + versions = alloca0 (ec->nodes * sizeof (*versions)); + dirty = alloca0 (ec->nodes * sizeof (*dirty)); + sources = alloca0 (ec->nodes); + healed_sinks = alloca0 (ec->nodes); + + EC_REPLIES_ALLOC (replies, ec->nodes); + ret = cluster_entrylk (ec->xl_list, heal_on, ec->nodes, replies, + locked_on, frame, ec->xl, ec->xl->name, inode, + NULL); + { + if (ret <= ec->fragments) { + gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal " + "as only %d number of subvolumes could " + "be locked", uuid_utoa (inode->gfid), ret); + ret = -ENOTCONN; + goto unlock; + } + ret = __ec_heal_entry_prepare (frame, ec, inode, locked_on, + versions, dirty, sources, + healed_sinks); + source = ret; + } +unlock: + cluster_unentrylk (ec->xl_list, locked_on, ec->nodes, replies, output, + frame, ec->xl, ec->xl->name, inode, NULL); + if (ret < 0) + goto out; + + participants = alloca0 (ec->nodes); + for (i = 0; i < ec->nodes; i++) { + if (sources[i] || healed_sinks[i]) + participants[i] = 1; + } + ret = ec_heal_names (frame, ec, inode, participants); + + if (EC_COUNT (participants, ec->nodes) <= ec->fragments) + goto out; + + for (i = 0; i < ec->nodes; i++) { + if (!participants[i]) { + sources[i] = 0; + healed_sinks[i] = 0; + } + } + + ec_adjust_versions (frame, ec, EC_DATA_TXN, inode, source, + sources, healed_sinks, versions, dirty); +out: + cluster_replies_wipe (replies, ec->nodes); + return ret; +} + +int +ec_heal_entry (call_frame_t *req_frame, ec_t *ec, inode_t *inode) +{ + unsigned char *locked_on = NULL; + unsigned char *up_subvols = NULL; + unsigned char *output = NULL; + char selfheal_domain[1024] = {0}; + int ret = 0; + default_args_cbk_t *replies = NULL; + call_frame_t *frame = NULL; + + EC_REPLIES_ALLOC (replies, ec->nodes); + locked_on = alloca0(ec->nodes); + output = alloca0(ec->nodes); + up_subvols = alloca0(ec->nodes); + + frame = copy_frame (req_frame); + if (!frame) { + ret = -ENOMEM; + goto out; + } + /*Do heal as root*/ + frame->root->uid = 0; + frame->root->gid = 0; + sprintf (selfheal_domain, "%s:self-heal", ec->xl->name); + ec_mask_to_char_array (ec->xl_up, up_subvols, ec->nodes); + /*If other processes are already doing the heal, don't block*/ + ret = cluster_entrylk (ec->xl_list, up_subvols, ec->nodes, replies, + locked_on, frame, ec->xl, selfheal_domain, inode, + NULL); + { + if (ret <= ec->fragments) { + gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal " + "as only %d number of subvolumes could " + "be locked", uuid_utoa (inode->gfid), ret); + ret = -ENOTCONN; + goto unlock; + } + ret = __ec_heal_entry (frame, ec, inode, locked_on); + } +unlock: + cluster_unentrylk (ec->xl_list, locked_on, ec->nodes, replies, output, + frame, ec->xl, selfheal_domain, inode, NULL); +out: + cluster_replies_wipe (replies, ec->nodes); + if (frame) + STACK_DESTROY (frame->root); + return ret; +} -- cgit