diff options
author | Krutika Dhananjay <kdhananj@redhat.com> | 2015-10-14 14:14:51 +0530 |
---|---|---|
committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2016-04-29 18:21:56 -0700 |
commit | 84c8cc9c5936a2a7539f343c180f06312c8f6d39 (patch) | |
tree | 0a89b67bde2e03dafa9f61ffea34f19d11cc9938 /xlators/cluster/afr | |
parent | f0fb05d2cefae08c143f2bfdef151084f5ddb498 (diff) |
cluster/afr: Entry self-heal performance enhancements
Change-Id: I52da41dff5619492b656c2217f4716a6cdadebe0
BUG: 1269461
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
Reviewed-on: http://review.gluster.org/12442
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
Smoke: Gluster Build System <jenkins@build.gluster.com>
CentOS-regression: Gluster Build System <jenkins@build.gluster.com>
Diffstat (limited to 'xlators/cluster/afr')
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 13 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-inode-write.c | 13 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 95 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-entry.c | 305 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-name.c | 2 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal.h | 5 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heald.c | 2 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.c | 10 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.h | 11 |
9 files changed, 408 insertions, 48 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index fda9785bdda..160170e035c 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -4284,6 +4284,8 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) goto out; } + local->need_full_crawl = _gf_false; + INIT_LIST_HEAD (&local->healer); return 0; out: @@ -4535,9 +4537,11 @@ afr_mark_pending_changelog (afr_private_t *priv, unsigned char *pending, int **changelog = NULL; int idx = -1; int m_idx = 0; + int d_idx = 0; int ret = 0; m_idx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); + d_idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); idx = afr_index_from_ia_type (iat); @@ -4552,6 +4556,11 @@ afr_mark_pending_changelog (afr_private_t *priv, unsigned char *pending, changelog[i][m_idx] = hton32(1); if (idx != -1) changelog[i][idx] = hton32(1); + /* If the newentry marking is on a newly created directory, + * then mark it with the full-heal indicator. + */ + if ((IA_ISDIR (iat)) && (priv->esh_granular)) + changelog[i][d_idx] = hton32(1); } ret = afr_set_pending_dict (priv, xattr, changelog); if (ret < 0) { @@ -4764,12 +4773,12 @@ afr_selfheal_locked_entry_inspect (call_frame_t *frame, xlator_t *this, *esh = afr_decide_heal_info (priv, sources, ret); } afr_selfheal_unentrylk (frame, this, inode, this->name, NULL, - data_lock); + data_lock, NULL); } unlock: if (!granular_locks) afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, - NULL, locked_on); + NULL, locked_on, NULL); out: if (locked_replies) afr_replies_wipe (locked_replies, priv->child_count); diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 15bae87a4f4..f240b5eec39 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -1104,12 +1104,13 @@ _afr_handle_empty_brick_type (xlator_t *this, call_frame_t *frame, afr_transaction_type type, char *op_type) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - unsigned char *locked_nodes = NULL; int count = 0; int ret = -ENOMEM; int idx = -1; + int d_idx = -1; + unsigned char *locked_nodes = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; priv = this->private; local = frame->local; @@ -1117,6 +1118,7 @@ _afr_handle_empty_brick_type (xlator_t *this, call_frame_t *frame, locked_nodes = alloca0 (priv->child_count); idx = afr_index_for_transaction_type (type); + d_idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); local->pending = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); @@ -1125,6 +1127,9 @@ _afr_handle_empty_brick_type (xlator_t *this, call_frame_t *frame, local->pending[empty_index][idx] = hton32 (1); + if ((priv->esh_granular) && (type == AFR_ENTRY_TRANSACTION)) + local->pending[empty_index][d_idx] = hton32 (1); + local->xdata_req = dict_new (); if (!local->xdata_req) goto out; @@ -1165,7 +1170,7 @@ _afr_handle_empty_brick_type (xlator_t *this, call_frame_t *frame, unlock: if (AFR_ENTRY_TRANSACTION == type) { afr_selfheal_unentrylk (frame, this, loc->inode, this->name, - NULL, locked_nodes); + NULL, locked_nodes, NULL); } else { afr_selfheal_uninodelk (frame, this, loc->inode, this->name, LLONG_MAX - 1, 0, locked_nodes); diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 68b5bb06799..0b92f616030 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -34,7 +34,7 @@ afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode, - int subvol, dict_t *xattr) + int subvol, dict_t *xattr, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -48,7 +48,7 @@ afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode, STACK_WIND (frame, afr_selfheal_post_op_cbk, priv->children[subvol], priv->children[subvol]->fops->xattrop, &loc, - GF_XATTROP_ADD_ARRAY, xattr, NULL); + GF_XATTROP_ADD_ARRAY, xattr, xdata); syncbarrier_wait (&local->barrier, 1); @@ -80,18 +80,22 @@ afr_check_stale_error (struct afr_reply *replies, afr_private_t *priv) dict_t * -afr_selfheal_output_xattr (xlator_t *this, afr_transaction_type type, - int *output_dirty, int **output_matrix, int subvol) +afr_selfheal_output_xattr (xlator_t *this, gf_boolean_t is_full_crawl, + afr_transaction_type type, int *output_dirty, + int **output_matrix, int subvol, + int **full_heal_mtx_out) { - dict_t *xattr = NULL; - afr_private_t *priv = NULL; - int j = 0; - int idx = 0; - int ret = 0; - int *raw = 0; + int j = 0; + int idx = 0; + int d_idx = 0; + int ret = 0; + int *raw = 0; + dict_t *xattr = NULL; + afr_private_t *priv = NULL; priv = this->private; idx = afr_index_for_transaction_type (type); + d_idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); xattr = dict_new (); if (!xattr) @@ -118,6 +122,8 @@ afr_selfheal_output_xattr (xlator_t *this, afr_transaction_type type, goto err; raw[idx] = hton32 (output_matrix[subvol][j]); + if (is_full_crawl) + raw[d_idx] = hton32 (full_heal_mtx_out[subvol][j]); ret = dict_set_bin (xattr, priv->pending_key[j], raw, sizeof(int) * AFR_NUM_CHANGE_LOGS); @@ -142,37 +148,57 @@ afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode, struct afr_reply *replies, unsigned char *locked_on) { afr_private_t *priv = NULL; + afr_local_t *local = NULL; int i = 0; int j = 0; unsigned char *pending = NULL; int *input_dirty = NULL; int **input_matrix = NULL; + int **full_heal_mtx_in = NULL; + int **full_heal_mtx_out = NULL; int *output_dirty = NULL; int **output_matrix = NULL; dict_t *xattr = NULL; + dict_t *xdata = NULL; priv = this->private; + local = frame->local; pending = alloca0 (priv->child_count); input_dirty = alloca0 (priv->child_count * sizeof (int)); input_matrix = ALLOC_MATRIX (priv->child_count, int); + full_heal_mtx_in = ALLOC_MATRIX (priv->child_count, int); + full_heal_mtx_out = ALLOC_MATRIX (priv->child_count, int); output_dirty = alloca0 (priv->child_count * sizeof (int)); output_matrix = ALLOC_MATRIX (priv->child_count, int); + xdata = dict_new (); + if (!xdata) + return -1; + afr_selfheal_extract_xattr (this, replies, type, input_dirty, input_matrix); + if (local->need_full_crawl) + afr_selfheal_extract_xattr (this, replies, AFR_DATA_TRANSACTION, + NULL, full_heal_mtx_in); + for (i = 0; i < priv->child_count; i++) if (sinks[i] && !healed_sinks[i]) pending[i] = 1; for (i = 0; i < priv->child_count; i++) { for (j = 0; j < priv->child_count; j++) { - if (pending[j]) + if (pending[j]) { output_matrix[i][j] = 1; - else + if (type == AFR_ENTRY_TRANSACTION) + full_heal_mtx_out[i][j] = 1; + } else { output_matrix[i][j] = -input_matrix[i][j]; + if (type == AFR_ENTRY_TRANSACTION) + full_heal_mtx_out[i][j] = -full_heal_mtx_in[i][j]; + } } } @@ -188,17 +214,30 @@ afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode, */ continue; - xattr = afr_selfheal_output_xattr (this, type, output_dirty, - output_matrix, i); + xattr = afr_selfheal_output_xattr (this, local->need_full_crawl, + type, output_dirty, + output_matrix, i, + full_heal_mtx_out); if (!xattr) { continue; } - afr_selfheal_post_op (frame, this, inode, i, xattr); + if ((type == AFR_ENTRY_TRANSACTION) && (priv->esh_granular)) { + if (xdata && + dict_set_int8 (xdata, GF_XATTROP_PURGE_INDEX, 1)) + gf_msg (this->name, GF_LOG_WARNING, 0, + AFR_MSG_DICT_SET_FAILED, "Failed to set" + " dict value for %s", + GF_XATTROP_PURGE_INDEX); + } + afr_selfheal_post_op (frame, this, inode, i, xattr, xdata); dict_unref (xattr); } + if (xdata) + dict_unref (xdata); + return 0; } @@ -242,6 +281,9 @@ afr_selfheal_fill_dirty (xlator_t *this, int *dirty, int subvol, void *pending_raw = NULL; int pending[3] = {0, }; + if (!dirty) + return 0; + if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw)) return -1; @@ -267,6 +309,9 @@ afr_selfheal_fill_matrix (xlator_t *this, int **matrix, int subvol, priv = this->private; + if (!matrix) + return 0; + for (i = 0; i < priv->child_count; i++) { if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw)) continue; @@ -1150,7 +1195,7 @@ afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, local->replies[i].op_errno == EAGAIN) { afr_locked_fill (frame, this, locked_on); afr_selfheal_unentrylk (frame, this, inode, dom, name, - locked_on); + locked_on, NULL); AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); @@ -1189,7 +1234,7 @@ afr_selfheal_tie_breaker_entrylk (call_frame_t *frame, xlator_t *this, if (lock_count > priv->child_count/2 && eagain_count) { afr_locked_fill (frame, this, locked_on); afr_selfheal_unentrylk (frame, this, inode, dom, name, - locked_on); + locked_on, NULL); AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); @@ -1203,7 +1248,8 @@ afr_selfheal_tie_breaker_entrylk (call_frame_t *frame, xlator_t *this, int afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, - char *dom, const char *name, unsigned char *locked_on) + char *dom, const char *name, unsigned char *locked_on, + dict_t *xdata) { loc_t loc = {0,}; @@ -1211,7 +1257,7 @@ afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, gf_uuid_copy (loc.gfid, inode->gfid); AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, entrylk, - dom, &loc, name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); + dom, &loc, name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata); loc_wipe (&loc); @@ -1316,7 +1362,12 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, if (replies[i].op_ret == -1) continue; - if (data_selfheal && afr_is_data_set (this, replies[i].xdata)) + /* The data segment of the changelog can be non-zero to indicate + * the directory needs a full heal. So the check below ensures + * it's not a directory before setting the data_selfheal boolean. + */ + if (data_selfheal && !IA_ISDIR (replies[i].poststat.ia_type) && + afr_is_data_set (this, replies[i].xdata)) *data_selfheal = _gf_true; if (metadata_selfheal && @@ -1326,7 +1377,7 @@ afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, if (entry_selfheal && afr_is_entry_set (this, replies[i].xdata)) *entry_selfheal = _gf_true; - valid_cnt ++; + valid_cnt++; if (valid_cnt == 1) { first = replies[i].poststat; continue; @@ -1500,7 +1551,7 @@ afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode, for (i = 0; i < priv->child_count; i++) { if (!sources[i]) continue; - afr_selfheal_post_op (frame, this, inode, i, xattr); + afr_selfheal_post_op (frame, this, inode, i, xattr, NULL); } out: if (changelog) diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index fccffa7dbac..00af8e9f2e6 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -14,6 +14,7 @@ #include "byte-order.h" #include "afr-transaction.h" #include "afr-messages.h" +#include "syncop-utils.h" /* Max file name length is 255 this filename is of length 256. No file with * this name can ever come, entry-lock with this name is going to prevent @@ -349,6 +350,82 @@ __afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, return ret; } +static gf_boolean_t +is_full_heal_marker_present (xlator_t *this, dict_t *xdata, int idx) +{ + int i = 0; + int pending[3] = {0,}; + void *pending_raw = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + + if (!xdata) + return _gf_false; + + /* Iterate over each of the priv->pending_keys[] elements and then + * see if any of them have data segment non-zero. If they do, return + * true. Else return false. + */ + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw)) + continue; + + if (!pending_raw) + continue; + + memcpy (pending, pending_raw, sizeof (pending)); + if (ntoh32 (pending[idx])) + return _gf_true; + } + + return _gf_false; +} + +static gf_boolean_t +afr_need_full_heal (xlator_t *this, struct afr_reply *replies, int source, + unsigned char *healed_sinks, afr_transaction_type type) +{ + int i = 0; + int idx = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + if (!priv->esh_granular) + return _gf_true; + + if (type != AFR_ENTRY_TRANSACTION) + return _gf_true; + + priv = this->private; + idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); + + /* If there is a clear source, check whether the full-heal-indicator + * is present in its xdata. Otherwise, we need to examine all the + * participating bricks and then figure if *even* one of them has a + * full-heal-indicator. + */ + + if (source != -1) { + if (is_full_heal_marker_present (this, replies[source].xdata, + idx)) + return _gf_true; + } + + /* else ..*/ + + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + + if (is_full_heal_marker_present (this, replies[i].xdata, idx)) + return _gf_true; + } + + return _gf_false; +} + static int __afr_selfheal_entry_finalize_source (xlator_t *this, unsigned char *sources, unsigned char *healed_sinks, @@ -431,7 +508,8 @@ __afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this, static int afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, - fd_t *fd, char *name) + fd_t *fd, char *name, inode_t *parent_idx_inode, + xlator_t *subvol, gf_boolean_t full_crawl) { int ret = 0; int source = -1; @@ -486,10 +564,15 @@ afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, ret = __afr_selfheal_entry_dirent (frame, this, fd, name, inode, source, sources, healed_sinks, locked_on, replies); + + if ((ret == 0) && (priv->esh_granular) && (!full_crawl)) + ret = afr_shd_index_purge (subvol, parent_idx_inode, + name); } + unlock: afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL, - locked_on); + locked_on, NULL); if (inode) inode_unref (inode); if (replies) @@ -513,12 +596,16 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this, xlator_t *subvol = NULL; afr_private_t *priv = NULL; gf_boolean_t mismatch = _gf_false; + afr_local_t *iter_local = NULL; + afr_local_t *local = NULL; priv = this->private; subvol = priv->children[child]; INIT_LIST_HEAD (&entries.list); + local = frame->local; + iter_frame = afr_copy_frame (frame); if (!iter_frame) return -ENOMEM; @@ -539,7 +626,9 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this, continue; ret = afr_selfheal_entry_dirent (iter_frame, this, fd, - entry->d_name); + entry->d_name, NULL, + NULL, + local->need_full_crawl); AFR_STACK_RESET (iter_frame); if (iter_frame->local == NULL) { ret = -ENOTCONN; @@ -567,36 +656,210 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this, return ret; } +static inode_t * +afr_shd_entry_changes_index_inode (xlator_t *this, xlator_t *subvol, + uuid_t pargfid) +{ + int ret = -1; + void *index_gfid = NULL; + loc_t rootloc = {0,}; + loc_t loc = {0,}; + dict_t *xattr = NULL; + inode_t *inode = NULL; + struct iatt iatt = {0,}; + + rootloc.inode = inode_ref (this->itable->root); + gf_uuid_copy (rootloc.gfid, rootloc.inode->gfid); + + ret = syncop_getxattr (subvol, &rootloc, &xattr, + GF_XATTROP_ENTRY_CHANGES_GFID, NULL, NULL); + if (ret || !xattr) { + errno = -ret; + goto out; + } + + ret = dict_get_ptr (xattr, GF_XATTROP_ENTRY_CHANGES_GFID, &index_gfid); + if (ret) { + errno = EINVAL; + goto out; + } + + loc.inode = inode_new (this->itable); + if (!loc.inode) { + errno = ENOMEM; + goto out; + } + + gf_uuid_copy (loc.pargfid, index_gfid); + loc.name = gf_strdup (uuid_utoa (pargfid)); + + ret = syncop_lookup (subvol, &loc, &iatt, NULL, NULL, NULL); + if (ret < 0) { + errno = -ret; + goto out; + } + + inode = inode_link (loc.inode, NULL, NULL, &iatt); + +out: + if (xattr) + dict_unref (xattr); + loc_wipe (&rootloc); + GF_FREE ((char *)loc.name); + loc_wipe (&loc); + + return inode; +} + +static int +afr_selfheal_entry_granular_dirent (xlator_t *subvol, gf_dirent_t *entry, + loc_t *parent, void *data) +{ + int ret = 0; + loc_t loc = {0,}; + struct iatt iatt = {0,}; + afr_granular_esh_args_t *args = data; + + /* Look up the actual inode associated with entry. If the lookup returns + * ESTALE or ENOENT, then it means we have a stale index. Remove it. + * This is analogous to the check in afr_shd_index_heal() except that + * here it is achieved through LOOKUP and in afr_shd_index_heal() through + * a GETXATTR. + */ + + loc.inode = inode_new (args->xl->itable); + loc.parent = inode_ref (args->heal_fd->inode); + gf_uuid_copy (loc.pargfid, loc.parent->gfid); + loc.name = entry->d_name; + + ret = syncop_lookup (args->xl, &loc, &iatt, NULL, NULL, NULL); + if ((ret == -ENOENT) || (ret == -ESTALE)) { + afr_shd_index_purge (subvol, parent->inode, entry->d_name); + ret = 0; + goto out; + } + /* TBD: afr_shd_zero_xattrop? */ + + ret = afr_selfheal_entry_dirent (args->frame, args->xl, args->heal_fd, + entry->d_name, parent->inode, subvol, + _gf_false); + AFR_STACK_RESET (args->frame); + if (args->frame->local == NULL) + ret = -ENOTCONN; + + if (ret == -1) + args->mismatch = _gf_true; + +out: + loc_wipe (&loc); + return 0; +} + +static int +afr_selfheal_entry_granular (call_frame_t *frame, xlator_t *this, fd_t *fd, + int subvol_idx, gf_boolean_t is_src) +{ + int ret = 0; + loc_t loc = {0,}; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + afr_granular_esh_args_t args = {0,}; + + priv = this->private; + subvol = priv->children[subvol_idx]; + + args.frame = afr_copy_frame (frame); + args.xl = this; + /* args.heal_fd represents the fd associated with the original directory + * on which entry heal is being attempted. + */ + args.heal_fd = fd; + + /* @subvol here represents the subvolume of AFR where + * indices/entry-changes/<pargfid> will be processed + */ + loc.inode = afr_shd_entry_changes_index_inode (this, subvol, + fd->inode->gfid); + if (!loc.inode) { + /* If granular heal failed on the sink (as it might sometimes + * because it is the src that would mostly contain the granular + * changelogs and the sink's entry-changes would be empty), + * do not treat heal as failure. + */ + if (is_src) + return -errno; + else + return 0; + } + + ret = syncop_dir_scan (subvol, &loc, GF_CLIENT_PID_SELF_HEALD, + &args, afr_selfheal_entry_granular_dirent); + + loc_wipe (&loc); + + if (args.mismatch == _gf_true) + ret = -1; + + return ret; +} + static int afr_selfheal_entry_do (call_frame_t *frame, xlator_t *this, fd_t *fd, int source, unsigned char *sources, unsigned char *healed_sinks) { - int i = 0; - afr_private_t *priv = NULL; - gf_boolean_t mismatch = _gf_false; - int ret = 0; + int i = 0; + int ret = 0; + gf_boolean_t mismatch = _gf_false; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; priv = this->private; + local = frame->local; gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, "performing entry selfheal on %s", uuid_utoa (fd->inode->gfid)); for (i = 0; i < priv->child_count; i++) { + /* Expunge */ if (!healed_sinks[i]) continue; - ret = afr_selfheal_entry_do_subvol (frame, this, fd, i); + + if (!local->need_full_crawl) + /* Why call afr_selfheal_entry_granular() on a "healed sink", + * given that it is the source that contains the granular + * indices? + * If the index for this directory is non-existent or empty on + * this subvol (=> clear sink), then it will return early + * without failure status. + * If the index is non-empty and it is yet a 'healed sink', then + * it is due to a split-brain in which case we anyway need to + * crawl the indices/entry-changes/pargfid directory. + */ + ret = afr_selfheal_entry_granular (frame, this, fd, i, + _gf_false); + else + ret = afr_selfheal_entry_do_subvol (frame, this, fd, i); + if (ret == -1) { /* gfid or type mismatch. */ mismatch = _gf_true; ret = 0; } - if (ret) - break; + if (ret) + break; } - if (!ret && source != -1) - ret = afr_selfheal_entry_do_subvol (frame, this, fd, source); + + if (!ret && source != -1) { + /* Impunge */ + if (local->need_full_crawl) + ret = afr_selfheal_entry_do_subvol (frame, this, fd, + source); + else + ret = afr_selfheal_entry_granular (frame, this, fd, + source, _gf_true); + } if (mismatch == _gf_true) /* undo pending will be skipped */ @@ -616,10 +879,12 @@ __afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd, unsigned char *postop_lock = NULL; unsigned char *healed_sinks = NULL; struct afr_reply *locked_replies = NULL; + afr_local_t *local = NULL; afr_private_t *priv = NULL; gf_boolean_t did_sh = _gf_true; priv = this->private; + local = frame->local; sources = alloca0 (priv->child_count); sinks = alloca0 (priv->child_count); @@ -651,10 +916,16 @@ __afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd, did_sh = _gf_false; goto unlock; } + + local->need_full_crawl = afr_need_full_heal (this, + locked_replies, + source, + healed_sinks, + AFR_ENTRY_TRANSACTION); } unlock: afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL, - data_lock); + data_lock, NULL); if (ret < 0) goto out; @@ -695,7 +966,7 @@ unlock: } postop_unlock: afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL, - postop_lock); + postop_lock, NULL); out: if (did_sh) afr_log_selfheal (fd->inode->gfid, this, ret, "entry", source, @@ -796,10 +1067,12 @@ afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode) } if (!granular_locks) afr_selfheal_unentrylk (frame, this, inode, this->name, - LONG_FILENAME, long_name_locked); + LONG_FILENAME, long_name_locked, + NULL); } unlock: - afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL, locked_on); + afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL, + locked_on, NULL); if (fd) fd_unref (fd); diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c index 9f7a5b1ff0f..3445ecccf9c 100644 --- a/xlators/cluster/afr/src/afr-self-heal-name.c +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -618,7 +618,7 @@ afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent, } unlock: afr_selfheal_unentrylk (frame, this, parent, this->name, bname, - locked_on); + locked_on, NULL); if (inode) inode_unref (inode); diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index afc086c0560..becbe67e084 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -137,7 +137,8 @@ afr_selfheal_tie_breaker_entrylk (call_frame_t *frame, xlator_t *this, int afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, - char *dom, const char *name, unsigned char *locked_on); + char *dom, const char *name, unsigned char *locked_on, + dict_t *xdata); int afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, @@ -177,7 +178,7 @@ afr_selfheal_recreate_entry (xlator_t *this, int dst, int source, inode_t *dir, int afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode, - int subvol, dict_t *xattr); + int subvol, dict_t *xattr, dict_t *xdata); call_frame_t * afr_frame_create (xlator_t *this); diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index 1ae4f18e764..2ec9d9ce686 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -265,7 +265,7 @@ afr_shd_zero_xattrop (xlator_t *this, uuid_t gfid) /*Send xattrop to all bricks. Doing a lookup to see if bricks are up or * has valid repies for this gfid seems a bit of an overkill.*/ for (i = 0; i < priv->child_count; i++) - afr_selfheal_post_op (frame, this, inode, i, xattr); + afr_selfheal_post_op (frame, this, inode, i, xattr, NULL); out: if (frame) diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 0930a081965..d01a806fe86 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -186,6 +186,8 @@ reconfigure (xlator_t *this, dict_t *options) out); GF_OPTION_RECONF ("locking-scheme", priv->locking_scheme, options, str, out); + GF_OPTION_RECONF ("granular-entry-heal", priv->esh_granular, options, + bool, out); GF_OPTION_RECONF ("eager-lock", priv->eager_lock, options, bool, out); GF_OPTION_RECONF ("quorum-type", qtype, options, str, out); @@ -379,6 +381,7 @@ init (xlator_t *this) GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out); GF_OPTION_INIT ("locking-scheme", priv->locking_scheme, str, out); + GF_OPTION_INIT ("granular-entry-heal", priv->esh_granular, bool, out); GF_OPTION_INIT ("eager-lock", priv->eager_lock, bool, out); GF_OPTION_INIT ("quorum-type", qtype, str, out); @@ -897,5 +900,12 @@ struct volume_options options[] = { "stop being compatible with afr-v1, which helps afr " "be more granular while self-healing", }, + { .key = {"granular-entry-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .description = "If this option is enabled, self-heal will resort to " + "granular way of recording changelogs and doing entry " + "self-heal.", + }, { .key = {NULL} }, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 0a872a98284..f16f9b4b4ac 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -139,6 +139,7 @@ typedef struct _afr_private { void *pump_private; gf_boolean_t use_afr_in_pump; char *locking_scheme; + gf_boolean_t esh_granular; } afr_private_t; @@ -755,6 +756,8 @@ typedef struct _afr_local { /* For client side background heals. */ struct list_head healer; call_frame_t *heal_frame; + + gf_boolean_t need_full_crawl; } afr_local_t; @@ -789,6 +792,14 @@ typedef struct afr_read_subvol_args { uuid_t gfid; } afr_read_subvol_args_t; +typedef struct afr_granular_esh_args { + fd_t *heal_fd; + xlator_t *xl; + call_frame_t *frame; + gf_boolean_t mismatch; /* flag to represent occurrence of type/gfid + mismatch */ +} afr_granular_esh_args_t; + /* did a call fail due to a child failing? */ #define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ ((op_errno == ENOTCONN) || \ |