summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr/src/afr-self-heal-entry.c
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/afr/src/afr-self-heal-entry.c')
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c305
1 files changed, 289 insertions, 16 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index fccffa7dbac..00af8e9f2e6 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -14,6 +14,7 @@
#include "byte-order.h"
#include "afr-transaction.h"
#include "afr-messages.h"
+#include "syncop-utils.h"
/* Max file name length is 255 this filename is of length 256. No file with
* this name can ever come, entry-lock with this name is going to prevent
@@ -349,6 +350,82 @@ __afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
return ret;
}
+static gf_boolean_t
+is_full_heal_marker_present (xlator_t *this, dict_t *xdata, int idx)
+{
+ int i = 0;
+ int pending[3] = {0,};
+ void *pending_raw = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ if (!xdata)
+ return _gf_false;
+
+ /* Iterate over each of the priv->pending_keys[] elements and then
+ * see if any of them have data segment non-zero. If they do, return
+ * true. Else return false.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw))
+ continue;
+
+ if (!pending_raw)
+ continue;
+
+ memcpy (pending, pending_raw, sizeof (pending));
+ if (ntoh32 (pending[idx]))
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
+static gf_boolean_t
+afr_need_full_heal (xlator_t *this, struct afr_reply *replies, int source,
+ unsigned char *healed_sinks, afr_transaction_type type)
+{
+ int i = 0;
+ int idx = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ if (!priv->esh_granular)
+ return _gf_true;
+
+ if (type != AFR_ENTRY_TRANSACTION)
+ return _gf_true;
+
+ priv = this->private;
+ idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
+
+ /* If there is a clear source, check whether the full-heal-indicator
+ * is present in its xdata. Otherwise, we need to examine all the
+ * participating bricks and then figure if *even* one of them has a
+ * full-heal-indicator.
+ */
+
+ if (source != -1) {
+ if (is_full_heal_marker_present (this, replies[source].xdata,
+ idx))
+ return _gf_true;
+ }
+
+ /* else ..*/
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+
+ if (is_full_heal_marker_present (this, replies[i].xdata, idx))
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
static int
__afr_selfheal_entry_finalize_source (xlator_t *this, unsigned char *sources,
unsigned char *healed_sinks,
@@ -431,7 +508,8 @@ __afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this,
static int
afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this,
- fd_t *fd, char *name)
+ fd_t *fd, char *name, inode_t *parent_idx_inode,
+ xlator_t *subvol, gf_boolean_t full_crawl)
{
int ret = 0;
int source = -1;
@@ -486,10 +564,15 @@ afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this,
ret = __afr_selfheal_entry_dirent (frame, this, fd, name, inode,
source, sources, healed_sinks,
locked_on, replies);
+
+ if ((ret == 0) && (priv->esh_granular) && (!full_crawl))
+ ret = afr_shd_index_purge (subvol, parent_idx_inode,
+ name);
}
+
unlock:
afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL,
- locked_on);
+ locked_on, NULL);
if (inode)
inode_unref (inode);
if (replies)
@@ -513,12 +596,16 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this,
xlator_t *subvol = NULL;
afr_private_t *priv = NULL;
gf_boolean_t mismatch = _gf_false;
+ afr_local_t *iter_local = NULL;
+ afr_local_t *local = NULL;
priv = this->private;
subvol = priv->children[child];
INIT_LIST_HEAD (&entries.list);
+ local = frame->local;
+
iter_frame = afr_copy_frame (frame);
if (!iter_frame)
return -ENOMEM;
@@ -539,7 +626,9 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this,
continue;
ret = afr_selfheal_entry_dirent (iter_frame, this, fd,
- entry->d_name);
+ entry->d_name, NULL,
+ NULL,
+ local->need_full_crawl);
AFR_STACK_RESET (iter_frame);
if (iter_frame->local == NULL) {
ret = -ENOTCONN;
@@ -567,36 +656,210 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this,
return ret;
}
+static inode_t *
+afr_shd_entry_changes_index_inode (xlator_t *this, xlator_t *subvol,
+ uuid_t pargfid)
+{
+ int ret = -1;
+ void *index_gfid = NULL;
+ loc_t rootloc = {0,};
+ loc_t loc = {0,};
+ dict_t *xattr = NULL;
+ inode_t *inode = NULL;
+ struct iatt iatt = {0,};
+
+ rootloc.inode = inode_ref (this->itable->root);
+ gf_uuid_copy (rootloc.gfid, rootloc.inode->gfid);
+
+ ret = syncop_getxattr (subvol, &rootloc, &xattr,
+ GF_XATTROP_ENTRY_CHANGES_GFID, NULL, NULL);
+ if (ret || !xattr) {
+ errno = -ret;
+ goto out;
+ }
+
+ ret = dict_get_ptr (xattr, GF_XATTROP_ENTRY_CHANGES_GFID, &index_gfid);
+ if (ret) {
+ errno = EINVAL;
+ goto out;
+ }
+
+ loc.inode = inode_new (this->itable);
+ if (!loc.inode) {
+ errno = ENOMEM;
+ goto out;
+ }
+
+ gf_uuid_copy (loc.pargfid, index_gfid);
+ loc.name = gf_strdup (uuid_utoa (pargfid));
+
+ ret = syncop_lookup (subvol, &loc, &iatt, NULL, NULL, NULL);
+ if (ret < 0) {
+ errno = -ret;
+ goto out;
+ }
+
+ inode = inode_link (loc.inode, NULL, NULL, &iatt);
+
+out:
+ if (xattr)
+ dict_unref (xattr);
+ loc_wipe (&rootloc);
+ GF_FREE ((char *)loc.name);
+ loc_wipe (&loc);
+
+ return inode;
+}
+
+static int
+afr_selfheal_entry_granular_dirent (xlator_t *subvol, gf_dirent_t *entry,
+ loc_t *parent, void *data)
+{
+ int ret = 0;
+ loc_t loc = {0,};
+ struct iatt iatt = {0,};
+ afr_granular_esh_args_t *args = data;
+
+ /* Look up the actual inode associated with entry. If the lookup returns
+ * ESTALE or ENOENT, then it means we have a stale index. Remove it.
+ * This is analogous to the check in afr_shd_index_heal() except that
+ * here it is achieved through LOOKUP and in afr_shd_index_heal() through
+ * a GETXATTR.
+ */
+
+ loc.inode = inode_new (args->xl->itable);
+ loc.parent = inode_ref (args->heal_fd->inode);
+ gf_uuid_copy (loc.pargfid, loc.parent->gfid);
+ loc.name = entry->d_name;
+
+ ret = syncop_lookup (args->xl, &loc, &iatt, NULL, NULL, NULL);
+ if ((ret == -ENOENT) || (ret == -ESTALE)) {
+ afr_shd_index_purge (subvol, parent->inode, entry->d_name);
+ ret = 0;
+ goto out;
+ }
+ /* TBD: afr_shd_zero_xattrop? */
+
+ ret = afr_selfheal_entry_dirent (args->frame, args->xl, args->heal_fd,
+ entry->d_name, parent->inode, subvol,
+ _gf_false);
+ AFR_STACK_RESET (args->frame);
+ if (args->frame->local == NULL)
+ ret = -ENOTCONN;
+
+ if (ret == -1)
+ args->mismatch = _gf_true;
+
+out:
+ loc_wipe (&loc);
+ return 0;
+}
+
+static int
+afr_selfheal_entry_granular (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int subvol_idx, gf_boolean_t is_src)
+{
+ int ret = 0;
+ loc_t loc = {0,};
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
+ afr_granular_esh_args_t args = {0,};
+
+ priv = this->private;
+ subvol = priv->children[subvol_idx];
+
+ args.frame = afr_copy_frame (frame);
+ args.xl = this;
+ /* args.heal_fd represents the fd associated with the original directory
+ * on which entry heal is being attempted.
+ */
+ args.heal_fd = fd;
+
+ /* @subvol here represents the subvolume of AFR where
+ * indices/entry-changes/<pargfid> will be processed
+ */
+ loc.inode = afr_shd_entry_changes_index_inode (this, subvol,
+ fd->inode->gfid);
+ if (!loc.inode) {
+ /* If granular heal failed on the sink (as it might sometimes
+ * because it is the src that would mostly contain the granular
+ * changelogs and the sink's entry-changes would be empty),
+ * do not treat heal as failure.
+ */
+ if (is_src)
+ return -errno;
+ else
+ return 0;
+ }
+
+ ret = syncop_dir_scan (subvol, &loc, GF_CLIENT_PID_SELF_HEALD,
+ &args, afr_selfheal_entry_granular_dirent);
+
+ loc_wipe (&loc);
+
+ if (args.mismatch == _gf_true)
+ ret = -1;
+
+ return ret;
+}
+
static int
afr_selfheal_entry_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
int source, unsigned char *sources,
unsigned char *healed_sinks)
{
- int i = 0;
- afr_private_t *priv = NULL;
- gf_boolean_t mismatch = _gf_false;
- int ret = 0;
+ int i = 0;
+ int ret = 0;
+ gf_boolean_t mismatch = _gf_false;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
priv = this->private;
+ local = frame->local;
gf_msg (this->name, GF_LOG_INFO, 0,
AFR_MSG_SELF_HEAL_INFO, "performing entry selfheal on %s",
uuid_utoa (fd->inode->gfid));
for (i = 0; i < priv->child_count; i++) {
+ /* Expunge */
if (!healed_sinks[i])
continue;
- ret = afr_selfheal_entry_do_subvol (frame, this, fd, i);
+
+ if (!local->need_full_crawl)
+ /* Why call afr_selfheal_entry_granular() on a "healed sink",
+ * given that it is the source that contains the granular
+ * indices?
+ * If the index for this directory is non-existent or empty on
+ * this subvol (=> clear sink), then it will return early
+ * without failure status.
+ * If the index is non-empty and it is yet a 'healed sink', then
+ * it is due to a split-brain in which case we anyway need to
+ * crawl the indices/entry-changes/pargfid directory.
+ */
+ ret = afr_selfheal_entry_granular (frame, this, fd, i,
+ _gf_false);
+ else
+ ret = afr_selfheal_entry_do_subvol (frame, this, fd, i);
+
if (ret == -1) {
/* gfid or type mismatch. */
mismatch = _gf_true;
ret = 0;
}
- if (ret)
- break;
+ if (ret)
+ break;
}
- if (!ret && source != -1)
- ret = afr_selfheal_entry_do_subvol (frame, this, fd, source);
+
+ if (!ret && source != -1) {
+ /* Impunge */
+ if (local->need_full_crawl)
+ ret = afr_selfheal_entry_do_subvol (frame, this, fd,
+ source);
+ else
+ ret = afr_selfheal_entry_granular (frame, this, fd,
+ source, _gf_true);
+ }
if (mismatch == _gf_true)
/* undo pending will be skipped */
@@ -616,10 +879,12 @@ __afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd,
unsigned char *postop_lock = NULL;
unsigned char *healed_sinks = NULL;
struct afr_reply *locked_replies = NULL;
+ afr_local_t *local = NULL;
afr_private_t *priv = NULL;
gf_boolean_t did_sh = _gf_true;
priv = this->private;
+ local = frame->local;
sources = alloca0 (priv->child_count);
sinks = alloca0 (priv->child_count);
@@ -651,10 +916,16 @@ __afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd,
did_sh = _gf_false;
goto unlock;
}
+
+ local->need_full_crawl = afr_need_full_heal (this,
+ locked_replies,
+ source,
+ healed_sinks,
+ AFR_ENTRY_TRANSACTION);
}
unlock:
afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL,
- data_lock);
+ data_lock, NULL);
if (ret < 0)
goto out;
@@ -695,7 +966,7 @@ unlock:
}
postop_unlock:
afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL,
- postop_lock);
+ postop_lock, NULL);
out:
if (did_sh)
afr_log_selfheal (fd->inode->gfid, this, ret, "entry", source,
@@ -796,10 +1067,12 @@ afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode)
}
if (!granular_locks)
afr_selfheal_unentrylk (frame, this, inode, this->name,
- LONG_FILENAME, long_name_locked);
+ LONG_FILENAME, long_name_locked,
+ NULL);
}
unlock:
- afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL, locked_on);
+ afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL,
+ locked_on, NULL);
if (fd)
fd_unref (fd);