summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPranith K <pranithk@gluster.com>2011-06-17 07:20:08 +0000
committerAnand Avati <avati@gluster.com>2011-07-12 05:37:45 -0700
commit1a82b4539b69390dfb1a158c420385c7ad5d999f (patch)
treefc1145204199e48ff7f1eb3a52be700eabb4d8bf
parent9866f23b9b0ceb8be876600be9832987b8646540 (diff)
cluster/afr: Handle lookups when self-heal is off
Signed-off-by: Pranith Kumar K <pranithk@gluster.com> Signed-off-by: Anand Avati <avati@gluster.com> BUG: 2586 (read child is set without checking the xattr) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2586
-rw-r--r--xlators/cluster/afr/src/afr-common.c777
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.c14
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c441
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.h21
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c186
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c20
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c21
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h13
-rw-r--r--xlators/cluster/afr/src/afr.h16
9 files changed, 996 insertions, 513 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 3ddfa6dff..e5046cb69 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -329,6 +329,8 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this)
if (sh->linkname)
GF_FREE ((char *)sh->linkname);
+ if (sh->child_success)
+ GF_FREE (sh->child_success);
loc_wipe (&sh->parent_loc);
}
@@ -418,6 +420,18 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)
if (local->cont.lookup.inode) {
inode_unref (local->cont.lookup.inode);
}
+
+ if (local->cont.lookup.postparents)
+ GF_FREE (local->cont.lookup.postparents);
+
+ if (local->cont.lookup.bufs)
+ GF_FREE (local->cont.lookup.bufs);
+
+ if (local->cont.lookup.child_success)
+ GF_FREE (local->cont.lookup.child_success);
+
+ if (local->cont.lookup.sources)
+ GF_FREE (local->cont.lookup.sources);
}
{ /* getxattr */
@@ -510,6 +524,22 @@ afr_up_children_count (int child_count, unsigned char *child_up)
return ret;
}
+gf_boolean_t
+afr_is_fresh_lookup (loc_t *loc, xlator_t *this)
+{
+ uint64_t ctx = 0;
+ int32_t ret = 0;
+
+ GF_ASSERT (loc);
+ GF_ASSERT (this);
+ GF_ASSERT (loc->inode);
+
+ ret = inode_ctx_get (loc->inode, this, &ctx);
+ if (0 == ret)
+ return _gf_false;
+ return _gf_true;
+}
+
void
afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent)
{
@@ -534,68 +564,96 @@ afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this)
}
AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->cont.lookup.inode,
- &local->cont.lookup.buf,
+ local->cont.lookup.inode, &local->cont.lookup.buf,
local->cont.lookup.xattr,
&local->cont.lookup.postparent);
return 0;
}
+void
+afr_lookup_build_response_params (afr_local_t *local, xlator_t *this)
+{
+ int32_t read_child = -1;
+ struct iatt *buf = NULL;
+ struct iatt *postparent = NULL;
+ dict_t **xattr = NULL;
-static void
-afr_lookup_collect_xattr (afr_local_t *local, xlator_t *this,
- int child_index, dict_t *xattr)
+ GF_ASSERT (local);
+ GF_ASSERT (local->cont.lookup.read_child >= 0);
+
+ buf = &local->cont.lookup.buf;
+ postparent = &local->cont.lookup.postparent;
+ xattr = &local->cont.lookup.xattr;
+
+ read_child = local->cont.lookup.read_child;
+ *xattr = dict_ref (local->cont.lookup.xattrs[read_child]);
+ *buf = local->cont.lookup.bufs[read_child];
+ *postparent = local->cont.lookup.postparents[read_child];
+
+ if (IA_INVAL == local->cont.lookup.inode->ia_type) {
+ /* fix for RT #602 */
+ local->cont.lookup.inode->ia_type = buf->ia_type;
+ }
+}
+
+
+ static void
+afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this,
+ int child_index, dict_t *xattr)
{
uint32_t inodelk_count = 0;
uint32_t entrylk_count = 0;
- int ret = 0;
+ int ret = -1;
+
+ GF_ASSERT (local);
+ GF_ASSERT (this);
+ GF_ASSERT (xattr);
+ GF_ASSERT (child_index >= 0);
+
+ ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT,
+ &inodelk_count);
+ if (ret == 0)
+ local->inodelk_count += inodelk_count;
- if (afr_sh_has_metadata_pending (xattr, child_index, this)) {
+ ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT,
+ &entrylk_count);
+ if (ret == 0)
+ local->entrylk_count += entrylk_count;
+}
+
+static void
+afr_lookup_detect_self_heal_by_xattr (afr_local_t *local, xlator_t *this,
+ dict_t *xattr)
+{
+ GF_ASSERT (local);
+ GF_ASSERT (this);
+ GF_ASSERT (xattr);
+
+ if (afr_sh_has_metadata_pending (xattr, this)) {
local->self_heal.need_metadata_self_heal = _gf_true;
gf_log(this->name, GF_LOG_DEBUG,
"metadata self-heal is pending for %s.",
local->loc.path);
}
- if (afr_sh_has_entry_pending (xattr, child_index, this)) {
+ if (afr_sh_has_entry_pending (xattr, this)) {
local->self_heal.need_entry_self_heal = _gf_true;
gf_log(this->name, GF_LOG_DEBUG,
"entry self-heal is pending for %s.", local->loc.path);
}
- if (afr_sh_has_data_pending (xattr, child_index, this)) {
+ if (afr_sh_has_data_pending (xattr, this)) {
local->self_heal.need_data_self_heal = _gf_true;
gf_log(this->name, GF_LOG_DEBUG,
"data self-heal is pending for %s.", local->loc.path);
}
-
- ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT,
- &inodelk_count);
- if (ret == 0)
- local->inodelk_count += inodelk_count;
-
- ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT,
- &entrylk_count);
- if (ret == 0)
- local->entrylk_count += entrylk_count;
}
-
static void
-afr_lookup_self_heal_check (xlator_t *this, afr_local_t *local,
+afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this,
struct iatt *buf, struct iatt *lookup_buf)
{
- if (FILETYPE_DIFFERS (buf, lookup_buf)) {
- /* mismatching filetypes with same name
- */
-
- gf_log (this->name, GF_LOG_INFO,
- "filetype differs for %s ", local->loc.path);
-
- local->govinda_gOvinda = 1;
- }
-
if (PERMISSION_DIFFERS (buf, lookup_buf)) {
/* mismatching permissions */
gf_log (this->name, GF_LOG_INFO,
@@ -624,105 +682,298 @@ afr_lookup_self_heal_check (xlator_t *this, afr_local_t *local,
}
}
-
static void
-afr_lookup_done (call_frame_t *frame, xlator_t *this, struct iatt *lookup_buf)
+afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this)
{
- int unwind = 1;
- int source = -1;
- int up_count = 0;
- char sh_type_str[256] = {0,};
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- priv = this->private;
- local = frame->local;
-
- up_count = afr_up_children_count (priv->child_count, priv->child_up);
- if (up_count == 1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Only 1 child up - do not attempt to detect self heal");
-
- goto unwind;
- }
+ GF_ASSERT (local);
+ GF_ASSERT (this);
- if (local->success_count && local->enoent_count) {
+ if ((local->success_count > 0) && (local->enoent_count > 0)) {
local->self_heal.need_metadata_self_heal = _gf_true;
local->self_heal.need_data_self_heal = _gf_true;
local->self_heal.need_entry_self_heal = _gf_true;
gf_log(this->name, GF_LOG_INFO,
"entries are missing in lookup of %s.",
local->loc.path);
+ //If all self-heals are needed no need to check for other rules
+ goto out;
}
- if (local->success_count) {
- /* check for split-brain case in previous lookup */
- if (afr_is_split_brain (this, local->cont.lookup.inode)) {
+ if (local->success_count > 0) {
+ if (afr_is_split_brain (this, local->cont.lookup.inode) &&
+ IA_ISREG (local->cont.lookup.inode->ia_type)) {
local->self_heal.need_data_self_heal = _gf_true;
- gf_log(this->name, GF_LOG_WARNING,
- "split brain detected during lookup of %s.",
- local->loc.path);
+ gf_log (this->name, GF_LOG_WARNING,
+ "split brain detected during lookup of %s.",
+ local->loc.path);
}
}
- if ((local->self_heal.need_metadata_self_heal
- || local->self_heal.need_data_self_heal
- || local->self_heal.need_entry_self_heal)
- && ((!local->cont.lookup.is_revalidate)
- || (local->op_ret != -1))) {
+out:
+ return;
+}
- if (local->inodelk_count || local->entrylk_count) {
+gf_boolean_t
+afr_can_self_heal_proceed (afr_self_heal_t *sh, afr_private_t *priv)
+{
+ GF_ASSERT (sh);
+ GF_ASSERT (priv);
- /* Someone else is doing self-heal on this file.
- So just make a best effort to set the read-subvolume
- and return */
+ return ((priv->data_self_heal && sh->need_data_self_heal)
+ || (priv->metadata_self_heal && sh->need_metadata_self_heal)
+ || (priv->entry_self_heal && sh->need_entry_self_heal));
+}
- if (IA_ISREG (local->cont.lookup.inode->ia_type)) {
- source = afr_self_heal_get_source (this, local, local->cont.lookup.xattrs);
+gf_boolean_t
+afr_is_self_heal_enabled (afr_private_t *priv)
+{
+ GF_ASSERT (priv);
- if (source >= 0) {
- afr_set_read_child (this,
- local->cont.lookup.inode,
- source);
- }
- }
- goto unwind;
+ return (priv->data_self_heal || priv->metadata_self_heal
+ || priv->entry_self_heal);
+}
+
+int
+afr_lookup_select_read_child (afr_local_t *local, xlator_t *this,
+ int32_t *read_child)
+{
+ int32_t source = -1;
+ ia_type_t ia_type = 0;
+ int ret = -1;
+ afr_transaction_type type = AFR_METADATA_TRANSACTION;
+ dict_t **xattrs = NULL;
+ int32_t *child_success = NULL;
+ struct iatt *bufs = NULL;
+
+ GF_ASSERT (local);
+ GF_ASSERT (this);
+
+ bufs = local->cont.lookup.bufs;
+ child_success = local->cont.lookup.child_success;
+ ia_type = local->cont.lookup.bufs[child_success[0]].ia_type;
+ if (IA_ISDIR (ia_type)) {
+ type = AFR_ENTRY_TRANSACTION;
+ } else if (IA_ISREG (ia_type)) {
+ type = AFR_DATA_TRANSACTION;
+ }
+ xattrs = local->cont.lookup.xattrs;
+ source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs,
+ type);
+ if (source < 0)
+ goto out;
+
+ *read_child = source;
+ ret = 0;
+out:
+ return ret;
+}
+
+static inline gf_boolean_t
+afr_is_self_heal_running (afr_local_t *local)
+{
+ GF_ASSERT (local);
+ return ((local->inodelk_count > 0) || (local->entrylk_count > 0));
+}
+
+static void
+afr_launch_self_heal (call_frame_t *frame, xlator_t *this,
+ gf_boolean_t is_background, ia_type_t ia_type,
+ int (*unwind) (call_frame_t *frame, xlator_t *this))
+{
+ afr_local_t *local = NULL;
+ char sh_type_str[256] = {0,};
+
+ GF_ASSERT (frame);
+ GF_ASSERT (this);
+
+ local = frame->local;
+ local->self_heal.background = is_background;
+ local->self_heal.type = ia_type;
+ local->self_heal.unwind = unwind;
+
+ afr_self_heal_type_str_get (&local->self_heal,
+ sh_type_str,
+ sizeof (sh_type_str));
+
+ gf_log (this->name, GF_LOG_INFO,
+ "background %s self-heal triggered. path: %s",
+ sh_type_str, local->loc.path);
+
+ afr_self_heal (frame, this);
+}
+
+static void
+afr_lookup_detect_self_heal (afr_local_t *local, xlator_t *this)
+{
+ int i = 0;
+ struct iatt *bufs = NULL;
+ dict_t **xattr = NULL;
+ afr_private_t *priv = NULL;
+ int32_t child1 = -1;
+ int32_t child2 = -1;
+
+ afr_detect_self_heal_by_lookup_status (local, this);
+
+ bufs = local->cont.lookup.bufs;
+ for (i = 1; i < local->success_count; i++) {
+ child1 = local->cont.lookup.child_success[i-1];
+ child2 = local->cont.lookup.child_success[i];;
+ afr_detect_self_heal_by_iatt (local, this,
+ &bufs[child1], &bufs[child2]);
+ }
+
+ xattr = local->cont.lookup.xattrs;
+ priv = this->private;
+ for (i = 0; i < local->success_count; i++) {
+ child1 = local->cont.lookup.child_success[i];;
+ afr_lookup_detect_self_heal_by_xattr (local, this,
+ xattr[child1]);
+ }
+}
+
+static void
+afr_lookup_perform_self_heal_if_needed (call_frame_t *frame, xlator_t *this,
+ gf_boolean_t *sh_launched)
+{
+ size_t up_count = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ GF_ASSERT (sh_launched);
+ *sh_launched = _gf_false;
+ priv = this->private;
+ local = frame->local;
+
+ up_count = afr_up_children_count (priv->child_count, local->child_up);
+ if (up_count == 1) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Only 1 child up - do not attempt to detect self heal");
+ goto out;
+ }
+
+ if (_gf_false == afr_is_self_heal_enabled (priv)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Self heal is not enabled");
+ goto out;
+ }
+
+ afr_lookup_detect_self_heal (local, this);
+ if (afr_can_self_heal_proceed (&local->self_heal, priv)) {
+ if (afr_is_self_heal_running (local)) {
+ goto out;
}
- if (!local->cont.lookup.inode->ia_type) {
- /* fix for RT #602 */
- local->cont.lookup.inode->ia_type =
- lookup_buf->ia_type;
+ afr_launch_self_heal (frame, this, _gf_true,
+ local->cont.lookup.buf.ia_type,
+ afr_self_heal_lookup_unwind);
+ *sh_launched = _gf_true;
+ }
+out:
+ return;
+}
+
+static gf_boolean_t
+afr_lookup_split_brain (afr_local_t *local, xlator_t *this)
+{
+ int i = 0;
+ gf_boolean_t symptom = _gf_false;
+ struct iatt *bufs = NULL;
+ int32_t *child_success = NULL;
+ struct iatt *child1 = NULL;
+ struct iatt *child2 = NULL;
+ const char *path = NULL;
+
+ bufs = local->cont.lookup.bufs;
+ child_success = local->cont.lookup.child_success;
+ for (i = 1; i < local->success_count; i++) {
+ child1 = &bufs[child_success[i-1]];
+ child2 = &bufs[child_success[i]];
+ /*
+ * TODO: gfid self-heal
+ * if (uuid_compare (child1->ia_gfid, child2->ia_gfid)) {
+ * gf_log (this->name, GF_LOG_WARNING, "%s: gfid differs"
+ * " on subvolumes (%d, %d)", local->loc.path,
+ * child_success[i-1], child_success[i]);
+ * symptom = _gf_true;
+ * }
+ */
+
+ if (FILETYPE_DIFFERS (child1, child2)) {
+ path = local->loc.path;
+ gf_log (this->name, GF_LOG_WARNING, "%s: filetype "
+ "differs on subvolumes (%d, %d)", path,
+ child_success[i-1], child_success[i]);
+ symptom = _gf_true;
+ local->govinda_gOvinda = 1;
}
+ if (symptom)
+ break;
+ }
+ return symptom;
+}
- local->self_heal.background = _gf_true;
- local->self_heal.type = local->cont.lookup.buf.ia_type;
- local->self_heal.unwind = afr_self_heal_lookup_unwind;
+static int
+afr_lookup_set_read_child (afr_local_t *local, xlator_t *this, int32_t read_child)
+{
+ GF_ASSERT (read_child >= 0);
- unwind = 0;
+ afr_set_read_child (this, local->cont.lookup.inode, read_child);
+ local->cont.lookup.read_child = read_child;
- afr_self_heal_type_str_get(&local->self_heal,
- sh_type_str,
- sizeof(sh_type_str));
+ return 0;
+}
- gf_log (this->name, GF_LOG_INFO,
- "background %s self-heal triggered. path: %s",
- sh_type_str, local->loc.path);
+static void
+afr_lookup_done (call_frame_t *frame, xlator_t *this)
+{
+ int unwind = 1;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ gf_boolean_t sh_launched = _gf_false;
+ int32_t read_child = -1;
+
+ priv = this->private;
+ local = frame->local;
- afr_self_heal (frame, this);
+ if (local->op_ret < 0)
+ goto unwind;
+
+ if (_gf_true == afr_lookup_split_brain (local, this)) {
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto unwind;
}
-unwind:
- if (unwind) {
- AFR_STACK_UNWIND (lookup, frame, local->op_ret,
- local->op_errno,
- local->cont.lookup.inode,
- &local->cont.lookup.buf,
- local->cont.lookup.xattr,
- &local->cont.lookup.postparent);
+ ret = afr_lookup_select_read_child (local, this, &read_child);
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto unwind;
}
-}
+ ret = afr_lookup_set_read_child (local, this, read_child);
+ if (ret)
+ goto unwind;
+
+ afr_lookup_build_response_params (local, this);
+ if (afr_is_fresh_lookup (&local->loc, this)) {
+ afr_update_loc_gfids (&local->loc, &local->cont.lookup.buf,
+ &local->cont.lookup.postparent);
+ }
+
+ afr_lookup_perform_self_heal_if_needed (frame, this, &sh_launched);
+ if (sh_launched)
+ unwind = 0;
+ unwind:
+ if (unwind) {
+ AFR_STACK_UNWIND (lookup, frame, local->op_ret,
+ local->op_errno, local->cont.lookup.inode,
+ &local->cont.lookup.buf,
+ local->cont.lookup.xattr,
+ &local->cont.lookup.postparent);
+ }
+}
/*
* During a lookup, some errors are more "important" than
@@ -749,236 +1000,170 @@ __error_more_important (int32_t old_errno, int32_t new_errno)
return ret;
}
-
-int
-afr_fresh_lookup_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
+static void
+afr_lookup_handle_error (afr_local_t *local, int32_t op_ret, int32_t op_errno)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- struct iatt * lookup_buf = NULL;
- int call_count = -1;
- int child_index = -1;
- int first_up_child = -1;
-
- child_index = (long) cookie;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- local = frame->local;
-
- lookup_buf = &local->cont.lookup.buf;
-
- if (op_ret == -1) {
- if (op_errno == ENOENT)
- local->enoent_count++;
-
- if (__error_more_important (local->op_errno, op_errno))
- local->op_errno = op_errno;
-
- if (local->op_errno == ESTALE) {
- local->op_ret = -1;
- }
-
- goto unlock;
- }
-
- afr_lookup_collect_xattr (local, this, child_index, xattr);
+ GF_ASSERT (local);
+ if (op_errno == ENOENT)
+ local->enoent_count++;
- first_up_child = afr_first_up_child (priv);
-
- if (local->success_count == 0) {
- if (local->op_errno != ESTALE)
- local->op_ret = op_ret;
-
- local->cont.lookup.inode = inode_ref (inode);
- local->cont.lookup.xattr = dict_ref (xattr);
- local->cont.lookup.xattrs[child_index] = dict_ref (xattr);
- local->cont.lookup.postparent = *postparent;
-
- if (priv->first_lookup && inode->ino == 1) {
- gf_log (this->name, GF_LOG_INFO,
- "added root inode");
- priv->root_inode = inode_ref (inode);
- priv->first_lookup = 0;
- }
-
- *lookup_buf = *buf;
-
- uuid_copy (local->loc.gfid, buf->ia_gfid);
- uuid_copy (local->loc.pargfid,
- postparent->ia_gfid);
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this,
- local->cont.lookup.inode,
- priv->read_child);
- } else {
- afr_set_read_child (this,
- local->cont.lookup.inode,
- child_index);
- }
+ if (__error_more_important (local->op_errno, op_errno))
+ local->op_errno = op_errno;
- } else {
- afr_lookup_self_heal_check (this, local, buf, lookup_buf);
+ if (local->op_errno == ESTALE) {
+ local->op_ret = -1;
+ }
+}
- if (child_index == local->read_child_index) {
- /*
- lookup has succeeded on the read child.
- So use its inode number
- */
- if (local->cont.lookup.xattr)
- dict_unref (local->cont.lookup.xattr);
+static void
+afr_set_root_inode_on_first_lookup (afr_local_t *local, xlator_t *this,
+ inode_t *inode)
+{
+ afr_private_t *priv = NULL;
+ GF_ASSERT (inode);
- local->cont.lookup.xattr = dict_ref (xattr);
- local->cont.lookup.xattrs[child_index] = dict_ref (xattr);
- local->cont.lookup.postparent = *postparent;
+ if (inode->ino != 1)
+ goto out;
+ if (!afr_is_fresh_lookup (&local->loc, this))
+ goto out;
+ priv = this->private;
+ if ((priv->first_lookup)) {
+ gf_log (this->name, GF_LOG_INFO, "added root inode");
+ priv->root_inode = inode_ref (inode);
+ priv->first_lookup = 0;
+ }
+out:
+ return;
+}
- *lookup_buf = *buf;
+static void
+afr_lookup_cache_args (afr_local_t *local, int child_index, dict_t *xattr,
+ struct iatt *buf, struct iatt *postparent)
+{
+ GF_ASSERT (child_index >= 0);
+ local->cont.lookup.xattrs[child_index] = dict_ref (xattr);
+ local->cont.lookup.postparents[child_index] = *postparent;
+ local->cont.lookup.bufs[child_index] = *buf;
+}
- uuid_copy (local->loc.gfid, buf->ia_gfid);
- uuid_copy (local->loc.pargfid,
- postparent->ia_gfid);
- }
+static void
+afr_lookup_handle_first_success (afr_local_t *local, xlator_t *this,
+ inode_t *inode, struct iatt *buf)
+{
+ local->cont.lookup.inode = inode_ref (inode);
+ local->cont.lookup.buf = *buf;
+ afr_set_root_inode_on_first_lookup (local, this, inode);
+}
+static void
+afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_index,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ if (local->success_count == 0) {
+ if (local->op_errno != ESTALE) {
+ local->op_ret = op_ret;
+ local->op_errno = 0;
}
-
- local->success_count++;
- }
-unlock:
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- afr_lookup_done (frame, this, lookup_buf);
+ afr_lookup_handle_first_success (local, this, inode, buf);
}
+ afr_lookup_update_lk_counts (local, this,
+ child_index, xattr);
- return 0;
+ afr_lookup_cache_args (local, child_index, xattr,
+ buf, postparent);
+ local->cont.lookup.child_success[local->success_count] = child_index;
+ local->success_count++;
}
-
int
-afr_revalidate_lookup_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
+afr_lookup_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct iatt *buf, dict_t *xattr,
+ struct iatt *postparent)
{
afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- struct iatt * lookup_buf = NULL;
int call_count = -1;
int child_index = -1;
- int first_up_child = -1;
- child_index = (long) cookie;
- priv = this->private;
+ child_index = (long) cookie;
LOCK (&frame->lock);
{
local = frame->local;
- lookup_buf = &local->cont.lookup.buf;
-
if (op_ret == -1) {
- if (op_errno == ENOENT)
- local->enoent_count++;
-
- if (__error_more_important (local->op_errno, op_errno))
- local->op_errno = op_errno;
-
- if (local->op_errno == ESTALE) {
- local->op_ret = -1;
- }
-
+ afr_lookup_handle_error (local, op_ret, op_errno);
goto unlock;
}
+ afr_lookup_handle_success (local, this, child_index, op_ret,
+ op_errno, inode, buf, xattr,
+ postparent);
- afr_lookup_collect_xattr (local, this, child_index, xattr);
-
- first_up_child = afr_first_up_child (priv);
-
- /* in case of revalidate, we need to send stat of the
- * child whose stat was sent during the first lookup.
- * (so that time stamp does not vary with revalidate.
- * in case it is down, stat of the fist success will
- * be replied */
-
- /* inode number should be preserved across revalidates */
-
- if (local->success_count == 0) {
- if (local->op_errno != ESTALE)
- local->op_ret = op_ret;
-
- local->cont.lookup.inode = inode_ref (inode);
- local->cont.lookup.xattr = dict_ref (xattr);
- local->cont.lookup.xattrs[child_index] = dict_ref (xattr);
- local->cont.lookup.postparent = *postparent;
-
- *lookup_buf = *buf;
-
- if (priv->read_child >= 0) {
- afr_set_read_child (this,
- local->cont.lookup.inode,
- priv->read_child);
- } else {
- afr_set_read_child (this,
- local->cont.lookup.inode,
- child_index);
- }
-
- } else {
- afr_lookup_self_heal_check (this, local, buf, lookup_buf);
-
- if (child_index == local->read_child_index) {
+ }
+unlock:
+ UNLOCK (&frame->lock);
- /*
- lookup has succeeded on the read child.
- So use its inode number
- */
+ call_count = afr_frame_return (frame);
+ if (call_count == 0) {
+ afr_lookup_done (frame, this);
+ }
- if (local->cont.lookup.xattr)
- dict_unref (local->cont.lookup.xattr);
+ return 0;
+}
- local->cont.lookup.xattr = dict_ref (xattr);
- local->cont.lookup.xattrs[child_index] = dict_ref (xattr);
- local->cont.lookup.postparent = *postparent;
+int
+afr_lookup_cont_init (afr_local_t *local, unsigned int child_count)
+{
+ int ret = -ENOMEM;
+ int32_t *child_success = NULL;
+ struct iatt *iatts = NULL;
+ int i = 0;
- *lookup_buf = *buf;
- }
+ GF_ASSERT (local);
+ local->cont.lookup.xattrs = GF_CALLOC (child_count,
+ sizeof (*local->cont.lookup.xattr),
+ gf_afr_mt_dict_t);
+ if (NULL == local->cont.lookup.xattrs)
+ goto out;
- }
+ iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt);
+ if (NULL == iatts)
+ goto out;
+ local->cont.lookup.postparents = iatts;
- local->success_count++;
- }
-unlock:
- UNLOCK (&frame->lock);
+ iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt);
+ if (NULL == iatts)
+ goto out;
+ local->cont.lookup.bufs = iatts;
- call_count = afr_frame_return (frame);
+ child_success = GF_CALLOC (child_count, sizeof (*child_success),
+ gf_afr_mt_char);
+ if (NULL == child_success)
+ goto out;
+ for (i = 0; i < child_count; i++)
+ child_success[i] = -1;
- if (call_count == 0) {
- afr_lookup_done (frame, this, lookup_buf);
- }
+ local->cont.lookup.child_success = child_success;
- return 0;
+ local->cont.lookup.read_child = -1;
+ ret = 0;
+out:
+ return ret;
}
-
int
afr_lookup (call_frame_t *frame, xlator_t *this,
loc_t *loc, dict_t *xattr_req)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int ret = -1;
- int i = 0;
- fop_lookup_cbk_t callback = NULL;
- int call_count = 0;
- uint64_t ctx = 0;
- int32_t op_errno = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int i = 0;
+ int call_count = 0;
+ uint64_t ctx = 0;
+ int32_t op_errno = 0;
priv = this->private;
@@ -999,14 +1184,9 @@ afr_lookup (call_frame_t *frame, xlator_t *this,
if (ret == 0) {
/* lookup is a revalidate */
- callback = afr_revalidate_lookup_cbk;
-
- local->cont.lookup.is_revalidate = _gf_true;
local->read_child_index = afr_read_child (this,
loc->inode);
} else {
- callback = afr_fresh_lookup_cbk;
-
LOCK (&priv->read_child_lock);
{
local->read_child_index = (++priv->read_child_rr)
@@ -1019,10 +1199,16 @@ afr_lookup (call_frame_t *frame, xlator_t *this,
local->cont.lookup.parent_ino = loc->parent->ino;
local->child_up = memdup (priv->child_up, priv->child_count);
+ if (NULL == local->child_up) {
+ op_errno = ENOMEM;
+ goto out;
+ }
- local->cont.lookup.xattrs = GF_CALLOC (priv->child_count,
- sizeof (*local->cont.lookup.xattr),
- gf_afr_mt_dict_t);
+ ret = afr_lookup_cont_init (local, priv->child_count);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
local->call_count = afr_up_children_count (priv->child_count,
local->child_up);
@@ -1068,7 +1254,8 @@ afr_lookup (call_frame_t *frame, xlator_t *this,
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, callback, (void *) (long) i,
+ STACK_WIND_COOKIE (frame, afr_lookup_cbk,
+ (void *) (long) i,
priv->children[i],
priv->children[i]->fops->lookup,
loc, local->xattr_req);
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
index 16b44a685..1bd2cc963 100644
--- a/xlators/cluster/afr/src/afr-dir-read.c
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -70,15 +70,19 @@ gf_boolean_t
__checksums_differ (uint32_t *checksum, int child_count,
unsigned char *child_up)
{
- int ret = _gf_false;
- int i = 0;
- uint32_t cksum = 0;
-
- cksum = checksum[0];
+ int ret = _gf_false;
+ int i = 0;
+ uint32_t cksum = 0;
+ gf_boolean_t activate_check = _gf_false;
for (i = 0; i < child_count; i++) {
if (!child_up[i])
continue;
+ if (_gf_false == activate_check) {
+ cksum = checksum[i];
+ activate_check = _gf_true;
+ continue;
+ }
if (cksum != checksum[i]) {
ret = _gf_true;
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index dfea2cd58..abc9ccb0f 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -70,23 +70,6 @@ afr_sh_source_count (int sources[], int child_count)
return nsource;
}
-
-int
-afr_sh_supress_errenous_children (int sources[], int child_errno[],
- int child_count)
-{
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (child_errno[i] && sources[i]) {
- sources[i] = 0;
- }
- }
-
- return 0;
-}
-
-
void
afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this)
{
@@ -113,11 +96,46 @@ afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this)
GF_FREE (buf);
}
+void
+afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count)
+{
+ int i = 0;
+ int j = 0;
+
+ GF_ASSERT (pending_matrix);
+
+ for (i = 0; i < child_count; i++) {
+ for (j = 0; j < child_count; j++) {
+ pending_matrix[i][j] = 0;
+ }
+ }
+}
void
-afr_sh_build_pending_matrix (afr_private_t *priv,
- int32_t *pending_matrix[], dict_t *xattr[],
- int child_count, afr_transaction_type type)
+afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix,
+ unsigned char *ignorant_subvols,
+ size_t child_count)
+{
+ int i = 0;
+ int j = 0;
+
+ GF_ASSERT (pending_matrix);
+ GF_ASSERT (ignorant_subvols);
+
+ for (i = 0; i < child_count; i++) {
+ if (ignorant_subvols[i]) {
+ for (j = 0; j < child_count; j++) {
+ if (!ignorant_subvols[j])
+ pending_matrix[j][i] += 1;
+ }
+ }
+ }
+}
+
+int
+afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix,
+ dict_t *xattr[], afr_transaction_type type,
+ size_t child_count)
{
/* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
int32_t pending[3] = {0,};
@@ -130,19 +148,16 @@ afr_sh_build_pending_matrix (afr_private_t *priv,
ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), child_count,
gf_afr_mt_char);
+ if (NULL == ignorant_subvols)
+ goto out;
- /* start clean */
- for (i = 0; i < child_count; i++) {
- for (j = 0; j < child_count; j++) {
- pending_matrix[i][j] = 0;
- }
- }
+ afr_init_pending_matrix (pending_matrix, child_count);
for (i = 0; i < child_count; i++) {
pending_raw = NULL;
for (j = 0; j < child_count; j++) {
- ret = dict_get_ptr (xattr[i], priv->pending_key[j],
+ ret = dict_get_ptr (xattr[i], pending_key[j],
&pending_raw);
if (ret != 0) {
@@ -163,21 +178,12 @@ afr_sh_build_pending_matrix (afr_private_t *priv,
}
}
- /*
- * Make all non-ignorant subvols point towards the ignorant
- * subvolumes.
- */
-
- for (i = 0; i < child_count; i++) {
- if (ignorant_subvols[i]) {
- for (j = 0; j < child_count; j++) {
- if (!ignorant_subvols[j])
- pending_matrix[j][i] += 1;
- }
- }
- }
-
+ afr_mark_ignorant_subvols_as_pending (pending_matrix,
+ ignorant_subvols,
+ child_count);
GF_FREE (ignorant_subvols);
+out:
+ return ret;
}
@@ -208,7 +214,8 @@ afr_sh_build_pending_matrix (afr_private_t *priv,
typedef enum {
AFR_NODE_INNOCENT,
AFR_NODE_FOOL,
- AFR_NODE_WISE
+ AFR_NODE_WISE,
+ AFR_NODE_INVALID = -1,
} afr_node_type;
typedef struct {
@@ -353,182 +360,276 @@ afr_sh_mark_wisest_as_sources (int sources[],
return nsources;
}
-
-static int
-afr_sh_mark_if_size_differs (afr_self_heal_t *sh, int child_count)
+static void
+afr_compute_witness_of_fools (int32_t *witnesses, int32_t **pending_matrix,
+ afr_node_character *characters,
+ int32_t child_count)
{
- int32_t ** pending_matrix = NULL;
- int i = 0;
- int j = 0;
- int size_differs = 0;
+ int i = 0;
+ int j = 0;
+ int witness = 0;
- pending_matrix = sh->pending_matrix;
+ GF_ASSERT (witnesses);
+ GF_ASSERT (pending_matrix);
+ GF_ASSERT (characters);
+ GF_ASSERT (child_count > 0);
for (i = 0; i < child_count; i++) {
+ if (characters[i].type != AFR_NODE_FOOL)
+ continue;
+
+ witness = 0;
for (j = 0; j < child_count; j++) {
- if (!sh->buf)
- break;
+ if (i == j)
+ continue;
+ witness += pending_matrix[i][j];
+ }
+ witnesses[i] = witness;
+ }
+}
- if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[j])
- && (pending_matrix[i][j] == 0)
- && (pending_matrix[j][i] == 0)) {
+static int32_t
+afr_find_biggest_witness_among_fools (int32_t *witnesses,
+ afr_node_character *characters,
+ int32_t child_count)
+{
+ int i = 0;
+ int biggest_witness = -1;
- pending_matrix[i][j] = 1;
- pending_matrix[j][i] = 1;
+ GF_ASSERT (witnesses);
+ GF_ASSERT (characters);
+ GF_ASSERT (child_count > 0);
- size_differs = 1;
- }
- }
- }
+ for (i = 0; i < child_count; i++) {
+ if (characters[i].type != AFR_NODE_FOOL)
+ continue;
- return size_differs;
+ if (biggest_witness < witnesses[i])
+ biggest_witness = witnesses[i];
+ }
+ return biggest_witness;
}
-
-static int
-afr_sh_mark_biggest_fool_as_source (afr_self_heal_t *sh,
+int
+afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses,
afr_node_character *characters,
- int child_count)
+ int32_t child_count, int32_t witness)
{
- int i = 0;
- int biggest = 0;
+ int i = 0;
+ int nsources = 0;
- for (i = 0; i < child_count; i++) {
- if (characters[i].type == AFR_NODE_FOOL) {
- biggest = i;
- break;
- }
- }
+ GF_ASSERT (sources);
+ GF_ASSERT (witnesses);
+ GF_ASSERT (characters);
+ GF_ASSERT (child_count > 0);
for (i = 0; i < child_count; i++) {
if (characters[i].type != AFR_NODE_FOOL)
continue;
- if (!sh->buf)
- break;
-
- if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) {
- biggest = i;
+ if (witness == witnesses[i]) {
+ sources[i] = 1;
+ nsources++;
}
}
+ return nsources;
+}
+
+static int
+afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix,
+ afr_node_character *characters,
+ int child_count)
+{
+ int32_t biggest_witness = 0;
+ int nsources = 0;
+ int32_t *witnesses = NULL;
- sh->sources[biggest] = 1;
+ GF_ASSERT (child_count > 0);
- return 1;
-}
+ witnesses = GF_CALLOC (child_count, sizeof (*witnesses),
+ gf_afr_mt_int32_t);
+ if (NULL == witnesses) {
+ nsources = -1;
+ goto out;
+ }
+ afr_compute_witness_of_fools (witnesses, pending_matrix, characters,
+ child_count);
+ biggest_witness = afr_find_biggest_witness_among_fools (witnesses,
+ characters,
+ child_count);
+ nsources = afr_mark_fool_as_source_by_witness (sources, witnesses,
+ characters, child_count,
+ biggest_witness);
+out:
+ if (witnesses)
+ GF_FREE (witnesses);
+ return nsources;
+}
-static int
-afr_sh_mark_biggest_as_source (afr_self_heal_t *sh, int child_count)
+int
+afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs,
+ int32_t *valid_children, int child_count,
+ uint32_t uid)
{
- int biggest = 0;
- int i = 0;
+ int i = 0;
+ int nsources = 0;
+ int child = 0;
+
+ GF_ASSERT (bufs);
+ GF_ASSERT (valid_children);
+ GF_ASSERT (sources);
+ GF_ASSERT (child_count > 0);
for (i = 0; i < child_count; i++) {
- if (!sh->buf)
- break;
+ if (-1 == valid_children[i])
+ continue;
- if (SIZE_GREATER (&sh->buf[i], &sh->buf[biggest])) {
- biggest = i;
+ child = valid_children[i];
+ if (uid == bufs[child].ia_uid) {
+ sources[child] = 1;
+ nsources++;
}
}
+ return nsources;
+}
- sh->sources[biggest] = 1;
+int
+afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *valid_children,
+ int child_count)
+{
+ int i = 0;
+ int smallest = -1;
+ int child = 0;
- return 1;
-}
+ GF_ASSERT (bufs);
+ GF_ASSERT (valid_children);
+ GF_ASSERT (child_count > 0);
+ for (i = 0; i < child_count; i++) {
+ if (-1 == valid_children[i])
+ continue;
+ child = valid_children[i];
+ if ((smallest == -1) ||
+ (bufs[child].ia_uid < bufs[smallest].ia_uid)) {
+ smallest = child;
+ }
+ }
+ return smallest;
+}
static int
-afr_sh_mark_loweia_uid_as_source (afr_self_heal_t *sh, int child_count)
+afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *valid_children,
+ int child_count, int32_t *sources)
{
- uid_t smallest = 0;
- int i = 0;
+ int nsources = 0;
+ int smallest = 0;
- for (i = 0; i < child_count; i++) {
- if (!sh->buf)
- break;
-
- if (sh->buf[i].ia_uid < sh->buf[smallest].ia_uid) {
- smallest = i;
- }
+ smallest = afr_get_child_with_lowest_uid (bufs, valid_children,
+ child_count);
+ if (smallest < 0) {
+ nsources = -1;
+ goto out;
}
+ nsources = afr_mark_child_as_source_by_uid (sources, bufs,
+ valid_children, child_count,
+ bufs[smallest].ia_uid);
+out:
+ return nsources;
+}
- sh->sources[smallest] = 1;
+char *
+afr_get_character_str (afr_node_type type)
+{
+ char *character = NULL;
- return 1;
+ switch (type) {
+ case AFR_NODE_INNOCENT:
+ character = "innocent";
+ break;
+ case AFR_NODE_FOOL:
+ character = "fool";
+ break;
+ case AFR_NODE_WISE:
+ character = "wise";
+ break;
+ default:
+ character = "invalid";
+ break;
+ }
+ return character;
}
+afr_node_type
+afr_find_child_character_type (int32_t *pending_row, int32_t child,
+ int32_t child_count, const char *xlator_name)
+{
+ afr_node_type type = AFR_NODE_INVALID;
+
+ GF_ASSERT (pending_row);
+ GF_ASSERT (child_count > 0);
+ GF_ASSERT ((child >= 0) && (child < child_count));
+
+ if (afr_sh_is_innocent (pending_row, child_count))
+ type = AFR_NODE_INNOCENT;
+ else if (afr_sh_is_fool (pending_row, child, child_count))
+ type = AFR_NODE_FOOL;
+ else if (afr_sh_is_wise (pending_row, child, child_count))
+ type = AFR_NODE_WISE;
+ else
+ GF_ASSERT (0);
+
+ gf_log (xlator_name, GF_LOG_DEBUG, "child %d character %s",
+ child, afr_get_character_str (type));
+ return type;
+}
int
-afr_sh_mark_sources (afr_self_heal_t *sh, int child_count,
- afr_self_heal_type type)
+afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs,
+ int32_t child_count, afr_self_heal_type type,
+ int32_t *valid_children, const char *xlator_name)
{
/* stores the 'characters' (innocent, fool, wise) of the nodes */
- afr_node_character *characters = NULL;
+ afr_node_character *characters = NULL;
int i = 0;
- int32_t ** pending_matrix = NULL;
- int * sources = NULL;
- int size_differs = 0;
- int nsources = 0;
+ int nsources = -1;
xlator_t *this = NULL;
- afr_private_t *priv = NULL;
characters = GF_CALLOC (sizeof (afr_node_character),
- child_count,
- gf_afr_mt_afr_node_character) ;
+ child_count, gf_afr_mt_afr_node_character);
if (!characters)
goto out;
this = THIS;
- priv = this->private;
- pending_matrix = sh->pending_matrix;
- sources = sh->sources;
/* start clean */
for (i = 0; i < child_count; i++) {
sources[i] = 0;
}
+ nsources = 0;
for (i = 0; i < child_count; i++) {
- if (afr_sh_is_innocent (pending_matrix[i], child_count)) {
- characters[i].type = AFR_NODE_INNOCENT;
-
- } else if (afr_sh_is_fool (pending_matrix[i], i, child_count)) {
- characters[i].type = AFR_NODE_FOOL;
-
- } else if (afr_sh_is_wise (pending_matrix[i], i, child_count)) {
- characters[i].type = AFR_NODE_WISE;
-
- } else {
- gf_log (this->name, GF_LOG_CRITICAL,
- "Could not determine the state of subvolume %s!"
- " (This message should never appear."
- " Please file a bug report to "
- "<gluster-devel@nongnu.org>.)",
- priv->children[i]->name);
- }
- }
-
- if (type == AFR_SELF_HEAL_DATA) {
- size_differs = afr_sh_mark_if_size_differs (sh, child_count);
+ characters[i].type =
+ afr_find_child_character_type (pending_matrix[i], i,
+ child_count,
+ xlator_name);
+ if (AFR_NODE_INVALID == characters[i].type)
+ gf_log (xlator_name, GF_LOG_WARNING,
+ "child %d had invalid xattrs", i);
}
if ((type == AFR_SELF_HEAL_METADATA)
&& afr_sh_all_nodes_innocent (characters, child_count)) {
- nsources = afr_sh_mark_loweia_uid_as_source (sh, child_count);
+ nsources = afr_sh_mark_lowest_uid_as_source (bufs,
+ valid_children,
+ child_count,
+ sources);
goto out;
}
- if (afr_sh_all_nodes_innocent (characters, child_count)) {
- if (size_differs) {
- nsources = afr_sh_mark_biggest_as_source (sh,
- child_count);
- }
-
- } else if (afr_sh_wise_nodes_exist (characters, child_count)) {
+ if (afr_sh_wise_nodes_exist (characters, child_count)) {
afr_sh_compute_wisdom (pending_matrix, characters, child_count);
if (afr_sh_wise_nodes_conflict (characters, child_count)) {
@@ -536,7 +637,6 @@ afr_sh_mark_sources (afr_self_heal_t *sh, int child_count,
gf_log (this->name, GF_LOG_INFO,
"split-brain possible, no source detected");
nsources = -1;
- goto out;
} else {
nsources = afr_sh_mark_wisest_as_sources (sources,
@@ -544,18 +644,26 @@ afr_sh_mark_sources (afr_self_heal_t *sh, int child_count,
child_count);
}
} else {
- nsources = afr_sh_mark_biggest_fool_as_source (sh, characters,
- child_count);
+ nsources = afr_mark_biggest_of_fools_as_source (sources,
+ pending_matrix,
+ characters,
+ child_count);
}
out:
+ if (nsources == 0) {
+ for (i = 0; i < child_count; i++) {
+ if (valid_children[i] != -1)
+ sources[valid_children[i]] = 1;
+ }
+ }
if (characters)
GF_FREE (characters);
+ gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources);
return nsources;
}
-
void
afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,
int32_t *delta_matrix[], int success[],
@@ -643,7 +751,7 @@ afr_sh_delta_to_xattr (afr_private_t *priv,
int
-afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this)
+afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this)
{
/* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
int32_t pending[3] = {0,};
@@ -674,7 +782,7 @@ afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this)
int
-afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this)
+afr_sh_has_data_pending (dict_t *xattr, xlator_t *this)
{
/* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
int32_t pending[3] = {0,};
@@ -705,7 +813,7 @@ afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this)
int
-afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this)
+afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this)
{
/* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
int32_t pending[3] = {0,};
@@ -1653,6 +1761,9 @@ afr_self_heal (call_frame_t *frame, xlator_t *this)
priv->child_count,
gf_afr_mt_int32_t);
}
+ sh->child_success = GF_CALLOC (sizeof (*sh->child_success),
+ priv->child_count, gf_afr_mt_int32_t);
+
FRAME_SU_DO (sh_frame, afr_local_t);
if (local->success_count && local->enoent_count) {
@@ -1686,3 +1797,25 @@ afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str,
snprintf(str + strlen(str), size - strlen(str), " entry");
}
}
+
+afr_self_heal_type
+afr_self_heal_type_for_transaction (afr_transaction_type type)
+{
+ afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID;
+
+ switch (type) {
+ case AFR_DATA_TRANSACTION:
+ sh_type = AFR_SELF_HEAL_DATA;
+ break;
+ case AFR_METADATA_TRANSACTION:
+ sh_type = AFR_SELF_HEAL_METADATA;
+ break;
+ case AFR_ENTRY_TRANSACTION:
+ sh_type = AFR_SELF_HEAL_ENTRY;
+ break;
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ GF_ASSERT (0);
+ break;
+ }
+ return sh_type;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h
index 6431feaff..7f6247455 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.h
+++ b/xlators/cluster/afr/src/afr-self-heal-common.h
@@ -26,6 +26,7 @@ typedef enum {
AFR_SELF_HEAL_ENTRY,
AFR_SELF_HEAL_METADATA,
AFR_SELF_HEAL_DATA,
+ AFR_SELF_HEAL_INVALID = -1,
} afr_self_heal_type;
int
@@ -37,17 +38,13 @@ afr_sh_sink_count (int sources[], int child_count);
int
afr_sh_source_count (int sources[], int child_count);
-int
-afr_sh_supress_errenous_children (int sources[], int child_errno[],
- int child_count);
-
void
afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this);
-void
-afr_sh_build_pending_matrix (afr_private_t *priv,
- int32_t *pending_matrix[], dict_t *xattr[],
- int child_count, afr_transaction_type type);
+int
+afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix,
+ dict_t *xattr[], afr_transaction_type type,
+ size_t child_count);
void
afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,
@@ -55,8 +52,9 @@ afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,
int child_count, afr_transaction_type type);
int
-afr_sh_mark_sources (afr_self_heal_t *sh, int child_count,
- afr_self_heal_type type);
+afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs,
+ int32_t child_count, afr_self_heal_type type,
+ int32_t *valid_children, const char *xlator_name);
int
afr_sh_delta_to_xattr (afr_private_t *priv,
@@ -70,4 +68,7 @@ void
afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str,
size_t size);
+afr_self_heal_type
+afr_self_heal_type_for_transaction (afr_transaction_type type);
+
#endif /* __AFR_SELF_HEAL_COMMON_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index f4cc4275d..3ee1db0e7 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -594,16 +594,15 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this)
sh = &local->self_heal;
priv = this->private;
- afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr,
- priv->child_count, AFR_DATA_TRANSACTION);
+ afr_build_pending_matrix (priv->pending_key, sh->pending_matrix,
+ sh->xattr, AFR_DATA_TRANSACTION,
+ priv->child_count);
afr_sh_print_pending_matrix (sh->pending_matrix, this);
- nsources = afr_sh_mark_sources (sh, priv->child_count,
- AFR_SELF_HEAL_DATA);
-
- afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
- priv->child_count);
+ nsources = afr_mark_sources (sh->sources, sh->pending_matrix, sh->buf,
+ priv->child_count, AFR_SELF_HEAL_DATA,
+ sh->child_success, this->name);
if (nsources == 0) {
gf_log (this->name, GF_LOG_TRACE,
@@ -692,39 +691,165 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this)
return 0;
}
+gf_boolean_t
+afr_is_fresh_read_child (int32_t *sources, int32_t child_count,
+ int32_t read_child)
+{
+ gf_boolean_t is_fresh_child = _gf_false;
+
+ GF_ASSERT (read_child < child_count);
-int
-afr_self_heal_get_source (xlator_t *this, afr_local_t *local, dict_t **xattr)
+ if ((read_child >= 0) && (read_child < child_count) &&
+ sources[read_child]) {
+ is_fresh_child = _gf_true;
+ }
+ return is_fresh_child;
+}
+
+static int
+afr_select_read_child_from_policy (int32_t *sources, int32_t child_count,
+ int32_t prev_read_child,
+ int32_t config_read_child,
+ int32_t *valid_children)
{
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
- int i = 0;
+ int32_t read_child = -1;
+ int i = 0;
- sh = &local->self_heal;
- priv = this->private;
+ GF_ASSERT (sources);
- sh->pending_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count,
- gf_afr_mt_int32_t);
- for (i = 0; i < priv->child_count; i++) {
- sh->pending_matrix[i] = GF_CALLOC (sizeof (int32_t),
- priv->child_count,
- gf_afr_mt_int32_t);
+ read_child = prev_read_child;
+ if (_gf_true == afr_is_fresh_read_child (sources, child_count,
+ read_child))
+ goto out;
+
+ read_child = config_read_child;
+ if (_gf_true == afr_is_fresh_read_child (sources, child_count,
+ read_child))
+ goto out;
+
+ for (i = 0; i < child_count; i++) {
+ read_child = valid_children[i];
+ if (read_child < 0)
+ break;
+ if (_gf_true == afr_is_fresh_read_child (sources, child_count,
+ read_child))
+ goto out;
}
+ read_child = -1;
- sh->sources = GF_CALLOC (priv->child_count, sizeof (*sh->sources),
- gf_afr_mt_int32_t);
+out:
+ return read_child;
+}
- afr_sh_build_pending_matrix (priv, sh->pending_matrix, xattr,
- priv->child_count, AFR_DATA_TRANSACTION);
+static void
+afr_destroy_pending_matrix (int32_t **pending_matrix, int32_t child_count)
+{
+ int i = 0;
+ GF_ASSERT (child_count > 0);
+ if (pending_matrix) {
+ for (i = 0; i < child_count; i++) {
+ if (pending_matrix[i])
+ GF_FREE (pending_matrix[i]);
+ }
+ GF_FREE (pending_matrix);
+ }
+}
- (void)afr_sh_mark_sources (sh, priv->child_count, AFR_SELF_HEAL_DATA);
+static int32_t**
+afr_create_pending_matrix (int32_t child_count)
+{
+ gf_boolean_t cleanup = _gf_false;
+ int32_t **pending_matrix = NULL;
+ int i = 0;
- source = afr_sh_select_source (sh->sources, priv->child_count);
+ GF_ASSERT (child_count > 0);
- return source;
+ pending_matrix = GF_CALLOC (sizeof (*pending_matrix), child_count,
+ gf_afr_mt_int32_t);
+ if (NULL == pending_matrix)
+ goto out;
+ for (i = 0; i < child_count; i++) {
+ pending_matrix[i] = GF_CALLOC (sizeof (**pending_matrix),
+ child_count,
+ gf_afr_mt_int32_t);
+ if (NULL == pending_matrix[i]) {
+ cleanup = _gf_true;
+ goto out;
+ }
+ }
+out:
+ if (_gf_true == cleanup) {
+ afr_destroy_pending_matrix (pending_matrix, child_count);
+ pending_matrix = NULL;
+ }
+ return pending_matrix;
}
+int
+afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local,
+ dict_t **xattr,
+ afr_transaction_type txn_type)
+{
+ afr_private_t *priv = NULL;
+ int read_child = -1;
+ int ret = -1;
+ afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID;
+ int32_t **pending_matrix = NULL;
+ int32_t *sources = NULL;
+ int32_t *valid_children = NULL;
+ struct iatt *bufs = NULL;
+ int32_t nsources = 0;
+ int32_t prev_read_child = -1;
+ int32_t config_read_child = -1;
+ afr_self_heal_t *sh = NULL;
+
+ priv = this->private;
+ bufs = local->cont.lookup.bufs;
+ valid_children = local->cont.lookup.child_success;
+ sh = &local->self_heal;
+
+ pending_matrix = afr_create_pending_matrix (priv->child_count);
+ if (NULL == pending_matrix)
+ goto out;
+
+ sources = GF_CALLOC (sizeof (*sources), priv->child_count,
+ gf_afr_mt_int32_t);
+ if (NULL == sources)
+ goto out;
+
+ afr_build_pending_matrix (priv->pending_key, pending_matrix,
+ xattr, txn_type, priv->child_count);
+
+ sh_type = afr_self_heal_type_for_transaction (txn_type);
+ if (AFR_SELF_HEAL_INVALID == sh_type)
+ goto out;
+
+ nsources = afr_mark_sources (sources, pending_matrix, bufs,
+ priv->child_count, sh_type,
+ valid_children, this->name);
+ if (nsources < 0) {
+ ret = -1;
+ goto out;
+ }
+
+ prev_read_child = local->read_child_index;
+ config_read_child = priv->read_child;
+ read_child = afr_select_read_child_from_policy (sources,
+ priv->child_count,
+ prev_read_child,
+ config_read_child,
+ valid_children);
+ ret = 0;
+ local->cont.lookup.sources = sources;
+out:
+ afr_destroy_pending_matrix (pending_matrix, priv->child_count);
+ if (-1 == ret) {
+ if (sources)
+ GF_FREE (sources);
+ }
+ gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d", read_child);
+ return read_child;
+}
int
afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie,
@@ -750,6 +875,8 @@ afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie,
priv->children[child_index]->name);
sh->buf[child_index] = *buf;
+ sh->child_success[sh->success_count] = child_index;
+ sh->success_count++;
}
}
UNLOCK (&frame->lock);
@@ -782,6 +909,9 @@ afr_sh_data_fstat (call_frame_t *frame, xlator_t *this)
local->call_count = call_count;
+ for (i = 0; i < priv->child_count; i++)
+ sh->child_success[i] = -1;
+ sh->success_count = 0;
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk,
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index ca738098a..0425644b3 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -2157,13 +2157,15 @@ afr_sh_entry_fix (call_frame_t *frame, xlator_t *this)
goto heal;
}
- afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr,
- priv->child_count, AFR_ENTRY_TRANSACTION);
+ afr_build_pending_matrix (priv->pending_key, sh->pending_matrix,
+ sh->xattr, AFR_ENTRY_TRANSACTION,
+ priv->child_count);
afr_sh_print_pending_matrix (sh->pending_matrix, this);
- nsources = afr_sh_mark_sources (sh, priv->child_count,
- AFR_SELF_HEAL_ENTRY);
+ nsources = afr_mark_sources (sh->sources, sh->pending_matrix, sh->buf,
+ priv->child_count, AFR_SELF_HEAL_ENTRY,
+ sh->child_success, this->name);
if (nsources == 0) {
gf_log (this->name, GF_LOG_TRACE,
@@ -2174,9 +2176,6 @@ afr_sh_entry_fix (call_frame_t *frame, xlator_t *this)
return 0;
}
- afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
- priv->child_count);
-
source = afr_sh_select_source (sh->sources, priv->child_count);
sh->source = source;
@@ -2209,6 +2208,8 @@ afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie,
if (op_ret != -1) {
sh->xattr[child_index] = dict_ref (xattr);
sh->buf[child_index] = *buf;
+ sh->child_success[sh->success_count] = child_index;
+ sh->success_count++;
}
}
UNLOCK (&frame->lock);
@@ -2233,9 +2234,11 @@ afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this)
int ret = 0;
int call_count = 0;
int i = 0;
+ afr_self_heal_t *sh = NULL;
priv = this->private;
local = frame->local;
+ sh = &local->self_heal;
call_count = afr_up_children_count (priv->child_count,
local->child_up);
@@ -2255,6 +2258,9 @@ afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this)
}
}
+ for (i = 0; i < priv->child_count; i++)
+ sh->child_success[i] = -1;
+ sh->success_count = 0;
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
STACK_WIND_COOKIE (frame,
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index 7ad1ce69a..fe1db60e2 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -475,17 +475,15 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this)
sh = &local->self_heal;
priv = this->private;
- afr_sh_build_pending_matrix (priv, sh->pending_matrix, sh->xattr,
- priv->child_count,
- AFR_METADATA_TRANSACTION);
+ afr_build_pending_matrix (priv->pending_key, sh->pending_matrix,
+ sh->xattr, AFR_METADATA_TRANSACTION,
+ priv->child_count);
afr_sh_print_pending_matrix (sh->pending_matrix, this);
- nsources = afr_sh_mark_sources (sh, priv->child_count,
- AFR_SELF_HEAL_METADATA);
-
- afr_sh_supress_errenous_children (sh->sources, sh->child_errno,
- priv->child_count);
+ nsources = afr_mark_sources (sh->sources, sh->pending_matrix, sh->buf,
+ priv->child_count, AFR_SELF_HEAL_METADATA,
+ sh->child_success, this->name);
if (nsources == 0) {
gf_log (this->name, GF_LOG_TRACE,
@@ -584,6 +582,8 @@ afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
sh->buf[child_index] = *buf;
if (xattr)
sh->xattr[child_index] = dict_ref (xattr);
+ sh->child_success[sh->success_count] = child_index;
+ sh->success_count++;
} else {
gf_log (this->name, GF_LOG_INFO,
"path %s on subvolume %s => -1 (%s)",
@@ -614,9 +614,11 @@ afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this)
int call_count = 0;
dict_t *xattr_req = NULL;
int ret = 0;
+ afr_self_heal_t *sh = NULL;
local = frame->local;
priv = this->private;
+ sh = &local->self_heal;
call_count = afr_up_children_count (priv->child_count,
local->child_up);
@@ -635,6 +637,9 @@ afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this)
}
}
+ for (i = 0; i < priv->child_count; i++)
+ sh->child_success[i] = -1;
+ sh->success_count = 0;
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
gf_log (this->name, GF_LOG_TRACE,
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index b10ae3fc0..976dae475 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -30,11 +30,11 @@
#define SIZE_GREATER(buf1,buf2) ((buf1)->ia_size > (buf2)->ia_size)
int
-afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this);
+afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this);
int
-afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this);
+afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this);
int
-afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this);
+afr_sh_has_data_pending (dict_t *xattr, xlator_t *this);
int
afr_self_heal_entry (call_frame_t *frame, xlator_t *this);
@@ -51,4 +51,11 @@ afr_self_heal_get_source (xlator_t *this, afr_local_t *local, dict_t **xattr);
int
afr_self_heal (call_frame_t *frame, xlator_t *this);
+gf_boolean_t
+afr_is_fresh_read_child (int32_t *sources, int32_t child_count,
+ int32_t read_child);
+int
+afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local,
+ dict_t **xattr,
+ afr_transaction_type txn_type);
#endif /* __AFR_SELF_HEAL_H__ */
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 3d21a2692..973e8b58f 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -137,6 +137,10 @@ typedef struct {
/* array of xattr's, one for each child */
dict_t **xattr;
+ /* array containing if the lookups succeeded in the order of response
+ */
+ int32_t *child_success;
+ int success_count;
/* array of errno's, one for each child */
int *child_errno;
@@ -340,14 +344,17 @@ typedef struct _afr_local {
struct {
inode_t *inode;
struct iatt buf;
- struct iatt read_child_buf;
struct iatt postparent;
ino_t ino;
uint64_t gen;
ino_t parent_ino;
- dict_t *xattr;
dict_t **xattrs;
- gf_boolean_t is_revalidate;
+ dict_t *xattr;
+ struct iatt *postparents;
+ struct iatt *bufs;
+ int32_t read_child;
+ int32_t *child_success;//in the order of response
+ int32_t *sources;
} lookup;
struct {
@@ -735,6 +742,9 @@ afr_build_parent_loc (loc_t *parent, loc_t *child);
int
afr_up_children_count (int child_count, unsigned char *child_up);
+gf_boolean_t
+afr_is_fresh_lookup (loc_t *loc, xlator_t *this);
+
void
afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent);