cluster/ec: Improve detection of new heals

When EC successfully healed a directory it assumed that maybe other entries inside that directory could have been created, which could require additional heal cycles. For this reason, when the heal happened as part of one index heal iteration, it triggered a new iteration. The problem happened when the directory was healthy, so no new entries were added, but its index entry was not removed for some reason. In this case self-heal started and endless loop healing the same directory continuously, cause high CPU utilization. This patch improves detection of new files added to the heal index so that a new index heal iteration is only triggered if there is new work to do. Change-Id: I2355742b85fbfa6de758bccc5d2e1a283c82b53f Fixes: #1354 Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
author: Xavi Hernandez <xhernandez@redhat.com> 2020-07-02 18:08:52 +0200
committer: Rinku Kothiya <rkothiya@redhat.com> 2020-08-19 18:00:31 +0000
commit: 269ece312c9fd890c74c46e79de70efe1720752c (patch)
tree: c5f54753ffc944afbc5dce50d35b18a72f65551b
parent: 19dc7b37fc9f6d6037958e5dd3c0c6a4a993e2af (diff)
6 files changed, 84 insertions, 28 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index ded34b81aa2..9abdcec3f78 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -230,7 +230,7 @@ ec_child_next(ec_t *ec, ec_fop_data_t *fop, uint32_t idx)
 int32_t
 ec_heal_report(call_frame_t *frame, void *cookie, xlator_t *this,
                int32_t op_ret, int32_t op_errno, uintptr_t mask, uintptr_t good,
-               uintptr_t bad, dict_t *xdata)
+               uintptr_t bad, uint32_t pending, dict_t *xdata)
 {
     if (op_ret < 0) {
         gf_msg(this->name, GF_LOG_DEBUG, op_errno, EC_MSG_HEAL_FAIL,
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
index 81f6add5bb0..16ea6cfc811 100644
--- a/xlators/cluster/ec/src/ec-heal.c
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -70,6 +70,7 @@ struct ec_name_data {
     char *name;
     inode_t *parent;
     default_args_cbk_t *replies;
+    uint32_t heal_pending;
 };
 
 static char *ec_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL};
@@ -994,6 +995,7 @@ ec_set_new_entry_dirty(ec_t *ec, loc_t *loc, struct iatt *ia,
         ret = -ENOTCONN;
         goto out;
     }
+
 out:
     if (xattr)
         dict_unref(xattr);
@@ -1172,6 +1174,7 @@ ec_create_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
     dict_t *xdata = NULL;
     char *linkname = NULL;
     ec_config_t config;
+
     /* There should be just one gfid key */
     EC_REPLIES_ALLOC(replies, ec->nodes);
     if (gfid_db->count != 1) {
@@ -1416,6 +1419,11 @@ __ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
 
     ret = ec_create_name(frame, ec, parent, name, replies, gfid_db, enoent,
                          participants);
+    if (ret >= 0) {
+        /* If ec_create_name() succeeded we return 1 to indicate that a new
+         * file has been created and it will need to be healed. */
+        ret = 1;
+    }
 out:
     cluster_replies_wipe(replies, ec->nodes);
     loc_wipe(&loc);
@@ -1493,18 +1501,22 @@ ec_name_heal_handler(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
     ret = ec_heal_name(name_data->frame, ec, parent->inode, entry->d_name,
                        name_on);
 
-    if (ret < 0)
+    if (ret < 0) {
         memset(name_on, 0, ec->nodes);
+    } else {
+        name_data->heal_pending += ret;
+    }
 
     for (i = 0; i < ec->nodes; i++)
         if (name_data->participants[i] && !name_on[i])
             name_data->failed_on[i] = 1;
+
     return 0;
 }
 
 int
 ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
-              unsigned char *participants)
+              unsigned char *participants, uint32_t *pending)
 {
     int i = 0;
     int j = 0;
@@ -1517,7 +1529,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
     name_data.frame = frame;
     name_data.participants = participants;
     name_data.failed_on = alloca0(ec->nodes);
-    ;
+    name_data.heal_pending = 0;
 
     for (i = 0; i < ec->nodes; i++) {
         if (!participants[i])
@@ -1536,6 +1548,8 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
             break;
         }
     }
+    *pending += name_data.heal_pending;
+
     loc_wipe(&loc);
     return ret;
 }
@@ -1543,7 +1557,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
 int
 __ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
                 unsigned char *heal_on, unsigned char *sources,
-                unsigned char *healed_sinks)
+                unsigned char *healed_sinks, uint32_t *pending)
 {
     unsigned char *locked_on = NULL;
     unsigned char *output = NULL;
@@ -1588,7 +1602,7 @@ unlock:
         if (sources[i] || healed_sinks[i])
             participants[i] = 1;
     }
-    ret = ec_heal_names(frame, ec, inode, participants);
+    ret = ec_heal_names(frame, ec, inode, participants, pending);
 
     if (EC_COUNT(participants, ec->nodes) <= ec->fragments)
         goto out;
@@ -1609,7 +1623,8 @@ out:
 
 int
 ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
-              unsigned char *sources, unsigned char *healed_sinks)
+              unsigned char *sources, unsigned char *healed_sinks,
+              uint32_t *pending)
 {
     unsigned char *locked_on = NULL;
     unsigned char *up_subvols = NULL;
@@ -1640,7 +1655,7 @@ ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
             goto unlock;
         }
         ret = __ec_heal_entry(frame, ec, inode, locked_on, sources,
-                              healed_sinks);
+                              healed_sinks, pending);
     }
 unlock:
     cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
@@ -1961,14 +1976,14 @@ ec_manager_heal_block(ec_fop_data_t *fop, int32_t state)
             if (fop->cbks.heal) {
                 fop->cbks.heal(fop->req_frame, fop->data, fop->xl, 0, 0,
                                (heal->good | heal->bad), heal->good, heal->bad,
-                               NULL);
+                               0, NULL);
             }
 
             return EC_STATE_END;
         case -EC_STATE_REPORT:
             if (fop->cbks.heal) {
                 fop->cbks.heal(fop->req_frame, fop->data, fop->xl, -1,
-                               fop->error, 0, 0, 0, NULL);
+                               fop->error, 0, 0, 0, 0, NULL);
             }
 
             return EC_STATE_END;
@@ -2005,14 +2020,15 @@ out:
     if (fop != NULL) {
         ec_manager(fop, error);
     } else {
-        func(frame, heal, this, -1, error, 0, 0, 0, NULL);
+        func(frame, heal, this, -1, error, 0, 0, 0, 0, NULL);
     }
 }
 
 int32_t
 ec_heal_block_done(call_frame_t *frame, void *cookie, xlator_t *this,
                    int32_t op_ret, int32_t op_errno, uintptr_t mask,
-                   uintptr_t good, uintptr_t bad, dict_t *xdata)
+                   uintptr_t good, uintptr_t bad, uint32_t pending,
+                   dict_t *xdata)
 {
     ec_heal_t *heal = cookie;
 
@@ -2498,6 +2514,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
     intptr_t mbad = 0;
     intptr_t good = 0;
     intptr_t bad = 0;
+    uint32_t pending = 0;
     ec_fop_data_t *fop = data;
     gf_boolean_t blocking = _gf_false;
     ec_heal_need_t need_heal = EC_HEAL_NONEED;
@@ -2533,7 +2550,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
     if (loc->name && strlen(loc->name)) {
         ret = ec_heal_name(frame, ec, loc->parent, (char *)loc->name,
                            participants);
-        if (ret == 0) {
+        if (ret >= 0) {
             gf_msg_debug(this->name, 0,
                          "%s: name heal "
                          "successful on %" PRIXPTR,
@@ -2551,7 +2568,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
 
     /* Mount triggers heal only when it detects that it must need heal, shd
      * triggers heals periodically which need not be thorough*/
-    if (ec->shd.iamshd) {
+    if (ec->shd.iamshd && (ret <= 0)) {
         ec_heal_inspect(frame, ec, loc->inode, up_subvols, _gf_false, _gf_false,
                         &need_heal);
 
@@ -2561,13 +2578,15 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
             goto out;
         }
     }
+
     sources = alloca0(ec->nodes);
     healed_sinks = alloca0(ec->nodes);
     if (IA_ISREG(loc->inode->ia_type)) {
         ret = ec_heal_data(frame, ec, blocking, loc->inode, sources,
                            healed_sinks);
     } else if (IA_ISDIR(loc->inode->ia_type) && !partial) {
-        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks);
+        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks,
+                            &pending);
     } else {
         ret = 0;
         memcpy(sources, participants, ec->nodes);
@@ -2597,10 +2616,11 @@ out:
     if (fop->cbks.heal) {
         fop->cbks.heal(fop->req_frame, fop->data, fop->xl, op_ret, op_errno,
                        ec_char_array_to_mask(participants, ec->nodes),
-                       mgood & good, mbad & bad, NULL);
+                       mgood & good, mbad & bad, pending, NULL);
     }
     if (frame)
         STACK_DESTROY(frame->root);
+
     return;
 }
 
@@ -2648,7 +2668,7 @@ ec_heal_fail(ec_t *ec, ec_fop_data_t *fop)
 {
     if (fop->cbks.heal) {
         fop->cbks.heal(fop->req_frame, fop->data, ec->xl, -1, fop->error, 0, 0,
-                       0, NULL);
+                       0, 0, NULL);
     }
     ec_fop_data_release(fop);
 }
@@ -2835,7 +2855,7 @@ fail:
     if (fop)
         ec_fop_data_release(fop);
     if (func)
-        func(frame, data, this, -1, err, 0, 0, 0, NULL);
+        func(frame, data, this, -1, err, 0, 0, 0, 0, NULL);
 }
 
 int
diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
index 956e73c2088..63fe5d34e38 100644
--- a/xlators/cluster/ec/src/ec-heald.c
+++ b/xlators/cluster/ec/src/ec-heald.c
@@ -160,15 +160,27 @@ int
 ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc,
                 gf_boolean_t full)
 {
+    dict_t *xdata = NULL;
+    uint32_t count;
     int32_t ret;
 
-    ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, NULL);
-    if (!full && (ret >= 0) && (loc->inode->ia_type == IA_IFDIR)) {
+    ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, &xdata);
+    if (!full && (loc->inode->ia_type == IA_IFDIR)) {
         /* If we have just healed a directory, it's possible that
-         * other index entries have appeared to be healed. We put a
-         * mark so that we can check it later and restart a scan
-         * without delay. */
-        healer->rerun = _gf_true;
+         * other index entries have appeared to be healed. */
+        if ((xdata != NULL) &&
+            (dict_get_uint32(xdata, EC_XATTR_HEAL_NEW, &count) == 0) &&
+            (count > 0)) {
+            /* Force a rerun of the index healer. */
+            gf_msg_debug(healer->this->name, 0, "%d more entries to heal",
+                         count);
+
+            healer->rerun = _gf_true;
+        }
+    }
+
+    if (xdata != NULL) {
+        dict_unref(xdata);
     }
 
     return ret;
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
index a891ccd0952..dad5f4d7018 100644
--- a/xlators/cluster/ec/src/ec-inode-read.c
+++ b/xlators/cluster/ec/src/ec-inode-read.c
@@ -390,7 +390,8 @@ ec_manager_getxattr(ec_fop_data_t *fop, int32_t state)
 int32_t
 ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
                      int32_t op_ret, int32_t op_errno, uintptr_t mask,
-                     uintptr_t good, uintptr_t bad, dict_t *xdata)
+                     uintptr_t good, uintptr_t bad, uint32_t pending,
+                     dict_t *xdata)
 {
     fop_getxattr_cbk_t func = cookie;
     ec_t *ec = xl->private;
@@ -398,6 +399,25 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
     char *str;
     char bin1[65], bin2[65];
 
+    /* We try to return the 'pending' information in xdata, but if this cannot
+     * be set, we will ignore it silently. We prefer to report the success or
+     * failure of the heal itself. */
+    if (xdata == NULL) {
+        xdata = dict_new();
+    } else {
+        dict_ref(xdata);
+    }
+    if (xdata != NULL) {
+        if (dict_set_uint32(xdata, EC_XATTR_HEAL_NEW, pending) != 0) {
+            /* dict_set_uint32() is marked as 'warn_unused_result' and gcc
+             * enforces to check the result in this case. However we don't
+             * really care if it succeeded or not. We'll just do the same.
+             *
+             * This empty 'if' avoids the warning, and it will be removed by
+             * the optimizer. */
+        }
+    }
+
     if (op_ret >= 0) {
         dict = dict_new();
         if (dict == NULL) {
@@ -431,11 +451,14 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
     }
 
 out:
-    func(frame, NULL, xl, op_ret, op_errno, dict, NULL);
+    func(frame, NULL, xl, op_ret, op_errno, dict, xdata);
 
     if (dict != NULL) {
         dict_unref(dict);
     }
+    if (xdata != NULL) {
+        dict_unref(xdata);
+    }
 
     return 0;
 }
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
index 7829b8c27b3..c25185f42ed 100644
--- a/xlators/cluster/ec/src/ec-types.h
+++ b/xlators/cluster/ec/src/ec-types.h
@@ -186,10 +186,10 @@ struct _ec_inode {
 
 typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
                                   int32_t, uintptr_t, uintptr_t, uintptr_t,
-                                  dict_t *);
+                                  uint32_t, dict_t *);
 typedef int32_t (*fop_fheal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
                                    int32_t, uintptr_t, uintptr_t, uintptr_t,
-                                   dict_t *);
+                                   uint32_t, dict_t *);
 
 union _ec_cbk {
     fop_access_cbk_t access;
diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
index 1b210d9adc1..6f6de6d5981 100644
--- a/xlators/cluster/ec/src/ec.h
+++ b/xlators/cluster/ec/src/ec.h
@@ -18,6 +18,7 @@
 #define EC_XATTR_SIZE EC_XATTR_PREFIX "size"
 #define EC_XATTR_VERSION EC_XATTR_PREFIX "version"
 #define EC_XATTR_HEAL EC_XATTR_PREFIX "heal"
+#define EC_XATTR_HEAL_NEW EC_XATTR_PREFIX "heal-new"
 #define EC_XATTR_DIRTY EC_XATTR_PREFIX "dirty"
 #define EC_STRIPE_CACHE_MAX_SIZE 10
 #define EC_VERSION_SIZE 2
author	Xavi Hernandez <xhernandez@redhat.com>	2020-07-02 18:08:52 +0200
committer	Rinku Kothiya <rkothiya@redhat.com>	2020-08-19 18:00:31 +0000
commit	269ece312c9fd890c74c46e79de70efe1720752c (patch)
tree	c5f54753ffc944afbc5dce50d35b18a72f65551b
parent	19dc7b37fc9f6d6037958e5dd3c0c6a4a993e2af (diff)