5 files changed, 132 insertions, 46 deletions
diff --git a/tests/bugs/ec/bug-1547662.t b/tests/bugs/ec/bug-1547662.t
new file mode 100644
index 00000000000..5748218587e
--- /dev/null
+++ b/tests/bugs/ec/bug-1547662.t
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+# Immediately after replace-brick, trusted.ec.version will be absent, so if it
+# is present we can assume that heal was started on root
+function root_heal_attempted {
+        if [ -z $(get_hex_xattr trusted.ec.version $1) ]; then
+                echo "N"
+        else
+                echo "Y"
+        fi
+}
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+TEST ${CLI} volume create ${V0} disperse 6 redundancy 2 ${H0}:${B0}/${V0}{0..5}
+TEST ${CLI} volume start ${V0}
+TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0}
+EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count ${V0} 0
+
+TEST mkdir ${M0}/base
+TEST mkdir ${M0}/base/dir.{1,2}
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}
+TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}
+
+TEST ${CLI} volume replace-brick ${V0} ${H0}:${B0}/${V0}5 ${H0}:${B0}/${V0}6 commit force
+EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count ${V0} 0
+EXPECT_WITHIN ${PROCESS_UP_TIMEOUT} "Y" glustershd_up_status
+EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count_shd ${V0} 0
+EXPECT_WITHIN ${HEAL_TIMEOUT} "Y" root_heal_attempted ${B0}/${V0}6
+EXPECT_WITHIN ${HEAL_TIMEOUT} "^0$" get_pending_heal_count ${V0}
+EXPECT "^127$" echo $(find ${B0}/${V0}6/base -type d | wc -l)
+
+cleanup;
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
index e4f9c0e3539..aeb890927c5 100644
--- a/xlators/cluster/ec/src/ec-heal.c
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -25,6 +25,7 @@
 #include "ec-combine.h"
 #include "ec-method.h"
 #include "ec-fops.h"
+#include "ec-heald.h"
 
 #define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr; })
 #define EC_COUNT(array, max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res; })
@@ -2631,6 +2632,14 @@ ec_replace_heal (ec_t *ec, inode_t *inode)
                 gf_msg_debug (ec->xl->name, 0,
                         "Heal failed for replace brick ret = %d", ret);
 
+        /* Once the root inode has been checked, it might have triggered a
+         * self-heal on it after a replace brick command or for some other
+         * reason. It can also happen that the volume already had damaged
+         * files in the index, even if the heal on the root directory failed.
+         * In both cases we need to wake all index healers to continue
+         * healing remaining entries that are marked as dirty. */
+        ec_shd_index_healer_wake(ec);
+
         loc_wipe (&loc);
         return ret;
 }
diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
index ffb78d5e950..6c1cefda39e 100644
--- a/xlators/cluster/ec/src/ec-heald.c
+++ b/xlators/cluster/ec/src/ec-heald.c
@@ -184,8 +184,19 @@ ec_shd_index_purge (xlator_t *subvol, inode_t *inode, char *name)
 int
 ec_shd_selfheal (struct subvol_healer *healer, int child, loc_t *loc)
 {
-        return syncop_getxattr (healer->this, loc, NULL, EC_XATTR_HEAL, NULL,
-                                NULL);
+        int32_t ret;
+
+        ret = syncop_getxattr (healer->this, loc, NULL, EC_XATTR_HEAL, NULL,
+                               NULL);
+        if ((ret >= 0) && (loc->inode->ia_type == IA_IFDIR)) {
+                /* If we have just healed a directory, it's possible that
+                 * other index entries have appeared to be healed. We put a
+                 * mark so that we can check it later and restart a scan
+                 * without delay. */
+                healer->rerun = _gf_true;
+        }
+
+        return ret;
 }
 
 
@@ -466,11 +477,15 @@ ec_shd_index_healer_spawn (xlator_t *this, int subvol)
 }
 
 void
-ec_selfheal_childup (ec_t *ec, int child)
+ec_shd_index_healer_wake(ec_t *ec)
 {
-        if (!ec->shd.iamshd)
-                return;
-        ec_shd_index_healer_spawn (ec->xl, child);
+        int32_t i;
+
+        for (i = 0; i < ec->nodes; i++) {
+                if (((ec->xl_up >> i) & 1) != 0) {
+                        ec_shd_index_healer_spawn(ec->xl, i);
+                }
+        }
 }
 
 int
diff --git a/xlators/cluster/ec/src/ec-heald.h b/xlators/cluster/ec/src/ec-heald.h
index 4ae02e2df3c..2a8488124c4 100644
--- a/xlators/cluster/ec/src/ec-heald.h
+++ b/xlators/cluster/ec/src/ec-heald.h
@@ -20,6 +20,8 @@ ec_xl_op (xlator_t *this, dict_t *input, dict_t *output);
 
 int
 ec_selfheal_daemon_init (xlator_t *this);
-void ec_selfheal_childup (ec_t *ec, int child);
+
+void
+ec_shd_index_healer_wake(ec_t *ec);
 
 #endif /* __EC_HEALD_H__ */
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index 8fc5e766157..5da72bd5172 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -314,7 +314,7 @@ ec_get_event_from_state (ec_t *ec)
                 /* If ec is up but some subvolumes are yet to notify, give
                  * grace time for other subvols to notify to prevent start of
                  * I/O which may result in self-heals */
-                if (ec->timer && ec->xl_notify_count < ec->nodes)
+                if (ec->xl_notify_count < ec->nodes)
                         return GF_EVENT_MAXVAL;
 
                 return GF_EVENT_CHILD_UP;
@@ -336,8 +336,8 @@ ec_up (xlator_t *this, ec_t *ec)
         }
 
         ec->up = 1;
-        gf_msg (this->name, GF_LOG_INFO, 0,
-                EC_MSG_EC_UP, "Going UP");
+        gf_msg (this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP, "Going UP");
+
         gf_event (EVENT_EC_MIN_BRICKS_UP, "subvol=%s", this->name);
 }
 
@@ -350,8 +350,8 @@ ec_down (xlator_t *this, ec_t *ec)
         }
 
         ec->up = 0;
-        gf_msg (this->name, GF_LOG_INFO, 0,
-                EC_MSG_EC_DOWN, "Going DOWN");
+        gf_msg (this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN, "Going DOWN");
+
         gf_event (EVENT_EC_MIN_BRICKS_NOT_UP, "subvol=%s", this->name);
 }
 
@@ -375,31 +375,38 @@ ec_notify_cbk (void *data)
                 gf_timer_call_cancel (ec->xl->ctx, ec->timer);
                 ec->timer = NULL;
 
+                /* The timeout has expired, so any subvolume that has not
+                 * already reported its state, will be considered to be down.
+                 * We mark as if all bricks had reported. */
+                ec->xl_notify = (1ULL << ec->nodes) - 1ULL;
+                ec->xl_notify_count = ec->nodes;
+
+                /* Since we have marked all subvolumes as notified, it's
+                 * guaranteed that ec_get_event_from_state() will return
+                 * CHILD_UP or CHILD_DOWN, but not MAXVAL. */
                 event = ec_get_event_from_state (ec);
-                /* If event is still MAXVAL then enough subvolumes didn't
-                 * notify, treat it as CHILD_DOWN. */
-                if (event == GF_EVENT_MAXVAL) {
-                        event = GF_EVENT_CHILD_DOWN;
-                        ec->xl_notify = (1ULL << ec->nodes) - 1ULL;
-                        ec->xl_notify_count = ec->nodes;
-                } else if (event == GF_EVENT_CHILD_UP) {
-                        /* Rest of the bricks are still not coming up,
-                         * notify that ec is up. Files/directories will be
-                         * healed as in when they come up. */
+                if (event == GF_EVENT_CHILD_UP) {
+                        /* We are ready to bring the volume up. If there are
+                         * still bricks DOWN, they will be healed when they
+                         * come up. */
                         ec_up (ec->xl, ec);
                 }
 
-                /* CHILD_DOWN should not come here as no grace period is given
-                 * for notifying CHILD_DOWN. */
-
                 propagate = _gf_true;
         }
 unlock:
         UNLOCK(&ec->lock);
 
         if (propagate) {
+                if ((event == GF_EVENT_CHILD_UP) && ec->shd.iamshd) {
+                        /* We have just brought the volume UP, so we trigger
+                         * a self-heal check on the root directory. */
+                        ec_launch_replace_heal (ec);
+                }
+
                 default_notify (ec->xl, event, NULL);
         }
+
 }
 
 void
@@ -418,7 +425,7 @@ ec_launch_notify_timer (xlator_t *this, ec_t *ec)
         }
 }
 
-void
+static gf_boolean_t
 ec_handle_up (xlator_t *this, ec_t *ec, int32_t idx)
 {
         if (((ec->xl_up >> idx) & 1) == 0) { /* Duplicate event */
@@ -428,7 +435,11 @@ ec_handle_up (xlator_t *this, ec_t *ec, int32_t idx)
                 }
                 ec->xl_up |= 1ULL << idx;
                 ec->xl_up_count++;
+
+                return _gf_true;
         }
+
+        return _gf_false;
 }
 
 void
@@ -466,14 +477,15 @@ ec_pending_fops_completed(ec_t *ec)
 int32_t
 ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
 {
-        ec_t             *ec        = this->private;
-        int32_t           idx       = 0;
-        int32_t           error     = 0;
-        glusterfs_event_t old_event = GF_EVENT_MAXVAL;
-        dict_t            *input    = NULL;
-        dict_t            *output   = NULL;
-        gf_boolean_t      propagate = _gf_true;
-        int32_t           orig_event = event;
+        ec_t             *ec              = this->private;
+        int32_t           idx             = 0;
+        int32_t           error           = 0;
+        glusterfs_event_t old_event       = GF_EVENT_MAXVAL;
+        dict_t            *input          = NULL;
+        dict_t            *output         = NULL;
+        gf_boolean_t      propagate       = _gf_true;
+        gf_boolean_t      needs_shd_check = _gf_false;
+        int32_t           orig_event      = event;
         struct gf_upcall *up_data   = NULL;
         struct gf_upcall_cache_invalidation *up_ci = NULL;
 
@@ -502,8 +514,6 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
 
         for (idx = 0; idx < ec->nodes; idx++) {
                 if (ec->xl_list[idx] == data) {
-                        if (event == GF_EVENT_CHILD_UP)
-                                ec_selfheal_childup (ec, idx);
                         break;
                 }
         }
@@ -528,17 +538,27 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)
                 old_event = ec_get_event_from_state (ec);
 
                 if (event == GF_EVENT_CHILD_UP) {
-                        ec_handle_up (this, ec, idx);
+                        /* We need to trigger a selfheal if a brick changes
+                         * to UP state. */
+                        needs_shd_check = ec_handle_up (this, ec, idx);
                 } else if (event == GF_EVENT_CHILD_DOWN) {
                         ec_handle_down (this, ec, idx);
                 }
 
                 event = ec_get_event_from_state (ec);
 
-                if (event == GF_EVENT_CHILD_UP && !ec->up) {
-                        ec_up (this, ec);
-                } else if (event == GF_EVENT_CHILD_DOWN && ec->up) {
-                        ec_down (this, ec);
+                if (event == GF_EVENT_CHILD_UP) {
+                        if (!ec->up) {
+                                ec_up (this, ec);
+                        }
+                } else {
+                        /* If the volume is not UP, it's irrelevant if one
+                         * brick has come up. We cannot heal anything. */
+                        needs_shd_check = _gf_false;
+
+                        if ((event == GF_EVENT_CHILD_DOWN) && ec->up) {
+                                ec_down (this, ec);
+                        }
                 }
 
                 if (event != GF_EVENT_MAXVAL) {
@@ -557,14 +577,13 @@ unlock:
 
 done:
         if (propagate) {
+                if (needs_shd_check && ec->shd.iamshd) {
+                        ec_launch_replace_heal (ec);
+                }
+
                 error = default_notify (this, event, data);
         }
 
-        if (ec->shd.iamshd &&
-            ec->xl_notify_count == ec->nodes &&
-            event == GF_EVENT_CHILD_UP) {
-                ec_launch_replace_heal (ec);
-        }
 out:
         return error;
 }