2 files changed, 52 insertions, 28 deletions
diff --git a/xlators/cluster/ec/src/ec-data.c b/xlators/cluster/ec/src/ec-data.c
index 9d64280eb35..a869735794e 100644
--- a/xlators/cluster/ec/src/ec-data.c
+++ b/xlators/cluster/ec/src/ec-data.c
@@ -104,19 +104,6 @@ void ec_cbk_data_destroy(ec_cbk_data_t * cbk)
     mem_put(cbk);
 }
 
-/* PARENT_DOWN will be notified to children only after these fops are complete
- * when graph switch happens.  We do not want graph switch to be waiting on
- * heal to complete as healing big file/directory could take a while. Which
- * will lead to hang on the mount.
- */
-static gf_boolean_t
-ec_needs_graceful_completion (ec_fop_data_t *fop)
-{
-        if ((fop->id != EC_FOP_HEAL) && (fop->id != EC_FOP_FHEAL))
-                return _gf_true;
-        return _gf_false;
-}
-
 ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,
                                      int32_t id, uint32_t flags,
                                      uintptr_t target, int32_t minimum,
@@ -203,13 +190,11 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,
         fop->parent = parent;
     }
 
-    if (ec_needs_graceful_completion (fop)) {
-            LOCK(&ec->lock);
+    LOCK(&ec->lock);
 
-            list_add_tail(&fop->pending_list, &ec->pending_fops);
+    list_add_tail(&fop->pending_list, &ec->pending_fops);
 
-            UNLOCK(&ec->lock);
-    }
+    UNLOCK(&ec->lock);
 
     return fop;
 }
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
index 8d62b01ac8a..6562adf9e24 100644
--- a/xlators/cluster/ec/src/ec-heal.c
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -1428,6 +1428,12 @@ ec_name_heal_handler (xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
         int                 i          = 0;
         int                 ret        = 0;
 
+        if (ec->shutdown) {
+                gf_msg_debug(this->name, 0, "Cancelling directory heal "
+                                            "because EC is stopping.");
+                return -ENOTCONN;
+        }
+
         memcpy (name_on, name_data->participants, ec->nodes);
         ret = ec_heal_name (name_data->frame, ec, parent->inode,
                             entry->d_name, name_on);
@@ -1449,6 +1455,7 @@ ec_heal_names (call_frame_t *frame, ec_t *ec, inode_t *inode,
         int j = 0;
         loc_t loc = {0};
         struct ec_name_data name_data = {0};
+        int ret = 0;
 
         loc.inode = inode_ref (inode);
         gf_uuid_copy (loc.gfid, inode->gfid);
@@ -1459,18 +1466,23 @@ ec_heal_names (call_frame_t *frame, ec_t *ec, inode_t *inode,
         for (i = 0; i < ec->nodes; i++) {
                 if (!participants[i])
                         continue;
-                syncop_dir_scan (ec->xl_list[i], &loc,
-                                GF_CLIENT_PID_SELF_HEALD, &name_data,
-                                ec_name_heal_handler);
+                ret = syncop_dir_scan (ec->xl_list[i], &loc,
+                                       GF_CLIENT_PID_SELF_HEALD, &name_data,
+                                       ec_name_heal_handler);
+                if (ret < 0) {
+                        break;
+                }
                 for (j = 0; j < ec->nodes; j++)
                         if (name_data.failed_on[j])
                                 participants[j] = 0;
 
-                if (EC_COUNT (participants, ec->nodes) <= ec->fragments)
-                        return -ENOTCONN;
+                if (EC_COUNT (participants, ec->nodes) <= ec->fragments) {
+                        ret = -ENOTCONN;
+                        break;
+                }
         }
         loc_wipe (&loc);
-        return 0;
+        return ret;
 }
 
 int
@@ -2009,6 +2021,17 @@ ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
 
         for (heal->offset = 0; (heal->offset < size) && !heal->done;
                                                    heal->offset += heal->size) {
+                /* We immediately abort any heal if a shutdown request has been
+                 * received to avoid delays. The healing of this file will be
+                 * restarted by another SHD or other client that accesses the
+                 * file. */
+                if (ec->shutdown) {
+                        gf_msg_debug(ec->xl->name, 0, "Cancelling heal because "
+                                                      "EC is stopping.");
+                        ret = -ENOTCONN;
+                        break;
+                }
+
                 gf_msg_debug (ec->xl->name, 0, "%s: sources: %d, sinks: "
                         "%d, offset: %"PRIu64" bsize: %"PRIu64,
                         uuid_utoa (fd->inode->gfid),
@@ -2612,16 +2635,32 @@ ec_handle_healers_done (ec_fop_data_t *fop)
                 return;
 
         LOCK (&ec->lock);
-        {
-                list_del_init (&fop->healer);
+
+        list_del_init (&fop->healer);
+
+        do {
                 ec->healers--;
                 heal_fop = __ec_dequeue_heals (ec);
-        }
+
+                if ((heal_fop != NULL) && ec->shutdown) {
+                        /* This will prevent ec_handle_healers_done() to be
+                         * called recursively. That would be problematic if
+                         * the queue is too big. */
+                        list_del_init(&heal_fop->healer);
+
+                        UNLOCK(&ec->lock);
+
+                        ec_fop_set_error(fop, ENOTCONN);
+                        ec_heal_fail(ec, heal_fop);
+
+                        LOCK(&ec->lock);
+                }
+        } while ((heal_fop != NULL) && ec->shutdown);
+
         UNLOCK (&ec->lock);
 
         if (heal_fop)
                 ec_launch_heal (ec, heal_fop);
-
 }
 
 void