diff options
| -rw-r--r-- | tests/bugs/ec/bug-1547662.t | 41 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-heal.c | 9 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-heald.c | 27 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-heald.h | 4 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec.c | 97 | 
5 files changed, 132 insertions, 46 deletions
diff --git a/tests/bugs/ec/bug-1547662.t b/tests/bugs/ec/bug-1547662.t new file mode 100644 index 00000000000..5748218587e --- /dev/null +++ b/tests/bugs/ec/bug-1547662.t @@ -0,0 +1,41 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# Immediately after replace-brick, trusted.ec.version will be absent, so if it +# is present we can assume that heal was started on root +function root_heal_attempted { +        if [ -z $(get_hex_xattr trusted.ec.version $1) ]; then +                echo "N" +        else +                echo "Y" +        fi +} + +cleanup + +TEST glusterd +TEST pidof glusterd +TEST ${CLI} volume create ${V0} disperse 6 redundancy 2 ${H0}:${B0}/${V0}{0..5} +TEST ${CLI} volume start ${V0} +TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0} +EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count ${V0} 0 + +TEST mkdir ${M0}/base +TEST mkdir ${M0}/base/dir.{1,2} +TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2} +TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2} +TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2} +TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2} +TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2} + +TEST ${CLI} volume replace-brick ${V0} ${H0}:${B0}/${V0}5 ${H0}:${B0}/${V0}6 commit force +EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count ${V0} 0 +EXPECT_WITHIN ${PROCESS_UP_TIMEOUT} "Y" glustershd_up_status +EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count_shd ${V0} 0 +EXPECT_WITHIN ${HEAL_TIMEOUT} "Y" root_heal_attempted ${B0}/${V0}6 +EXPECT_WITHIN ${HEAL_TIMEOUT} "^0$" get_pending_heal_count ${V0} +EXPECT "^127$" echo $(find ${B0}/${V0}6/base -type d | wc -l) + +cleanup; diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c index e4f9c0e3539..aeb890927c5 100644 --- a/xlators/cluster/ec/src/ec-heal.c +++ b/xlators/cluster/ec/src/ec-heal.c @@ -25,6 +25,7 @@  #include "ec-combine.h"  #include "ec-method.h"  #include "ec-fops.h" +#include "ec-heald.h"  #define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr; })  #define EC_COUNT(array, max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res; }) @@ -2631,6 +2632,14 @@ ec_replace_heal (ec_t *ec, inode_t *inode)                  gf_msg_debug (ec->xl->name, 0,                          "Heal failed for replace brick ret = %d", ret); +        /* Once the root inode has been checked, it might have triggered a +         * self-heal on it after a replace brick command or for some other +         * reason. It can also happen that the volume already had damaged +         * files in the index, even if the heal on the root directory failed. +         * In both cases we need to wake all index healers to continue +         * healing remaining entries that are marked as dirty. */ +        ec_shd_index_healer_wake(ec); +          loc_wipe (&loc);          return ret;  } diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c index ffb78d5e950..6c1cefda39e 100644 --- a/xlators/cluster/ec/src/ec-heald.c +++ b/xlators/cluster/ec/src/ec-heald.c @@ -184,8 +184,19 @@ ec_shd_index_purge (xlator_t *subvol, inode_t *inode, char *name)  int  ec_shd_selfheal (struct subvol_healer *healer, int child, loc_t *loc)  { -        return syncop_getxattr (healer->this, loc, NULL, EC_XATTR_HEAL, NULL, -                                NULL); +        int32_t ret; + +        ret = syncop_getxattr (healer->this, loc, NULL, EC_XATTR_HEAL, NULL, +                               NULL); +        if ((ret >= 0) && (loc->inode->ia_type == IA_IFDIR)) { +                /* If we have just healed a directory, it's possible that +                 * other index entries have appeared to be healed. We put a +                 * mark so that we can check it later and restart a scan +                 * without delay. */ +                healer->rerun = _gf_true; +        } + +        return ret;  } @@ -466,11 +477,15 @@ ec_shd_index_healer_spawn (xlator_t *this, int subvol)  }  void -ec_selfheal_childup (ec_t *ec, int child) +ec_shd_index_healer_wake(ec_t *ec)  { -        if (!ec->shd.iamshd) -                return; -        ec_shd_index_healer_spawn (ec->xl, child); +        int32_t i; + +        for (i = 0; i < ec->nodes; i++) { +                if (((ec->xl_up >> i) & 1) != 0) { +                        ec_shd_index_healer_spawn(ec->xl, i); +                } +        }  }  int diff --git a/xlators/cluster/ec/src/ec-heald.h b/xlators/cluster/ec/src/ec-heald.h index 4ae02e2df3c..2a8488124c4 100644 --- a/xlators/cluster/ec/src/ec-heald.h +++ b/xlators/cluster/ec/src/ec-heald.h @@ -20,6 +20,8 @@ ec_xl_op (xlator_t *this, dict_t *input, dict_t *output);  int  ec_selfheal_daemon_init (xlator_t *this); -void ec_selfheal_childup (ec_t *ec, int child); + +void +ec_shd_index_healer_wake(ec_t *ec);  #endif /* __EC_HEALD_H__ */ diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index 8fc5e766157..5da72bd5172 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -314,7 +314,7 @@ ec_get_event_from_state (ec_t *ec)                  /* If ec is up but some subvolumes are yet to notify, give                   * grace time for other subvols to notify to prevent start of                   * I/O which may result in self-heals */ -                if (ec->timer && ec->xl_notify_count < ec->nodes) +                if (ec->xl_notify_count < ec->nodes)                          return GF_EVENT_MAXVAL;                  return GF_EVENT_CHILD_UP; @@ -336,8 +336,8 @@ ec_up (xlator_t *this, ec_t *ec)          }          ec->up = 1; -        gf_msg (this->name, GF_LOG_INFO, 0, -                EC_MSG_EC_UP, "Going UP"); +        gf_msg (this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP, "Going UP"); +          gf_event (EVENT_EC_MIN_BRICKS_UP, "subvol=%s", this->name);  } @@ -350,8 +350,8 @@ ec_down (xlator_t *this, ec_t *ec)          }          ec->up = 0; -        gf_msg (this->name, GF_LOG_INFO, 0, -                EC_MSG_EC_DOWN, "Going DOWN"); +        gf_msg (this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN, "Going DOWN"); +          gf_event (EVENT_EC_MIN_BRICKS_NOT_UP, "subvol=%s", this->name);  } @@ -375,31 +375,38 @@ ec_notify_cbk (void *data)                  gf_timer_call_cancel (ec->xl->ctx, ec->timer);                  ec->timer = NULL; +                /* The timeout has expired, so any subvolume that has not +                 * already reported its state, will be considered to be down. +                 * We mark as if all bricks had reported. */ +                ec->xl_notify = (1ULL << ec->nodes) - 1ULL; +                ec->xl_notify_count = ec->nodes; + +                /* Since we have marked all subvolumes as notified, it's +                 * guaranteed that ec_get_event_from_state() will return +                 * CHILD_UP or CHILD_DOWN, but not MAXVAL. */                  event = ec_get_event_from_state (ec); -                /* If event is still MAXVAL then enough subvolumes didn't -                 * notify, treat it as CHILD_DOWN. */ -                if (event == GF_EVENT_MAXVAL) { -                        event = GF_EVENT_CHILD_DOWN; -                        ec->xl_notify = (1ULL << ec->nodes) - 1ULL; -                        ec->xl_notify_count = ec->nodes; -                } else if (event == GF_EVENT_CHILD_UP) { -                        /* Rest of the bricks are still not coming up, -                         * notify that ec is up. Files/directories will be -                         * healed as in when they come up. */ +                if (event == GF_EVENT_CHILD_UP) { +                        /* We are ready to bring the volume up. If there are +                         * still bricks DOWN, they will be healed when they +                         * come up. */                          ec_up (ec->xl, ec);                  } -                /* CHILD_DOWN should not come here as no grace period is given -                 * for notifying CHILD_DOWN. */ -                  propagate = _gf_true;          }  unlock:          UNLOCK(&ec->lock);          if (propagate) { +                if ((event == GF_EVENT_CHILD_UP) && ec->shd.iamshd) { +                        /* We have just brought the volume UP, so we trigger +                         * a self-heal check on the root directory. */ +                        ec_launch_replace_heal (ec); +                } +                  default_notify (ec->xl, event, NULL);          } +  }  void @@ -418,7 +425,7 @@ ec_launch_notify_timer (xlator_t *this, ec_t *ec)          }  } -void +static gf_boolean_t  ec_handle_up (xlator_t *this, ec_t *ec, int32_t idx)  {          if (((ec->xl_up >> idx) & 1) == 0) { /* Duplicate event */ @@ -428,7 +435,11 @@ ec_handle_up (xlator_t *this, ec_t *ec, int32_t idx)                  }                  ec->xl_up |= 1ULL << idx;                  ec->xl_up_count++; + +                return _gf_true;          } + +        return _gf_false;  }  void @@ -466,14 +477,15 @@ ec_pending_fops_completed(ec_t *ec)  int32_t  ec_notify (xlator_t *this, int32_t event, void *data, void *data2)  { -        ec_t             *ec        = this->private; -        int32_t           idx       = 0; -        int32_t           error     = 0; -        glusterfs_event_t old_event = GF_EVENT_MAXVAL; -        dict_t            *input    = NULL; -        dict_t            *output   = NULL; -        gf_boolean_t      propagate = _gf_true; -        int32_t           orig_event = event; +        ec_t             *ec              = this->private; +        int32_t           idx             = 0; +        int32_t           error           = 0; +        glusterfs_event_t old_event       = GF_EVENT_MAXVAL; +        dict_t            *input          = NULL; +        dict_t            *output         = NULL; +        gf_boolean_t      propagate       = _gf_true; +        gf_boolean_t      needs_shd_check = _gf_false; +        int32_t           orig_event      = event;          struct gf_upcall *up_data   = NULL;          struct gf_upcall_cache_invalidation *up_ci = NULL; @@ -502,8 +514,6 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)          for (idx = 0; idx < ec->nodes; idx++) {                  if (ec->xl_list[idx] == data) { -                        if (event == GF_EVENT_CHILD_UP) -                                ec_selfheal_childup (ec, idx);                          break;                  }          } @@ -528,17 +538,27 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)                  old_event = ec_get_event_from_state (ec);                  if (event == GF_EVENT_CHILD_UP) { -                        ec_handle_up (this, ec, idx); +                        /* We need to trigger a selfheal if a brick changes +                         * to UP state. */ +                        needs_shd_check = ec_handle_up (this, ec, idx);                  } else if (event == GF_EVENT_CHILD_DOWN) {                          ec_handle_down (this, ec, idx);                  }                  event = ec_get_event_from_state (ec); -                if (event == GF_EVENT_CHILD_UP && !ec->up) { -                        ec_up (this, ec); -                } else if (event == GF_EVENT_CHILD_DOWN && ec->up) { -                        ec_down (this, ec); +                if (event == GF_EVENT_CHILD_UP) { +                        if (!ec->up) { +                                ec_up (this, ec); +                        } +                } else { +                        /* If the volume is not UP, it's irrelevant if one +                         * brick has come up. We cannot heal anything. */ +                        needs_shd_check = _gf_false; + +                        if ((event == GF_EVENT_CHILD_DOWN) && ec->up) { +                                ec_down (this, ec); +                        }                  }                  if (event != GF_EVENT_MAXVAL) { @@ -557,14 +577,13 @@ unlock:  done:          if (propagate) { +                if (needs_shd_check && ec->shd.iamshd) { +                        ec_launch_replace_heal (ec); +                } +                  error = default_notify (this, event, data);          } -        if (ec->shd.iamshd && -            ec->xl_notify_count == ec->nodes && -            event == GF_EVENT_CHILD_UP) { -                ec_launch_replace_heal (ec); -        }  out:          return error;  }  | 
