diff options
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.h | 1 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-data.c | 188 | 
2 files changed, 138 insertions, 51 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h index 2979966c569..339eebe1564 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -21,6 +21,7 @@  #define __AFR_SELF_HEAL_COMMON_H__  #define FILE_HAS_HOLES(buf) (((buf)->ia_size) > ((buf)->ia_blocks * 512)) +#define AFR_SH_MIN_PARTICIPANTS 2  typedef enum {          AFR_SELF_HEAL_ENTRY, diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 1c687a4e4fa..3df6a0e8705 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -49,6 +49,14 @@  #include "afr-self-heal-common.h"  #include "afr-self-heal-algorithm.h" +int +afr_sh_data_fail (call_frame_t *frame, xlator_t *this); + +static inline gf_boolean_t +afr_sh_data_proceed (unsigned int success_count) +{ +        return (success_count >= AFR_SH_MIN_PARTICIPANTS); +}  extern int  sh_loop_finish (call_frame_t *loop_frame, xlator_t *this); @@ -259,9 +267,14 @@ afr_sh_data_setattr_fstat_cbk (call_frame_t *frame, void *cookie,          sh = &local->self_heal;          GF_ASSERT (sh->source == child_index); -        if (op_ret != -1) +        if (op_ret != -1) {                  sh->buf[child_index] = *buf; -        afr_sh_data_setattr (frame, this); +                afr_sh_data_setattr (frame, this); +        } else { +                gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " +                        "time-stamps after self-heal", local->loc.path); +                afr_sh_data_fail (frame, this); +        }          return 0;  } @@ -360,12 +373,27 @@ afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie,          int             call_count = 0;          afr_local_t     *local = NULL;          afr_self_heal_t *sh = NULL; +        afr_private_t   *priv = NULL; +        int32_t         child_index = (long) cookie; + +        priv  = this->private; +        local = frame->local; +        sh    = &local->self_heal; +        if (op_ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, "Erasing of pending change " +                        "log failed on %s for subvol %s, reason: %s", +                        local->loc.path, priv->children[child_index]->name, +                        strerror (op_errno)); +                sh->op_failed = 1; +        }          call_count = afr_frame_return (frame);          if (call_count == 0) { -                local = frame->local; -                sh = &local->self_heal; +                if (sh->op_failed) { +                        afr_sh_data_fail (frame, this); +                        goto out; +                }                  if (!IA_ISREG (sh->type)) {                          afr_sh_data_finish (frame, this);                          goto out; @@ -550,36 +578,44 @@ afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                        int32_t op_ret, int32_t op_errno, struct iatt *prebuf,                        struct iatt *postbuf, dict_t *xdata)  { -        afr_private_t * priv = NULL; -        afr_local_t * local  = NULL;          int              call_count = 0;          int              child_index = 0; +        afr_private_t    *priv = NULL; +        afr_local_t      *local  = NULL; +        afr_self_heal_t  *sh = NULL; -        priv = this->private; +        priv  = this->private;          local = frame->local; +        sh    = &local->self_heal;          child_index = (long) cookie;          LOCK (&frame->lock);          { -                if (op_ret == -1) -                        gf_log (this->name, GF_LOG_INFO, +                if (op_ret == -1) { +                        gf_log (this->name, GF_LOG_ERROR,                                  "ftruncate of %s on subvolume %s failed (%s)",                                  local->loc.path,                                  priv->children[child_index]->name,                                  strerror (op_errno)); -                else +                        sh->op_failed = 1; +                } else {                          gf_log (this->name, GF_LOG_DEBUG,                                  "ftruncate of %s on subvolume %s completed",                                  local->loc.path,                                  priv->children[child_index]->name); +                }          }          UNLOCK (&frame->lock);          call_count = afr_frame_return (frame); -        if (call_count == 0) -                afr_sh_data_sync_prepare (frame, this); +        if (call_count == 0) { +                if (sh->op_failed) +                        afr_sh_data_fail (frame, this); +                else +                        afr_sh_data_sync_prepare (frame, this); +        }          return 0;  } @@ -849,6 +885,12 @@ afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie,                          sh->buf[child_index] = *buf;                          sh->success_children[sh->success_count] = child_index;                          sh->success_count++; +                } else { +                        gf_log (this->name, GF_LOG_ERROR, "%s: fstat failed " +                                "on %s, reason %s", local->loc.path, +                                priv->children[child_index]->name, +                                strerror (op_errno)); +                        sh->child_errno[child_index] = op_errno;                  }          }          UNLOCK (&frame->lock); @@ -859,13 +901,20 @@ afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie,                  /* Previous versions of glusterfs might have set                   * the pending data xattrs which need to be erased                   */ +                if (!afr_sh_data_proceed (sh->success_count)) { +                        gf_log (this->name, GF_LOG_ERROR, "inspecting metadata " +                                "succeeded on < %d children, aborting " +                                "self-heal for %s", AFR_SH_MIN_PARTICIPANTS, +                                local->loc.path); +                        afr_sh_data_fail (frame, this); +                        goto out; +                }                  if (IA_ISREG (buf->ia_type))                          afr_sh_data_fix (frame, this);                  else                          afr_sh_data_special_file_fix (frame, this); -          } - +out:          return 0;  } @@ -876,33 +925,43 @@ afr_sh_data_fstat (call_frame_t *frame, xlator_t *this)          afr_self_heal_t *sh    = NULL;          afr_local_t     *local = NULL;          afr_private_t   *priv  = NULL; -        int call_count = 0; -        int i = 0; +        int             call_count = 0; +        int             i = 0; +        int             child = 0; +        int32_t         *fstat_children = NULL;          priv  = this->private;          local = frame->local;          sh    = &local->self_heal; -        call_count = afr_up_children_count (local->child_up, -                                            priv->child_count); - +        fstat_children = memdup (sh->success_children, +                                 sizeof (*fstat_children) * priv->child_count); +        if (!fstat_children) { +                afr_sh_data_fail (frame, this); +                goto out; +        } +        call_count = sh->success_count;          local->call_count = call_count;          afr_reset_children (sh->success_children, priv->child_count); +        memset (sh->child_errno, 0, +                sizeof (*sh->child_errno) * priv->child_count);          sh->success_count = 0;          for (i = 0; i < priv->child_count; i++) { -                if (local->child_up[i]) { -                        STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk, -                                           (void *) (long) i, -                                           priv->children[i], -                                           priv->children[i]->fops->fstat, -                                           sh->healing_fd, NULL); - -                        if (!--call_count) -                                break; -                } +                child = fstat_children[i]; +                if (child == -1) +                        break; +                STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk, +                                   (void *) (long) child, +                                   priv->children[child], +                                   priv->children[child]->fops->fstat, +                                   sh->healing_fd, NULL); +                --call_count;          } - +        GF_ASSERT (!call_count); +out: +        if (fstat_children) +                GF_FREE (fstat_children);          return 0;  } @@ -931,6 +990,12 @@ afr_sh_common_fxattrop_resp_handler (call_frame_t *frame, void *cookie,                          sh->xattr[child_index] = dict_ref (xattr);                          sh->success_children[sh->success_count] = child_index;                          sh->success_count++; +                } else { +                        gf_log (this->name, GF_LOG_ERROR, "fxattrop of %s " +                                "failed on %s, reason %s", local->loc.path, +                                priv->children[child_index]->name, +                                strerror (op_errno)); +                        sh->child_errno[child_index] = op_errno;                  }          }          UNLOCK (&frame->lock); @@ -952,19 +1017,27 @@ afr_post_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie,          local = frame->local;          sh = &local->self_heal;          call_count = afr_frame_return (frame); -        if (call_count == 0) { -                (void) afr_build_sources (this, sh->xattr, NULL, -                                          sh->pending_matrix, -                                          sh->sources, sh->success_children, -                                          AFR_DATA_TRANSACTION, NULL, _gf_false); -                ret = afr_sh_inode_set_read_ctx (sh, this); -                if (ret) -                        afr_sh_data_fail (frame, this); -                else -                        afr_sh_set_timestamps (frame, this); -        } +        if (call_count) +                goto out; -        return 0; +        if (!afr_sh_data_proceed (sh->success_count)) { +                gf_log (this->name, GF_LOG_ERROR, "%s, inspecting change log " +                        "succeeded on < %d children", local->loc.path, +                        AFR_SH_MIN_PARTICIPANTS); +                afr_sh_data_fail (frame, this); +                goto out; +         } +         (void) afr_build_sources (this, sh->xattr, NULL, +                                   sh->pending_matrix, +                                   sh->sources, sh->success_children, +                                   AFR_DATA_TRANSACTION, NULL, _gf_false); +         ret = afr_sh_inode_set_read_ctx (sh, this); +         if (ret) +                 afr_sh_data_fail (frame, this); +         else +                 afr_sh_set_timestamps (frame, this); +out: +         return 0;  }  int @@ -972,16 +1045,28 @@ afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie,                            xlator_t *this, int32_t op_ret, int32_t op_errno,                            dict_t *xattr, dict_t *xdata)  { -        int call_count  = -1; +        int             call_count  = -1; +        afr_local_t     *local  = NULL; +        afr_self_heal_t *sh     = NULL; + +        local = frame->local; +        sh    = &local->self_heal;          afr_sh_common_fxattrop_resp_handler (frame, cookie, this, op_ret,                                               op_errno, xattr);          call_count = afr_frame_return (frame);          if (call_count == 0) { +                if (!afr_sh_data_proceed (sh->success_count)) { +                        gf_log (this->name, GF_LOG_ERROR, "%s, inspecting " +                                "change log succeeded on < %d children", +                                local->loc.path, AFR_SH_MIN_PARTICIPANTS); +                        afr_sh_data_fail (frame, this); +                        goto out; +                }                  afr_sh_data_fstat (frame, this);          } - +out:          return 0;  } @@ -1035,6 +1120,8 @@ afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this,          afr_reset_xattr (sh->xattr, priv->child_count);          afr_reset_children (sh->success_children, priv->child_count); +        memset (sh->child_errno, 0, +                sizeof (*sh->child_errno) * priv->child_count);          sh->success_count = 0;          for (i = 0; i < priv->child_count; i++) {                  if (local->child_up[i]) { @@ -1057,8 +1144,7 @@ out:          if (ret) {                  if (zero_pending)                          GF_FREE (zero_pending); -                sh->op_failed = 1; -                afr_sh_data_done (frame, this); +                afr_sh_data_fail (frame, this);          }          return 0; @@ -1240,12 +1326,12 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                                  priv->children[child_index]->name,                                  strerror (op_errno));                          sh->op_failed = 1; +                } else { +                        gf_log (this->name, GF_LOG_TRACE, +                                "open of %s succeeded on child %s", +                                local->loc.path, +                                priv->children[child_index]->name);                  } - -                gf_log (this->name, GF_LOG_TRACE, -                        "open of %s succeeded on child %s", -                        local->loc.path, -                        priv->children[child_index]->name);          }          UNLOCK (&frame->lock);  | 
