diff options
| author | Susant Palai <spalai@redhat.com> | 2016-12-13 14:26:53 +0530 | 
|---|---|---|
| committer | Raghavendra G <rgowdapp@redhat.com> | 2016-12-27 21:54:41 -0800 | 
| commit | 58deefde3c279fd867304118d60970bdfa72cd93 (patch) | |
| tree | d2562581dfb97b4eba7238c44b16e7cb28cc49f0 | |
| parent | e657973372c35de91fd447b74fd0229e60f24f42 (diff) | |
dht/rebalance: reverify lookup failures
race: readdirp has read one entry, and doing a lookup on
that entry, but user might have renamed/removed that entry just
after readdirp but before lookup.
Since remove-brick is a costly opertaion,will ingore any
ENOENT/ESTALE failures and move on.
> Change-Id: I62c7fa93c0b9b7e764065ad1574b97acf51b5996
> BUG: 1408115
> Signed-off-by: Susant Palai <spalai@redhat.com>
> Reviewed-on: http://review.gluster.org/15846
> Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
> Smoke: Gluster Build System <jenkins@build.gluster.org>
> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
Change-Id: I62c7fa93c0b9b7e764065ad1574b97acf51b5996
BUG: 1408414
Reviewed-on: http://review.gluster.org/15846
Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
Smoke: Gluster Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
(cherry picked from commit c20febcb1ffaef3fa29563987e7a3b554aea27b3)
Signed-off-by: Susant Palai <spalai@redhat.com>
Reviewed-on: http://review.gluster.org/16278
| -rw-r--r-- | xlators/cluster/dht/src/dht-messages.h | 11 | ||||
| -rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 177 | 
2 files changed, 130 insertions, 58 deletions
diff --git a/xlators/cluster/dht/src/dht-messages.h b/xlators/cluster/dht/src/dht-messages.h index 153f4de0458..30b64eb5711 100644 --- a/xlators/cluster/dht/src/dht-messages.h +++ b/xlators/cluster/dht/src/dht-messages.h @@ -40,7 +40,7 @@   */  #define GLFS_DHT_BASE                   GLFS_MSGID_COMP_DHT -#define GLFS_DHT_NUM_MESSAGES           117 +#define GLFS_DHT_NUM_MESSAGES           118  #define GLFS_MSGID_END          (GLFS_DHT_BASE + GLFS_DHT_NUM_MESSAGES + 1)  /* Messages with message IDs */ @@ -1072,11 +1072,18 @@  #define DHT_MSG_LOCK_INODE_UNREF_FAILED  (GLFS_DHT_BASE + 116)  /* - * @messageid 109116 + * @messageid 109117   * @diagnosis   * @recommendedaction None   */  #define DHT_MSG_ASPRINTF_FAILED         (GLFS_DHT_BASE + 117) +/* + * @messageid 109118 + * @diagnosis + * @recommendedaction None + */ +#define DHT_MSG_DIR_LOOKUP_FAILED          (GLFS_DHT_BASE + 118) +  #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"  #endif /* _DHT_MESSAGES_H_ */ diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index 5fd9c86d363..1d145855ed7 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -2362,6 +2362,7 @@ gf_defrag_get_entry (xlator_t *this, int i, struct dht_container **container,          struct dht_container   *tmp_container   = NULL;          xlator_t               *hashed_subvol   = NULL;          xlator_t               *cached_subvol   = NULL; +        int                     fop_errno       = 0;          if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {                  ret = -1; @@ -2385,11 +2386,11 @@ gf_defrag_get_entry (xlator_t *this, int i, struct dht_container **container,                  }                  if (ret < 0) { -                        gf_msg (this->name, GF_LOG_ERROR, 0, +                        gf_msg (this->name, GF_LOG_WARNING, -ret,                                  DHT_MSG_MIGRATE_DATA_FAILED, -                                "%s: Migrate data failed: Readdir returned" -                                " %s. Aborting migrate-data", loc->path, -                                strerror(-ret)); +                                "Readdirp failed. Aborting data migration for " +                                "directory: %s", loc->path); +                        fop_errno = -ret;                          ret = -1;                          goto out;                  } @@ -2521,9 +2522,9 @@ gf_defrag_get_entry (xlator_t *this, int i, struct dht_container **container,                  ret = syncop_lookup (this, &entry_loc, NULL, NULL,                                       NULL, NULL);                  if (ret) { -                        gf_msg (this->name, GF_LOG_ERROR, 0, +                        gf_msg (this->name, GF_LOG_WARNING, -ret,                                  DHT_MSG_MIGRATE_FILE_FAILED, -                                "Migrate file failed:%s lookup failed", +                                "lookup failed for file:%s",                                  entry_loc.path);                          if (-ret != ENOENT && -ret != ESTALE) { @@ -2644,6 +2645,9 @@ out:          if (xattr_rsp)                  dict_unref (xattr_rsp); + + +        errno = fop_errno;          return ret;  } @@ -2669,6 +2673,7 @@ gf_defrag_process_dir (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,          int                      throttle_up       = 0;          struct dir_dfmeta       *dir_dfmeta        = NULL;          int                      should_commit_hash = 1; +        int                      fop_errno         = 0;          gf_log (this->name, GF_LOG_INFO, "migrate data called on %s",                  loc->path); @@ -2691,10 +2696,11 @@ gf_defrag_process_dir (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,          ret = syncop_opendir (this, loc, fd, NULL, NULL);          if (ret) { -                gf_msg (this->name, GF_LOG_ERROR, 0, +                gf_msg (this->name, GF_LOG_WARNING, 0,                          DHT_MSG_MIGRATE_DATA_FAILED,                          "Migrate data failed: Failed to open dir %s",                          loc->path); +                fop_errno = -ret;                  ret = -1;                  goto out;          } @@ -2841,9 +2847,12 @@ gf_defrag_process_dir (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,                                                     migrate_data, dir_dfmeta,                                                     xattr_req,                                                     &should_commit_hash); +                          if (ret) { -                                gf_log ("DHT", GF_LOG_INFO, "Found critical " +                                fop_errno = errno; +                                gf_log ("this->name", GF_LOG_WARNING, "Found "                                          "error from gf_defrag_get_entry"); +                                  ret = -1;                                  goto out;                          } @@ -2901,6 +2910,7 @@ out:                  ret = 2;          } +        errno = fop_errno;          return ret;  }  int @@ -3070,7 +3080,6 @@ out:          return ret;  } -  int  gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,                    dict_t *fix_layout, dict_t *migrate_data) @@ -3094,14 +3103,33 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,                  goto out;          } - -          ret = syncop_lookup (this, loc, &iatt, NULL, NULL, NULL);          if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "Lookup failed on %s", -                        loc->path); -                ret = -1; -                goto out; +                if (strcmp (loc->path, "/") == 0) { +                        gf_msg (this->name, GF_LOG_ERROR, -ret, +                                DHT_MSG_DIR_LOOKUP_FAILED, +                                "lookup failed for:%s", loc->path); + +                        defrag->total_failures++; +                        ret = -1; +                        goto out; +                } + +                if (-ret == ENOENT || -ret == ESTALE) { +                        gf_msg (this->name, GF_LOG_INFO, errno, +                                DHT_MSG_DIR_LOOKUP_FAILED, +                                "Dir:%s renamed or removed. Skipping", +                                loc->path); +                                ret = 0; +                                goto out; +                } else { +                        gf_msg (this->name, GF_LOG_ERROR, -ret, +                                DHT_MSG_DIR_LOOKUP_FAILED, +                                "lookup failed for:%s", loc->path); + +                        defrag->total_failures++; +                        goto out; +                }          }          if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) && @@ -3109,18 +3137,24 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,                  ret = gf_defrag_process_dir (this, defrag, loc, migrate_data);                  if (ret && ret != 2) { -                        defrag->total_failures++; +                        if (errno == ENOENT || errno == ESTALE) { +                                ret = 0; +                                goto out; +                        } else { -                        gf_msg (this->name, GF_LOG_ERROR, 0, -                                DHT_MSG_DEFRAG_PROCESS_DIR_FAILED, -                                "gf_defrag_process_dir failed for directory: %s" -                                , loc->path); +                                defrag->total_failures++; -                        if (conf->decommission_in_progress) { -                                goto out; -                        } +                                gf_msg (this->name, GF_LOG_ERROR, 0, +                                        DHT_MSG_DEFRAG_PROCESS_DIR_FAILED, +                                        "gf_defrag_process_dir failed for " +                                        "directory: %s", loc->path); -                        should_commit_hash = 0; +                                if (conf->decommission_in_progress) { +                                        goto out; +                                } + +                                should_commit_hash = 0; +                        }                  } else if (ret == 2) {                          should_commit_hash = 0;                  } @@ -3137,8 +3171,14 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,          ret = syncop_opendir (this, loc, fd, NULL, NULL);          if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "Failed to open dir %s", -                        loc->path); +                if (-ret == ENOENT || -ret == ESTALE) { +                        ret = 0; +                        goto out; +                } + +                gf_log (this->name, GF_LOG_ERROR, "Failed to open dir %s, " +                        "err:%d", loc->path, -ret); +                  ret = -1;                  goto out;          } @@ -3150,8 +3190,15 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,          {                  if (ret < 0) { -                        gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s" -                                ". Aborting fix-layout",strerror(-ret)); +                        if (-ret == ENOENT || -ret == ESTALE) { +                                ret = 0; +                                goto out; +                        } + +                        gf_msg (this->name, GF_LOG_ERROR, -ret, +                                DHT_MSG_READDIR_ERROR, "readdirp failed for " +                                "path %s. Aborting fix-layout", loc->path); +                          ret = -1;                          goto out;                  } @@ -3243,43 +3290,63 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,                          ret = syncop_lookup (this, &entry_loc, &iatt, NULL,                                               NULL, NULL); -                        /*Check whether it is ENOENT or ESTALE*/                          if (ret) { -                                gf_log (this->name, GF_LOG_ERROR, "%s" -                                        " lookup failed with %d", -                                        entry_loc.path, -ret); - -                                if (!conf->decommission_in_progress && -                                    -ret != ENOENT && -ret != ESTALE) { -                                        should_commit_hash = 0; +                                if (-ret == ENOENT || -ret == ESTALE) { +                                        gf_msg (this->name, GF_LOG_INFO, errno, +                                                DHT_MSG_DIR_LOOKUP_FAILED, +                                                "Dir:%s renamed or removed. " +                                                "Skipping", loc->path); +                                                ret = 0; +                                        continue; +                                } else { +                                        gf_msg (this->name, GF_LOG_ERROR, -ret, +                                                DHT_MSG_DIR_LOOKUP_FAILED, +                                                "lookup failed for:%s", +                                                entry_loc.path); +                                        defrag->total_failures++; +                                        if (conf->decommission_in_progress) { +                                                defrag->defrag_status = +                                                GF_DEFRAG_STATUS_FAILED; +                                                ret = -1; +                                                goto out; +                                        } else { +                                                should_commit_hash = 0; +                                                continue; +                                        }                                  } - -                                continue;                          }                          ret = syncop_setxattr (this, &entry_loc, fix_layout,                                                 0, NULL, NULL);                          if (ret) { -                                gf_log (this->name, GF_LOG_ERROR, "Setxattr " -                                        "failed for %s", entry_loc.path); - -                                defrag->total_failures++; - -                                /*Don't go for fix-layout of child subtree if" -                                  fix-layout failed*/ -                                if (conf->decommission_in_progress) { -                                        defrag->defrag_status = -                                        GF_DEFRAG_STATUS_FAILED; - -                                        ret = -1; - -                                        goto out; -                                } else { +                                if (-ret == ENOENT || -ret == ESTALE) { +                                        gf_msg (this->name, GF_LOG_INFO, -ret, +                                                DHT_MSG_LAYOUT_FIX_FAILED, +                                                "Setxattr failed. Dir %s " +                                                "renamed or removed", +                                                entry_loc.path); +                                        ret = 0;                                          continue; +                                } else { + +                                        gf_msg (this->name, GF_LOG_ERROR, -ret, +                                                DHT_MSG_LAYOUT_FIX_FAILED, +                                                "Setxattr failed for %s", +                                                entry_loc.path); + +                                        defrag->total_failures++; + +                                        if (conf->decommission_in_progress) { +                                                defrag->defrag_status = +                                                GF_DEFRAG_STATUS_FAILED; +                                                ret = -1; +                                                goto out; +                                        } else { +                                                continue; +                                        }                                  }                          } -                          /* A return value of 2 means, either process_dir or                           * lookup of a dir failed. Hence, don't commit hash                           * for the current directory*/ @@ -3299,8 +3366,6 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,                                          defrag->defrag_status =                                          GF_DEFRAG_STATUS_FAILED; -                                        ret = -1; -                                          goto out;                                  } else {                                          /* Let's not commit-hash if  | 
