diff options
author | Susant Palai <spalai@redhat.com> | 2015-06-16 20:35:46 +0530 |
---|---|---|
committer | Raghavendra G <rgowdapp@redhat.com> | 2015-08-23 23:06:31 -0700 |
commit | 1626fb105c99ef623be3687f3f48d9247ab9b7c4 (patch) | |
tree | cda2fc88f521eaf69041ca7a8d343dad02f7c87c | |
parent | a586b30c1bd968d23562406cefbb76b82a0e236c (diff) |
dht: block/handle create op falling to decommissioned brick
Problem:
Post remove-brick start till commit phase, the client layout
may not be in sync with disk layout because of lack of lookup.
Hence,a create call may fall on the decommissioned brick.
Solution:
Will acquire a lock on hashed subvol. So that a fix-layout or
selfheal can not step on layout while reading the layout.
Even if we read a layout before remove-brick fix-layout and the
file falls on the decommissioned brick, the file should be
migrated to a new brick as per the fix-layout.
Change-Id: If84a12ec34f981adb2b9b224e80f535cfe5bf9f2
BUG: 1232378
Signed-off-by: Susant Palai <spalai@redhat.com>
Reviewed-on: http://review.gluster.org/11260
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Tested-by: NetBSD Build System <jenkins@build.gluster.org>
Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
-rw-r--r-- | xlators/cluster/dht/src/dht-common.c | 456 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 11 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-diskusage.c | 25 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-linkfile.c | 2 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-selfheal.c | 18 |
5 files changed, 455 insertions, 57 deletions
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index dbbb7e59bc0..1195c3bb49d 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -34,7 +34,6 @@ dht_removexattr2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame); int dht_setxattr2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame); - int dht_aggregate_quota_xattr (dict_t *dst, char *key, data_t *value) { @@ -3512,7 +3511,6 @@ err: return 0; } - static int dht_common_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, @@ -5473,9 +5471,6 @@ dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int ret = -1; dht_local_t *local = NULL; - if (op_ret == -1) - goto out; - local = frame->local; if (!local) { op_ret = -1; @@ -5483,6 +5478,9 @@ dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } + if (op_ret == -1) + goto out; + prev = cookie; if (local->loc.parent) { @@ -5502,18 +5500,34 @@ dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, op_errno = EINVAL; goto out; } + + local->op_errno = op_errno; + if (local->linked == _gf_true) { local->stbuf = *stbuf; dht_linkfile_attr_heal (frame, this); } out: + DHT_STRIP_PHASE1_FLAGS (stbuf); - DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, stbuf, preparent, - postparent, xdata); + + if (local && local->lock.locks) { + /* store op_errno for failure case*/ + local->op_errno = op_errno; + local->refresh_layout_unlock (frame, this, op_ret); + + if (op_ret == 0) { + DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd, + inode, stbuf, preparent, postparent, + xdata); + } + } else { + DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, + stbuf, preparent, postparent, xdata); + } return 0; } - int dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -5525,8 +5539,10 @@ dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, dht_local_t *local = NULL; xlator_t *cached_subvol = NULL; - if (op_ret == -1) + if (op_ret == -1) { + local->op_errno = op_errno; goto err; + } local = frame->local; cached_subvol = local->cached_subvol; @@ -5538,25 +5554,327 @@ dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, return 0; err: - DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, - NULL, NULL, NULL); + if (local->lock.locks) + local->refresh_layout_unlock (frame, this, -1); + + return 0; +} + +int +dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, + xlator_t *subvol, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, + dict_t *params) +{ + dht_local_t *local = NULL; + xlator_t *avail_subvol = NULL; + + local = frame->local; + + if (!dht_is_subvol_filled (this, subvol)) { + gf_msg_debug (this->name, 0, + "creating %s on %s", loc->path, + subvol->name); + + STACK_WIND (frame, dht_create_cbk, + subvol, subvol->fops->create, + loc, flags, mode, umask, fd, params); + + } else { + avail_subvol = dht_free_disk_available_subvol (this, subvol, local); + + if (avail_subvol != subvol) { + local->params = dict_ref (params); + local->flags = flags; + local->mode = mode; + local->umask = umask; + local->cached_subvol = avail_subvol; + local->hashed_subvol = subvol; + + gf_msg_debug (this->name, 0, + "creating %s on %s (link at %s)", loc->path, + avail_subvol->name, subvol->name); + + dht_linkfile_create (frame, dht_create_linkfile_create_cbk, + this, avail_subvol, subvol, loc); + + goto out; + } + + gf_msg_debug (this->name, 0, + "creating %s on %s", loc->path, subvol->name); + + STACK_WIND (frame, dht_create_cbk, + subvol, subvol->fops->create, + loc, flags, mode, umask, fd, params); + } +out: return 0; } int +dht_build_parent_loc (xlator_t *this, loc_t *parent, loc_t *child, + int32_t *op_errno) +{ + inode_table_t *table = NULL; + int ret = -1; + + if (!parent || !child) { + if (op_errno) + *op_errno = EINVAL; + goto out; + } + + if (child->parent) { + parent->inode = inode_ref (child->parent); + if (!parent->inode) { + if (op_errno) + *op_errno = EINVAL; + goto out; + } + + gf_uuid_copy (parent->gfid, child->pargfid); + + ret = 0; + + goto out; + } else { + if (gf_uuid_is_null (child->pargfid)) { + if (op_errno) + *op_errno = EINVAL; + goto out; + } + + table = this->itable; + + if (!table) { + if (op_errno) { + *op_errno = EINVAL; + goto out; + } + } + + parent->inode = inode_find (table, child->pargfid); + + if (!parent->inode) { + if (op_errno) { + *op_errno = EINVAL; + goto out; + } + } + + gf_uuid_copy (parent->gfid, child->pargfid); + + ret = 0; + } + +out: + return ret; +} + + +int32_t +dht_create_do (call_frame_t *frame) +{ + dht_local_t *local = NULL; + dht_layout_t *refreshed = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_methods_t *methods = NULL; + + local = frame->local; + + this = THIS; + + conf = this->private; + + GF_VALIDATE_OR_GOTO (this->name, conf, err); + + methods = conf->methods; + + GF_VALIDATE_OR_GOTO (this->name, conf->methods, err); + + /* We don't need parent_loc anymore */ + loc_wipe (&local->loc); + + loc_copy (&local->loc, &local->loc2); + + loc_wipe (&local->loc2); + + refreshed = local->selfheal.refreshed_layout; + + subvol = methods->layout_search (this, refreshed, local->loc.name); + + if (!subvol) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, "no subvolume in " + "layout for path=%s", local->loc.path); + local->op_errno = ENOENT; + goto err; + } + + dht_create_wind_to_avail_subvol (frame, this, subvol, &local->loc, + local->flags, local->mode, + local->umask, local->fd, local->params); + return 0; +err: + local->refresh_layout_unlock (frame, this, -1); + + return 0; +} + +int32_t +dht_create_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + DHT_STACK_DESTROY (frame); + return 0; +} + +int32_t +dht_create_finish (call_frame_t *frame, xlator_t *this, int op_ret) +{ + dht_local_t *local = NULL, *lock_local = NULL; + call_frame_t *lock_frame = NULL; + int lock_count = 0; + + local = frame->local; + lock_count = dht_lock_count (local->lock.locks, local->lock.lk_count); + if (lock_count == 0) + goto done; + + lock_frame = copy_frame (frame); + if (lock_frame == NULL) { + goto done; + } + + lock_local = dht_local_init (lock_frame, &local->loc, NULL, + lock_frame->root->op); + if (lock_local == NULL) { + goto done; + } + + lock_local->lock.locks = local->lock.locks; + lock_local->lock.lk_count = local->lock.lk_count; + + local->lock.locks = NULL; + local->lock.lk_count = 0; + + dht_unlock_inodelk (lock_frame, lock_local->lock.locks, + lock_local->lock.lk_count, + dht_create_unlock_cbk); + lock_frame = NULL; + +done: + if (lock_frame != NULL) { + DHT_STACK_DESTROY (lock_frame); + } + + if (op_ret == 0) + return 0; + + DHT_STACK_UNWIND (create, frame, op_ret, local->op_errno, NULL, NULL, + NULL, NULL, NULL, NULL); + return 0; +} + +int32_t +dht_create_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + + local = frame->local; + + if (!local) { + goto err; + } + + if (op_ret < 0) { + gf_msg ("DHT", GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR, + "Create lock failed for file: %s", local->loc2.name); + + local->op_errno = op_errno; + + goto err; + } + + local->refresh_layout_unlock = dht_create_finish; + + local->refresh_layout_done = dht_create_do; + + dht_refresh_layout (frame); + + return 0; +err: + dht_create_finish (frame, this, -1); + return 0; +} + +int32_t +dht_create_lock (call_frame_t *frame, xlator_t *subvol) +{ + dht_local_t *local = NULL; + int count = 1, ret = -1; + dht_lock_t **lk_array = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO (frame->this->name, frame->local, err); + + local = frame->local; + + lk_array = GF_CALLOC (count, sizeof (*lk_array), gf_common_mt_char); + + if (lk_array == NULL) + goto err; + + lk_array[0] = dht_lock_new (frame->this, subvol, &local->loc, F_RDLCK, + DHT_LAYOUT_HEAL_DOMAIN); + + if (lk_array[0] == NULL) + goto err; + + local->lock.locks = lk_array; + local->lock.lk_count = count; + + ret = dht_blocking_inodelk (frame, lk_array, count, + dht_create_lock_cbk); + + if (ret < 0) { + local->lock.locks = NULL; + local->lock.lk_count = 0; + goto err; + } + + return 0; +err: + if (lk_array != NULL) { + dht_lock_array_free (lk_array, count); + GF_FREE (lk_array); + } + + return -1; +} + +int dht_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *params) { - int op_errno = -1; - xlator_t *subvol = NULL; - dht_local_t *local = NULL; - xlator_t *avail_subvol = NULL; + int op_errno = -1; + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + int i = 0; + dht_conf_t *conf = NULL; + int ret = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); + conf = this->private; + dht_get_du_info (frame, this, loc); local = dht_local_init (frame, loc, fd, GF_FOP_CREATE); @@ -5579,48 +5897,90 @@ dht_create (call_frame_t *frame, xlator_t *this, subvol = dht_subvol_get_hashed (this, loc); if (!subvol) { - gf_msg_debug (this->name, 0, - "no subvolume in layout for path=%s", - loc->path); + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; goto err; } - if (!dht_is_subvol_filled (this, subvol)) { - gf_msg_trace (this->name, 0, - "creating %s on %s", loc->path, - subvol->name); - STACK_WIND (frame, dht_create_cbk, - subvol, subvol->fops->create, - loc, flags, mode, umask, fd, params); - goto done; - } - /* Choose the minimum filled volume, and create the - files there */ - avail_subvol = dht_free_disk_available_subvol (this, subvol, local); - if (avail_subvol != subvol) { - local->params = dict_ref (params); - local->flags = flags; - local->mode = mode; - local->umask = umask; - local->cached_subvol = avail_subvol; - local->hashed_subvol = subvol; - gf_msg_trace (this->name, 0, - "creating %s on %s (link at %s)", loc->path, - avail_subvol->name, subvol->name); - dht_linkfile_create (frame, dht_create_linkfile_create_cbk, - this, avail_subvol, subvol, loc); - goto done; + /* Post remove-brick, the client layout may not be in sync with + * disk layout because of lack of lookup. Hence,a create call + * may fall on the decommissioned brick. Hence, if the + * hashed_subvol is part of decommissioned bricks list, do a + * lookup on parent dir. If a fix-layout is already done by the + * remove-brick process, the parent directory layout will be in + * sync with that of the disk. If fix-layout is still ending + * on the parent directory, we can let the file get created on + * the decommissioned brick which will be eventually migrated to + * non-decommissioned brick based on the new layout. + */ + + if (conf->decommission_subvols_cnt) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i] && + conf->decommissioned_bricks[i] == subvol) { + + gf_msg_debug (this->name, 0, "hashed subvol:%s is " + "part of decommission brick list for " + "file: %s", subvol->name, loc->path); + + /* dht_refresh_layout needs directory info in + * local->loc. Hence, storing the parent_loc in + * local->loc and storing the create context in + * local->loc2. We will restore this information + * in dht_creation do */ + + ret = loc_copy (&local->loc2, &local->loc); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, ENOMEM, + DHT_MSG_NO_MEMORY, + "loc_copy failed %s", loc->path); + + goto err; + } + + local->params = dict_ref (params); + local->flags = flags; + local->mode = mode; + local->umask = umask; + + loc_wipe (&local->loc); + + ret = dht_build_parent_loc (this, &local->loc, loc, + &op_errno); + + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, ENOMEM, + DHT_MSG_NO_MEMORY, + "parent loc build failed"); + goto err; + } + + ret = dht_create_lock (frame, subvol); + + if (ret < 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_INODE_LK_ERROR, + "locking parent failed"); + goto err; + } + + goto done; + } + } } - gf_msg_trace (this->name, 0, - "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_create_cbk, - subvol, subvol->fops->create, - loc, flags, mode, umask, fd, params); + + + dht_create_wind_to_avail_subvol (frame, this, subvol, loc, flags, mode, + umask, fd, params); done: return 0; err: + op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, NULL); diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 16e63f36644..a9bcdbd3141 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -39,6 +39,10 @@ typedef int (*dht_selfheal_dir_cbk_t) (call_frame_t *frame, void *cookie, typedef int (*dht_defrag_cbk_fn_t) (xlator_t *this, xlator_t *dst_node, call_frame_t *frame); +typedef int (*dht_refresh_layout_unlock) (call_frame_t *frame, xlator_t *this, + int op_ret); + +typedef int (*dht_refresh_layout_done_handle) (call_frame_t *frame); struct dht_layout { int spread_cnt; /* layout spread count per directory, @@ -207,6 +211,10 @@ struct dht_local { gf_boolean_t force_mkdir; dht_layout_t *layout, *refreshed_layout; } selfheal; + + dht_refresh_layout_unlock refresh_layout_unlock; + dht_refresh_layout_done_handle refresh_layout_done; + uint32_t uid; uint32_t gid; @@ -505,6 +513,7 @@ typedef struct dht_migrate_info { GF_REF_DECL; } dht_migrate_info_t; + #define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT) #define is_revalidate(loc) (dht_inode_ctx_layout_get (loc->inode, this, NULL) == 0) @@ -1062,4 +1071,6 @@ dht_layout_sort (dht_layout_t *layout); int dht_layout_missing_dirs (dht_layout_t *layout); +int +dht_refresh_layout (call_frame_t *frame); #endif/* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 9ebf6bf8732..b85c78a540b 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -334,7 +334,8 @@ out: } static inline -int32_t dht_subvol_has_err (xlator_t *this, dht_layout_t *layout) +int32_t dht_subvol_has_err (dht_conf_t *conf, xlator_t *this, + dht_layout_t *layout) { int ret = -1; int i = 0; @@ -350,6 +351,17 @@ int32_t dht_subvol_has_err (xlator_t *this, dht_layout_t *layout) goto out; } } + + /* discard decommissioned subvol */ + if (conf->decommission_subvols_cnt) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i] && + conf->decommissioned_bricks[i] == this) + ret = -1; + goto out; + } + } + ret = 0; out: return ret; @@ -371,8 +383,9 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, conf = this->private; for(i=0; i < conf->subvolume_cnt; i++) { - /* check if subvol has layout errors, before selecting it */ - ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], + /* check if subvol has layout errors and also it is not a + * decommissioned brick, before selecting it */ + ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i], layout); if (ignore_subvol) continue; @@ -419,8 +432,10 @@ dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, conf = this->private; for (i = 0; i < conf->subvolume_cnt; i++) { - /* check if subvol has layout errors, before selecting it */ - ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], + /* check if subvol has layout errors and also it is not a + * decommissioned brick, before selecting it*/ + + ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i], layout); if (ignore_subvol) continue; diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c index 894743621ba..deba2138672 100644 --- a/xlators/cluster/dht/src/dht-linkfile.c +++ b/xlators/cluster/dht/src/dht-linkfile.c @@ -115,7 +115,7 @@ dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, int need_unref = 0; int ret = 0; dht_conf_t *conf = this->private; - char gfid[GF_UUID_BUF_SIZE] = {0}; + char gfid[GF_UUID_BUF_SIZE] = {0}; local = frame->local; local->linkfile.linkfile_cbk = linkfile_cbk; diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index ca0507bda44..42ab822a701 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -209,7 +209,7 @@ unlock: if (is_last_call (this_call_cnt)) { if (local->op_ret == 0) { - dht_refresh_layout_done (frame); + local->refresh_layout_done (frame); } else { goto err; } @@ -219,7 +219,8 @@ unlock: return 0; err: - dht_selfheal_dir_finish (frame, this, -1); + local->refresh_layout_unlock (frame, this, -1); + return 0; } @@ -285,7 +286,7 @@ dht_refresh_layout (call_frame_t *frame) return 0; out: - dht_selfheal_dir_finish (frame, this, -1); + local->refresh_layout_unlock (frame, this, -1); return 0; } @@ -294,10 +295,21 @@ int32_t dht_selfheal_layout_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { + dht_local_t *local = NULL; + + local = frame->local; + + if (!local) { + goto err; + } + if (op_ret < 0) { goto err; } + local->refresh_layout_unlock = dht_selfheal_dir_finish; + local->refresh_layout_done = dht_refresh_layout_done; + dht_refresh_layout (frame); return 0; |