diff options
Diffstat (limited to 'xlators/cluster/dht/src/dht-helper.c')
| -rw-r--r-- | xlators/cluster/dht/src/dht-helper.c | 3267 |
1 files changed, 1910 insertions, 1357 deletions
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index 64d6f74acfd..3f2fe43d5f3 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -8,1744 +8,2297 @@ cases as published by the Free Software Foundation. */ -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - - -#include "glusterfs.h" -#include "xlator.h" #include "dht-common.h" -#include "dht-helper.h" +#include "dht-lock.h" +#include "glusterfs/compat-errno.h" // for ENODATA on BSD -static inline int -dht_inode_ctx_set1 (xlator_t *this, inode_t *inode, xlator_t *subvol) +static void +dht_free_fd_ctx(dht_fd_ctx_t *fd_ctx) { - uint64_t tmp_subvol = 0; + GF_FREE(fd_ctx); +} - tmp_subvol = (long)subvol; - return inode_ctx_set1 (inode, this, &tmp_subvol); +int32_t +dht_fd_ctx_destroy(xlator_t *this, fd_t *fd) +{ + dht_fd_ctx_t *fd_ctx = NULL; + uint64_t value = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + ret = fd_ctx_del(fd, this, &value); + if (ret) { + goto out; + } + + fd_ctx = (dht_fd_ctx_t *)(uintptr_t)value; + if (fd_ctx) { + GF_REF_PUT(fd_ctx); + } +out: + return ret; } -int -dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol) +static int +__dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *dst) { - int ret = -1; - uint64_t tmp_subvol = 0; + dht_fd_ctx_t *fd_ctx = NULL; + uint64_t value = 0; + int ret = -1; - ret = inode_ctx_get1 (inode, this, &tmp_subvol); - if (tmp_subvol && subvol) - *subvol = (xlator_t *)tmp_subvol; + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); - return ret; -} + fd_ctx = GF_CALLOC(1, sizeof(*fd_ctx), gf_dht_mt_fd_ctx_t); + + if (!fd_ctx) { + goto out; + } + + fd_ctx->opened_on_dst = (uint64_t)(uintptr_t)dst; + GF_REF_INIT(fd_ctx, dht_free_fd_ctx); + value = (uint64_t)(uintptr_t)fd_ctx; + + ret = __fd_ctx_set(fd, this, value); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FD_CTX_SET_FAILED, + "fd=0x%p", fd, NULL); + GF_REF_PUT(fd_ctx); + } +out: + return ret; +} int -dht_frame_return (call_frame_t *frame) +dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *dst) { - dht_local_t *local = NULL; - int this_call_cnt = -1; + dht_fd_ctx_t *fd_ctx = NULL; + uint64_t value = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); + + LOCK(&fd->lock); + { + ret = __fd_ctx_get(fd, this, &value); + if (ret && value) { + fd_ctx = (dht_fd_ctx_t *)(uintptr_t)value; + if (fd_ctx->opened_on_dst == (uint64_t)(uintptr_t)dst) { + /* This could happen due to racing + * check_progress tasks*/ + goto unlock; + } else { + /* This would be a big problem*/ + /* Overwrite and hope for the best*/ + fd_ctx->opened_on_dst = (uint64_t)(uintptr_t)dst; + UNLOCK(&fd->lock); + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_VALUE, + NULL); - if (!frame) - return -1; + goto out; + } + } + ret = __dht_fd_ctx_set(this, fd, dst); + } +unlock: + UNLOCK(&fd->lock); +out: + return ret; +} - local = frame->local; +static dht_fd_ctx_t * +dht_fd_ctx_get(xlator_t *this, fd_t *fd) +{ + dht_fd_ctx_t *fd_ctx = NULL; + int ret = -1; + uint64_t tmp_val = 0; + + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, fd, out); - LOCK (&frame->lock); - { - this_call_cnt = --local->call_cnt; + LOCK(&fd->lock); + { + ret = __fd_ctx_get(fd, this, &tmp_val); + if ((ret < 0) || (tmp_val == 0)) { + goto unlock; } - UNLOCK (&frame->lock); - return this_call_cnt; + fd_ctx = (dht_fd_ctx_t *)(uintptr_t)tmp_val; + GF_REF_GET(fd_ctx); + } +unlock: + UNLOCK(&fd->lock); + +out: + return fd_ctx; } +gf_boolean_t +dht_fd_open_on_dst(xlator_t *this, fd_t *fd, xlator_t *dst) +{ + dht_fd_ctx_t *fd_ctx = NULL; + gf_boolean_t opened = _gf_false; + + fd_ctx = dht_fd_ctx_get(this, fd); + + if (fd_ctx) { + if (fd_ctx->opened_on_dst == (uint64_t)(uintptr_t)dst) { + opened = _gf_true; + } + GF_REF_PUT(fd_ctx); + } -static uint64_t -dht_bits_for (uint64_t num) + return opened; +} + +void +dht_free_mig_info(void *data) { - uint64_t bits = 0, ctrl = 1; + dht_migrate_info_t *miginfo = NULL; - while (ctrl < num) { - ctrl *= 2; - bits ++; - } + miginfo = data; + GF_FREE(miginfo); - return bits; + return; } -/* - * A slightly "updated" version of the algorithm described in the commit log - * is used here. - * - * The only enhancement is that: - * - * - The number of bits used by the backend filesystem for HUGE d_off which - * is described as 63, and - * - The number of bits used by the d_off presented by the transformation - * upwards which is described as 64, are both made "configurable." - */ +static int +dht_inode_ctx_set_mig_info(xlator_t *this, inode_t *inode, xlator_t *src_subvol, + xlator_t *dst_subvol) +{ + dht_migrate_info_t *miginfo = NULL; + uint64_t value = 0; + int ret = -1; + miginfo = GF_CALLOC(1, sizeof(*miginfo), gf_dht_mt_miginfo_t); + if (miginfo == NULL) + goto out; -#define BACKEND_D_OFF_BITS 63 -#define PRESENT_D_OFF_BITS 63 + miginfo->src_subvol = src_subvol; + miginfo->dst_subvol = dst_subvol; + GF_REF_INIT(miginfo, dht_free_mig_info); -#define ONE 1ULL -#define MASK (~0ULL) -#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS)) -#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS)) + value = (uint64_t)(uintptr_t)miginfo; -#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1)) -#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1))) + ret = inode_ctx_set1(inode, this, &value); + if (ret < 0) { + GF_REF_PUT(miginfo); + } + +out: + return ret; +} int -dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) +dht_inode_ctx_get_mig_info(xlator_t *this, inode_t *inode, + xlator_t **src_subvol, xlator_t **dst_subvol) { - dht_conf_t *conf = NULL; - int cnt = 0; - int max = 0; - uint64_t y = 0; - uint64_t hi_mask = 0; - uint64_t off_mask = 0; - int max_bits = 0; - - if (x == ((uint64_t) -1)) { - y = (uint64_t) -1; - goto out; - } + int ret = -1; + uint64_t tmp_miginfo = 0; + dht_migrate_info_t *miginfo = NULL; - conf = this->private; - if (!conf) - goto out; + LOCK(&inode->lock); + { + ret = __inode_ctx_get1(inode, this, &tmp_miginfo); + if ((ret < 0) || (tmp_miginfo == 0)) { + UNLOCK(&inode->lock); + goto out; + } - max = conf->subvolume_cnt; - cnt = dht_subvol_cnt (this, subvol); + miginfo = (dht_migrate_info_t *)(uintptr_t)tmp_miginfo; + GF_REF_GET(miginfo); + } + UNLOCK(&inode->lock); - if (max == 1) { - y = x; - goto out; - } + if (src_subvol) + *src_subvol = miginfo->src_subvol; - max_bits = dht_bits_for (max); + if (dst_subvol) + *dst_subvol = miginfo->dst_subvol; - hi_mask = ~(PRESENT_MASK >> (max_bits + 1)); - - if (x & hi_mask) { - /* HUGE d_off */ - off_mask = MASK << max_bits; - y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt; - } else { - /* small d_off */ - y = ((x * max) + cnt); - } + GF_REF_PUT(miginfo); out: - if (y_p) - *y_p = y; + return ret; +} - return 0; +gf_boolean_t +dht_mig_info_is_invalid(xlator_t *current, xlator_t *src_subvol, + xlator_t *dst_subvol) +{ + /* Not set + */ + if (!src_subvol || !dst_subvol) + return _gf_true; + + /* Invalid scenarios: + * The src_subvol does not match the subvol on which the current op was sent + * so the cached subvol has changed between the last mig_info_set and now. + * src_subvol == dst_subvol. The file was migrated without any FOP detecting + * a P2 so the old dst is now the current subvol. + * + * There is still one scenario where the info could be outdated - if + * file has undergone multiple migrations and ends up on the same src_subvol + * on which the mig_info was first set. + */ + if ((current == dst_subvol) || (current != src_subvol)) + return _gf_true; + + return _gf_false; } +/* Used to check if fd fops have the fd opened on the cached subvol + * This is required when: + * 1. an fd is opened on FILE1 on subvol1 + * 2. the file is migrated to subvol2 + * 3. a lookup updates the cached subvol in the inode_ctx to subvol2 + * 4. a write comes on the fd + * The write is sent to subvol2 on an fd which has been opened only on fd1 + * Since the migration phase checks don't kick in, the fop fails with EBADF + * + */ + int -dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, - xlator_t **subvol) +dht_check_and_open_fd_on_subvol_complete(int ret, call_frame_t *frame, + void *data) { - char *new_name = NULL; - char *new_path = NULL; - xlator_list_t *trav = NULL; - char key[1024] = {0,}; - int ret = 0; /* not found */ - - /* Why do other tasks if first required 'char' itself is not there */ - if (!new_loc || !loc || !loc->name || !strchr (loc->name, '@')) - goto out; + glusterfs_fop_t fop = 0; + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + fd_t *fd = NULL; + int op_errno = -1; + + local = frame->local; + this = frame->this; + fop = local->fop; + subvol = local->cached_subvol; + fd = local->fd; + + if (ret) { + op_errno = local->op_errno; + goto handle_err; + } + + switch (fop) { + case GF_FOP_WRITE: + STACK_WIND_COOKIE(frame, dht_writev_cbk, subvol, subvol, + subvol->fops->writev, fd, local->rebalance.vector, + local->rebalance.count, local->rebalance.offset, + local->rebalance.flags, local->rebalance.iobref, + local->xattr_req); + break; + + case GF_FOP_FLUSH: + STACK_WIND(frame, dht_flush_cbk, subvol, subvol->fops->flush, fd, + local->xattr_req); + break; + + case GF_FOP_FSETATTR: + STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol, + subvol->fops->fsetattr, fd, + &local->rebalance.stbuf, local->rebalance.flags, + local->xattr_req); + break; + + case GF_FOP_ZEROFILL: + STACK_WIND_COOKIE(frame, dht_zerofill_cbk, subvol, subvol, + subvol->fops->zerofill, fd, + local->rebalance.offset, local->rebalance.size, + local->xattr_req); + + break; + + case GF_FOP_DISCARD: + STACK_WIND_COOKIE(frame, dht_discard_cbk, subvol, subvol, + subvol->fops->discard, local->fd, + local->rebalance.offset, local->rebalance.size, + local->xattr_req); + break; + + case GF_FOP_FALLOCATE: + STACK_WIND_COOKIE(frame, dht_fallocate_cbk, subvol, subvol, + subvol->fops->fallocate, fd, + local->rebalance.flags, local->rebalance.offset, + local->rebalance.size, local->xattr_req); + break; + + case GF_FOP_FTRUNCATE: + STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol, + subvol->fops->ftruncate, fd, + local->rebalance.offset, local->xattr_req); + break; + + case GF_FOP_FSYNC: + STACK_WIND_COOKIE(frame, dht_fsync_cbk, subvol, subvol, + subvol->fops->fsync, local->fd, + local->rebalance.flags, local->xattr_req); + break; + + case GF_FOP_READ: + STACK_WIND(frame, dht_readv_cbk, subvol, subvol->fops->readv, + local->fd, local->rebalance.size, + local->rebalance.offset, local->rebalance.flags, + local->xattr_req); + break; + + case GF_FOP_FSTAT: + STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol, + subvol->fops->fstat, fd, local->xattr_req); + break; + + case GF_FOP_FSETXATTR: + STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol, + subvol->fops->fsetxattr, local->fd, + local->rebalance.xattr, local->rebalance.flags, + local->xattr_req); + break; + + case GF_FOP_FREMOVEXATTR: + STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol, + subvol->fops->fremovexattr, local->fd, local->key, + local->xattr_req); + + break; + + case GF_FOP_FXATTROP: + STACK_WIND(frame, dht_common_xattrop_cbk, subvol, + subvol->fops->fxattrop, local->fd, + local->rebalance.flags, local->rebalance.xattr, + local->xattr_req); + break; + + case GF_FOP_FGETXATTR: + STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->fgetxattr, + local->fd, local->key, NULL); + break; + + case GF_FOP_FINODELK: + STACK_WIND(frame, dht_finodelk_cbk, subvol, subvol->fops->finodelk, + local->key, local->fd, local->rebalance.lock_cmd, + &local->rebalance.flock, local->xattr_req); + break; + default: + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UNKNOWN_FOP, "fd=%p", + fd, "gfid=%s", uuid_utoa(fd->inode->gfid), "name=%s", + subvol->name, NULL); + break; + } + + goto out; + + /* Could not open the fd on the dst. Unwind */ + +handle_err: + + switch (fop) { + case GF_FOP_WRITE: + DHT_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_FLUSH: + DHT_STACK_UNWIND(flush, frame, -1, op_errno, NULL); + break; + + case GF_FOP_FSETATTR: + DHT_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_ZEROFILL: + DHT_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_DISCARD: + DHT_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_FALLOCATE: + DHT_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_FTRUNCATE: + DHT_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_FSYNC: + DHT_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL); + break; + + case GF_FOP_READ: + DHT_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, + NULL); + break; + + case GF_FOP_FSTAT: + DHT_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL); + break; + + case GF_FOP_FSETXATTR: + DHT_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL); + break; + + case GF_FOP_FREMOVEXATTR: + DHT_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL); + break; + + case GF_FOP_FXATTROP: + DHT_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL); + break; + + case GF_FOP_FGETXATTR: + DHT_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL); + break; + + case GF_FOP_FINODELK: + DHT_STACK_UNWIND(finodelk, frame, -1, op_errno, NULL); + break; + + default: + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UNKNOWN_FOP, "fd=%p", + fd, "gfid=%s", uuid_utoa(fd->inode->gfid), "name=%s", + subvol->name, NULL); + break; + } - trav = this->children; - while (trav) { - snprintf (key, 1024, "*@%s:%s", this->name, trav->xlator->name); - if (fnmatch (key, loc->name, FNM_NOESCAPE) == 0) { - new_name = GF_CALLOC(strlen (loc->name), - sizeof (char), - gf_common_mt_char); - if (!new_name) - goto out; - if (fnmatch (key, loc->path, FNM_NOESCAPE) == 0) { - new_path = GF_CALLOC(strlen (loc->path), - sizeof (char), - gf_common_mt_char); - if (!new_path) - goto out; - strncpy (new_path, loc->path, (strlen (loc->path) - - strlen (key) + 1)); - } - strncpy (new_name, loc->name, (strlen (loc->name) - - strlen (key) + 1)); - - if (new_loc) { - new_loc->path = ((new_path) ? new_path: - gf_strdup (loc->path)); - new_loc->name = new_name; - new_loc->inode = inode_ref (loc->inode); - new_loc->parent = inode_ref (loc->parent); - } - *subvol = trav->xlator; - ret = 1; /* success */ - goto out; - } - trav = trav->next; - } out: - if (!ret) { - /* !success */ - GF_FREE (new_path); - GF_FREE (new_name); - } - return ret; + + return 0; } +/* Check once again if the fd has been opened on the cached subvol. + * If not, open and update the fd_ctx. + */ + int -dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, - uint64_t *x_p) +dht_check_and_open_fd_on_subvol_task(void *data) { - dht_conf_t *conf = NULL; - int cnt = 0; - int max = 0; - uint64_t x = 0; - xlator_t *subvol = 0; - int max_bits = 0; - uint64_t off_mask = 0; - uint64_t host_mask = 0; - - if (!this->private) - return -1; - - conf = this->private; - max = conf->subvolume_cnt; - - if (max == 1) { - x = y; - cnt = 0; - goto out; - } - - if (y & TOP_BIT) { - /* HUGE d_off */ - max_bits = dht_bits_for (max); - off_mask = (MASK << max_bits); - host_mask = ~(off_mask); - - x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS; - - cnt = y & host_mask; - } else { - /* small d_off */ - cnt = y % max; - x = y / max; - } + loc_t loc = { + 0, + }; + int ret = -1; + call_frame_t *frame = NULL; + dht_local_t *local = NULL; + fd_t *fd = NULL; + xlator_t *this = NULL; + xlator_t *subvol = NULL; + + frame = data; + local = frame->local; + this = THIS; + fd = local->fd; + subvol = local->cached_subvol; + + local->fd_checked = _gf_true; + + if (fd_is_anonymous(fd) || dht_fd_open_on_dst(this, fd, subvol)) { + ret = 0; + goto out; + } -out: - subvol = conf->subvolumes[cnt]; + gf_msg_debug(this->name, 0, "Opening fd (%p, flags=0%o) on file %s @ %s", + fd, fd->flags, uuid_utoa(fd->inode->gfid), subvol->name); - if (subvol_p) - *subvol_p = subvol; + loc.inode = inode_ref(fd->inode); + gf_uuid_copy(loc.gfid, fd->inode->gfid); - if (x_p) - *x_p = x; + /* Open this on the dst subvol */ - return 0; -} + SYNCTASK_SETID(0, 0); -char * -dht_lock_asprintf (dht_lock_t *lock) -{ - char *lk_buf = NULL; - char gfid[GF_UUID_BUF_SIZE] = {0, }; + ret = syncop_open(subvol, &loc, (fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)), + fd, NULL, NULL); - if (lock == NULL) - goto out; + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_OPEN_FD_ON_DST_FAILED, + "fd=%p", fd, "flags=0%o", fd->flags, "gfid=%s", + uuid_utoa(fd->inode->gfid), "name=%s", subvol->name, NULL); + /* This can happen if the cached subvol was updated in the + * inode_ctx and the fd was opened on the new cached suvol + * after this fop was wound on the old cached subvol. + * As we do not close the fd on the old subvol (a leak) + * don't treat ENOENT as an error and allow the phase1/phase2 + * checks to handle it. + */ + + if ((-ret != ENOENT) && (-ret != ESTALE)) { + local->op_errno = -ret; + ret = -1; + } else { + ret = 0; + } - uuid_utoa_r (lock->loc.gfid, gfid); + local->op_errno = -ret; + ret = -1; - gf_asprintf (&lk_buf, "%s:%s", lock->xl->name, gfid); + } else { + dht_fd_ctx_set(this, fd, subvol); + } + SYNCTASK_SETID(frame->root->uid, frame->root->gid); out: - return lk_buf; + loc_wipe(&loc); + + return ret; } -void -dht_log_lk_array (char *name, gf_loglevel_t log_level, dht_lock_t **lk_array, - int count) +int +dht_check_and_open_fd_on_subvol(xlator_t *this, call_frame_t *frame) { - int i = 0; - char *lk_buf = NULL; + int ret = -1; + dht_local_t *local = NULL; - if ((lk_array == NULL) || (count == 0)) - goto out; + /* + if (dht_fd_open_on_dst (this, fd, subvol)) + goto out; + */ + local = frame->local; - for (i = 0; i < count; i++) { - lk_buf = dht_lock_asprintf (lk_array[i]); - gf_log (name, log_level, "%d. %s", i, lk_buf); - GF_FREE (lk_buf); - } + ret = synctask_new(this->ctx->env, dht_check_and_open_fd_on_subvol_task, + dht_check_and_open_fd_on_subvol_complete, frame, frame); -out: - return; + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SYNCTASK_CREATE_FAILED, + "to-check-and-open fd=%p", local->fd, NULL); + } + + return ret; } -void -dht_lock_stack_destroy (call_frame_t *lock_frame) +int +dht_frame_return(call_frame_t *frame) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; + int this_call_cnt = -1; - local = lock_frame->local; + if (!frame) + return -1; - local->lock.locks = NULL; - local->lock.lk_count = 0; + local = frame->local; - DHT_STACK_DESTROY (lock_frame); - return; + LOCK(&frame->lock); + { + this_call_cnt = --local->call_cnt; + } + UNLOCK(&frame->lock); + + return this_call_cnt; } -void -dht_lock_free (dht_lock_t *lock) +/* + * Use this function to specify which subvol you want the file created + * on - this need not be the hashed subvol. + * Format: <filename>@<this->name>:<subvol-name> + * Eg: file-1@vol1-dht:vol1-client-0 + * where vol1 is a pure distribute volume + * will create file-1 on vol1-client-0 + */ + +int +dht_filter_loc_subvol_key(xlator_t *this, loc_t *loc, loc_t *new_loc, + xlator_t **subvol) { - if (lock == NULL) + char *new_name = NULL; + char *new_path = NULL; + xlator_list_t *trav = NULL; + char key[1024] = { + 0, + }; + int ret = 0; /* not found */ + int keylen = 0; + int name_len = 0; + int path_len = 0; + + /* Why do other tasks if first required 'char' itself is not there */ + if (!new_loc || !loc || !loc->name || !strchr(loc->name, '@')) { + /* Skip the GF_FREE checks here */ + return ret; + } + + trav = this->children; + while (trav) { + keylen = snprintf(key, sizeof(key), "*@%s:%s", this->name, + trav->xlator->name); + /* Ignore '*' */ + keylen = keylen - 1; + if (fnmatch(key, loc->name, FNM_NOESCAPE) == 0) { + name_len = strlen(loc->name) - keylen; + new_name = GF_MALLOC(name_len + 1, gf_common_mt_char); + if (!new_name) goto out; - - loc_wipe (&lock->loc); - GF_FREE (lock->domain); - mem_put (lock); - + if (fnmatch(key, loc->path, FNM_NOESCAPE) == 0) { + path_len = strlen(loc->path) - keylen; + new_path = GF_MALLOC(path_len + 1, gf_common_mt_char); + if (!new_path) + goto out; + snprintf(new_path, path_len + 1, "%s", loc->path); + } + snprintf(new_name, name_len + 1, "%s", loc->name); + + if (new_loc) { + new_loc->path = ((new_path) ? new_path : gf_strdup(loc->path)); + new_loc->name = new_name; + new_loc->inode = inode_ref(loc->inode); + new_loc->parent = inode_ref(loc->parent); + } + *subvol = trav->xlator; + ret = 1; /* success */ + goto out; + } + trav = trav->next; + } out: - return; + if (!ret) { + /* !success */ + GF_FREE(new_path); + GF_FREE(new_name); + } + return ret; } -void -dht_lock_array_free (dht_lock_t **lk_array, int count) +static xlator_t * +dht_get_subvol_from_id(xlator_t *this, int client_id) { - int i = 0; - dht_lock_t *lock = NULL; + xlator_t *xl = NULL; + dht_conf_t *conf = NULL; + char *sid = NULL; + int32_t ret = -1; - if (lk_array == NULL) - goto out; + conf = this->private; - for (i = 0; i < count; i++) { - lock = lk_array[i]; - lk_array[i] = NULL; - dht_lock_free (lock); - } + ret = gf_asprintf(&sid, "%d", client_id); + if (ret == -1) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_ASPRINTF_FAILED, NULL); + goto out; + } + + if (dict_get_ptr(conf->leaf_to_subvol, sid, (void **)&xl)) + xl = NULL; + + GF_FREE(sid); out: - return; + return xl; } -dht_lock_t * -dht_lock_new (xlator_t *this, xlator_t *xl, loc_t *loc, short type, - const char *domain) +int +dht_deitransform(xlator_t *this, uint64_t y, xlator_t **subvol_p) { - dht_conf_t *conf = NULL; - dht_lock_t *lock = NULL; + int client_id = 0; + xlator_t *subvol = 0; + dht_conf_t *conf = NULL; - conf = this->private; + if (!this->private) + return -1; - lock = mem_get0 (conf->lock_pool); - if (lock == NULL) - goto out; + conf = this->private; - lock->xl = xl; - lock->type = type; - lock->domain = gf_strdup (domain); - if (lock->domain == NULL) { - dht_lock_free (lock); - lock = NULL; - goto out; - } + client_id = gf_deitransform(this, y); - /* Fill only inode and gfid. - posix and protocol/server give preference to pargfid/basename over - gfid/inode for resolution if all the three parameters of loc_t are - present. I want to avoid the following hypothetical situation: - - 1. rebalance did a lookup on a dentry and got a gfid. - 2. rebalance acquires lock on loc_t which was filled with gfid and - path (pargfid/bname) from step 1. - 3. somebody deleted and recreated the same file - 4. rename on the same path acquires lock on loc_t which now points - to a different inode (and hence gets the lock). - 5. rebalance continues to migrate file (note that not all fops done - by rebalance during migration are inode/gfid based Eg., unlink) - 6. rename continues. - */ - lock->loc.inode = inode_ref (loc->inode); - loc_gfid (loc, lock->loc.gfid); + subvol = dht_get_subvol_from_id(this, client_id); -out: - return lock; + if (!subvol) + subvol = conf->subvolumes[0]; + + if (subvol_p) + *subvol_p = subvol; + + return 0; } -int -dht_local_lock_init (call_frame_t *frame, dht_lock_t **lk_array, - int lk_count, fop_inodelk_cbk_t inodelk_cbk) +void +dht_local_wipe(xlator_t *this, dht_local_t *local) { - int ret = -1; - dht_local_t *local = NULL; + int i = 0; - local = frame->local; + if (!local) + return; - if (local == NULL) { - local = dht_local_init (frame, NULL, NULL, 0); - } + loc_wipe(&local->loc); + loc_wipe(&local->loc2); + loc_wipe(&local->loc2_copy); - if (local == NULL) { - goto out; - } + if (local->xattr) + dict_unref(local->xattr); - local->lock.inodelk_cbk = inodelk_cbk; - local->lock.locks = lk_array; - local->lock.lk_count = lk_count; + if (local->inode) + inode_unref(local->inode); - ret = dht_lock_order_requests (local->lock.locks, - local->lock.lk_count); - if (ret < 0) - goto out; + if (local->layout) { + dht_layout_unref(this, local->layout); + local->layout = NULL; + } - ret = 0; -out: - return ret; -} + loc_wipe(&local->linkfile.loc); -void -dht_local_wipe (xlator_t *this, dht_local_t *local) -{ - if (!local) - return; + if (local->linkfile.xattr) + dict_unref(local->linkfile.xattr); - loc_wipe (&local->loc); - loc_wipe (&local->loc2); + if (local->linkfile.inode) + inode_unref(local->linkfile.inode); - if (local->xattr) - dict_unref (local->xattr); + if (local->fd) { + fd_unref(local->fd); + local->fd = NULL; + } - if (local->inode) - inode_unref (local->inode); + if (local->params) { + dict_unref(local->params); + local->params = NULL; + } - if (local->layout) { - dht_layout_unref (this, local->layout); - local->layout = NULL; - } + if (local->xattr_req) + dict_unref(local->xattr_req); + if (local->mds_xattr) + dict_unref(local->mds_xattr); + if (local->xdata) + dict_unref(local->xdata); - loc_wipe (&local->linkfile.loc); + if (local->selfheal.layout) { + dht_layout_unref(this, local->selfheal.layout); + local->selfheal.layout = NULL; + } - if (local->linkfile.xattr) - dict_unref (local->linkfile.xattr); + if (local->selfheal.refreshed_layout) { + dht_layout_unref(this, local->selfheal.refreshed_layout); + local->selfheal.refreshed_layout = NULL; + } - if (local->linkfile.inode) - inode_unref (local->linkfile.inode); + for (i = 0; i < 2; i++) { + dht_lock_array_free(local->lock[i].ns.parent_layout.locks, + local->lock[i].ns.parent_layout.lk_count); - if (local->fd) { - fd_unref (local->fd); - local->fd = NULL; - } + GF_FREE(local->lock[i].ns.parent_layout.locks); - if (local->params) { - dict_unref (local->params); - local->params = NULL; - } + dht_lock_array_free(local->lock[i].ns.directory_ns.locks, + local->lock[i].ns.directory_ns.lk_count); + GF_FREE(local->lock[i].ns.directory_ns.locks); + } - if (local->xattr_req) - dict_unref (local->xattr_req); + GF_FREE(local->key); - if (local->selfheal.layout) { - dht_layout_unref (this, local->selfheal.layout); - local->selfheal.layout = NULL; - } + if (local->rebalance.xdata) + dict_unref(local->rebalance.xdata); - dht_lock_array_free (local->lock.locks, local->lock.lk_count); - GF_FREE (local->lock.locks); + if (local->rebalance.xattr) + dict_unref(local->rebalance.xattr); - GF_FREE (local->newpath); + if (local->rebalance.dict) + dict_unref(local->rebalance.dict); - GF_FREE (local->key); + GF_FREE(local->rebalance.vector); - GF_FREE (local->rebalance.vector); + if (local->rebalance.iobref) + iobref_unref(local->rebalance.iobref); - if (local->rebalance.iobref) - iobref_unref (local->rebalance.iobref); + if (local->stub) { + call_stub_destroy(local->stub); + local->stub = NULL; + } - mem_put (local); -} + if (local->ret_cache) + GF_FREE(local->ret_cache); + mem_put(local); +} dht_local_t * -dht_local_init (call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop) +dht_local_init(call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop) { - dht_local_t *local = NULL; - inode_t *inode = NULL; - int ret = 0; + dht_local_t *local = NULL; + inode_t *inode = NULL; + int ret = 0; - local = mem_get0 (THIS->local_pool); - if (!local) - goto out; + local = mem_get0(THIS->local_pool); + if (!local) + goto out; - if (loc) { - ret = loc_copy (&local->loc, loc); - if (ret) - goto out; + if (loc) { + ret = loc_copy(&local->loc, loc); + if (ret) + goto out; - inode = loc->inode; - } + inode = loc->inode; + } - if (fd) { - local->fd = fd_ref (fd); - if (!inode) - inode = fd->inode; - } + if (fd) { + local->fd = fd_ref(fd); + if (!inode) + inode = fd->inode; + } - local->op_ret = -1; - local->op_errno = EUCLEAN; - local->fop = fop; + local->op_ret = -1; + local->op_errno = EUCLEAN; + local->fop = fop; - if (inode) { - local->layout = dht_layout_get (frame->this, inode); - local->cached_subvol = dht_subvol_get_cached (frame->this, - inode); - } + if (inode) { + local->layout = dht_layout_get(frame->this, inode); + local->cached_subvol = dht_subvol_get_cached(frame->this, inode); + } - frame->local = local; + frame->local = local; out: - if (ret) { - if (local) - mem_put (local); - local = NULL; - } - return local; + if (ret) { + if (local) + mem_put(local); + local = NULL; + } + return local; } xlator_t * -dht_first_up_subvol (xlator_t *this) +dht_first_up_subvol(xlator_t *this) { - dht_conf_t *conf = NULL; - xlator_t *child = NULL; - int i = 0; - time_t time = 0; + dht_conf_t *conf = NULL; + xlator_t *child = NULL; + int i = 0; + time_t time = 0; - conf = this->private; - if (!conf) - goto out; + conf = this->private; + if (!conf) + goto out; - LOCK (&conf->subvolume_lock); - { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->subvol_up_time[i]) { - if (!time) { - time = conf->subvol_up_time[i]; - child = conf->subvolumes[i]; - } else if (time > conf->subvol_up_time[i]) { - time = conf->subvol_up_time[i]; - child = conf->subvolumes[i]; - } - } + LOCK(&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvol_up_time[i]) { + if (!time) { + time = conf->subvol_up_time[i]; + child = conf->subvolumes[i]; + } else if (time > conf->subvol_up_time[i]) { + time = conf->subvol_up_time[i]; + child = conf->subvolumes[i]; } + } } - UNLOCK (&conf->subvolume_lock); + } + UNLOCK(&conf->subvolume_lock); out: - return child; + return child; } xlator_t * -dht_last_up_subvol (xlator_t *this) +dht_last_up_subvol(xlator_t *this) { - dht_conf_t *conf = NULL; - xlator_t *child = NULL; - int i = 0; + dht_conf_t *conf = NULL; + xlator_t *child = NULL; + int i = 0; - conf = this->private; - if (!conf) - goto out; + conf = this->private; + if (!conf) + goto out; - LOCK (&conf->subvolume_lock); - { - for (i = conf->subvolume_cnt-1; i >= 0; i--) { - if (conf->subvolume_status[i]) { - child = conf->subvolumes[i]; - break; - } - } + LOCK(&conf->subvolume_lock); + { + for (i = conf->subvolume_cnt - 1; i >= 0; i--) { + if (conf->subvolume_status[i]) { + child = conf->subvolumes[i]; + break; + } } - UNLOCK (&conf->subvolume_lock); + } + UNLOCK(&conf->subvolume_lock); out: - return child; + return child; } xlator_t * -dht_subvol_get_hashed (xlator_t *this, loc_t *loc) +dht_subvol_get_hashed(xlator_t *this, loc_t *loc) { - dht_layout_t *layout = NULL; - xlator_t *subvol = NULL; + dht_layout_t *layout = NULL; + xlator_t *subvol = NULL; + dht_conf_t *conf = NULL; + dht_methods_t *methods = NULL; - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO (this->name, loc, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, loc, out); - if (__is_root_gfid (loc->gfid)) { - subvol = dht_first_up_subvol (this); - goto out; - } + conf = this->private; + GF_VALIDATE_OR_GOTO(this->name, conf, out); - GF_VALIDATE_OR_GOTO (this->name, loc->parent, out); - GF_VALIDATE_OR_GOTO (this->name, loc->name, out); + methods = &(conf->methods); - layout = dht_layout_get (this, loc->parent); + if (__is_root_gfid(loc->gfid)) { + subvol = dht_first_up_subvol(this); + goto out; + } - if (!layout) { - gf_log (this->name, GF_LOG_DEBUG, - "layout missing path=%s parent=%s", - loc->path, uuid_utoa (loc->parent->gfid)); - goto out; - } + GF_VALIDATE_OR_GOTO(this->name, loc->parent, out); + GF_VALIDATE_OR_GOTO(this->name, loc->name, out); - subvol = dht_layout_search (this, layout, loc->name); + layout = dht_layout_get(this, loc->parent); - if (!subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "could not find subvolume for path=%s", - loc->path); - goto out; - } + if (!layout) { + gf_msg_debug(this->name, 0, "Missing layout. path=%s, parent gfid =%s", + loc->path, uuid_utoa(loc->parent->gfid)); + goto out; + } + + subvol = methods->layout_search(this, layout, loc->name); + + if (!subvol) { + gf_msg_debug(this->name, 0, "No hashed subvolume for path=%s", + loc->path); + goto out; + } out: - if (layout) { - dht_layout_unref (this, layout); - } + if (layout) { + dht_layout_unref(this, layout); + } - return subvol; + return subvol; } - xlator_t * -dht_subvol_get_cached (xlator_t *this, inode_t *inode) +dht_subvol_get_cached(xlator_t *this, inode_t *inode) { - dht_layout_t *layout = NULL; - xlator_t *subvol = NULL; + dht_layout_t *layout = NULL; + xlator_t *subvol = NULL; - GF_VALIDATE_OR_GOTO (this->name, this, out); - GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); - layout = dht_layout_get (this, inode); + layout = dht_layout_get(this, inode); - if (!layout) { - goto out; - } + if (!layout) { + goto out; + } - subvol = layout->list[0].xlator; + subvol = layout->list[0].xlator; out: - if (layout) { - dht_layout_unref (this, layout); - } + if (layout) { + dht_layout_unref(this, layout); + } - return subvol; + return subvol; } - xlator_t * -dht_subvol_next (xlator_t *this, xlator_t *prev) +dht_subvol_next(xlator_t *this, xlator_t *prev) { - dht_conf_t *conf = NULL; - int i = 0; - xlator_t *next = NULL; + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *next = NULL; - conf = this->private; - if (!conf) - goto out; + conf = this->private; + if (!conf) + goto out; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->subvolumes[i] == prev) { - if ((i + 1) < conf->subvolume_cnt) - next = conf->subvolumes[i + 1]; - break; - } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) { + if ((i + 1) < conf->subvolume_cnt) + next = conf->subvolumes[i + 1]; + break; } + } out: - return next; + return next; } /* This func wraps around, if prev is actually the last subvol. */ xlator_t * -dht_subvol_next_available (xlator_t *this, xlator_t *prev) +dht_subvol_next_available(xlator_t *this, xlator_t *prev) { - dht_conf_t *conf = NULL; - int i = 0; - xlator_t *next = NULL; - - conf = this->private; - if (!conf) - goto out; - - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->subvolumes[i] == prev) { - /* if prev is last in conf->subvolumes, then wrap - * around. - */ - if ((i + 1) < conf->subvolume_cnt) { - next = conf->subvolumes[i + 1]; - } else { - next = conf->subvolumes[0]; - } - break; - } - } + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *next = NULL; + + conf = this->private; + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) { + /* if prev is last in conf->subvolumes, then wrap + * around. + */ + if ((i + 1) < conf->subvolume_cnt) { + next = conf->subvolumes[i + 1]; + } else { + next = conf->subvolumes[0]; + } + break; + } + } out: - return next; + return next; } int -dht_subvol_cnt (xlator_t *this, xlator_t *subvol) +dht_subvol_cnt(xlator_t *this, xlator_t *subvol) { - int i = 0; - int ret = -1; - dht_conf_t *conf = NULL; + int i = 0; + int ret = -1; + dht_conf_t *conf = NULL; - conf = this->private; - if (!conf) - goto out; + conf = this->private; + if (!conf) + goto out; - for (i = 0; i < conf->subvolume_cnt; i++) { - if (subvol == conf->subvolumes[i]) { - ret = i; - break; - } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + ret = i; + break; } + } out: - return ret; + return ret; } +#define set_if_greater(a, b) \ + do { \ + if ((a) < (b)) \ + (a) = (b); \ + } while (0) -#define set_if_greater(a, b) do { \ - if ((a) < (b)) \ - (a) = (b); \ - } while (0) - - -#define set_if_greater_time(a, an, b, bn) do { \ - if (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))){ \ - (a) = (b); \ - (an) = (bn); \ - } \ - } while (0) \ - +#define set_if_greater_time(a, an, b, bn) \ + do { \ + if (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))) { \ + (a) = (b); \ + (an) = (bn); \ + } \ + } while (0) int -dht_iatt_merge (xlator_t *this, struct iatt *to, - struct iatt *from, xlator_t *subvol) +dht_iatt_merge(xlator_t *this, struct iatt *to, struct iatt *from) { - if (!from || !to) - return 0; - - to->ia_dev = from->ia_dev; - - uuid_copy (to->ia_gfid, from->ia_gfid); - - to->ia_ino = from->ia_ino; - to->ia_prot = from->ia_prot; - to->ia_type = from->ia_type; - to->ia_nlink = from->ia_nlink; - to->ia_rdev = from->ia_rdev; - to->ia_size += from->ia_size; - to->ia_blksize = from->ia_blksize; - to->ia_blocks += from->ia_blocks; - - set_if_greater (to->ia_uid, from->ia_uid); - set_if_greater (to->ia_gid, from->ia_gid); - - set_if_greater_time(to->ia_atime, to->ia_atime_nsec, - from->ia_atime, from->ia_atime_nsec); - set_if_greater_time (to->ia_mtime, to->ia_mtime_nsec, - from->ia_mtime, from->ia_mtime_nsec); - set_if_greater_time (to->ia_ctime, to->ia_ctime_nsec, - from->ia_ctime, from->ia_ctime_nsec); - + if (!from || !to) return 0; + + to->ia_dev = from->ia_dev; + + gf_uuid_copy(to->ia_gfid, from->ia_gfid); + + to->ia_ino = from->ia_ino; + to->ia_prot = from->ia_prot; + to->ia_type = from->ia_type; + to->ia_nlink = from->ia_nlink; + to->ia_rdev = from->ia_rdev; + to->ia_size += from->ia_size; + to->ia_blksize = from->ia_blksize; + to->ia_blocks += from->ia_blocks; + + if (IA_ISDIR(from->ia_type)) { + to->ia_blocks = DHT_DIR_STAT_BLOCKS; + to->ia_size = DHT_DIR_STAT_SIZE; + } + set_if_greater(to->ia_uid, from->ia_uid); + set_if_greater(to->ia_gid, from->ia_gid); + + set_if_greater_time(to->ia_atime, to->ia_atime_nsec, from->ia_atime, + from->ia_atime_nsec); + set_if_greater_time(to->ia_mtime, to->ia_mtime_nsec, from->ia_mtime, + from->ia_mtime_nsec); + set_if_greater_time(to->ia_ctime, to->ia_ctime_nsec, from->ia_ctime, + from->ia_ctime_nsec); + + return 0; } int -dht_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +dht_build_child_loc(xlator_t *this, loc_t *child, loc_t *parent, char *name) { - if (!child) { - goto err; - } + if (!child) { + goto err; + } - if (strcmp (parent->path, "/") == 0) - gf_asprintf ((char **)&child->path, "/%s", name); - else - gf_asprintf ((char **)&child->path, "%s/%s", parent->path, name); + if (strcmp(parent->path, "/") == 0) + gf_asprintf((char **)&child->path, "/%s", name); + else + gf_asprintf((char **)&child->path, "%s/%s", parent->path, name); - if (!child->path) { - goto err; - } + if (!child->path) { + goto err; + } - child->name = strrchr (child->path, '/'); - if (child->name) - child->name++; + child->name = strrchr(child->path, '/'); + if (child->name) + child->name++; - child->parent = inode_ref (parent->inode); - child->inode = inode_new (parent->inode->table); + child->parent = inode_ref(parent->inode); + child->inode = inode_new(parent->inode->table); - if (!child->inode) { - goto err; - } + if (!child->inode) { + goto err; + } - return 0; + return 0; err: - loc_wipe (child); - return -1; + if (child) { + loc_wipe(child); + } + return -1; } - - int -dht_init_subvolumes (xlator_t *this, dht_conf_t *conf) +dht_init_local_subvolumes(xlator_t *this, dht_conf_t *conf) { - xlator_list_t *subvols = NULL; - int cnt = 0; - - if (!conf) - return -1; - - for (subvols = this->children; subvols; subvols = subvols->next) - cnt++; - - conf->subvolumes = GF_CALLOC (cnt, sizeof (xlator_t *), - gf_dht_mt_xlator_t); - if (!conf->subvolumes) { - return -1; - } - conf->subvolume_cnt = cnt; + xlator_list_t *subvols = NULL; + int cnt = 0; - cnt = 0; - for (subvols = this->children; subvols; subvols = subvols->next) - conf->subvolumes[cnt++] = subvols->xlator; + if (!conf) + return -1; - conf->subvolume_status = GF_CALLOC (cnt, sizeof (char), - gf_dht_mt_char); - if (!conf->subvolume_status) { - return -1; - } + for (subvols = this->children; subvols; subvols = subvols->next) + cnt++; - conf->last_event = GF_CALLOC (cnt, sizeof (int), - gf_dht_mt_char); - if (!conf->last_event) { - return -1; - } + conf->local_subvols = GF_CALLOC(cnt, sizeof(xlator_t *), + gf_dht_mt_xlator_t); - conf->subvol_up_time = GF_CALLOC (cnt, sizeof (time_t), - gf_dht_mt_subvol_time); - if (!conf->subvol_up_time) { - return -1; - } + /* FIX FIX : do this dynamically*/ + conf->local_nodeuuids = GF_CALLOC(cnt, sizeof(subvol_nodeuuids_info_t), + gf_dht_nodeuuids_t); - conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t), - gf_dht_mt_dht_du_t); - if (!conf->du_stats) { - return -1; - } + if (!conf->local_subvols || !conf->local_nodeuuids) { + return -1; + } - conf->decommissioned_bricks = GF_CALLOC (cnt, sizeof (xlator_t *), - gf_dht_mt_xlator_t); - if (!conf->decommissioned_bricks) { - return -1; - } + conf->local_subvols_cnt = 0; - return 0; + return 0; } +int +dht_init_subvolumes(xlator_t *this, dht_conf_t *conf) +{ + xlator_list_t *subvols = NULL; + int cnt = 0; + if (!conf) + return -1; + for (subvols = this->children; subvols; subvols = subvols->next) + cnt++; -static int -dht_migration_complete_check_done (int op_ret, call_frame_t *frame, void *data) -{ - dht_local_t *local = NULL; + conf->subvolumes = GF_CALLOC(cnt, sizeof(xlator_t *), gf_dht_mt_xlator_t); + if (!conf->subvolumes) { + return -1; + } + conf->subvolume_cnt = cnt; + /* Doesn't make sense to do any dht layer tasks + if the subvol count is 1. Set it as pass_through */ + if (cnt == 1) + this->pass_through = _gf_true; - local = frame->local; + conf->local_subvols_cnt = 0; - local->rebalance.target_op_fn (THIS, frame, op_ret); + dht_set_subvol_range(this); - return 0; -} + cnt = 0; + for (subvols = this->children; subvols; subvols = subvols->next) + conf->subvolumes[cnt++] = subvols->xlator; + conf->subvolume_status = GF_CALLOC(cnt, sizeof(char), gf_dht_mt_char); + if (!conf->subvolume_status) { + return -1; + } -int -dht_migration_complete_check_task (void *data) -{ - int ret = -1; - xlator_t *src_node = NULL; - xlator_t *dst_node = NULL; - dht_local_t *local = NULL; - dict_t *dict = NULL; - dht_layout_t *layout = NULL; - struct iatt stbuf = {0,}; - xlator_t *this = NULL; - call_frame_t *frame = NULL; - loc_t tmp_loc = {0,}; - char *path = NULL; - dht_conf_t *conf = NULL; - inode_t *inode = NULL; - fd_t *iter_fd = NULL; - uint64_t tmp_subvol = 0; - int open_failed = 0; - int32_t op_errno = EINVAL; - - this = THIS; - frame = data; - local = frame->local; - conf = this->private; - - src_node = local->cached_subvol; - - if (!local->loc.inode && !local->fd) { - local->op_errno = EINVAL; - goto out; - } + conf->last_event = GF_CALLOC(cnt, sizeof(int), gf_dht_mt_char); + if (!conf->last_event) { + return -1; + } - inode = (!local->fd) ? local->loc.inode : local->fd->inode; + conf->subvol_up_time = GF_CALLOC(cnt, sizeof(time_t), + gf_dht_mt_subvol_time); + if (!conf->subvol_up_time) { + return -1; + } - /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr - * as root:root. If a fd is already open, access check wont be done*/ + conf->du_stats = GF_CALLOC(conf->subvolume_cnt, sizeof(dht_du_t), + gf_dht_mt_dht_du_t); + if (!conf->du_stats) { + return -1; + } - if (!local->loc.inode) { - ret = syncop_fgetxattr (src_node, local->fd, &dict, - conf->link_xattr_name); - } else { - SYNCTASK_SETID (0, 0); - ret = syncop_getxattr (src_node, &local->loc, &dict, - conf->link_xattr_name); - SYNCTASK_SETID (frame->root->uid, frame->root->gid); - } + conf->decommissioned_bricks = GF_CALLOC(cnt, sizeof(xlator_t *), + gf_dht_mt_xlator_t); + if (!conf->decommissioned_bricks) { + return -1; + } - if (!ret) - dst_node = dht_linkfile_subvol (this, NULL, NULL, dict); + return 0; +} - if (ret) { - if (!dht_inode_missing(errno) || (!local->loc.inode)) { - local->op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to get the 'linkto' xattr %s", - local->loc.path, strerror (errno)); - goto out; - } +/* + op_ret values : + 0 : Success. + -1 : Failure. + 1 : File is being migrated but not by this DHT layer. +*/ - /* Need to do lookup on hashed subvol, then get the file */ - ret = syncop_lookup (this, &local->loc, NULL, &stbuf, NULL, - NULL); - if (ret) { - local->op_errno = op_errno; - goto out; - } +static int +dht_migration_complete_check_done(int op_ret, call_frame_t *frame, void *data) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; - dst_node = dht_subvol_get_cached (this, local->loc.inode); - } + local = frame->local; - if (!dst_node) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to get the destination node", - local->loc.path); - ret = -1; - local->op_errno = EINVAL; - goto out; - } + if (op_ret != 0) + goto out; - /* lookup on dst */ - if (local->loc.inode) { - ret = syncop_lookup (dst_node, &local->loc, NULL, &stbuf, NULL, - NULL); + if (local->cached_subvol == NULL) { + local->op_errno = EINVAL; + goto out; + } - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to lookup the file on %s", - local->loc.path, dst_node->name); + subvol = local->cached_subvol; - local->op_errno = errno; - goto out; - } +out: + local->rebalance.target_op_fn(THIS, subvol, frame, op_ret); - if (uuid_compare (stbuf.ia_gfid, local->loc.inode->gfid)) { - gf_log (this->name, GF_LOG_ERROR, - "%s: gfid different on the target file on %s", - local->loc.path, dst_node->name); - ret = -1; - local->op_errno = EIO; - goto out; - } - } + return 0; +} - /* update inode ctx (the layout) */ - dht_layout_unref (this, local->layout); +int +dht_migration_complete_check_task(void *data) +{ + int ret = -1; + xlator_t *src_node = NULL; + xlator_t *dst_node = NULL, *linkto_target = NULL; + dht_local_t *local = NULL; + dict_t *dict = NULL; + struct iatt stbuf = { + 0, + }; + xlator_t *this = NULL; + call_frame_t *frame = NULL; + loc_t tmp_loc = { + 0, + }; + char *path = NULL; + dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + fd_t *tmp = NULL; + uint64_t tmp_miginfo = 0; + dht_migrate_info_t *miginfo = NULL; + gf_boolean_t skip_open = _gf_false; + int open_failed = 0; + + this = THIS; + frame = data; + local = frame->local; + conf = this->private; + + src_node = local->cached_subvol; + + if (!local->loc.inode && !local->fd) { + local->op_errno = EINVAL; + goto out; + } + + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr + * as root:root. If a fd is already open, access check won't be done*/ + + if (!local->loc.inode) { + ret = syncop_fgetxattr(src_node, local->fd, &dict, + conf->link_xattr_name, NULL, NULL); + } else { + SYNCTASK_SETID(0, 0); + ret = syncop_getxattr(src_node, &local->loc, &dict, + conf->link_xattr_name, NULL, NULL); + SYNCTASK_SETID(frame->root->uid, frame->root->gid); + } + + /* + * Each DHT xlator layer has its own name for the linkto xattr. + * If the file mode bits indicate the the file is being migrated but + * this layer's linkto xattr is not set, it means that another + * DHT layer is migrating the file. In this case, return 1 so + * the mode bits can be passed on to the higher layer for appropriate + * action. + */ + if (-ret == ENODATA) { + /* This DHT translator is not migrating this file */ + + ret = inode_ctx_reset1(inode, this, &tmp_miginfo); + if (tmp_miginfo) { + /* This can be a problem if the file was + * migrated by two different layers. Raise + * a warning here. + */ + gf_smsg( + this->name, GF_LOG_WARNING, 0, DHT_MSG_HAS_MIGINFO, "tmp=%s", + tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), NULL); + + miginfo = (void *)(uintptr_t)tmp_miginfo; + GF_REF_PUT(miginfo); + } + ret = 1; + goto out; + } + + if (!ret) + linkto_target = dht_linkfile_subvol(this, NULL, NULL, dict); + + if (local->loc.inode) { + loc_copy(&tmp_loc, &local->loc); + } else { + tmp_loc.inode = inode_ref(inode); + gf_uuid_copy(tmp_loc.gfid, inode->gfid); + } + + ret = syncop_lookup(this, &tmp_loc, &stbuf, 0, 0, 0); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_FILE_LOOKUP_FAILED, + "tmp=%s", tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), + "name=%s", this->name, NULL); + local->op_errno = -ret; + ret = -1; + goto out; + } + + dst_node = dht_subvol_get_cached(this, tmp_loc.inode); + if (linkto_target && dst_node != linkto_target) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_LINKFILE, + "linkto_target_name=%s", linkto_target->name, "dst_name=%s", + dst_node->name, NULL); + } + + if (gf_uuid_compare(stbuf.ia_gfid, tmp_loc.inode->gfid)) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH, "tmp=%s", + tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), + "dst_name=%s", dst_node->name, NULL); + ret = -1; + local->op_errno = EIO; + goto out; + } + + /* update local. A layout is set in inode-ctx in lookup already */ + + dht_layout_unref(this, local->layout); + + local->layout = dht_layout_get(frame->this, inode); + local->cached_subvol = dst_node; + + ret = 0; + + /* once we detect the migration complete, the inode-ctx2 is no more + required.. delete the ctx and also, it means, open() already + done on all the fd of inode */ + ret = inode_ctx_reset1(inode, this, &tmp_miginfo); + if (tmp_miginfo) { + miginfo = (void *)(uintptr_t)tmp_miginfo; + GF_REF_PUT(miginfo); + goto out; + } + + /* perform 'open()' on all the fd's present on the inode */ + if (tmp_loc.path == NULL) { + inode_path(inode, NULL, &path); + if (path) + tmp_loc.path = path; + } + + LOCK(&inode->lock); + + if (list_empty(&inode->fd_list)) + goto unlock; + + /* perform open as root:root. There is window between linkfile + * creation(root:root) and setattr with the correct uid/gid + */ + SYNCTASK_SETID(0, 0); + + /* It's possible that we are the last user of iter_fd after each + * iteration. In this case the fd_unref() of iter_fd at the end of + * the loop will cause the destruction of the fd. So we need to + * iterate the list safely because iter_fd cannot be trusted. + */ + iter_fd = list_entry((&inode->fd_list)->next, typeof(*iter_fd), inode_list); + while (&iter_fd->inode_list != (&inode->fd_list)) { + if (fd_is_anonymous(iter_fd) || + (dht_fd_open_on_dst(this, iter_fd, dst_node))) { + if (!tmp) { + iter_fd = list_entry(iter_fd->inode_list.next, typeof(*iter_fd), + inode_list); + continue; + } + skip_open = _gf_true; + } + /* We need to release the inode->lock before calling + * syncop_open() to avoid possible deadlocks. However this + * can cause the iter_fd to be released by other threads. + * To avoid this, we take a reference before releasing the + * lock. + */ + fd_ref(iter_fd); - ret = dht_layout_preset (this, dst_node, inode); - if (ret != 0) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: could not set preset layout for subvol %s", - local->loc.path, dst_node->name); - ret = -1; - local->op_errno = EINVAL; - goto out; - } + UNLOCK(&inode->lock); - layout = dht_layout_for_subvol (this, dst_node); - if (!layout) { - gf_log (this->name, GF_LOG_INFO, - "%s: no pre-set layout for subvolume %s", - local->loc.path, dst_node ? dst_node->name : "<nil>"); - ret = -1; - local->op_errno = EINVAL; - goto out; + if (tmp) { + fd_unref(tmp); + tmp = NULL; } + if (skip_open) + goto next; - ret = dht_layout_set (this, inode, layout); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set the new layout", - local->loc.path); - local->op_errno = EINVAL; - goto out; + /* flags for open are stripped down to allow following the + * new location of the file, otherwise we can get EEXIST or + * truncate the file again as rebalance is moving the data */ + ret = syncop_open(dst_node, &tmp_loc, + (iter_fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)), + iter_fd, NULL, NULL); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_OPEN_FD_ON_DST_FAILED, "id=%p", iter_fd, + "flags=0%o", iter_fd->flags, "path=%s", path, "name=%s", + dst_node->name, NULL); + + open_failed = 1; + local->op_errno = -ret; + ret = -1; + } else { + dht_fd_ctx_set(this, iter_fd, dst_node); } - local->cached_subvol = dst_node; - ret = 0; + next: + LOCK(&inode->lock); + skip_open = _gf_false; + tmp = iter_fd; + iter_fd = list_entry(tmp->inode_list.next, typeof(*tmp), inode_list); + } - /* once we detect the migration complete, the inode-ctx2 is no more - required.. delete the ctx and also, it means, open() already - done on all the fd of inode */ - ret = inode_ctx_reset1 (inode, this, &tmp_subvol); - if (tmp_subvol) - goto out; + SYNCTASK_SETID(frame->root->uid, frame->root->gid); - if (list_empty (&inode->fd_list)) - goto out; + if (open_failed) { + ret = -1; + goto unlock; + } + ret = 0; - /* perform open as root:root. There is window between linkfile - * creation(root:root) and setattr with the correct uid/gid - */ - SYNCTASK_SETID(0, 0); +unlock: + UNLOCK(&inode->lock); + if (tmp) { + fd_unref(tmp); + tmp = NULL; + } - /* perform 'open()' on all the fd's present on the inode */ - tmp_loc.inode = inode; - inode_path (inode, NULL, &path); - if (path) - tmp_loc.path = path; - list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { - if (fd_is_anonymous (iter_fd)) - continue; - - ret = syncop_open (dst_node, &tmp_loc, - iter_fd->flags, iter_fd); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, "failed to open " - "the fd (%p, flags=0%o) on file %s @ %s", - iter_fd, iter_fd->flags, path, dst_node->name); - open_failed = 1; - local->op_errno = -ret; - ret = -1; - } - } - GF_FREE (path); - - SYNCTASK_SETID (frame->root->uid, frame->root->gid); - - if (open_failed) { - ret = -1; - goto out; - } - ret = 0; out: + if (dict) { + dict_unref(dict); + } - return ret; + loc_wipe(&tmp_loc); + + return ret; } int -dht_rebalance_complete_check (xlator_t *this, call_frame_t *frame) +dht_rebalance_complete_check(xlator_t *this, call_frame_t *frame) { - int ret = -1; + int ret = -1; - ret = synctask_new (this->ctx->env, dht_migration_complete_check_task, - dht_migration_complete_check_done, - frame, frame); - return ret; + ret = synctask_new(this->ctx->env, dht_migration_complete_check_task, + dht_migration_complete_check_done, frame, frame); + return ret; } /* During 'in-progress' state, both nodes should have the file */ +/* + op_ret values : + 0 : Success + -1 : Failure. + 1 : File is being migrated but not by this DHT layer. +*/ static int -dht_inprogress_check_done (int op_ret, call_frame_t *sync_frame, void *data) +dht_inprogress_check_done(int op_ret, call_frame_t *frame, void *data) { - dht_local_t *local = NULL; + dht_local_t *local = NULL; + xlator_t *dst_subvol = NULL, *src_subvol = NULL; + inode_t *inode = NULL; - local = sync_frame->local; + local = frame->local; - local->rebalance.target_op_fn (THIS, sync_frame, op_ret); + if (op_ret != 0) + goto out; - return 0; -} + inode = local->loc.inode ? local->loc.inode : local->fd->inode; -static int -dht_rebalance_inprogress_task (void *data) -{ - int ret = -1; - xlator_t *src_node = NULL; - xlator_t *dst_node = NULL; - dht_local_t *local = NULL; - dict_t *dict = NULL; - call_frame_t *frame = NULL; - xlator_t *this = NULL; - char *path = NULL; - struct iatt stbuf = {0,}; - loc_t tmp_loc = {0,}; - dht_conf_t *conf = NULL; - inode_t *inode = NULL; - fd_t *iter_fd = NULL; - int open_failed = 0; - - this = THIS; - frame = data; - local = frame->local; - conf = this->private; - - src_node = local->cached_subvol; - - if (!local->loc.inode && !local->fd) - goto out; - - inode = (!local->fd) ? local->loc.inode : local->fd->inode; - - /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr - * as root:root. If a fd is already open, access check wont be done*/ - if (local->loc.inode) { - SYNCTASK_SETID (0, 0); - ret = syncop_getxattr (src_node, &local->loc, &dict, - conf->link_xattr_name); - SYNCTASK_SETID (frame->root->uid, frame->root->gid); - } else { - ret = syncop_fgetxattr (src_node, local->fd, &dict, - conf->link_xattr_name); + dht_inode_ctx_get_mig_info(THIS, inode, &src_subvol, &dst_subvol); + if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol, dst_subvol)) { + dst_subvol = dht_subvol_get_cached(THIS, inode); + if (!dst_subvol) { + local->op_errno = EINVAL; + goto out; } + } - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to get the 'linkto' xattr %s", - local->loc.path, strerror (errno)); - goto out; - } - - dst_node = dht_linkfile_subvol (this, NULL, NULL, dict); - if (!dst_node) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to get the 'linkto' xattr from dict", - local->loc.path); - ret = -1; - goto out; - } - - local->rebalance.target_node = dst_node; - - if (local->loc.inode) { - /* lookup on dst */ - ret = syncop_lookup (dst_node, &local->loc, NULL, - &stbuf, NULL, NULL); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to lookup the file on %s", - local->loc.path, dst_node->name); - goto out; - } - - if (uuid_compare (stbuf.ia_gfid, local->loc.inode->gfid)) { - gf_log (this->name, GF_LOG_ERROR, - "%s: gfid different on the target file on %s", - local->loc.path, dst_node->name); - ret = -1; - goto out; - } - } - - ret = 0; +out: + local->rebalance.target_op_fn(THIS, dst_subvol, frame, op_ret); - if (list_empty (&inode->fd_list)) - goto done; + return 0; +} - /* perform open as root:root. There is window between linkfile - * creation(root:root) and setattr with the correct uid/gid +static int +dht_rebalance_inprogress_task(void *data) +{ + int ret = -1; + xlator_t *src_node = NULL; + xlator_t *dst_node = NULL; + dht_local_t *local = NULL; + dict_t *dict = NULL; + call_frame_t *frame = NULL; + xlator_t *this = NULL; + char *path = NULL; + struct iatt stbuf = { + 0, + }; + loc_t tmp_loc = { + 0, + }; + dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + fd_t *tmp = NULL; + int open_failed = 0; + uint64_t tmp_miginfo = 0; + dht_migrate_info_t *miginfo = NULL; + gf_boolean_t skip_open = _gf_false; + + this = THIS; + frame = data; + local = frame->local; + conf = this->private; + + src_node = local->cached_subvol; + + if (!local->loc.inode && !local->fd) + goto out; + + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr + * as root:root. If a fd is already open, access check won't be done*/ + if (local->loc.inode) { + SYNCTASK_SETID(0, 0); + ret = syncop_getxattr(src_node, &local->loc, &dict, + conf->link_xattr_name, NULL, NULL); + SYNCTASK_SETID(frame->root->uid, frame->root->gid); + } else { + ret = syncop_fgetxattr(src_node, local->fd, &dict, + conf->link_xattr_name, NULL, NULL); + } + + /* + * Each DHT xlator layer has its own name for the linkto xattr. + * If the file mode bits indicate the the file is being migrated but + * this layer's linkto xattr is not present, it means that another + * DHT layer is migrating the file. In this case, return 1 so + * the mode bits can be passed on to the higher layer for appropriate + * action. + */ + + if (-ret == ENODATA) { + /* This DHT layer is not migrating this file */ + ret = inode_ctx_reset1(inode, this, &tmp_miginfo); + if (tmp_miginfo) { + /* This can be a problem if the file was + * migrated by two different layers. Raise + * a warning here. + */ + gf_smsg( + this->name, GF_LOG_WARNING, 0, DHT_MSG_HAS_MIGINFO, "tmp=%s", + tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), NULL); + miginfo = (void *)(uintptr_t)tmp_miginfo; + GF_REF_PUT(miginfo); + } + ret = 1; + goto out; + } + + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_GET_XATTR_FAILED, + "path=%s", local->loc.path, NULL); + ret = -1; + goto out; + } + + dst_node = dht_linkfile_subvol(this, NULL, NULL, dict); + if (!dst_node) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GET_XATTR_FAILED, + "path=%s", local->loc.path, NULL); + ret = -1; + goto out; + } + + local->rebalance.target_node = dst_node; + + if (local->loc.inode) { + loc_copy(&tmp_loc, &local->loc); + } else { + tmp_loc.inode = inode_ref(inode); + gf_uuid_copy(tmp_loc.gfid, inode->gfid); + } + + /* lookup on dst */ + ret = syncop_lookup(dst_node, &tmp_loc, &stbuf, NULL, NULL, NULL); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_FILE_LOOKUP_FAILED, + "tmp=%s", tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), + "name=%s", dst_node->name, NULL); + ret = -1; + goto out; + } + + if (gf_uuid_compare(stbuf.ia_gfid, tmp_loc.inode->gfid)) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH, "tmp=%s", + tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), + "name=%s", dst_node->name, NULL); + ret = -1; + goto out; + } + ret = 0; + + if (tmp_loc.path == NULL) { + inode_path(inode, NULL, &path); + if (path) + tmp_loc.path = path; + } + + LOCK(&inode->lock); + + if (list_empty(&inode->fd_list)) + goto unlock; + + /* perform open as root:root. There is window between linkfile + * creation(root:root) and setattr with the correct uid/gid + */ + SYNCTASK_SETID(0, 0); + + /* It's possible that we are the last user of iter_fd after each + * iteration. In this case the fd_unref() of iter_fd at the end of + * the loop will cause the destruction of the fd. So we need to + * iterate the list safely because iter_fd cannot be trusted. + */ + iter_fd = list_entry((&inode->fd_list)->next, typeof(*iter_fd), inode_list); + while (&iter_fd->inode_list != (&inode->fd_list)) { + /* We need to release the inode->lock before calling + * syncop_open() to avoid possible deadlocks. However this + * can cause the iter_fd to be released by other threads. + * To avoid this, we take a reference before releasing the + * lock. */ - SYNCTASK_SETID (0, 0); - tmp_loc.inode = inode; - inode_path (inode, NULL, &path); - if (path) - tmp_loc.path = path; - - list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { - if (fd_is_anonymous (iter_fd)) - continue; - - ret = syncop_open (dst_node, &tmp_loc, - iter_fd->flags, iter_fd); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, "failed to send open " - "the fd (%p, flags=0%o) on file %s @ %s", - iter_fd, iter_fd->flags, path, dst_node->name); - open_failed = 1; - } + if (fd_is_anonymous(iter_fd) || + (dht_fd_open_on_dst(this, iter_fd, dst_node))) { + if (!tmp) { + iter_fd = list_entry(iter_fd->inode_list.next, typeof(*iter_fd), + inode_list); + continue; + } + skip_open = _gf_true; } - GF_FREE (path); - SYNCTASK_SETID (frame->root->uid, frame->root->gid); + /* Yes, this is ugly but there isn't a cleaner way to do this + * the fd_ref is an atomic increment so not too bad. We want to + * reduce the number of inode locks and unlocks. + */ - if (open_failed) { - ret = -1; - goto out; - } + fd_ref(iter_fd); + UNLOCK(&inode->lock); -done: - ret = dht_inode_ctx_set1 (this, inode, dst_node); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set inode-ctx target file at %s", - local->loc.path, dst_node->name); - goto out; + if (tmp) { + fd_unref(tmp); + tmp = NULL; } + if (skip_open) + goto next; - ret = 0; + /* flags for open are stripped down to allow following the + * new location of the file, otherwise we can get EEXIST or + * truncate the file again as rebalance is moving the data */ + ret = syncop_open(dst_node, &tmp_loc, + (iter_fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)), + iter_fd, NULL, NULL); + if (ret < 0) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + DHT_MSG_OPEN_FD_ON_DST_FAILED, "fd=%p", iter_fd, + "flags=0%o", iter_fd->flags, "path=%s", path, "name=%s", + dst_node->name, NULL); + ret = -1; + open_failed = 1; + } else { + /* Potential fd leak if this fails here as it will be + reopened at the next Phase1/2 check */ + dht_fd_ctx_set(this, iter_fd, dst_node); + } + + next: + LOCK(&inode->lock); + skip_open = _gf_false; + tmp = iter_fd; + iter_fd = list_entry(tmp->inode_list.next, typeof(*tmp), inode_list); + } + + SYNCTASK_SETID(frame->root->uid, frame->root->gid); + +unlock: + UNLOCK(&inode->lock); + + if (tmp) { + fd_unref(tmp); + tmp = NULL; + } + if (open_failed) { + ret = -1; + goto out; + } + + ret = dht_inode_ctx_set_mig_info(this, inode, src_node, dst_node); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "path=%s", local->loc.path, "name=%s", dst_node->name, NULL); + goto out; + } + + ret = 0; out: - return ret; -} - -int -dht_rebalance_in_progress_check (xlator_t *this, call_frame_t *frame) -{ + if (dict) { + dict_unref(dict); + } - int ret = -1; - - ret = synctask_new (this->ctx->env, dht_rebalance_inprogress_task, - dht_inprogress_check_done, - frame, frame); - return ret; + loc_wipe(&tmp_loc); + return ret; } int -dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this, - dht_layout_t *layout_int) +dht_rebalance_in_progress_check(xlator_t *this, call_frame_t *frame) { - dht_inode_ctx_t *ctx = NULL; - int ret = -1; + int ret = -1; - ret = dht_inode_ctx_get (inode, this, &ctx); - if (!ret && ctx) { - ctx->layout = layout_int; - } else { - ctx = GF_CALLOC (1, sizeof (*ctx), gf_dht_mt_inode_ctx_t); - if (!ctx) - return ret; - ctx->layout = layout_int; - } - - ret = dht_inode_ctx_set (inode, this, ctx); - - return ret; + ret = synctask_new(this->ctx->env, dht_rebalance_inprogress_task, + dht_inprogress_check_done, frame, frame); + return ret; } int -dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat, - int32_t post) +dht_inode_ctx_layout_set(inode_t *inode, xlator_t *this, + dht_layout_t *layout_int) { - dht_inode_ctx_t *ctx = NULL; - dht_stat_time_t *time = 0; - int ret = -1; - - GF_VALIDATE_OR_GOTO (this->name, stat, out); - GF_VALIDATE_OR_GOTO (this->name, inode, out); - - ret = dht_inode_ctx_get (inode, this, &ctx); - - if (ret) { - ctx = GF_CALLOC (1, sizeof (*ctx), gf_dht_mt_inode_ctx_t); - if (!ctx) - return -1; - } - - time = &ctx->time; - - DHT_UPDATE_TIME(time->mtime, time->mtime_nsec, - stat->ia_mtime, stat->ia_mtime_nsec, inode, post); - DHT_UPDATE_TIME(time->ctime, time->ctime_nsec, - stat->ia_ctime, stat->ia_ctime_nsec, inode, post); - DHT_UPDATE_TIME(time->atime, time->atime_nsec, - stat->ia_atime, stat->ia_atime_nsec, inode, post); - - ret = dht_inode_ctx_set (inode, this, ctx); -out: - return 0; + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + + ret = dht_inode_ctx_get(inode, this, &ctx); + if (!ret && ctx) { + ctx->layout = layout_int; + } else { + ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t); + if (!ctx) + return ret; + ctx->layout = layout_int; + } + + ret = dht_inode_ctx_set(inode, this, ctx); + + return ret; } -int -dht_inode_ctx_get (inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx) +void +dht_inode_ctx_time_set(inode_t *inode, xlator_t *this, struct iatt *stat) { - int ret = -1; - uint64_t ctx_int = 0; + dht_inode_ctx_t *ctx = NULL; + dht_stat_time_t *time = 0; + int ret = -1; - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO (this->name, inode, out); + ret = dht_inode_ctx_get(inode, this, &ctx); - ret = inode_ctx_get (inode, this, &ctx_int); + if (ret) + return; - if (ret) - return ret; + time = &ctx->time; - if (ctx) - *ctx = (dht_inode_ctx_t *) ctx_int; -out: - return ret; -} + time->mtime = stat->ia_mtime; + time->mtime_nsec = stat->ia_mtime_nsec; -int dht_inode_ctx_set (inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx) -{ - int ret = -1; - uint64_t ctx_int = 0; + time->ctime = stat->ia_ctime; + time->ctime_nsec = stat->ia_ctime_nsec; - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO (this->name, inode, out); - GF_VALIDATE_OR_GOTO (this->name, ctx, out); + time->atime = stat->ia_atime; + time->atime_nsec = stat->ia_atime_nsec; - ctx_int = (long)ctx; - ret = inode_ctx_set (inode, this, &ctx_int); -out: - return ret; + return; } int -dht_subvol_status (dht_conf_t *conf, xlator_t *subvol) +dht_inode_ctx_time_update(inode_t *inode, xlator_t *this, struct iatt *stat, + int32_t post) { - int i; - - for (i=0 ; i < conf->subvolume_cnt; i++) { - if (conf->subvolumes[i] == subvol) { - return conf->subvolume_status[i]; - } - } - return 0; + dht_inode_ctx_t *ctx = NULL; + dht_stat_time_t *time = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO(this->name, stat, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + + ret = dht_inode_ctx_get(inode, this, &ctx); + + if (ret) { + ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t); + if (!ctx) + return -1; + } + + time = &ctx->time; + + LOCK(&inode->lock); + { + DHT_UPDATE_TIME(time->mtime, time->mtime_nsec, stat->ia_mtime, + stat->ia_mtime_nsec, post); + DHT_UPDATE_TIME(time->ctime, time->ctime_nsec, stat->ia_ctime, + stat->ia_ctime_nsec, post); + DHT_UPDATE_TIME(time->atime, time->atime_nsec, stat->ia_atime, + stat->ia_atime_nsec, post); + } + UNLOCK(&inode->lock); + + ret = dht_inode_ctx_set(inode, this, ctx); +out: + return 0; } -void -dht_inodelk_done (call_frame_t *lock_frame) +int +dht_inode_ctx_get(inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx) { - fop_inodelk_cbk_t inodelk_cbk = NULL; - call_frame_t *main_frame = NULL; - dht_local_t *local = NULL; + int ret = -1; + uint64_t ctx_int = 0; - local = lock_frame->local; - main_frame = local->main_frame; + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); - local->lock.locks = NULL; - local->lock.lk_count = 0; + ret = inode_ctx_get(inode, this, &ctx_int); - inodelk_cbk = local->lock.inodelk_cbk; - local->lock.inodelk_cbk = NULL; - - inodelk_cbk (main_frame, NULL, main_frame->this, local->lock.op_ret, - local->lock.op_errno, NULL); + if (ret) + return ret; - dht_lock_stack_destroy (lock_frame); - return; + if (ctx) + *ctx = (dht_inode_ctx_t *)(uintptr_t)ctx_int; +out: + return ret; } int -dht_inodelk_cleanup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xdata) +dht_inode_ctx_set(inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx) { - dht_inodelk_done (frame); - return 0; -} + int ret = -1; + uint64_t ctx_int = 0; -inline int32_t -dht_lock_count (dht_lock_t **lk_array, int lk_count) -{ - int i = 0, locked = 0; + GF_VALIDATE_OR_GOTO("dht", this, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, ctx, out); - if ((lk_array == NULL) || (lk_count == 0)) - goto out; - - for (i = 0; i < lk_count; i++) { - if (lk_array[i]->locked) - locked++; - } + ctx_int = (long)ctx; + ret = inode_ctx_set(inode, this, &ctx_int); out: - return locked; + return ret; } -void -dht_inodelk_cleanup (call_frame_t *lock_frame) +int +dht_subvol_status(dht_conf_t *conf, xlator_t *subvol) { - dht_lock_t **lk_array = NULL; - int lk_count = 0, lk_acquired = 0; - dht_local_t *local = NULL; - - local = lock_frame->local; - - lk_array = local->lock.locks; - lk_count = local->lock.lk_count; + int i; - lk_acquired = dht_lock_count (lk_array, lk_count); - if (lk_acquired != 0) { - dht_unlock_inodelk (lock_frame, lk_array, lk_count, - dht_inodelk_cleanup_cbk); - } else { - dht_inodelk_done (lock_frame); + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == subvol) { + return conf->subvolume_status[i]; } - - return; + } + return 0; } -int32_t -dht_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +inode_t * +dht_heal_path(xlator_t *this, char *path, inode_table_t *itable) { - dht_local_t *local = NULL; - int lk_index = 0, call_cnt = 0; - char gfid[GF_UUID_BUF_SIZE] = {0}; - - lk_index = (long) cookie; - - local = frame->local; - if (op_ret < 0) { - uuid_utoa_r (local->lock.locks[lk_index]->loc.gfid, - gfid); - - gf_log (this->name, GF_LOG_WARNING, - "unlocking failed on %s:%s (%s)", - local->lock.locks[lk_index]->xl->name, - gfid, strerror (op_errno)); + int ret = -1; + struct iatt iatt = { + 0, + }; + inode_t *linked_inode = NULL; + loc_t loc = { + 0, + }; + char *bname = NULL; + char *save_ptr = NULL; + static uuid_t gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + char *tmp_path = NULL; + + tmp_path = gf_strdup(path); + if (!tmp_path) { + goto out; + } + + gf_uuid_copy(loc.pargfid, gfid); + loc.parent = inode_ref(itable->root); + + bname = strtok_r(tmp_path, "/", &save_ptr); + + /* sending a lookup on parent directory, + * Eg: if path is like /a/b/c/d/e/f/g/ + * then we will send a lookup on a first and then b,c,d,etc + */ + + while (bname) { + linked_inode = NULL; + loc.inode = inode_grep(itable, loc.parent, bname); + if (loc.inode == NULL) { + loc.inode = inode_new(itable); + if (loc.inode == NULL) { + ret = -ENOMEM; + goto out; + } } else { - local->lock.locks[lk_index]->locked = 0; - } - - call_cnt = dht_frame_return (frame); - if (is_last_call (call_cnt)) { - dht_inodelk_done (frame); - } - - return 0; -} - -call_frame_t * -dht_lock_frame (call_frame_t *parent_frame) -{ - call_frame_t *lock_frame = NULL; - - lock_frame = copy_frame (parent_frame); - if (lock_frame == NULL) + /* + * Inode is already populated in the inode table. + * Which means we already looked up the inode and + * linked with a dentry. So that we will skip + * lookup on this entry, and proceed to next. + */ + linked_inode = loc.inode; + bname = strtok_r(NULL, "/", &save_ptr); + if (!bname) { goto out; - - set_lk_owner_from_ptr (&lock_frame->root->lk_owner, parent_frame->root); - -out: - return lock_frame; -} - -int32_t -dht_unlock_inodelk (call_frame_t *frame, dht_lock_t **lk_array, int lk_count, - fop_inodelk_cbk_t inodelk_cbk) -{ - dht_local_t *local = NULL; - struct gf_flock flock = {0,}; - int ret = -1 , i = 0; - call_frame_t *lock_frame = NULL; - int call_cnt = 0; - - GF_VALIDATE_OR_GOTO ("dht-locks", frame, done); - GF_VALIDATE_OR_GOTO (frame->this->name, lk_array, done); - GF_VALIDATE_OR_GOTO (frame->this->name, inodelk_cbk, done); - - call_cnt = dht_lock_count (lk_array, lk_count); - if (call_cnt == 0) { - ret = 0; - goto done; - } - - lock_frame = dht_lock_frame (frame); - if (lock_frame == NULL) { - gf_log (frame->this->name, GF_LOG_WARNING, - "cannot allocate a frame, not unlocking following " - "locks:"); - - dht_log_lk_array (frame->this->name, GF_LOG_WARNING, lk_array, - lk_count); - goto done; + } + inode_unref(loc.parent); + loc.parent = loc.inode; + gf_uuid_copy(loc.pargfid, loc.inode->gfid); + loc.inode = NULL; + continue; } - ret = dht_local_lock_init (lock_frame, lk_array, lk_count, inodelk_cbk); - if (ret < 0) { - gf_log (frame->this->name, GF_LOG_WARNING, - "storing locks in local failed, not unlocking " - "following locks:"); - - dht_log_lk_array (frame->this->name, GF_LOG_WARNING, lk_array, - lk_count); + loc.name = bname; + ret = loc_path(&loc, bname); - goto done; - } - - local = lock_frame->local; - local->main_frame = frame; - local->call_cnt = call_cnt; - - flock.l_type = F_UNLCK; - - for (i = 0; i < local->lock.lk_count; i++) { - if (!local->lock.locks[i]->locked) - continue; - - STACK_WIND_COOKIE (lock_frame, dht_unlock_inodelk_cbk, - (void *)(long)i, - local->lock.locks[i]->xl, - local->lock.locks[i]->xl->fops->inodelk, - local->lock.locks[i]->domain, - &local->lock.locks[i]->loc, F_SETLK, - &flock, NULL); + ret = syncop_lookup(this, &loc, &iatt, NULL, NULL, NULL); + if (ret) { + gf_smsg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_SELFHEAL_FAILED, + "path=%s", path, "subvolume=%s", this->name, "bname=%s", + bname, NULL); + goto out; } - return 0; + linked_inode = inode_link(loc.inode, loc.parent, bname, &iatt); + if (!linked_inode) + goto out; -done: - if (lock_frame) - dht_lock_stack_destroy (lock_frame); - - /* no locks acquired, invoke inodelk_cbk */ - if (ret == 0) - inodelk_cbk (frame, NULL, frame->this, 0, 0, NULL); - - return ret; -} - -int32_t -dht_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - dht_local_t *local = NULL; - int lk_index = 0, call_cnt = 0; - char gfid[GF_UUID_BUF_SIZE] = {0}; - - local = frame->local; - lk_index = (long) cookie; - - if (op_ret == -1) { - local->lock.op_ret = -1; - local->lock.op_errno = op_errno; - - if (local && local->lock.locks[lk_index]) { - uuid_utoa_r (local->lock.locks[lk_index]->loc.inode->gfid, - gfid); - - gf_log (this->name, GF_LOG_DEBUG, - "inodelk failed on gfid: %s " - "subvolume: %s (%s)", gfid, - local->lock.locks[lk_index]->xl->name, - strerror (op_errno)); - } - - goto out; - } - - local->lock.locks[lk_index]->locked = _gf_true; + loc_wipe(&loc); + gf_uuid_copy(loc.pargfid, linked_inode->gfid); + loc.inode = NULL; + bname = strtok_r(NULL, "/", &save_ptr); + if (bname) + loc.parent = linked_inode; + } out: - call_cnt = dht_frame_return (frame); - if (is_last_call (call_cnt)) { - if (local->lock.op_ret < 0) { - dht_inodelk_cleanup (frame); - return 0; - } - - dht_inodelk_done (frame); - } + inode_ref(linked_inode); + loc_wipe(&loc); + GF_FREE(tmp_path); - return 0; + return linked_inode; } int -dht_nonblocking_inodelk (call_frame_t *frame, dht_lock_t **lk_array, - int lk_count, fop_inodelk_cbk_t inodelk_cbk) +dht_heal_full_path(void *data) { - struct gf_flock flock = {0,}; - int i = 0, ret = 0; - dht_local_t *local = NULL; - call_frame_t *lock_frame = NULL; - - GF_VALIDATE_OR_GOTO ("dht-locks", frame, out); - GF_VALIDATE_OR_GOTO (frame->this->name, lk_array, out); - GF_VALIDATE_OR_GOTO (frame->this->name, inodelk_cbk, out); - - lock_frame = dht_lock_frame (frame); - if (lock_frame == NULL) - goto out; - - ret = dht_local_lock_init (lock_frame, lk_array, lk_count, inodelk_cbk); - if (ret < 0) { - goto out; - } - - local = lock_frame->local; - local->main_frame = frame; - - local->call_cnt = lk_count; - - for (i = 0; i < local->lock.lk_count; i++) { - flock.l_type = local->lock.locks[i]->type; - - STACK_WIND_COOKIE (lock_frame, dht_nonblocking_inodelk_cbk, - (void *) (long) i, - local->lock.locks[i]->xl, - local->lock.locks[i]->xl->fops->inodelk, - local->lock.locks[i]->domain, - &local->lock.locks[i]->loc, F_SETLK, - &flock, NULL); + call_frame_t *heal_frame = data; + dht_local_t *local = NULL; + loc_t loc = { + 0, + }; + dict_t *dict = NULL; + char *path = NULL; + int ret = -1; + xlator_t *source = NULL; + xlator_t *this = NULL; + inode_table_t *itable = NULL; + inode_t *inode = NULL; + inode_t *tmp_inode = NULL; + + GF_VALIDATE_OR_GOTO("DHT", heal_frame, out); + + local = heal_frame->local; + this = heal_frame->this; + source = heal_frame->cookie; + heal_frame->cookie = NULL; + gf_uuid_copy(loc.gfid, local->gfid); + + if (local->loc.inode) + loc.inode = inode_ref(local->loc.inode); + else + goto out; + + itable = loc.inode->table; + ret = syncop_getxattr(source, &loc, &dict, GET_ANCESTRY_PATH_KEY, NULL, + NULL); + if (ret) { + gf_smsg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_HEAL_ABORT, + "subvol=%s", source->name, NULL); + goto out; + } + + ret = dict_get_str(dict, GET_ANCESTRY_PATH_KEY, &path); + if (path) { + inode = dht_heal_path(this, path, itable); + if (inode && inode != local->inode) { + /* + * if inode returned by heal function is different + * from what we passed, which means a racing thread + * already linked a different inode for dentry. + * So we will update our local->inode, so that we can + * retrurn proper inode. + */ + tmp_inode = local->inode; + local->inode = inode; + inode_unref(tmp_inode); + tmp_inode = NULL; + } else { + inode_unref(inode); } - - return 0; + } out: - if (lock_frame) - dht_lock_stack_destroy (lock_frame); - - return -1; + loc_wipe(&loc); + if (dict) + dict_unref(dict); + return 0; } -int32_t -dht_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +int +dht_heal_full_path_done(int op_ret, call_frame_t *heal_frame, void *data) { - int lk_index = 0; - dht_local_t *local = NULL; - - lk_index = (long) cookie; - - local = frame->local; - - if (op_ret == 0) { - local->lock.locks[lk_index]->locked = _gf_true; - } else { - local->lock.op_ret = -1; - local->lock.op_errno = op_errno; - goto cleanup; - } - - if (lk_index == (local->lock.lk_count - 1)) { - dht_inodelk_done (frame); - } else { - dht_blocking_inodelk_rec (frame, ++lk_index); + call_frame_t *main_frame = NULL; + dht_local_t *local = NULL; + xlator_t *this = NULL; + int ret = -1; + int op_errno = 0; + + local = heal_frame->local; + main_frame = local->main_frame; + local->main_frame = NULL; + this = heal_frame->this; + + dht_set_fixed_dir_stat(&local->postparent); + if (local->need_xattr_heal) { + local->need_xattr_heal = 0; + ret = dht_dir_xattr_heal(this, local, &op_errno); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, op_errno, + DHT_MSG_DIR_XATTR_HEAL_FAILED, "path=%s", local->loc.path, + NULL); } + } - return 0; - -cleanup: - dht_inodelk_cleanup (frame); + DHT_STACK_UNWIND(lookup, main_frame, 0, 0, local->inode, &local->stbuf, + local->xattr, &local->postparent); - return 0; -} - -void -dht_blocking_inodelk_rec (call_frame_t *frame, int i) -{ - dht_local_t *local = NULL; - struct gf_flock flock = {0,}; - - local = frame->local; - - flock.l_type = local->lock.locks[i]->type; - - STACK_WIND_COOKIE (frame, dht_blocking_inodelk_cbk, - (void *) (long) i, - local->lock.locks[i]->xl, - local->lock.locks[i]->xl->fops->inodelk, - local->lock.locks[i]->domain, - &local->lock.locks[i]->loc, F_SETLKW, &flock, NULL); - - return; + DHT_STACK_DESTROY(heal_frame); + return 0; } +/* This function must be called inside an inode lock */ int -dht_lock_request_cmp (const void *val1, const void *val2) +__dht_lock_subvol_set(inode_t *inode, xlator_t *this, xlator_t *lock_subvol) { - dht_lock_t *lock1 = NULL; - dht_lock_t *lock2 = NULL; - int ret = 0; - - lock1 = *(dht_lock_t **)val1; - lock2 = *(dht_lock_t **)val2; + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + uint64_t value = 0; - GF_VALIDATE_OR_GOTO ("dht-locks", lock1, out); - GF_VALIDATE_OR_GOTO ("dht-locks", lock2, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); - ret = strcmp (lock1->xl->name, lock2->xl->name); - - if (ret == 0) { - ret = uuid_compare (lock1->loc.gfid, lock2->loc.gfid); - } + ret = __inode_ctx_get0(inode, this, &value); + if (ret || !value) { + return -1; + } + ctx = (dht_inode_ctx_t *)(uintptr_t)value; + ctx->lock_subvol = lock_subvol; out: - return ret; + return ret; } -int -dht_lock_order_requests (dht_lock_t **locks, int count) +xlator_t * +dht_get_lock_subvolume(xlator_t *this, struct gf_flock *lock, + dht_local_t *local) { - int ret = -1; - - if (!locks || !count) - goto out; - - qsort (locks, count, sizeof (*locks), dht_lock_request_cmp); - ret = 0; - + xlator_t *subvol = NULL; + inode_t *inode = NULL; + int32_t ret = -1; + uint64_t value = 0; + xlator_t *cached_subvol = NULL; + dht_inode_ctx_t *ctx = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + GF_VALIDATE_OR_GOTO(this->name, lock, out); + GF_VALIDATE_OR_GOTO(this->name, local, out); + + cached_subvol = local->cached_subvol; + + if (local->loc.inode || local->fd) { + inode = local->loc.inode ? local->loc.inode : local->fd->inode; + } + + if (!inode) + goto out; + + if (!(IA_ISDIR(inode->ia_type) || IA_ISINVAL(inode->ia_type))) { + /* + * We may get non-linked inode for directories as part + * of the selfheal code path. So checking for IA_INVAL + * type also. This will only happen for directory. + */ + subvol = local->cached_subvol; + goto out; + } + + if (lock->l_type != F_UNLCK) { + /* + * inode purging might happen on NFS between a lk + * and unlk. Due to this lk and unlk might be sent + * to different subvols. + * So during a lock request, taking a ref on inode + * to prevent inode purging. inode unref will happen + * in unlock cbk code path. + */ + inode_ref(inode); + } + + LOCK(&inode->lock); + ret = __inode_ctx_get0(inode, this, &value); + if (!ret && value) { + ctx = (dht_inode_ctx_t *)(uintptr_t)value; + subvol = ctx->lock_subvol; + } + if (!subvol && lock->l_type != F_UNLCK && cached_subvol) { + ret = __dht_lock_subvol_set(inode, this, cached_subvol); + if (ret) { + gf_uuid_unparse(inode->gfid, gfid); + UNLOCK(&inode->lock); + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SET_INODE_CTX_FAILED, + "lock_subvol gfid=%s", gfid, NULL); + goto post_unlock; + } + subvol = cached_subvol; + } + UNLOCK(&inode->lock); +post_unlock: + if (!subvol && inode && lock->l_type != F_UNLCK) { + inode_unref(inode); + } out: - return ret; + return subvol; } int -dht_blocking_inodelk (call_frame_t *frame, dht_lock_t **lk_array, - int lk_count, fop_inodelk_cbk_t inodelk_cbk) +dht_lk_inode_unref(call_frame_t *frame, int32_t op_ret) { - int ret = -1; - call_frame_t *lock_frame = NULL; - dht_local_t *local = NULL; - - GF_VALIDATE_OR_GOTO ("dht-locks", frame, out); - GF_VALIDATE_OR_GOTO (frame->this->name, lk_array, out); - GF_VALIDATE_OR_GOTO (frame->this->name, inodelk_cbk, out); - - lock_frame = dht_lock_frame (frame); - if (lock_frame == NULL) + int ret = -1; + dht_local_t *local = NULL; + inode_t *inode = NULL; + xlator_t *this = NULL; + char gfid[GF_UUID_BUF_SIZE] = {0}; + + local = frame->local; + this = frame->this; + + if (local->loc.inode || local->fd) { + inode = local->loc.inode ? local->loc.inode : local->fd->inode; + } + if (!inode) { + gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LOCK_INODE_UNREF_FAILED, + NULL); + goto out; + } + + if (!(IA_ISDIR(inode->ia_type) || IA_ISINVAL(inode->ia_type))) { + ret = 0; + goto out; + } + + switch (local->lock_type) { + case F_RDLCK: + case F_WRLCK: + if (op_ret) { + gf_uuid_unparse(inode->gfid, gfid); + gf_msg_debug(this->name, 0, "lock request failed for gfid %s", + gfid); + inode_unref(inode); goto out; - - ret = dht_local_lock_init (lock_frame, lk_array, lk_count, inodelk_cbk); - if (ret < 0) { + } + break; + + case F_UNLCK: + if (!op_ret) { + inode_unref(inode); + } else { + gf_uuid_unparse(inode->gfid, gfid); + gf_smsg(this->name, GF_LOG_WARNING, 0, + DHT_MSG_LOCK_INODE_UNREF_FAILED, "gfid=%s", gfid, NULL); goto out; - } - - local = lock_frame->local; - local->main_frame = frame; - - dht_blocking_inodelk_rec (lock_frame, 0); - - return 0; + } + default: + break; + } + ret = 0; out: - if (lock_frame) - dht_lock_stack_destroy (lock_frame); + return ret; +} - return -1; +/* Code to update custom extended attributes from src dict to dst dict + */ +void +dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst, + dict_t *src, int *uret, int *uflag) +{ + int ret = -1; + data_t *keyval = NULL; + int luret = -1; + int luflag = -1; + int i = 0; + char **xattrs_to_heal; + + if (!src || !dst) { + gf_smsg(this->name, GF_LOG_WARNING, EINVAL, DHT_MSG_DST_NULL_SET_FAILED, + "path=%s", local->loc.path, NULL); + return; + } + /* Check if any user xattr present in src dict and set + it to dst dict + */ + luret = dict_foreach_fnmatch(src, "user.*", dht_set_user_xattr, dst); + /* Check if any other custom xattr present in src dict + and set it to dst dict, here index start from 1 because + user xattr already checked in previous statement + */ + + xattrs_to_heal = get_xattrs_to_heal(); + + for (i = 1; xattrs_to_heal[i]; i++) { + keyval = dict_get(src, xattrs_to_heal[i]); + if (keyval) { + luflag = 1; + ret = dict_set(dst, xattrs_to_heal[i], keyval); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, ENOMEM, + DHT_MSG_DICT_SET_FAILED, "key=%s", xattrs_to_heal[i], + "path=%s", local->loc.path, NULL); + keyval = NULL; + } + } + if (uret) + (*uret) = luret; + if (uflag) + (*uflag) = luflag; } |
