diff options
Diffstat (limited to 'xlators/cluster/afr/src')
21 files changed, 2212 insertions, 756 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 5e399ee98df..032ab5c8001 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -45,6 +45,56 @@ afr_quorum_errno(afr_private_t *priv) return ENOTCONN; } +gf_boolean_t +afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name, + pid_t pid) +{ + if (!__is_root_gfid(pargfid)) { + return _gf_false; + } + + if (strcmp(name, GF_REPLICATE_TRASH_DIR) == 0) { + /*For backward compatibility /.landfill is private*/ + return _gf_true; + } + + if (pid == GF_CLIENT_PID_GSYNCD) { + /*geo-rep needs to create/sync private directory on slave because + * it appears in changelog*/ + return _gf_false; + } + + if (pid == GF_CLIENT_PID_GLFS_HEAL || pid == GF_CLIENT_PID_SELF_HEALD) { + if (strcmp(name, priv->anon_inode_name) == 0) { + /* anonymous-inode dir is private*/ + return _gf_true; + } + } else { + if (strncmp(name, AFR_ANON_DIR_PREFIX, strlen(AFR_ANON_DIR_PREFIX)) == + 0) { + /* anonymous-inode dir prefix is private for geo-rep to work*/ + return _gf_true; + } + } + + return _gf_false; +} + +void +afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, + unsigned char *replies) +{ + int i = 0; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && local->replies[i].op_ret == 0) { + replies[i] = 1; + } else { + replies[i] = 0; + } + } +} + int afr_fav_child_reset_sink_xattrs(void *opaque); @@ -54,6 +104,581 @@ afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *frame, void *opaque); static void afr_discover_done(call_frame_t *frame, xlator_t *this); +int +afr_dom_lock_acquire_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int i = (long)cookie; + + local->cont.lk.dom_lock_op_ret[i] = op_ret; + local->cont.lk.dom_lock_op_errno[i] = op_errno; + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to acquire %s on %s", + uuid_utoa(local->fd->inode->gfid), AFR_LK_HEAL_DOM, + priv->children[i]->name); + } else { + local->cont.lk.dom_locked_nodes[i] = 1; + } + + syncbarrier_wake(&local->barrier); + + return 0; +} + +int +afr_dom_lock_acquire(call_frame_t *frame) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct gf_flock flock = { + 0, + }; + int i = 0; + + priv = frame->this->private; + local = frame->local; + local->cont.lk.dom_locked_nodes = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.locked_nodes), + gf_afr_mt_char); + if (!local->cont.lk.dom_locked_nodes) { + return -ENOMEM; + } + local->cont.lk.dom_lock_op_ret = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.dom_lock_op_ret), + gf_afr_mt_int32_t); + if (!local->cont.lk.dom_lock_op_ret) { + return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */ + } + local->cont.lk.dom_lock_op_errno = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.dom_lock_op_errno), + gf_afr_mt_int32_t); + if (!local->cont.lk.dom_lock_op_errno) { + return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */ + } + flock.l_type = F_WRLCK; + + AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM, + local->fd, F_SETLK, &flock, NULL); + + if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) + goto blocking_lock; + + /*If any of the bricks returned EAGAIN, we still need blocking locks.*/ + if (AFR_COUNT(local->cont.lk.dom_locked_nodes, priv->child_count) != + priv->child_count) { + for (i = 0; i < priv->child_count; i++) { + if (local->cont.lk.dom_lock_op_ret[i] == -1 && + local->cont.lk.dom_lock_op_errno[i] == EAGAIN) + goto blocking_lock; + } + } + + return 0; + +blocking_lock: + afr_dom_lock_release(frame); + AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM, + local->fd, F_SETLKW, &flock, NULL); + if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) { + afr_dom_lock_release(frame); + return -afr_quorum_errno(priv); + } + + return 0; +} + +int +afr_dom_lock_release_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int i = (long)cookie; + + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to release %s on %s", local->loc.path, + AFR_LK_HEAL_DOM, priv->children[i]->name); + } + local->cont.lk.dom_locked_nodes[i] = 0; + + syncbarrier_wake(&local->barrier); + + return 0; +} + +void +afr_dom_lock_release(call_frame_t *frame) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + struct gf_flock flock = { + 0, + }; + + local = frame->local; + priv = frame->this->private; + locked_on = local->cont.lk.dom_locked_nodes; + if (AFR_COUNT(locked_on, priv->child_count) == 0) + return; + flock.l_type = F_UNLCK; + + AFR_ONLIST(locked_on, frame, afr_dom_lock_release_cbk, finodelk, + AFR_LK_HEAL_DOM, local->fd, F_SETLK, &flock, NULL); + + return; +} + +static void +afr_lk_heal_info_cleanup(afr_lk_heal_info_t *info) +{ + if (!info) + return; + if (info->xdata_req) + dict_unref(info->xdata_req); + if (info->fd) + fd_unref(info->fd); + GF_FREE(info->locked_nodes); + GF_FREE(info->child_up_event_gen); + GF_FREE(info->child_down_event_gen); + GF_FREE(info); +} + +static int +afr_add_lock_to_saved_locks(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = this->private; + afr_local_t *local = frame->local; + afr_lk_heal_info_t *info = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = -ENOMEM; + + info = GF_CALLOC(sizeof(*info), 1, gf_afr_mt_lk_heal_info_t); + if (!info) { + goto cleanup; + } + INIT_LIST_HEAD(&info->pos); + info->fd = fd_ref(local->fd); + info->cmd = local->cont.lk.cmd; + info->pid = frame->root->pid; + info->flock = local->cont.lk.user_flock; + info->xdata_req = dict_copy_with_ref(local->xdata_req, NULL); + if (!info->xdata_req) { + goto cleanup; + } + info->lk_owner = frame->root->lk_owner; + info->locked_nodes = GF_MALLOC( + sizeof(*info->locked_nodes) * priv->child_count, gf_afr_mt_char); + if (!info->locked_nodes) { + goto cleanup; + } + memcpy(info->locked_nodes, local->cont.lk.locked_nodes, + sizeof(*info->locked_nodes) * priv->child_count); + info->child_up_event_gen = GF_CALLOC(sizeof(*info->child_up_event_gen), + priv->child_count, gf_afr_mt_int32_t); + if (!info->child_up_event_gen) { + goto cleanup; + } + info->child_down_event_gen = GF_CALLOC(sizeof(*info->child_down_event_gen), + priv->child_count, + gf_afr_mt_int32_t); + if (!info->child_down_event_gen) { + goto cleanup; + } + + LOCK(&local->fd->lock); + { + fd_ctx = __afr_fd_ctx_get(local->fd, this); + if (fd_ctx) + fd_ctx->lk_heal_info = info; + } + UNLOCK(&local->fd->lock); + if (!fd_ctx) { + goto cleanup; + } + + LOCK(&priv->lock); + { + list_add_tail(&info->pos, &priv->saved_locks); + } + UNLOCK(&priv->lock); + + return 0; +cleanup: + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to add lock to healq", + uuid_utoa(local->fd->inode->gfid)); + if (info) { + afr_lk_heal_info_cleanup(info); + if (fd_ctx) { + LOCK(&local->fd->lock); + { + fd_ctx->lk_heal_info = NULL; + } + UNLOCK(&local->fd->lock); + } + } + return ret; +} + +static int +afr_remove_lock_from_saved_locks(afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = this->private; + struct gf_flock flock = local->cont.lk.user_flock; + afr_lk_heal_info_t *info = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = -EINVAL; + + fd_ctx = afr_fd_ctx_get(local->fd, this); + if (!fd_ctx || !fd_ctx->lk_heal_info) { + goto out; + } + + info = fd_ctx->lk_heal_info; + if ((info->flock.l_start != flock.l_start) || + (info->flock.l_whence != flock.l_whence) || + (info->flock.l_len != flock.l_len)) { + /*TODO: Compare lkowners too.*/ + goto out; + } + + LOCK(&priv->lock); + { + list_del(&fd_ctx->lk_heal_info->pos); + } + UNLOCK(&priv->lock); + + afr_lk_heal_info_cleanup(info); + fd_ctx->lk_heal_info = NULL; + ret = 0; +out: + if (ret) + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to remove lock from healq", + uuid_utoa(local->fd->inode->gfid)); + return ret; +} + +int +afr_lock_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + afr_local_t *local = frame->local; + int i = (long)cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (op_ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "Failed to heal lock on child %d for %s", i, + uuid_utoa(local->fd->inode->gfid)); + } + syncbarrier_wake(&local->barrier); + return 0; +} + +int +afr_getlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct gf_flock *lock, dict_t *xdata) +{ + afr_local_t *local = frame->local; + int i = (long)cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (op_ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "Failed getlk for %s", uuid_utoa(local->fd->inode->gfid)); + } else { + local->cont.lk.getlk_rsp[i] = *lock; + } + + syncbarrier_wake(&local->barrier); + return 0; +} + +static gf_boolean_t +afr_does_lk_owner_match(call_frame_t *frame, afr_private_t *priv, + afr_lk_heal_info_t *info) +{ + int i = 0; + afr_local_t *local = frame->local; + struct gf_flock flock = { + 0, + }; + gf_boolean_t ret = _gf_true; + char *wind_on = alloca0(priv->child_count); + unsigned char *success_replies = alloca0(priv->child_count); + local->cont.lk.getlk_rsp = GF_CALLOC(sizeof(*local->cont.lk.getlk_rsp), + priv->child_count, gf_afr_mt_gf_lock); + + flock = info->flock; + for (i = 0; i < priv->child_count; i++) { + if (info->locked_nodes[i]) + wind_on[i] = 1; + } + + AFR_ONLIST(wind_on, frame, afr_getlk_cbk, lk, info->fd, F_GETLK, &flock, + info->xdata_req); + + afr_fill_success_replies(local, priv, success_replies); + if (AFR_COUNT(success_replies, priv->child_count) == 0) { + ret = _gf_false; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid || local->replies[i].op_ret != 0) + continue; + if (local->cont.lk.getlk_rsp[i].l_type == F_UNLCK) + continue; + /*TODO: Do we really need to compare lkowner if F_UNLCK is true?*/ + if (!is_same_lkowner(&local->cont.lk.getlk_rsp[i].l_owner, + &info->lk_owner)) { + ret = _gf_false; + break; + } + } +out: + afr_local_replies_wipe(local, priv); + GF_FREE(local->cont.lk.getlk_rsp); + local->cont.lk.getlk_rsp = NULL; + return ret; +} + +static void +afr_mark_fd_bad(fd_t *fd, xlator_t *this) +{ + afr_fd_ctx_t *fd_ctx = NULL; + + if (!fd) + return; + LOCK(&fd->lock); + { + fd_ctx = __afr_fd_ctx_get(fd, this); + if (fd_ctx) { + fd_ctx->is_fd_bad = _gf_true; + fd_ctx->lk_heal_info = NULL; + } + } + UNLOCK(&fd->lock); +} + +static void +afr_add_lock_to_lkhealq(afr_private_t *priv, afr_lk_heal_info_t *info) +{ + LOCK(&priv->lock); + { + list_del(&info->pos); + list_add_tail(&info->pos, &priv->lk_healq); + } + UNLOCK(&priv->lock); +} + +static void +afr_lock_heal_do(call_frame_t *frame, afr_private_t *priv, + afr_lk_heal_info_t *info) +{ + int i = 0; + int op_errno = 0; + int32_t *current_event_gen = NULL; + afr_local_t *local = frame->local; + xlator_t *this = frame->this; + char *wind_on = alloca0(priv->child_count); + gf_boolean_t retry = _gf_true; + + frame->root->pid = info->pid; + lk_owner_copy(&frame->root->lk_owner, &info->lk_owner); + + op_errno = -afr_dom_lock_acquire(frame); + if ((op_errno != 0)) { + goto release; + } + + if (!afr_does_lk_owner_match(frame, priv, info)) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_LK_HEAL_DOM, + "Ignoring lock heal for %s since lk-onwers mismatch. " + "Lock possibly pre-empted by another client.", + uuid_utoa(info->fd->inode->gfid)); + goto release; + } + + for (i = 0; i < priv->child_count; i++) { + if (info->locked_nodes[i]) + continue; + wind_on[i] = 1; + } + + current_event_gen = alloca(priv->child_count); + memcpy(current_event_gen, info->child_up_event_gen, + priv->child_count * sizeof *current_event_gen); + AFR_ONLIST(wind_on, frame, afr_lock_heal_cbk, lk, info->fd, info->cmd, + &info->flock, info->xdata_req); + + LOCK(&priv->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (!wind_on[i]) + continue; + if ((!local->replies[i].valid) || (local->replies[i].op_ret != 0)) { + continue; + } + + if ((current_event_gen[i] == info->child_up_event_gen[i]) && + (current_event_gen[i] > info->child_down_event_gen[i])) { + info->locked_nodes[i] = 1; + retry = _gf_false; + list_del_init(&info->pos); + list_add_tail(&info->pos, &priv->saved_locks); + } else { + /*We received subsequent child up/down events while heal was in + * progress; don't mark child as healed. Attempt again on the + * new child up*/ + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_LK_HEAL_DOM, + "Event gen mismatch: skipped healing lock on child %d " + "for %s.", + i, uuid_utoa(info->fd->inode->gfid)); + } + } + } + UNLOCK(&priv->lock); + +release: + afr_dom_lock_release(frame); + if (retry) + afr_add_lock_to_lkhealq(priv, info); + return; +} + +static int +afr_lock_heal_done(int ret, call_frame_t *frame, void *opaque) +{ + STACK_DESTROY(frame->root); + return 0; +} + +static int +afr_lock_heal(void *opaque) +{ + call_frame_t *frame = (call_frame_t *)opaque; + call_frame_t *iter_frame = NULL; + xlator_t *this = frame->this; + afr_private_t *priv = this->private; + afr_lk_heal_info_t *info = NULL; + afr_lk_heal_info_t *tmp = NULL; + struct list_head healq = { + 0, + }; + int ret = 0; + + iter_frame = afr_copy_frame(frame); + if (!iter_frame) { + return ENOMEM; + } + + INIT_LIST_HEAD(&healq); + LOCK(&priv->lock); + { + list_splice_init(&priv->lk_healq, &healq); + } + UNLOCK(&priv->lock); + + list_for_each_entry_safe(info, tmp, &healq, pos) + { + GF_ASSERT((AFR_COUNT(info->locked_nodes, priv->child_count) < + priv->child_count)); + ((afr_local_t *)(iter_frame->local))->fd = fd_ref(info->fd); + afr_lock_heal_do(iter_frame, priv, info); + AFR_STACK_RESET(iter_frame); + if (iter_frame->local == NULL) { + ret = ENOTCONN; + gf_msg(frame->this->name, GF_LOG_ERROR, ENOTCONN, + AFR_MSG_LK_HEAL_DOM, + "Aborting processing of lk_healq." + "Healing will be reattempted on next child up for locks " + "that are still in quorum."); + LOCK(&priv->lock); + { + list_add_tail(&healq, &priv->lk_healq); + } + UNLOCK(&priv->lock); + break; + } + } + + AFR_STACK_DESTROY(iter_frame); + return ret; +} + +static int +__afr_lock_heal_synctask(xlator_t *this, afr_private_t *priv, int child) +{ + int ret = 0; + call_frame_t *frame = NULL; + afr_lk_heal_info_t *info = NULL; + afr_lk_heal_info_t *tmp = NULL; + + if (priv->shd.iamshd) + return 0; + + list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos) + { + info->child_up_event_gen[child] = priv->event_generation; + list_del_init(&info->pos); + list_add_tail(&info->pos, &priv->lk_healq); + } + + frame = create_frame(this, this->ctx->pool); + if (!frame) + return -1; + + ret = synctask_new(this->ctx->env, afr_lock_heal, afr_lock_heal_done, frame, + frame); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_LK_HEAL_DOM, + "Failed to launch lock heal synctask"); + + return ret; +} + +static int +__afr_mark_pending_lk_heal(xlator_t *this, afr_private_t *priv, int child) +{ + afr_lk_heal_info_t *info = NULL; + afr_lk_heal_info_t *tmp = NULL; + + if (priv->shd.iamshd) + return 0; + list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos) + { + info->child_down_event_gen[child] = priv->event_generation; + if (info->locked_nodes[child] == 1) + info->locked_nodes[child] = 0; + if (!afr_has_quorum(info->locked_nodes, this, NULL)) { + /* Since the lock was lost on quorum no. of nodes, we should + * not attempt to heal it anymore. Some other client could have + * acquired the lock, modified data and released it and this + * client wouldn't know about it if we heal it.*/ + afr_mark_fd_bad(info->fd, this); + list_del(&info->pos); + afr_lk_heal_info_cleanup(info); + /* We're not winding an unlock on the node where the lock is still + * present because when fencing logic switches over to the new + * client (since we marked the fd bad), it should preempt any + * existing lock. */ + } + } + return 0; +} + gf_boolean_t afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv, int32_t *op_errno) @@ -68,6 +693,19 @@ afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv, return _gf_true; } +gf_boolean_t +afr_is_lock_mode_mandatory(dict_t *xdata) +{ + int ret = 0; + uint32_t lk_mode = GF_LK_ADVISORY; + + ret = dict_get_uint32(xdata, GF_LOCK_MODE, &lk_mode); + if (!ret && lk_mode == GF_LK_MANDATORY) + return _gf_true; + + return _gf_false; +} + call_frame_t * afr_copy_frame(call_frame_t *base) { @@ -282,7 +920,7 @@ __afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local, metadatamap |= (1 << index); } if (metadatamap_old != metadatamap) { - event = 0; + __afr_inode_need_refresh_set(inode, this); } break; @@ -295,7 +933,7 @@ __afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local, datamap |= (1 << index); } if (datamap_old != datamap) - event = 0; + __afr_inode_need_refresh_set(inode, this); break; default: @@ -459,34 +1097,6 @@ out: } int -__afr_inode_event_gen_reset_small(inode_t *inode, xlator_t *this) -{ - int ret = -1; - uint16_t datamap = 0; - uint16_t metadatamap = 0; - uint32_t event = 0; - uint64_t val = 0; - afr_inode_ctx_t *ctx = NULL; - - ret = __afr_inode_ctx_get(this, inode, &ctx); - if (ret) - return ret; - - val = ctx->read_subvol; - - metadatamap = (val & 0x000000000000ffff) >> 0; - datamap = (val & 0x00000000ffff0000) >> 16; - event = 0; - - val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) | - (((uint64_t)event) << 32); - - ctx->read_subvol = val; - - return ret; -} - -int __afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data, unsigned char *metadata, int *event_p) { @@ -557,22 +1167,6 @@ out: } int -__afr_inode_event_gen_reset(inode_t *inode, xlator_t *this) -{ - afr_private_t *priv = NULL; - int ret = -1; - - priv = this->private; - - if (priv->child_count <= 16) - ret = __afr_inode_event_gen_reset_small(inode, this); - else - ret = -1; - - return ret; -} - -int afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data, unsigned char *metadata, int *event_p) { @@ -638,12 +1232,11 @@ afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this, return 0; } -int +static int afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this, int *spb_choice) { int ret = -1; - GF_VALIDATE_OR_GOTO(this->name, inode, out); LOCK(&inode->lock); @@ -655,6 +1248,40 @@ out: return ret; } +/* + * frame is used to get the favourite policy. Since + * afr_inode_split_brain_choice_get was called with afr_open, it is possible to + * have a frame with out local->replies. So in that case, frame is passed as + * null, hence this function will handle the frame NULL case. + */ +int +afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this, + call_frame_t *frame, int *spb_subvol) +{ + int ret = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO("afr", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, spb_subvol, out); + + priv = this->private; + + ret = afr_inode_split_brain_choice_get(inode, this, spb_subvol); + if (*spb_subvol < 0 && priv->fav_child_policy && frame && frame->local) { + local = frame->local; + *spb_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode, + NULL); + if (*spb_subvol >= 0) { + ret = 0; + } + } + +out: + return ret; +} int afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data, unsigned char *metadata, int event) @@ -721,30 +1348,22 @@ out: return need_refresh; } -static int -afr_inode_need_refresh_set(inode_t *inode, xlator_t *this) +int +__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this) { int ret = -1; afr_inode_ctx_t *ctx = NULL; - GF_VALIDATE_OR_GOTO(this->name, inode, out); - - LOCK(&inode->lock); - { - ret = __afr_inode_ctx_get(this, inode, &ctx); - if (ret) - goto unlock; - + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret == 0) { ctx->need_refresh = _gf_true; } -unlock: - UNLOCK(&inode->lock); -out: + return ret; } int -afr_inode_event_gen_reset(inode_t *inode, xlator_t *this) +afr_inode_need_refresh_set(inode_t *inode, xlator_t *this) { int ret = -1; @@ -752,7 +1371,7 @@ afr_inode_event_gen_reset(inode_t *inode, xlator_t *this) LOCK(&inode->lock); { - ret = __afr_inode_event_gen_reset(inode, this); + ret = __afr_inode_need_refresh_set(inode, this); } UNLOCK(&inode->lock); out: @@ -1049,6 +1668,8 @@ afr_readables_fill(call_frame_t *frame, xlator_t *this, inode_t *inode, ia_type = inode->ia_type; } + if (!xdata) + continue; /* mkdir_cbk sends NULL xdata_rsp. */ afr_accused_fill(this, xdata, data_accused, (ia_type == IA_IFDIR) ? AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION); @@ -1185,7 +1806,7 @@ afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) ret = afr_inode_get_readable(frame, inode, this, local->readable, &event_generation, local->transaction.type); - if (ret == -EIO || (local->is_read_txn && !event_generation)) { + if (ret == -EIO) { /* No readable subvolume even after refresh ==> splitbrain.*/ if (!priv->fav_child_policy) { err = EIO; @@ -1224,18 +1845,6 @@ refresh_done: return 0; } -static void -afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, - unsigned char *replies) -{ - int i = 0; - - for (i = 0; i < priv->child_count; i++) { - if (local->replies[i].valid && local->replies[i].op_ret == 0) - replies[i] = 1; - } -} - int afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error) { @@ -1318,17 +1927,22 @@ afr_inode_refresh_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this, if (xdata) local->replies[call_child].xdata = dict_ref(xdata); } + if (xdata) { ret = dict_get_int8(xdata, "link-count", &need_heal); - local->replies[call_child].need_heal = need_heal; - } else { - local->replies[call_child].need_heal = need_heal; + if (ret) { + gf_msg_debug(this->name, -ret, "Unable to get link count"); + } } + local->replies[call_child].need_heal = need_heal; call_count = afr_frame_return(frame); if (call_count == 0) { afr_set_need_heal(this, local); ret = afr_inode_refresh_err(frame, this); + if (ret) { + gf_msg_debug(this->name, ret, "afr_inode_refresh_err failed"); + } afr_inode_refresh_done(frame, this, ret); } } @@ -1692,8 +2306,9 @@ afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv, * need is a low probability that multiple clients * won't converge on the same subvolume. */ + gf_uuid_copy(gfid_copy, args->gfid); pid = getpid(); - memcpy(gfid_copy, &pid, sizeof(pid)); + *(pid_t *)gfid_copy ^= pid; } child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) % priv->child_count; @@ -2044,6 +2659,9 @@ afr_local_cleanup(afr_local_t *local, xlator_t *this) { /* lk */ GF_FREE(local->cont.lk.locked_nodes); + GF_FREE(local->cont.lk.dom_locked_nodes); + GF_FREE(local->cont.lk.dom_lock_op_ret); + GF_FREE(local->cont.lk.dom_lock_op_errno); } { /* create */ @@ -2274,7 +2892,7 @@ afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int spb_choice = -1; + int spb_subvol = -1; int child_count = -1; if (*read_subvol != -1) @@ -2284,10 +2902,10 @@ afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this, local = frame->local; child_count = priv->child_count; - afr_inode_split_brain_choice_get(local->inode, this, &spb_choice); - if ((spb_choice >= 0) && + afr_split_brain_read_subvol_get(local->inode, this, frame, &spb_subvol); + if ((spb_subvol >= 0) && (AFR_COUNT(success_replies, child_count) == child_count)) { - *read_subvol = spb_choice; + *read_subvol = spb_subvol; } else if (!priv->quorum_count || frame->root->pid == GF_CLIENT_PID_GLFS_HEAL) { *read_subvol = afr_first_up_child(frame, this); @@ -2328,6 +2946,7 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this) 0, }; gf_boolean_t locked_entry = _gf_false; + gf_boolean_t in_flight_create = _gf_false; gf_boolean_t can_interpret = _gf_true; inode_t *parent = NULL; ia_type_t ia_type = IA_INVAL; @@ -2371,17 +2990,12 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this) if (!replies[i].valid) continue; - if (locked_entry && replies[i].op_ret == -1 && - replies[i].op_errno == ENOENT) { - /* Second, check entry is still - "underway" in creation */ - local->op_ret = -1; - local->op_errno = ENOENT; - goto error; - } - - if (replies[i].op_ret == -1) + if (replies[i].op_ret == -1) { + if (locked_entry && replies[i].op_errno == ENOENT) { + in_flight_create = _gf_true; + } continue; + } if (read_subvol == -1 || !readable[read_subvol]) { read_subvol = i; @@ -2391,6 +3005,12 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this) } } + if (in_flight_create && !afr_has_quorum(success_replies, this, NULL)) { + local->op_ret = -1; + local->op_errno = ENOENT; + goto error; + } + if (read_subvol == -1) goto error; /* We now have a read_subvol, which is readable[] (if there @@ -2449,7 +3069,7 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this) if (read_subvol == -1) goto cant_interpret; if (ret) { - afr_inode_event_gen_reset(local->inode, this); + afr_inode_need_refresh_set(local->inode, this); dict_del_sizen(local->replies[read_subvol].xdata, GF_CONTENT_KEY); } } else { @@ -2502,7 +3122,7 @@ error: * others in that they must be given higher priority while * returning to the user. * - * The hierarchy is ENODATA > ENOENT > ESTALE > others + * The hierarchy is ENODATA > ENOENT > ESTALE > ENOSPC others */ int @@ -2514,6 +3134,8 @@ afr_higher_errno(int32_t old_errno, int32_t new_errno) return ENOENT; if (old_errno == ESTALE || new_errno == ESTALE) return ESTALE; + if (old_errno == ENOSPC || new_errno == ENOSPC) + return ENOSPC; return new_errno; } @@ -3005,6 +3627,7 @@ afr_discover_unwind(call_frame_t *frame, xlator_t *this) afr_private_t *priv = NULL; afr_local_t *local = NULL; int read_subvol = -1; + int ret = 0; unsigned char *data_readable = NULL; unsigned char *success_replies = NULL; @@ -3026,7 +3649,10 @@ afr_discover_unwind(call_frame_t *frame, xlator_t *this) if (!afr_has_quorum(success_replies, this, frame)) goto unwind; - afr_replies_interpret(frame, this, local->inode, NULL); + ret = afr_replies_interpret(frame, this, local->inode, NULL); + if (ret) { + afr_inode_need_refresh_set(local->inode, this); + } read_subvol = afr_read_subvol_decide(local->inode, this, NULL, data_readable); @@ -3078,7 +3704,7 @@ afr_ta_id_file_check(void *opaque) this = opaque; priv = this->private; - ret = afr_fill_ta_loc(this, &loc); + ret = afr_fill_ta_loc(this, &loc, _gf_false); if (ret) { gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, "Failed to populate thin-arbiter loc for: %s.", loc.name); @@ -3269,10 +3895,15 @@ afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) local->inode = inode_ref(loc->inode); - if (xattr_req) + if (xattr_req) { /* If xattr_req was null, afr_lookup_xattr_req_prepare() will allocate one for us */ - local->xattr_req = dict_ref(xattr_req); + local->xattr_req = dict_copy_with_ref(xattr_req, NULL); + if (!local->xattr_req) { + op_errno = ENOMEM; + goto out; + } + } if (gf_uuid_is_null(loc->inode->gfid)) { afr_discover_do(frame, this, 0); @@ -3282,11 +3913,7 @@ afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) afr_read_subvol_get(loc->inode, this, NULL, NULL, &event, AFR_DATA_TRANSACTION, NULL); - if (afr_is_inode_refresh_reqd(loc->inode, this, event, - local->event_generation)) - afr_inode_refresh(frame, this, loc->inode, NULL, afr_discover_do); - else - afr_discover_do(frame, this, 0); + afr_discover_do(frame, this, 0); return 0; out: @@ -3387,11 +4014,10 @@ afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) return 0; } - if (__is_root_gfid(loc->parent->gfid)) { - if (!strcmp(loc->name, GF_REPLICATE_TRASH_DIR)) { - op_errno = EPERM; - goto out; - } + if (afr_is_private_directory(this->private, loc->parent->gfid, loc->name, + frame->root->pid)) { + op_errno = EPERM; + goto out; } local = AFR_FRAME_INIT(frame, op_errno); @@ -3427,11 +4053,7 @@ afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) afr_read_subvol_get(loc->parent, this, NULL, NULL, &event, AFR_DATA_TRANSACTION, NULL); - if (afr_is_inode_refresh_reqd(loc->inode, this, event, - local->event_generation)) - afr_inode_refresh(frame, this, loc->parent, NULL, afr_lookup_do); - else - afr_lookup_do(frame, this, 0); + afr_lookup_do(frame, this, 0); return 0; out: @@ -3441,8 +4063,18 @@ out: } void -_afr_cleanup_fd_ctx(afr_fd_ctx_t *fd_ctx) +_afr_cleanup_fd_ctx(xlator_t *this, afr_fd_ctx_t *fd_ctx) { + afr_private_t *priv = this->private; + + if (fd_ctx->lk_heal_info) { + LOCK(&priv->lock); + { + list_del(&fd_ctx->lk_heal_info->pos); + } + afr_lk_heal_info_cleanup(fd_ctx->lk_heal_info); + fd_ctx->lk_heal_info = NULL; + } GF_FREE(fd_ctx->opened_on); GF_FREE(fd_ctx); return; @@ -3462,7 +4094,7 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd) fd_ctx = (afr_fd_ctx_t *)(long)ctx; if (fd_ctx) { - _afr_cleanup_fd_ctx(fd_ctx); + _afr_cleanup_fd_ctx(this, fd_ctx); } out: @@ -3555,13 +4187,14 @@ __afr_fd_ctx_set(xlator_t *this, fd_t *fd) } fd_ctx->readdir_subvol = -1; + fd_ctx->lk_heal_info = NULL; ret = __fd_ctx_set(fd, this, (uint64_t)(long)fd_ctx); if (ret) gf_msg_debug(this->name, 0, "failed to set fd ctx (%p)", fd); out: if (ret && fd_ctx) - _afr_cleanup_fd_ctx(fd_ctx); + _afr_cleanup_fd_ctx(this, fd_ctx); return ret; } @@ -3585,11 +4218,10 @@ afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, } else { local->op_errno = op_errno; } + call_count = --local->call_count; } UNLOCK(&frame->lock); - call_count = afr_frame_return(frame); - if (call_count == 0) AFR_STACK_UNWIND(flush, frame, local->op_ret, local->op_errno, local->xdata_rsp); @@ -3685,6 +4317,7 @@ afr_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) call_stub_t *stub = NULL; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local = AFR_FRAME_INIT(frame, op_errno); if (!local) goto out; @@ -3725,11 +4358,10 @@ afr_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, } else { local->op_errno = op_errno; } + call_count = --local->call_count; } UNLOCK(&frame->lock); - call_count = afr_frame_return(frame); - if (call_count == 0) AFR_STACK_UNWIND(fsyncdir, frame, local->op_ret, local->op_errno, local->xdata_rsp); @@ -4222,9 +4854,9 @@ out: } static int32_t -afr_handle_inodelk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume, - loc_t *loc, fd_t *fd, int32_t cmd, struct gf_flock *flock, - dict_t *xdata) +afr_handle_inodelk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + const char *volume, loc_t *loc, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) { afr_local_t *local = NULL; int32_t op_errno = ENOMEM; @@ -4236,8 +4868,10 @@ afr_handle_inodelk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume, local->op = fop; if (loc) loc_copy(&local->loc, loc); - if (fd) + if (fd && (flock->l_type != F_UNLCK)) { + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local->fd = fd_ref(fd); + } local->cont.inodelk.volume = gf_strdup(volume); if (!local->cont.inodelk.volume) { @@ -4266,8 +4900,8 @@ int32_t afr_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { - afr_handle_inodelk(frame, GF_FOP_INODELK, volume, loc, NULL, cmd, flock, - xdata); + afr_handle_inodelk(frame, this, GF_FOP_INODELK, volume, loc, NULL, cmd, + flock, xdata); return 0; } @@ -4275,15 +4909,16 @@ int32_t afr_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { - afr_handle_inodelk(frame, GF_FOP_FINODELK, volume, NULL, fd, cmd, flock, - xdata); + afr_handle_inodelk(frame, this, GF_FOP_FINODELK, volume, NULL, fd, cmd, + flock, xdata); return 0; } static int -afr_handle_entrylk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume, - loc_t *loc, fd_t *fd, const char *basename, entrylk_cmd cmd, - entrylk_type type, dict_t *xdata) +afr_handle_entrylk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + const char *volume, loc_t *loc, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { afr_local_t *local = NULL; int32_t op_errno = ENOMEM; @@ -4295,8 +4930,10 @@ afr_handle_entrylk(call_frame_t *frame, glusterfs_fop_t fop, const char *volume, local->op = fop; if (loc) loc_copy(&local->loc, loc); - if (fd) + if (fd && (cmd != ENTRYLK_UNLOCK)) { + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local->fd = fd_ref(fd); + } local->cont.entrylk.cmd = cmd; local->cont.entrylk.in_cmd = cmd; local->cont.entrylk.type = type; @@ -4323,8 +4960,8 @@ afr_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, const char *basename, entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { - afr_handle_entrylk(frame, GF_FOP_ENTRYLK, volume, loc, NULL, basename, cmd, - type, xdata); + afr_handle_entrylk(frame, this, GF_FOP_ENTRYLK, volume, loc, NULL, basename, + cmd, type, xdata); return 0; } @@ -4333,8 +4970,8 @@ afr_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, const char *basename, entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { - afr_handle_entrylk(frame, GF_FOP_FENTRYLK, volume, NULL, fd, basename, cmd, - type, xdata); + afr_handle_entrylk(frame, this, GF_FOP_FENTRYLK, volume, NULL, fd, basename, + cmd, type, xdata); return 0; } @@ -4346,10 +4983,10 @@ afr_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int call_count = 0; struct statvfs *buf = NULL; + local = frame->local; + LOCK(&frame->lock); { - local = frame->local; - if (op_ret != 0) { local->op_errno = op_errno; goto unlock; @@ -4375,10 +5012,9 @@ afr_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, } } unlock: + call_count = --local->call_count; UNLOCK(&frame->lock); - call_count = afr_frame_return(frame); - if (call_count == 0) AFR_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno, &local->cont.statfs.buf, local->xdata_rsp); @@ -4453,9 +5089,10 @@ afr_lk_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, } call_count = afr_frame_return(frame); - if (call_count == 0) + if (call_count == 0) { AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL, local->xdata_rsp); + } return 0; } @@ -4554,11 +5191,133 @@ afr_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, } int +afr_lk_transaction_cbk(int ret, call_frame_t *frame, void *opaque) +{ + return 0; +} + +int +afr_lk_txn_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = -1; + + local = frame->local; + child_index = (long)cookie; + afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata); + if (op_ret == 0) { + local->op_ret = 0; + local->op_errno = 0; + local->cont.lk.locked_nodes[child_index] = 1; + local->cont.lk.ret_flock = *lock; + } + syncbarrier_wake(&local->barrier); + return 0; +} + +int +afr_lk_txn_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int child_index = (long)cookie; + + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL, + "gfid=%s: unlock failed on subvolume %s " + "with lock owner %s", + uuid_utoa(local->fd->inode->gfid), + priv->children[child_index]->name, + lkowner_utoa(&frame->root->lk_owner)); + } + return 0; +} +int +afr_lk_transaction(void *opaque) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + char *wind_on = NULL; + int op_errno = 0; + int i = 0; + int ret = 0; + + frame = (call_frame_t *)opaque; + local = frame->local; + this = frame->this; + priv = this->private; + wind_on = alloca0(priv->child_count); + + if (priv->arbiter_count || priv->child_count != 3) { + op_errno = ENOTSUP; + gf_msg(frame->this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "%s: Lock healing supported only for replica 3 volumes.", + uuid_utoa(local->fd->inode->gfid)); + goto err; + } + + op_errno = -afr_dom_lock_acquire(frame); // Released during + // AFR_STACK_UNWIND + if (op_errno != 0) { + goto err; + } + if (priv->quorum_count && + !afr_has_quorum(local->cont.lk.dom_locked_nodes, this, NULL)) { + op_errno = afr_final_errno(local, priv); + goto err; + } + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] && local->cont.lk.dom_locked_nodes[i]) + wind_on[i] = 1; + } + AFR_ONLIST(wind_on, frame, afr_lk_txn_wind_cbk, lk, local->fd, + local->cont.lk.cmd, &local->cont.lk.user_flock, + local->xdata_req); + + if (priv->quorum_count && + !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) { + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + goto unlock; + } else { + if (local->cont.lk.user_flock.l_type == F_UNLCK) + ret = afr_remove_lock_from_saved_locks(local, this); + else + ret = afr_add_lock_to_saved_locks(frame, this); + if (ret) { + local->op_ret = -1; + local->op_errno = -ret; + goto unlock; + } + AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, + &local->cont.lk.ret_flock, local->xdata_rsp); + } + + return 0; + +unlock: + local->cont.lk.user_flock.l_type = F_UNLCK; + AFR_ONLIST(local->cont.lk.locked_nodes, frame, afr_lk_txn_unlock_cbk, lk, + local->fd, F_SETLK, &local->cont.lk.user_flock, NULL); +err: + AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL); + return -1; +} + +int afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; + int ret = 0; int i = 0; int32_t op_errno = ENOMEM; @@ -4569,9 +5328,11 @@ afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, goto out; local->op = GF_FOP_LK; - if (!afr_lk_is_unlock(cmd, flock) && - !afr_is_consistent_io_possible(local, priv, &op_errno)) - goto out; + if (!afr_lk_is_unlock(cmd, flock)) { + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) + goto out; + } local->cont.lk.locked_nodes = GF_CALLOC( priv->child_count, sizeof(*local->cont.lk.locked_nodes), @@ -4589,6 +5350,16 @@ afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, if (xdata) local->xdata_req = dict_ref(xdata); + if (afr_is_lock_mode_mandatory(xdata)) { + ret = synctask_new(this->ctx->env, afr_lk_transaction, + afr_lk_transaction_cbk, frame, frame); + if (ret) { + op_errno = ENOMEM; + goto out; + } + return 0; + } + STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)0, priv->children[i], priv->children[i]->fops->lk, fd, cmd, flock, local->xdata_req); @@ -4910,6 +5681,8 @@ afr_priv_dump(xlator_t *this) GF_ATOMIC_GET(priv->pending_reads[i])); sprintf(key, "child_latency[%d]", i); gf_proc_dump_write(key, "%" PRId64, priv->child_latency[i]); + sprintf(key, "halo_child_up[%d]", i); + gf_proc_dump_write(key, "%d", priv->halo_child_up[i]); } gf_proc_dump_write("data_self_heal", "%d", priv->data_self_heal); gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal); @@ -4922,6 +5695,7 @@ afr_priv_dump(xlator_t *this) priv->background_self_heal_count); gf_proc_dump_write("healers", "%d", priv->healers); gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode); + gf_proc_dump_write("use-anonymous-inode", "%d", priv->use_anon_inode); if (priv->quorum_count == AFR_QUORUM_AUTO) { gf_proc_dump_write("quorum-type", "auto"); } else if (priv->quorum_count == 0) { @@ -4982,13 +5756,31 @@ __afr_get_up_children_count(afr_private_t *priv) return up_children; } +static int +__get_heard_from_all_status(xlator_t *this) +{ + afr_private_t *priv = this->private; + int i; + + for (i = 0; i < priv->child_count; i++) { + if (!priv->last_event[i]) { + return 0; + } + } + if (priv->thin_arbiter_count && !priv->ta_child_up) { + return 0; + } + return 1; +} + glusterfs_event_t -__afr_transform_event_from_state(afr_private_t *priv) +__afr_transform_event_from_state(xlator_t *this) { int i = 0; int up_children = 0; + afr_private_t *priv = this->private; - if (AFR_COUNT(priv->last_event, priv->child_count) == priv->child_count) + if (__get_heard_from_all_status(this)) /* have_heard_from_all. Let afr_notify() do the propagation. */ return GF_EVENT_MAXVAL; @@ -5030,7 +5822,7 @@ afr_notify_cbk(void *data) goto unlock; } priv->timer = NULL; - event = __afr_transform_event_from_state(priv); + event = __afr_transform_event_from_state(this); if (event != GF_EVENT_MAXVAL) propagate = _gf_true; } @@ -5057,22 +5849,6 @@ __afr_launch_notify_timer(xlator_t *this, afr_private_t *priv) } } -int -__get_heard_from_all_status(xlator_t *this) -{ - afr_private_t *priv = this->private; - int heard_from_all = 1; - int i = 0; - - for (i = 0; i < priv->child_count; i++) { - if (!priv->last_event[i]) { - heard_from_all = 0; - break; - } - } - return heard_from_all; -} - static int find_best_down_child(xlator_t *this) { @@ -5084,7 +5860,7 @@ find_best_down_child(xlator_t *this) priv = this->private; for (i = 0; i < priv->child_count; i++) { - if (priv->child_up[i] && priv->child_latency[i] >= 0 && + if (!priv->child_up[i] && priv->child_latency[i] >= 0 && priv->child_latency[i] < best_latency) { best_child = i; best_latency = priv->child_latency[i]; @@ -5156,7 +5932,9 @@ __afr_handle_ping_event(xlator_t *this, xlator_t *child_xlator, const int idx, "), " "marking child down.", child_latency_msec, halo_max_latency_msec); - *event = GF_EVENT_CHILD_DOWN; + if (priv->halo_child_up[idx]) { + *event = GF_EVENT_CHILD_DOWN; + } } } else if (child_latency_msec < halo_max_latency_msec && priv->child_up[idx] == 0) { @@ -5168,7 +5946,9 @@ __afr_handle_ping_event(xlator_t *this, xlator_t *child_xlator, const int idx, "), " "marking child up.", child_latency_msec, halo_max_latency_msec); - *event = GF_EVENT_CHILD_UP; + if (priv->halo_child_up[idx]) { + *event = GF_EVENT_CHILD_UP; + } } else { gf_log(child_xlator->name, GF_LOG_INFO, "Not marking child %d up, " @@ -5235,7 +6015,10 @@ __afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator, if (child_latency_msec < 0) { /*set to INT64_MAX-1 so that it is found for best_down_child*/ - priv->child_latency[idx] = AFR_HALO_MAX_LATENCY; + priv->halo_child_up[idx] = 1; + if (priv->child_latency[idx] < 0) { + priv->child_latency[idx] = AFR_HALO_MAX_LATENCY; + } } /* @@ -5324,6 +6107,7 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx, */ if (child_latency_msec < 0) { priv->child_latency[idx] = child_latency_msec; + priv->halo_child_up[idx] = 0; } priv->child_up[idx] = 0; @@ -5588,6 +6372,7 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) __afr_handle_child_up_event(this, child_xlator, idx, child_latency_msec, &event, &call_psh, &up_child); + __afr_lock_heal_synctask(this, priv, idx); break; case GF_EVENT_CHILD_DOWN: @@ -5601,6 +6386,7 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) __afr_handle_child_down_event(this, child_xlator, idx, child_latency_msec, &event, &call_psh, &up_child); + __afr_mark_pending_lk_heal(this, priv, idx); break; case GF_EVENT_CHILD_CONNECTING: @@ -5694,7 +6480,7 @@ afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno) goto out; } - local->child_up = GF_CALLOC(priv->child_count, sizeof(*local->child_up), + local->child_up = GF_MALLOC(priv->child_count * sizeof(*local->child_up), gf_afr_mt_char); if (!local->child_up) { if (op_errno) @@ -5886,6 +6672,8 @@ afr_priv_destroy(afr_private_t *priv) if (!priv) goto out; + + GF_FREE(priv->sh_domain); GF_FREE(priv->last_event); child_count = priv->child_count; @@ -5901,7 +6689,9 @@ afr_priv_destroy(afr_private_t *priv) GF_FREE(priv->local); GF_FREE(priv->pending_key); GF_FREE(priv->children); + GF_FREE(priv->anon_inode); GF_FREE(priv->child_up); + GF_FREE(priv->halo_child_up); GF_FREE(priv->child_latency); LOCK_DESTROY(&priv->lock); @@ -5952,21 +6742,6 @@ out: return changelog; } -gf_boolean_t -afr_decide_heal_info(afr_private_t *priv, unsigned char *sources, int source) -{ - int sources_count = 0; - - if (source < 0) - goto out; - - sources_count = AFR_COUNT(sources, priv->child_count); - if (sources_count == priv->child_count) - return _gf_false; -out: - return _gf_true; -} - static dict_t * afr_set_heal_info(char *status) { @@ -6054,8 +6829,8 @@ afr_is_dirty_count_non_unary(xlator_t *this, struct afr_reply *replies, static int afr_update_heal_status(xlator_t *this, struct afr_reply *replies, - char *index_vgfid, ia_type_t ia_type, gf_boolean_t *esh, - gf_boolean_t *dsh, gf_boolean_t *msh) + ia_type_t ia_type, gf_boolean_t *esh, gf_boolean_t *dsh, + gf_boolean_t *msh, unsigned char pending) { int ret = -1; GF_UNUSED int ret1 = 0; @@ -6085,14 +6860,7 @@ afr_update_heal_status(xlator_t *this, struct afr_reply *replies, } } - if (!strcmp(index_vgfid, GF_XATTROP_INDEX_GFID)) { - if (shd_domain_lk_count) { - ret = -EAGAIN; /*For 'possibly-healing'. */ - } else { - ret = 0; /*needs heal. Just set a non -ve value so that it is - assumed as the source index.*/ - } - } else if (!strcmp(index_vgfid, GF_XATTROP_DIRTY_GFID)) { + if (!pending) { if ((afr_is_dirty_count_non_unary(this, replies, ia_type)) || (!io_domain_lk_count)) { /* Needs heal. */ @@ -6101,6 +6869,13 @@ afr_update_heal_status(xlator_t *this, struct afr_reply *replies, /* No heal needed. */ *dsh = *esh = *msh = 0; } + } else { + if (shd_domain_lk_count) { + ret = -EAGAIN; /*For 'possibly-healing'. */ + } else { + ret = 0; /*needs heal. Just set a non -ve value so that it is + assumed as the source index.*/ + } } return ret; } @@ -6108,8 +6883,8 @@ afr_update_heal_status(xlator_t *this, struct afr_reply *replies, /*return EIO, EAGAIN or pending*/ int afr_lockless_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid, - inode_t **inode, char *index_vgfid, - gf_boolean_t *entry_selfheal, gf_boolean_t *data_selfheal, + inode_t **inode, gf_boolean_t *entry_selfheal, + gf_boolean_t *data_selfheal, gf_boolean_t *metadata_selfheal, unsigned char *pending) { int ret = -1; @@ -6168,8 +6943,8 @@ afr_lockless_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid, goto out; } - ret = afr_update_heal_status(this, replies, index_vgfid, (*inode)->ia_type, - &esh, &dsh, &msh); + ret = afr_update_heal_status(this, replies, (*inode)->ia_type, &esh, &dsh, + &msh, *pending); out: *data_selfheal = dsh; *entry_selfheal = esh; @@ -6194,14 +6969,6 @@ afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc) char *status = NULL; call_frame_t *heal_frame = NULL; afr_local_t *heal_local = NULL; - afr_local_t *local = NULL; - char *index_vgfid = NULL; - - local = frame->local; - if (dict_get_str(local->xdata_req, "index-vgfid", &index_vgfid)) { - ret = -1; - goto out; - } /*Use frame with lk-owner set*/ heal_frame = afr_frame_create(frame->this, &op_errno); @@ -6212,7 +6979,7 @@ afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc) heal_local = heal_frame->local; heal_frame->local = frame->local; - ret = afr_lockless_inspect(heal_frame, this, loc->gfid, &inode, index_vgfid, + ret = afr_lockless_inspect(heal_frame, this, loc->gfid, &inode, &entry_selfheal, &data_selfheal, &metadata_selfheal, &pending); @@ -6687,7 +7454,7 @@ afr_fav_child_reset_sink_xattrs(void *opaque) ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, 0, 0, locked_on); { - if (ret < AFR_SH_MIN_PARTICIPANTS) + if (ret < priv->child_count) goto data_unlock; ret = __afr_selfheal_data_prepare( heal_frame, this, inode, locked_on, sources, sinks, @@ -6704,7 +7471,7 @@ afr_fav_child_reset_sink_xattrs(void *opaque) ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, LLONG_MAX - 1, 0, locked_on); { - if (ret < AFR_SH_MIN_PARTICIPANTS) + if (ret < priv->child_count) goto mdata_unlock; ret = __afr_selfheal_metadata_prepare( heal_frame, this, inode, locked_on, sources, sinks, @@ -7036,16 +7803,16 @@ afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local) return _gf_false; } -gf_boolean_t +static gf_boolean_t afr_is_add_replica_mount_lookup_on_root(call_frame_t *frame) { afr_local_t *local = NULL; - local = frame->local; - if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT) return _gf_false; + local = frame->local; + if (local->op != GF_FOP_LOOKUP) /* TODO:If the replica count is being increased on a plain distribute * volume that was never mounted, we need to allow setxattr on '/' with @@ -7062,15 +7829,11 @@ afr_is_add_replica_mount_lookup_on_root(call_frame_t *frame) } gf_boolean_t -afr_lookup_has_quorum(call_frame_t *frame, xlator_t *this, - unsigned char *subvols) +afr_lookup_has_quorum(call_frame_t *frame, const unsigned int up_children_count) { - afr_private_t *priv = this->private; - - if (frame && afr_is_add_replica_mount_lookup_on_root(frame)) { - if (AFR_COUNT(subvols, priv->child_count) > 0) - return _gf_true; - } + if (frame && (up_children_count > 0) && + afr_is_add_replica_mount_lookup_on_root(frame)) + return _gf_true; return _gf_false; } diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index f9fd48893ff..f8bf8340dab 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -53,11 +53,10 @@ afr_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, if (!local->xdata_rsp && xdata) local->xdata_rsp = dict_ref(xdata); } + call_count = --local->call_count; } UNLOCK(&frame->lock); - call_count = afr_frame_return(frame); - if (call_count == 0) { afr_handle_replies_quorum(frame, this); AFR_STACK_UNWIND(opendir, frame, local->op_ret, local->op_errno, @@ -68,7 +67,8 @@ afr_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, } int -afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) +afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -164,8 +164,8 @@ afr_validate_read_subvol(inode_t *inode, xlator_t *this, int par_read_subvol) } static void -afr_readdir_transform_entries(gf_dirent_t *subvol_entries, int subvol, - gf_dirent_t *entries, fd_t *fd) +afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries, + int subvol, gf_dirent_t *entries, fd_t *fd) { int ret = -1; gf_dirent_t *entry = NULL; @@ -183,8 +183,8 @@ afr_readdir_transform_entries(gf_dirent_t *subvol_entries, int subvol, list_for_each_entry_safe(entry, tmp, &subvol_entries->list, list) { - if (__is_root_gfid(fd->inode->gfid) && - !strcmp(entry->d_name, GF_REPLICATE_TRASH_DIR)) { + if (afr_is_private_directory(priv, fd->inode->gfid, entry->d_name, + frame->root->pid)) { continue; } @@ -228,8 +228,8 @@ afr_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, } if (op_ret >= 0) - afr_readdir_transform_entries(subvol_entries, (long)cookie, &entries, - local->fd); + afr_readdir_transform_entries(frame, subvol_entries, (long)cookie, + &entries, local->fd); AFR_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, xdata); diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 82a72fddd67..b7cceb79158 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -119,11 +119,11 @@ __afr_dir_write_finalize(call_frame_t *frame, xlator_t *this) continue; if (local->replies[i].op_ret < 0) { if (local->inode) - afr_inode_event_gen_reset(local->inode, this); + afr_inode_need_refresh_set(local->inode, this); if (local->parent) - afr_inode_event_gen_reset(local->parent, this); + afr_inode_need_refresh_set(local->parent, this); if (local->parent2) - afr_inode_event_gen_reset(local->parent2, this); + afr_inode_need_refresh_set(local->parent2, this); continue; } @@ -229,9 +229,9 @@ __afr_dir_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this, __afr_dir_write_fill(frame, this, child_index, op_ret, op_errno, buf, preparent, postparent, preparent2, postparent2, xdata); + call_count = --local->call_count; } UNLOCK(&frame->lock); - call_count = afr_frame_return(frame); if (call_count == 0) { __afr_dir_write_finalize(frame, this); @@ -345,6 +345,7 @@ afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this) afr_private_t *priv = NULL; int pre_op_count = 0; int failed_count = 0; + unsigned char *success_replies = NULL; local = frame->local; priv = this->private; @@ -360,9 +361,16 @@ afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this) failed_count = AFR_COUNT(local->transaction.failed_subvols, priv->child_count); + /* FOP succeeded on all bricks. */ if (pre_op_count == priv->child_count && !failed_count) return; + /* FOP did not suceed on quorum no. of bricks. */ + success_replies = alloca0(priv->child_count); + afr_fill_success_replies(local, priv, success_replies); + if (!afr_has_quorum(success_replies, this, NULL)) + return; + if (priv->thin_arbiter_count) { /*Mark new entry using ta file*/ local->is_new_entry = _gf_true; diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index 9204add5b15..c5521704de2 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -302,6 +302,7 @@ afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) afr_local_t *local = NULL; int op_errno = 0; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local = AFR_FRAME_INIT(frame, op_errno); if (!local) goto out; @@ -945,24 +946,13 @@ unlock: goto unwind; } - len = dict_serialized_length(local->dict); - if (len <= 0) { - goto unwind; - } - - lockinfo_buf = GF_CALLOC(1, len, gf_common_mt_char); - if (!lockinfo_buf) { + op_ret = dict_allocate_and_serialize( + local->dict, (char **)&lockinfo_buf, (unsigned int *)&len); + if (op_ret != 0) { local->op_ret = -1; - local->op_errno = ENOMEM; goto unwind; } - op_ret = dict_serialize(local->dict, lockinfo_buf); - if (op_ret < 0) { - local->op_ret = -1; - local->op_errno = -op_ret; - } - op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY, (void *)lockinfo_buf, len); if (op_ret < 0) { @@ -1061,24 +1051,13 @@ unlock: goto unwind; } - len = dict_serialized_length(local->dict); - if (len <= 0) { - goto unwind; - } - - lockinfo_buf = GF_CALLOC(1, len, gf_common_mt_char); - if (!lockinfo_buf) { + op_ret = dict_allocate_and_serialize( + local->dict, (char **)&lockinfo_buf, (unsigned int *)&len); + if (op_ret != 0) { local->op_ret = -1; - local->op_errno = ENOMEM; goto unwind; } - op_ret = dict_serialize(local->dict, lockinfo_buf); - if (op_ret < 0) { - local->op_ret = -1; - local->op_errno = -op_ret; - } - op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY, (void *)lockinfo_buf, len); if (op_ret < 0) { @@ -1720,6 +1699,7 @@ afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, int32_t op_errno = 0; fop_fgetxattr_cbk_t cbk = NULL; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local = AFR_FRAME_INIT(frame, op_errno); if (!local) goto out; @@ -1813,6 +1793,7 @@ afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, afr_local_t *local = NULL; int32_t op_errno = 0; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local = AFR_FRAME_INIT(frame, op_errno); if (!local) goto out; @@ -1888,6 +1869,7 @@ afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, afr_local_t *local = NULL; int32_t op_errno = 0; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local = AFR_FRAME_INIT(frame, op_errno); if (!local) goto out; diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h index 1627ee2c426..8c982bc7e6f 100644 --- a/xlators/cluster/afr/src/afr-inode-read.h +++ b/xlators/cluster/afr/src/afr-inode-read.h @@ -38,5 +38,8 @@ afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, dict_t *xdata); int +afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata); +int afr_handle_quota_size(call_frame_t *frame, xlator_t *this); #endif /* __INODE_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 8e9de3e12a6..1d6e4f3570a 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -174,11 +174,10 @@ __afr_inode_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this, { __afr_inode_write_fill(frame, this, child_index, op_ret, op_errno, prebuf, postbuf, xattr, xdata); + call_count = --local->call_count; } UNLOCK(&frame->lock); - call_count = afr_frame_return(frame); - if (call_count == 0) { __afr_inode_write_finalize(frame, this); @@ -492,6 +491,7 @@ afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int op_errno = ENOMEM; int ret = -1; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); local = AFR_FRAME_INIT(frame, op_errno); if (!local) goto out; @@ -731,6 +731,7 @@ afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, int ret = -1; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -941,6 +942,7 @@ afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf, int ret = -1; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -1057,11 +1059,10 @@ afr_emptyb_set_pending_changelog_cbk(call_frame_t *frame, void *cookie, if (ret) goto out; - gf_msg(this->name, op_ret ? GF_LOG_ERROR : GF_LOG_INFO, - op_ret ? op_errno : 0, afr_get_msg_id(op_type), - "Set of pending xattr %s on" - " %s.", - op_ret ? "failed" : "succeeded", priv->children[i]->name); + gf_smsg(this->name, op_ret ? GF_LOG_ERROR : GF_LOG_INFO, + op_ret ? op_errno : 0, AFR_MSG_SET_PEND_XATTR, "name=%s", + priv->children[i]->name, "op_ret=%s", + op_ret ? "failed" : "succeeded", NULL); out: syncbarrier_wake(&local->barrier); @@ -1155,9 +1156,8 @@ _afr_handle_empty_brick_type(xlator_t *this, call_frame_t *frame, loc_t *loc, } if (!count) { - gf_msg(this->name, GF_LOG_ERROR, EAGAIN, AFR_MSG_REPLACE_BRICK_STATUS, - "Couldn't acquire lock on" - " any child."); + gf_smsg(this->name, GF_LOG_ERROR, EAGAIN, AFR_MSG_REPLACE_BRICK_STATUS, + NULL); ret = -EAGAIN; goto unlock; } @@ -1235,8 +1235,8 @@ _afr_handle_empty_brick(void *opaque) loc_copy(&local->loc, &data->loc); - gf_msg(this->name, GF_LOG_INFO, 0, 0, "New brick is : %s", - priv->children[empty_index]->name); + gf_smsg(this->name, GF_LOG_INFO, 0, AFR_MSG_NEW_BRICK, "name=%s", + priv->children[empty_index]->name, NULL); ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index, AFR_METADATA_TRANSACTION, op_type, @@ -1311,9 +1311,8 @@ afr_split_brain_resolve_do(call_frame_t *frame, xlator_t *this, loc_t *loc, */ ret = afr_inode_split_brain_choice_set(loc->inode, this, -1); if (ret) - gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, - "Failed to set" - "split-brain choice to -1"); + gf_smsg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_SET_FAILED, + NULL); afr_heal_splitbrain_file(frame, this, loc); ret = 0; out: @@ -1336,8 +1335,8 @@ afr_get_split_brain_child_index(xlator_t *this, void *value, size_t len) spb_child_index = afr_get_child_index_from_name(this, spb_child_str); if (spb_child_index < 0) { - gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL, - "Invalid subvol: %s", spb_child_str); + gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL, + "subvol=%s", spb_child_str, NULL); } return spb_child_index; } @@ -1359,11 +1358,9 @@ afr_can_set_split_brain_choice(void *opaque) &data->m_spb); if (ret) - gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, - "Failed to determine if %s" - " is in split-brain. " - "Aborting split-brain-choice set.", - uuid_utoa(loc->gfid)); + gf_smsg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, "gfid=%s", + uuid_utoa(loc->gfid), NULL); return ret; } @@ -1425,12 +1422,8 @@ afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc, ret = synctask_new(this->ctx->env, afr_can_set_split_brain_choice, afr_set_split_brain_choice, NULL, data); if (ret) { - gf_msg(this->name, GF_LOG_ERROR, 0, - AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, - "Failed to create" - " synctask. Aborting split-brain choice set" - " for %s", - loc->name); + gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS, + "name=%s", loc->name, NULL); ret = 1; op_errno = ENOMEM; goto out; @@ -1505,8 +1498,8 @@ afr_handle_empty_brick(xlator_t *this, call_frame_t *frame, loc_t *loc, goto out; if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT) { - gf_msg(this->name, GF_LOG_ERROR, EPERM, afr_get_msg_id(op_type), - "'%s' is an internal extended attribute.", op_type); + gf_smsg(this->name, GF_LOG_ERROR, EPERM, AFR_MSG_INTERNAL_ATTR, + "op_type=%s", op_type, NULL); ret = 1; goto out; } @@ -1532,8 +1525,8 @@ afr_handle_empty_brick(xlator_t *this, call_frame_t *frame, loc_t *loc, ret = synctask_new(this->ctx->env, _afr_handle_empty_brick, _afr_handle_empty_brick_cbk, NULL, data); if (ret) { - gf_msg(this->name, GF_LOG_ERROR, 0, afr_get_msg_id(op_type), - "Failed to create synctask."); + gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS, + NULL); ret = 1; op_errno = ENOMEM; afr_brick_args_cleanup(data); @@ -1691,6 +1684,7 @@ afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out); + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -1899,6 +1893,7 @@ afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out); + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -1999,6 +1994,7 @@ afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, int ret = -1; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -2108,6 +2104,7 @@ afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, int ret = -1; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -2214,6 +2211,7 @@ afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, int ret = -1; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -2413,6 +2411,7 @@ afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, int ret = -1; int op_errno = ENOMEM; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -2507,7 +2506,9 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, call_frame_t *transaction_frame = NULL; int ret = -1; int32_t op_errno = ENOMEM; + int8_t last_fsync = 0; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); transaction_frame = copy_frame(frame); if (!transaction_frame) goto out; @@ -2516,10 +2517,16 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, if (!local) goto out; - if (xdata) + if (xdata) { local->xdata_req = dict_copy_with_ref(xdata, NULL); - else + if (dict_get_int8(xdata, "last-fsync", &last_fsync) == 0) { + if (last_fsync) { + local->transaction.disable_delayed_post_op = _gf_true; + } + } + } else { local->xdata_req = dict_new(); + } if (!local->xdata_req) goto out; diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index b0fb00641a0..816065fb57a 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -31,6 +31,8 @@ enum gf_afr_mem_types_ { gf_afr_mt_empty_brick_t, gf_afr_mt_child_latency_t, gf_afr_mt_atomic_t, + gf_afr_mt_lk_heal_info_t, + gf_afr_mt_gf_lock, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h index c9c99270e98..e73fd997765 100644 --- a/xlators/cluster/afr/src/afr-messages.h +++ b/xlators/cluster/afr/src/afr-messages.h @@ -23,25 +23,145 @@ * glfs-message-id.h. */ -GLFS_MSGID(AFR, AFR_MSG_QUORUM_FAIL, AFR_MSG_QUORUM_MET, - AFR_MSG_QUORUM_OVERRIDE, AFR_MSG_INVALID_CHILD_UP, AFR_MSG_SUBVOL_UP, - AFR_MSG_SUBVOLS_DOWN, AFR_MSG_ENTRY_UNLOCK_FAIL, AFR_MSG_SPLIT_BRAIN, - AFR_MSG_OPEN_FAIL, AFR_MSG_UNLOCK_FAIL, AFR_MSG_REPLACE_BRICK_STATUS, - AFR_MSG_GFID_NULL, AFR_MSG_FD_CREATE_FAILED, AFR_MSG_DICT_SET_FAILED, - AFR_MSG_EXPUNGING_FILE_OR_DIR, AFR_MSG_MIGRATION_IN_PROGRESS, - AFR_MSG_CHILD_MISCONFIGURED, AFR_MSG_VOL_MISCONFIGURED, - AFR_MSG_INTERNAL_LKS_FAILED, AFR_MSG_INVALID_FD, AFR_MSG_LOCK_INFO, - AFR_MSG_LOCK_XLATOR_NOT_LOADED, AFR_MSG_FD_CTX_GET_FAILED, - AFR_MSG_INVALID_SUBVOL, AFR_MSG_PUMP_XLATOR_ERROR, - AFR_MSG_SELF_HEAL_INFO, AFR_MSG_READ_SUBVOL_ERROR, - AFR_MSG_DICT_GET_FAILED, AFR_MSG_INFO_COMMON, - AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, AFR_MSG_LOCAL_CHILD, - AFR_MSG_INVALID_DATA, AFR_MSG_INVALID_ARG, - AFR_MSG_INDEX_DIR_GET_FAILED, AFR_MSG_FSYNC_FAILED, - AFR_MSG_FAVORITE_CHILD, AFR_MSG_SELF_HEAL_FAILED, - AFR_MSG_SPLIT_BRAIN_STATUS, AFR_MSG_ADD_BRICK_STATUS, - AFR_MSG_NO_CHANGELOG, AFR_MSG_TIMER_CREATE_FAIL, - AFR_MSG_SBRAIN_FAV_CHILD_POLICY, AFR_MSG_INODE_CTX_GET_FAILED, - AFR_MSG_THIN_ARB); +GLFS_MSGID( + AFR, AFR_MSG_QUORUM_FAIL, AFR_MSG_QUORUM_MET, AFR_MSG_QUORUM_OVERRIDE, + AFR_MSG_INVALID_CHILD_UP, AFR_MSG_SUBVOL_UP, AFR_MSG_SUBVOLS_DOWN, + AFR_MSG_ENTRY_UNLOCK_FAIL, AFR_MSG_SPLIT_BRAIN, AFR_MSG_OPEN_FAIL, + AFR_MSG_UNLOCK_FAIL, AFR_MSG_REPLACE_BRICK_STATUS, AFR_MSG_GFID_NULL, + AFR_MSG_FD_CREATE_FAILED, AFR_MSG_DICT_SET_FAILED, + AFR_MSG_EXPUNGING_FILE_OR_DIR, AFR_MSG_MIGRATION_IN_PROGRESS, + AFR_MSG_CHILD_MISCONFIGURED, AFR_MSG_VOL_MISCONFIGURED, + AFR_MSG_INTERNAL_LKS_FAILED, AFR_MSG_INVALID_FD, AFR_MSG_LOCK_INFO, + AFR_MSG_LOCK_XLATOR_NOT_LOADED, AFR_MSG_FD_CTX_GET_FAILED, + AFR_MSG_INVALID_SUBVOL, AFR_MSG_PUMP_XLATOR_ERROR, AFR_MSG_SELF_HEAL_INFO, + AFR_MSG_READ_SUBVOL_ERROR, AFR_MSG_DICT_GET_FAILED, AFR_MSG_INFO_COMMON, + AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, AFR_MSG_LOCAL_CHILD, AFR_MSG_INVALID_DATA, + AFR_MSG_INVALID_ARG, AFR_MSG_INDEX_DIR_GET_FAILED, AFR_MSG_FSYNC_FAILED, + AFR_MSG_FAVORITE_CHILD, AFR_MSG_SELF_HEAL_FAILED, + AFR_MSG_SPLIT_BRAIN_STATUS, AFR_MSG_ADD_BRICK_STATUS, AFR_MSG_NO_CHANGELOG, + AFR_MSG_TIMER_CREATE_FAIL, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + AFR_MSG_INODE_CTX_GET_FAILED, AFR_MSG_THIN_ARB, + AFR_MSG_THIN_ARB_XATTROP_FAILED, AFR_MSG_THIN_ARB_LOC_POP_FAILED, + AFR_MSG_GET_PEND_VAL, AFR_MSG_THIN_ARB_SKIP_SHD, AFR_MSG_UNKNOWN_SET, + AFR_MSG_NO_XL_ID, AFR_MSG_SELF_HEAL_INFO_START, + AFR_MSG_SELF_HEAL_INFO_FINISH, AFR_MSG_INCRE_COUNT, + AFR_MSG_ADD_TO_OUTPUT_FAILED, AFR_MSG_SET_TIME_FAILED, + AFR_MSG_GFID_MISMATCH_DETECTED, AFR_MSG_GFID_HEAL_MSG, + AFR_MSG_THIN_ARB_LOOKUP_FAILED, AFR_MSG_DICT_CREATE_FAILED, + AFR_MSG_NO_MAJORITY_TO_RESOLVE, AFR_MSG_TYPE_MISMATCH, + AFR_MSG_SIZE_POLICY_NOT_APPLICABLE, AFR_MSG_NO_CHILD_SELECTED, + AFR_MSG_INVALID_CHILD, AFR_MSG_RESOLVE_CONFLICTING_DATA, + SERROR_GETTING_SRC_BRICK, SNO_DIFF_IN_MTIME, SNO_BIGGER_FILE, + SALL_BRICKS_UP_TO_RESOLVE, AFR_MSG_UNLOCK_FAILED, AFR_MSG_POST_OP_FAILED, + AFR_MSG_TA_FRAME_CREATE_FAILED, AFR_MSG_SET_KEY_XATTROP_FAILED, + AFR_MSG_BLOCKING_ENTRYLKS_FAILED, AFR_MSG_FOP_FAILED, + AFR_MSG_CLEAN_UP_FAILED, AFR_MSG_UNABLE_TO_FETCH, AFR_MSG_XATTR_SET_FAILED, + AFR_MSG_SPLIT_BRAIN_REPLICA, AFR_MSG_INODE_CTX_FAILED, + AFR_MSG_LOOKUP_FAILED, AFR_MSG_ALL_SUBVOLS_DOWN, + AFR_MSG_RELEASE_LOCK_FAILED, AFR_MSG_CLEAR_TIME_SPLIT_BRAIN, + AFR_MSG_READ_FAILED, AFR_MSG_LAUNCH_FAILED, AFR_MSG_READ_SUBVOL_NOT_UP, + AFR_MSG_LK_HEAL_DOM, AFR_MSG_NEW_BRICK, AFR_MSG_SPLIT_BRAIN_SET_FAILED, + AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, AFR_MSG_HEALER_SPAWN_FAILED, + AFR_MSG_ADD_CRAWL_EVENT_FAILED, AFR_MSG_NULL_DEREF, AFR_MSG_SET_PEND_XATTR, + AFR_MSG_INTERNAL_ATTR); +#define AFR_MSG_DICT_GET_FAILED_STR "Dict get failed" +#define AFR_MSG_DICT_SET_FAILED_STR "Dict set failed" +#define AFR_MSG_HEALER_SPAWN_FAILED_STR "Healer spawn failed" +#define AFR_MSG_ADD_CRAWL_EVENT_FAILED_STR "Adding crawl event failed" +#define AFR_MSG_INVALID_ARG_STR "Invalid argument" +#define AFR_MSG_INDEX_DIR_GET_FAILED_STR "unable to get index-dir on " +#define AFR_MSG_THIN_ARB_LOOKUP_FAILED_STR "Failed lookup on file" +#define AFR_MSG_DICT_CREATE_FAILED_STR "Failed to create dict." +#define AFR_MSG_THIN_ARB_XATTROP_FAILED_STR "Xattrop failed." +#define AFR_MSG_THIN_ARB_LOC_POP_FAILED_STR \ + "Failed to populate loc for thin-arbiter" +#define AFR_MSG_GET_PEND_VAL_STR "Error getting value of pending" +#define AFR_MSG_THIN_ARB_SKIP_SHD_STR "I am not the god shd. skipping." +#define AFR_MSG_UNKNOWN_SET_STR "Unknown set" +#define AFR_MSG_NO_XL_ID_STR "xl does not have id" +#define AFR_MSG_SELF_HEAL_INFO_START_STR "starting full sweep on" +#define AFR_MSG_SELF_HEAL_INFO_FINISH_STR "finished full sweep on" +#define AFR_MSG_INCRE_COUNT_STR "Could not increment the counter." +#define AFR_MSG_ADD_TO_OUTPUT_FAILED_STR "Could not add to output" +#define AFR_MSG_SET_TIME_FAILED_STR "Could not set time" +#define AFR_MSG_GFID_HEAL_MSG_STR "Error setting gfid-heal-msg dict" +#define AFR_MSG_NO_MAJORITY_TO_RESOLVE_STR \ + "No majority to resolve gfid split brain" +#define AFR_MSG_GFID_MISMATCH_DETECTED_STR "Gfid mismatch dectected" +#define AFR_MSG_SELF_HEAL_INFO_STR "performing selfheal" +#define AFR_MSG_TYPE_MISMATCH_STR "TYPE mismatch" +#define AFR_MSG_SIZE_POLICY_NOT_APPLICABLE_STR \ + "Size policy is not applicable to directories." +#define AFR_MSG_NO_CHILD_SELECTED_STR \ + "No child selected by favorite-child policy" +#define AFR_MSG_INVALID_CHILD_STR "Invalid child" +#define AFR_MSG_RESOLVE_CONFLICTING_DATA_STR \ + "selected as authentic to resolve conflicting data" +#define SERROR_GETTING_SRC_BRICK_STR "Error getting the source brick" +#define SNO_DIFF_IN_MTIME_STR "No difference in mtime" +#define SNO_BIGGER_FILE_STR "No bigger file" +#define SALL_BRICKS_UP_TO_RESOLVE_STR \ + "All the bricks should be up to resolve the gfid split brain" +#define AFR_MSG_UNLOCK_FAILED_STR "Failed to unlock" +#define AFR_MSG_POST_OP_FAILED_STR "Post-op on thin-arbiter failed" +#define AFR_MSG_TA_FRAME_CREATE_FAILED_STR "Failed to create ta_frame" +#define AFR_MSG_SET_KEY_XATTROP_FAILED_STR "Could not set key during xattrop" +#define AFR_MSG_BLOCKING_ENTRYLKS_FAILED_STR "Blocking entrylks failed" +#define AFR_MSG_FSYNC_FAILED_STR "fsync failed" +#define AFR_MSG_QUORUM_FAIL_STR "quorum is not met" +#define AFR_MSG_FOP_FAILED_STR "Failing Fop" +#define AFR_MSG_INVALID_SUBVOL_STR "not a subvolume" +#define AFR_MSG_VOL_MISCONFIGURED_STR "Volume is dangling" +#define AFR_MSG_CHILD_MISCONFIGURED_STR \ + "replicate translator needs more than one subvolume defined" +#define AFR_MSG_CLEAN_UP_FAILED_STR "Failed to clean up healer threads" +#define AFR_MSG_QUORUM_OVERRIDE_STR "overriding quorum-count" +#define AFR_MSG_UNABLE_TO_FETCH_STR \ + "Unable to fetch afr-pending-xattr option from volfile. Falling back to " \ + "using client translator names" +#define AFR_MSG_NULL_DEREF_STR "possible NULL deref" +#define AFR_MSG_XATTR_SET_FAILED_STR "Cannot set xattr cookie key" +#define AFR_MSG_SPLIT_BRAIN_STATUS_STR "Failed to create synctask" +#define AFR_MSG_SUBVOLS_DOWN_STR "All subvolumes are not up" +#define AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR_STR \ + "Failed to cancel split-brain choice" +#define AFR_MSG_SPLIT_BRAIN_REPLICA_STR \ + "Cannot set replica. File is not in data/metadata split-brain" +#define AFR_MSG_INODE_CTX_FAILED_STR "Failed to get inode_ctx" +#define AFR_MSG_READ_SUBVOL_ERROR_STR "no read subvols" +#define AFR_MSG_LOCAL_CHILD_STR "selecting local read-child" +#define AFR_MSG_LOOKUP_FAILED_STR "Failed to lookup/create thin-arbiter id file" +#define AFR_MSG_TIMER_CREATE_FAIL_STR \ + "Cannot create timer for delayed initialization" +#define AFR_MSG_SUBVOL_UP_STR "Subvolume came back up; going online" +#define AFR_MSG_ALL_SUBVOLS_DOWN_STR \ + "All subvolumes are down. Going offline until atleast one of them is up" +#define AFR_MSG_RELEASE_LOCK_FAILED_STR "Failed to release lock" +#define AFR_MSG_INVALID_CHILD_UP_STR "Received child_up from invalid subvolume" +#define AFR_MSG_QUORUM_MET_STR "Client-quorum is met" +#define AFR_MSG_EXPUNGING_FILE_OR_DIR_STR "expunging file or dir" +#define AFR_MSG_SELF_HEAL_FAILED_STR "Invalid" +#define AFR_MSG_SPLIT_BRAIN_STR "Skipping conservative mergeon the file" +#define AFR_MSG_CLEAR_TIME_SPLIT_BRAIN_STR "clear time split brain" +#define AFR_MSG_READ_FAILED_STR "Failing read since good brick is down" +#define AFR_MSG_LAUNCH_FAILED_STR "Failed to launch synctask" +#define AFR_MSG_READ_SUBVOL_NOT_UP_STR \ + "read subvolume in this generation is not up" +#define AFR_MSG_INTERNAL_LKS_FAILED_STR \ + "Unable to work with lk-owner while attempting fop" +#define AFR_MSG_LOCK_XLATOR_NOT_LOADED_STR \ + "subvolume does not support locking. please load features/locks xlator " \ + "on server." +#define AFR_MSG_FD_CTX_GET_FAILED_STR "unable to get fd ctx" +#define AFR_MSG_INFO_COMMON_STR "fd not open on any subvolumes, aborting." +#define AFR_MSG_REPLACE_BRICK_STATUS_STR "Couldn't acquire lock on any child." +#define AFR_MSG_NEW_BRICK_STR "New brick" +#define AFR_MSG_SPLIT_BRAIN_SET_FAILED_STR \ + "Failed to set split-brain choice to -1" +#define AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED_STR \ + "Failed to determine split-brain. Aborting split-brain-choice set" +#define AFR_MSG_OPEN_FAIL_STR "Failed to open subvolume" +#define AFR_MSG_SET_PEND_XATTR_STR "Set of pending xattr" +#define AFR_MSG_INTERNAL_ATTR_STR "is an internal extended attribute" #endif /* !_AFR_MESSAGES_H_ */ diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index 1747a079fa6..64856042b65 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -78,11 +78,10 @@ afr_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, if (!local->xdata_rsp && xdata) local->xdata_rsp = dict_ref(xdata); } + call_count = --local->call_count; } UNLOCK(&frame->lock); - call_count = afr_frame_return(frame); - if (call_count == 0) { afr_handle_replies_quorum(frame, this); if (local->op_ret == -1) { @@ -138,7 +137,7 @@ afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int spb_choice = 0; + int spb_subvol = 0; int event_generation = 0; int ret = 0; int32_t op_errno = 0; @@ -180,9 +179,9 @@ afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, ret = afr_inode_get_readable(frame, local->inode, this, NULL, &event_generation, AFR_DATA_TRANSACTION); if ((ret < 0) && - (afr_inode_split_brain_choice_get(local->inode, this, &spb_choice) == - 0) && - spb_choice < 0) { + (afr_split_brain_read_subvol_get(local->inode, this, NULL, + &spb_subvol) == 0) && + spb_subvol < 0) { afr_inode_refresh(frame, this, local->inode, local->inode->gfid, afr_open_continue); } else { @@ -216,11 +215,9 @@ afr_openfd_fix_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, "successfully on subvolume %s", local->loc.path, priv->children[child_index]->name); } else { - gf_msg(this->name, fop_log_level(GF_FOP_OPEN, op_errno), op_errno, - AFR_MSG_OPEN_FAIL, - "Failed to open %s on " - "subvolume %s", - local->loc.path, priv->children[child_index]->name); + gf_smsg(this->name, fop_log_level(GF_FOP_OPEN, op_errno), op_errno, + AFR_MSG_OPEN_FAIL, "path=%s", local->loc.path, "subvolume=%s", + priv->children[child_index]->name, NULL); } fd_ctx = local->fd_ctx; diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index 772b59f9a2f..6fc2c75145c 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -164,7 +164,7 @@ afr_ta_read_txn(void *opaque) xdata_rsp = NULL; /* It doesn't. So query thin-arbiter to see if it blames any data brick. */ - ret = afr_fill_ta_loc(this, &loc); + ret = afr_fill_ta_loc(this, &loc, _gf_true); if (ret) { gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, "Failed to populate thin-arbiter loc for: %s.", loc.name); @@ -272,7 +272,7 @@ afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) int read_subvol = -1; inode_t *inode = NULL; int ret = -1; - int spb_choice = -1; + int spb_subvol = -1; local = frame->local; inode = local->inode; @@ -303,9 +303,9 @@ afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) local->read_attempted[read_subvol] = 1; readfn: if (read_subvol == -1) { - ret = afr_inode_split_brain_choice_get(inode, this, &spb_choice); - if ((ret == 0) && spb_choice >= 0) - read_subvol = spb_choice; + ret = afr_split_brain_read_subvol_get(inode, this, frame, &spb_subvol); + if ((ret == 0) && spb_subvol >= 0) + read_subvol = spb_subvol; } if (read_subvol == -1) { diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index fc00793322c..a580a1584cc 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -140,7 +140,7 @@ heal: } } out: - if (gfid_idx && (*gfid_idx == -1) && (ret == 0)) { + if (gfid_idx && (*gfid_idx == -1) && (ret == 0) && local) { ret = -afr_final_errno(local, priv); } loc_wipe(&loc); @@ -1575,7 +1575,6 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this, char *accused = NULL; /* Accused others without any self-accusal */ char *pending = NULL; /* Have pending operations on others */ char *self_accused = NULL; /* Accused itself */ - int min_participants = -1; priv = this->private; @@ -1599,12 +1598,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this, } } - if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION) { - min_participants = priv->child_count; - } else { - min_participants = AFR_SH_MIN_PARTICIPANTS; - } - if (afr_success_count(replies, priv->child_count) < min_participants) { + if (afr_success_count(replies, priv->child_count) < priv->child_count) { /* Treat this just like locks not being acquired */ return -ENOTCONN; } @@ -1773,11 +1767,9 @@ afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, if (xdata) { local->replies[i].xdata = dict_ref(xdata); ret = dict_get_int8(xdata, "link-count", &need_heal); - local->replies[i].need_heal = need_heal; - } else { - local->replies[i].need_heal = need_heal; } + local->replies[i].need_heal = need_heal; syncbarrier_wake(&local->barrier); return 0; @@ -1913,17 +1905,16 @@ int afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid, struct afr_reply *replies) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; dict_t *dict = NULL; - priv = frame->this->private; local = frame->local; - if (local && local->xattr_req) + + if (local->xattr_req) dict = local->xattr_req; return afr_selfheal_unlocked_discover_on(frame, inode, gfid, replies, - priv->child_up, dict); + local->child_up, dict); } unsigned int @@ -2759,3 +2750,185 @@ afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources, out: return source; } + +static int +afr_anon_inode_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + afr_local_t *local = frame->local; + int i = (long)cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (op_ret == 0) { + local->op_ret = 0; + local->replies[i].poststat = *buf; + local->replies[i].preparent = *preparent; + local->replies[i].postparent = *postparent; + } + if (xdata) { + local->replies[i].xdata = dict_ref(xdata); + } + + syncbarrier_wake(&local->barrier); + return 0; +} + +int +afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode) +{ + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = this->private; + unsigned char *mkdir_on = alloca0(priv->child_count); + unsigned char *lookup_on = alloca0(priv->child_count); + loc_t loc = {0}; + int32_t op_errno = 0; + int32_t child_op_errno = 0; + struct iatt iatt = {0}; + dict_t *xdata = NULL; + uuid_t anon_inode_gfid = {0}; + int mkdir_count = 0; + int i = 0; + + /*Try to mkdir everywhere and return success if the dir exists on 'child' + */ + + if (!priv->use_anon_inode) { + op_errno = EINVAL; + goto out; + } + + frame = afr_frame_create(this, &op_errno); + if (op_errno) { + goto out; + } + local = frame->local; + if (!local->child_up[child]) { + /*Other bricks may need mkdir so don't error out yet*/ + child_op_errno = ENOTCONN; + } + gf_uuid_parse(priv->anon_gfid_str, anon_inode_gfid); + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + + if (priv->anon_inode[i]) { + mkdir_on[i] = 0; + } else { + mkdir_on[i] = 1; + mkdir_count++; + } + } + + if (mkdir_count == 0) { + *linked_inode = inode_find(this->itable, anon_inode_gfid); + if (*linked_inode) { + op_errno = 0; + goto out; + } + } + + loc.parent = inode_ref(this->itable->root); + loc.name = priv->anon_inode_name; + loc.inode = inode_new(this->itable); + if (!loc.inode) { + op_errno = ENOMEM; + goto out; + } + + xdata = dict_new(); + if (!xdata) { + op_errno = ENOMEM; + goto out; + } + + op_errno = -dict_set_gfuuid(xdata, "gfid-req", anon_inode_gfid, _gf_true); + if (op_errno) { + goto out; + } + + if (mkdir_count == 0) { + memcpy(lookup_on, local->child_up, priv->child_count); + goto lookup; + } + + AFR_ONLIST(mkdir_on, frame, afr_anon_inode_mkdir_cbk, mkdir, &loc, 0755, 0, + xdata); + + for (i = 0; i < priv->child_count; i++) { + if (!mkdir_on[i]) { + continue; + } + + if (local->replies[i].op_ret == 0) { + priv->anon_inode[i] = 1; + iatt = local->replies[i].poststat; + } else if (local->replies[i].op_ret < 0 && + local->replies[i].op_errno == EEXIST) { + lookup_on[i] = 1; + } else if (i == child) { + child_op_errno = local->replies[i].op_errno; + } + } + + if (AFR_COUNT(lookup_on, priv->child_count) == 0) { + goto link; + } + +lookup: + AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xdata); + for (i = 0; i < priv->child_count; i++) { + if (!lookup_on[i]) { + continue; + } + + if (local->replies[i].op_ret == 0) { + if (gf_uuid_compare(anon_inode_gfid, + local->replies[i].poststat.ia_gfid) == 0) { + priv->anon_inode[i] = 1; + iatt = local->replies[i].poststat; + } else { + if (i == child) + child_op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_DATA, + "%s has gfid: %s", priv->anon_inode_name, + uuid_utoa(local->replies[i].poststat.ia_gfid)); + } + } else if (i == child) { + child_op_errno = local->replies[i].op_errno; + } + } +link: + if (!gf_uuid_is_null(iatt.ia_gfid)) { + *linked_inode = inode_link(loc.inode, loc.parent, loc.name, &iatt); + if (*linked_inode) { + op_errno = 0; + inode_lookup(*linked_inode); + } else { + op_errno = ENOMEM; + } + goto out; + } + +out: + if (xdata) + dict_unref(xdata); + loc_wipe(&loc); + /*child_op_errno takes precedence*/ + if (child_op_errno == 0) { + child_op_errno = op_errno; + } + + if (child_op_errno && *linked_inode) { + inode_unref(*linked_inode); + *linked_inode = NULL; + } + if (frame) + AFR_STACK_DESTROY(frame); + return -child_op_errno; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index cdff4a57674..37bcc2b3f9e 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -225,24 +225,40 @@ __afr_selfheal_data_read_write(call_frame_t *frame, xlator_t *this, fd_t *fd, return ret; } +static gf_boolean_t +afr_source_sinks_locked(xlator_t *this, unsigned char *locked_on, int source, + unsigned char *healed_sinks) +{ + afr_private_t *priv = this->private; + int i = 0; + + if (!locked_on[source]) + return _gf_false; + + for (i = 0; i < priv->child_count; i++) { + if (healed_sinks[i] && locked_on[i]) + return _gf_true; + } + + return _gf_false; +} + static int afr_selfheal_data_block(call_frame_t *frame, xlator_t *this, fd_t *fd, int source, unsigned char *healed_sinks, off_t offset, size_t size, int type, struct afr_reply *replies) { int ret = -1; - int sink_count = 0; afr_private_t *priv = NULL; unsigned char *data_lock = NULL; priv = this->private; - sink_count = AFR_COUNT(healed_sinks, priv->child_count); data_lock = alloca0(priv->child_count); ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, offset, size, data_lock); { - if (ret < sink_count) { + if (!afr_source_sinks_locked(this, data_lock, source, healed_sinks)) { ret = -ENOTCONN; goto unlock; } diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 3ce882e8c4a..64893f441e3 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -16,54 +16,170 @@ #include <glusterfs/syncop-utils.h> #include <glusterfs/events.h> -static int -afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name, - inode_t *inode, int child, struct afr_reply *replies) +int +afr_selfheal_entry_anon_inode(xlator_t *this, inode_t *dir, const char *name, + inode_t *inode, int child, + struct afr_reply *replies, + gf_boolean_t *anon_inode) { afr_private_t *priv = NULL; + afr_local_t *local = NULL; xlator_t *subvol = NULL; int ret = 0; + int i = 0; + char g[64] = {0}; + unsigned char *lookup_success = NULL; + call_frame_t *frame = NULL; + loc_t loc2 = { + 0, + }; loc_t loc = { 0, }; - char g[64]; priv = this->private; - subvol = priv->children[child]; + lookup_success = alloca0(priv->child_count); + uuid_utoa_r(replies[child].poststat.ia_gfid, g); + loc.inode = inode_new(inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + + if (replies[child].poststat.ia_type == IA_IFDIR) { + /* This directory may have sub-directory hierarchy which may need to + * be preserved for subsequent heals. So unconditionally move the + * directory to anonymous-inode directory*/ + *anon_inode = _gf_true; + goto anon_inode; + } + + frame = afr_frame_create(this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + local = frame->local; + gf_uuid_copy(loc.gfid, replies[child].poststat.ia_gfid); + AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc, + NULL); + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == 0) { + lookup_success[i] = 1; + } else if (local->replies[i].op_errno != ENOENT && + local->replies[i].op_errno != ESTALE) { + ret = -local->replies[i].op_errno; + } + } + + if (priv->quorum_count) { + if (afr_has_quorum(lookup_success, this, NULL)) { + *anon_inode = _gf_true; + } + } else if (AFR_COUNT(lookup_success, priv->child_count) > 1) { + *anon_inode = _gf_true; + } else if (ret) { + goto out; + } + +anon_inode: + if (!*anon_inode) { + ret = 0; + goto out; + } loc.parent = inode_ref(dir); gf_uuid_copy(loc.pargfid, dir->gfid); loc.name = name; - loc.inode = inode_ref(inode); - if (replies[child].valid && replies[child].op_ret == 0) { - switch (replies[child].poststat.ia_type) { - case IA_IFDIR: - gf_msg(this->name, GF_LOG_WARNING, 0, - AFR_MSG_EXPUNGING_FILE_OR_DIR, - "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid), - name, uuid_utoa_r(replies[child].poststat.ia_gfid, g), - subvol->name); - ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL); - break; - default: - gf_msg(this->name, GF_LOG_WARNING, 0, - AFR_MSG_EXPUNGING_FILE_OR_DIR, - "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid), - name, uuid_utoa_r(replies[child].poststat.ia_gfid, g), - subvol->name); - ret = syncop_unlink(subvol, &loc, NULL, NULL); - break; - } + ret = afr_anon_inode_create(this, child, &loc2.parent); + if (ret < 0) + goto out; + + loc2.name = g; + ret = syncop_rename(subvol, &loc, &loc2, NULL, NULL); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "Rename to %s dir %s/%s (%s) on %s failed", + priv->anon_inode_name, uuid_utoa(dir->gfid), name, g, + subvol->name); + } else { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "Rename to %s dir %s/%s (%s) on %s successful", + priv->anon_inode_name, uuid_utoa(dir->gfid), name, g, + subvol->name); } +out: loc_wipe(&loc); + loc_wipe(&loc2); + if (frame) { + AFR_STACK_DESTROY(frame); + } return ret; } int +afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name, + inode_t *inode, int child, struct afr_reply *replies) +{ + char g[64] = {0}; + afr_private_t *priv = NULL; + xlator_t *subvol = NULL; + int ret = 0; + loc_t loc = { + 0, + }; + gf_boolean_t anon_inode = _gf_false; + + priv = this->private; + subvol = priv->children[child]; + + if ((!replies[child].valid) || (replies[child].op_ret < 0)) { + /*Nothing to do*/ + ret = 0; + goto out; + } + + if (priv->use_anon_inode) { + ret = afr_selfheal_entry_anon_inode(this, dir, name, inode, child, + replies, &anon_inode); + if (ret < 0 || anon_inode) + goto out; + } + + loc.parent = inode_ref(dir); + loc.inode = inode_new(inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + loc.name = name; + switch (replies[child].poststat.ia_type) { + case IA_IFDIR: + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid), name, + uuid_utoa_r(replies[child].poststat.ia_gfid, g), + subvol->name); + ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL); + break; + default: + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR, + "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid), + name, uuid_utoa_r(replies[child].poststat.ia_gfid, g), + subvol->name); + ret = syncop_unlink(subvol, &loc, NULL, NULL); + break; + } + +out: + loc_wipe(&loc); + return ret; +} + +int afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, unsigned char *sources, inode_t *dir, const char *name, inode_t *inode, @@ -76,6 +192,9 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, loc_t srcloc = { 0, }; + loc_t anonloc = { + 0, + }; xlator_t *this = frame->this; afr_private_t *priv = NULL; dict_t *xdata = NULL; @@ -86,15 +205,18 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, 0, }; unsigned char *newentry = NULL; + char iatt_uuid_str[64] = {0}; + char dir_uuid_str[64] = {0}; priv = this->private; iatt = &replies[source].poststat; + uuid_utoa_r(iatt->ia_gfid, iatt_uuid_str); if (iatt->ia_type == IA_INVAL || gf_uuid_is_null(iatt->ia_gfid)) { gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SELF_HEAL_FAILED, "Invalid ia_type (%d) or gfid(%s). source brick=%d, " "pargfid=%s, name=%s", - iatt->ia_type, uuid_utoa(iatt->ia_gfid), source, - uuid_utoa(dir->gfid), name); + iatt->ia_type, iatt_uuid_str, source, + uuid_utoa_r(dir->gfid, dir_uuid_str), name); ret = -EINVAL; goto out; } @@ -119,14 +241,24 @@ afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source, srcloc.inode = inode_ref(inode); gf_uuid_copy(srcloc.gfid, iatt->ia_gfid); - if (iatt->ia_type != IA_IFDIR) - ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0); - if (iatt->ia_type == IA_IFDIR || ret == -ENOENT || ret == -ESTALE) { + ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0); + if (ret == -ENOENT || ret == -ESTALE) { newentry[dst] = 1; ret = afr_selfheal_newentry_mark(frame, this, inode, source, replies, sources, newentry); if (ret) goto out; + } else if (ret == 0 && iatt->ia_type == IA_IFDIR && priv->use_anon_inode) { + // Try rename from hidden directory + ret = afr_anon_inode_create(this, dst, &anonloc.parent); + if (ret < 0) + goto out; + anonloc.inode = inode_ref(inode); + anonloc.name = iatt_uuid_str; + ret = syncop_rename(priv->children[dst], &anonloc, &loc, NULL, NULL); + if (ret == -ENOENT || ret == -ESTALE) + ret = -1; /*This sets 'mismatch' to true*/ + goto out; } mode = st_mode_from_ia(iatt->ia_prot, iatt->ia_type); @@ -165,6 +297,7 @@ out: GF_FREE(linkname); loc_wipe(&loc); loc_wipe(&srcloc); + loc_wipe(&anonloc); return ret; } @@ -577,6 +710,11 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, priv = this->private; + if (afr_is_private_directory(priv, fd->inode->gfid, name, + GF_CLIENT_PID_SELF_HEALD)) { + return 0; + } + xattr = dict_new(); if (!xattr) return -ENOMEM; @@ -597,7 +735,7 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL, locked_on); { - if (ret < AFR_SH_MIN_PARTICIPANTS) { + if (ret < priv->child_count) { gf_msg_debug(this->name, 0, "%s: Skipping " "entry self-heal as only %d sub-volumes " @@ -625,7 +763,7 @@ afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd, replies); if ((ret == 0) && (priv->esh_granular) && parent_idx_inode) { - ret = afr_shd_index_purge(subvol, parent_idx_inode, name, + ret = afr_shd_entry_purge(subvol, parent_idx_inode, name, inode->ia_type); /* Why is ret force-set to 0? We do not care about * index purge failing for full heal as it is quite @@ -755,10 +893,6 @@ afr_selfheal_entry_do_subvol(call_frame_t *frame, xlator_t *this, fd_t *fd, if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) continue; - if (__is_root_gfid(fd->inode->gfid) && - !strcmp(entry->d_name, GF_REPLICATE_TRASH_DIR)) - continue; - ret = afr_selfheal_entry_dirent(iter_frame, this, fd, entry->d_name, loc.inode, subvol, local->need_full_crawl); @@ -821,7 +955,7 @@ afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry, /* The name indices under the pgfid index dir are guaranteed * to be regular files. Hence the hardcoding. */ - afr_shd_index_purge(subvol, parent->inode, entry->d_name, IA_IFREG); + afr_shd_entry_purge(subvol, parent->inode, entry->d_name, IA_IFREG); ret = 0; goto out; } @@ -991,7 +1125,7 @@ __afr_selfheal_entry(call_frame_t *frame, xlator_t *this, fd_t *fd, ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL, data_lock); { - if (ret < AFR_SH_MIN_PARTICIPANTS) { + if (ret < priv->child_count) { gf_msg_debug(this->name, 0, "%s: Skipping " "entry self-heal as only %d sub-volumes could " @@ -1115,7 +1249,7 @@ afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode) ret = afr_selfheal_tie_breaker_entrylk(frame, this, inode, priv->sh_domain, NULL, locked_on); { - if (ret < AFR_SH_MIN_PARTICIPANTS) { + if (ret < priv->child_count) { gf_msg_debug(this->name, 0, "%s: Skipping " "entry self-heal as only %d sub-volumes could " diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index ecfa791b8cb..03f43bad16e 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -190,6 +190,59 @@ out: return ret; } +static int +__afr_selfheal_metadata_mark_pending_xattrs(call_frame_t *frame, xlator_t *this, + inode_t *inode, + struct afr_reply *replies, + unsigned char *sources) +{ + int ret = 0; + int i = 0; + int m_idx = 0; + afr_private_t *priv = NULL; + int raw[AFR_NUM_CHANGE_LOGS] = {0}; + dict_t *xattr = NULL; + + priv = this->private; + m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION); + raw[m_idx] = 1; + + xattr = dict_new(); + if (!xattr) + return -ENOMEM; + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) + continue; + ret = dict_set_static_bin(xattr, priv->pending_key[i], raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + ret = -1; + goto out; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + ret = afr_selfheal_post_op(frame, this, inode, i, xattr, NULL); + if (ret < 0) { + gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_SELF_HEAL_INFO, + "Failed to set pending metadata xattr on child %d for %s", i, + uuid_utoa(inode->gfid)); + goto out; + } + } + + afr_replies_wipe(replies, priv->child_count); + ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies); + +out: + if (xattr) + dict_unref(xattr); + return ret; +} + /* * Look for mismatching uid/gid or mode or user xattrs even if * AFR xattrs don't say so, and pick one arbitrarily as winner. */ @@ -210,6 +263,7 @@ __afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this, }; int source = -1; int sources_count = 0; + int ret = 0; priv = this->private; @@ -300,7 +354,13 @@ __afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this, healed_sinks[i] = 1; } } - + if ((sources_count == priv->child_count) && (source > -1) && + (AFR_COUNT(healed_sinks, priv->child_count) != 0)) { + ret = __afr_selfheal_metadata_mark_pending_xattrs(frame, this, inode, + replies, sources); + if (ret < 0) + return ret; + } out: afr_mark_active_sinks(this, sources, locked_on, healed_sinks); return source; @@ -421,12 +481,8 @@ afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode) if (ret) goto unlock; - /* Restore atime/mtime for files that don't need data heal as - * restoring timestamps happens only as a part of data-heal. - */ - if (!IA_ISREG(locked_replies[source].poststat.ia_type)) - afr_selfheal_restore_time(frame, this, inode, source, healed_sinks, - locked_replies); + afr_selfheal_restore_time(frame, this, inode, source, healed_sinks, + locked_replies); ret = afr_selfheal_undo_pending( frame, this, inode, sources, sinks, healed_sinks, undid_pending, diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c index 36640b5456b..834aac86d48 100644 --- a/xlators/cluster/afr/src/afr-self-heal-name.c +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -98,21 +98,12 @@ __afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid, const char *bname, inode_t *inode, struct afr_reply *replies) { - loc_t loc = { - 0, - }; int i = 0; afr_private_t *priv = NULL; - char g[64]; int ret = 0; priv = this->private; - loc.parent = inode_ref(parent); - gf_uuid_copy(loc.pargfid, pargfid); - loc.name = bname; - loc.inode = inode_ref(inode); - for (i = 0; i < priv->child_count; i++) { if (!replies[i].valid) continue; @@ -120,30 +111,10 @@ __afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid, if (replies[i].op_ret) continue; - switch (replies[i].poststat.ia_type) { - case IA_IFDIR: - gf_msg(this->name, GF_LOG_WARNING, 0, - AFR_MSG_EXPUNGING_FILE_OR_DIR, - "expunging dir %s/%s (%s) on %s", uuid_utoa(pargfid), - bname, uuid_utoa_r(replies[i].poststat.ia_gfid, g), - priv->children[i]->name); - - ret |= syncop_rmdir(priv->children[i], &loc, 1, NULL, NULL); - break; - default: - gf_msg(this->name, GF_LOG_WARNING, 0, - AFR_MSG_EXPUNGING_FILE_OR_DIR, - "expunging file %s/%s (%s) on %s", uuid_utoa(pargfid), - bname, uuid_utoa_r(replies[i].poststat.ia_gfid, g), - priv->children[i]->name); - - ret |= syncop_unlink(priv->children[i], &loc, NULL, NULL); - break; - } + ret |= afr_selfheal_entry_delete(this, parent, bname, inode, i, + replies); } - loc_wipe(&loc); - return ret; } @@ -381,7 +352,7 @@ __afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent, ret = __afr_selfheal_assign_gfid(this, parent, pargfid, bname, inode, replies, gfid, locked_on, source, sources, is_gfid_absent, &gfid_idx); - if (ret) + if (ret || (gfid_idx < 0)) return ret; ret = __afr_selfheal_name_impunge(frame, this, parent, pargfid, bname, @@ -514,7 +485,7 @@ afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent, ret = afr_selfheal_entrylk(frame, this, parent, this->name, bname, locked_on); { - if (ret < AFR_SH_MIN_PARTICIPANTS) { + if (ret < priv->child_count) { ret = -ENOTCONN; goto unlock; } @@ -560,13 +531,15 @@ afr_selfheal_name_unlocked_inspect(call_frame_t *frame, xlator_t *this, struct afr_reply *replies = NULL; inode_t *inode = NULL; int first_idx = -1; + afr_local_t *local = NULL; priv = this->private; + local = frame->local; replies = alloca0(sizeof(*replies) * priv->child_count); inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies, - priv->child_up, NULL); + local->child_up, NULL); if (!inode) return -ENOMEM; diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index e0dfd7bdfb2..48e6dbcfb18 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -11,8 +11,6 @@ #ifndef _AFR_SELFHEAL_H #define _AFR_SELFHEAL_H -#define AFR_SH_MIN_PARTICIPANTS 2 - /* Perform fop on all UP subvolumes and wait for all callbacks to return */ #define AFR_ONALL(frame, rfn, fop, args...) \ @@ -20,9 +18,8 @@ afr_local_t *__local = frame->local; \ afr_private_t *__priv = frame->this->private; \ int __i = 0, __count = 0; \ - unsigned char *__child_up = NULL; \ + unsigned char *__child_up = alloca(__priv->child_count); \ \ - __child_up = alloca0(__priv->child_count); \ memcpy(__child_up, __priv->child_up, \ sizeof(*__child_up) * __priv->child_count); \ __count = AFR_COUNT(__child_up, __priv->child_count); \ @@ -48,13 +45,16 @@ afr_local_t *__local = frame->local; \ afr_private_t *__priv = frame->this->private; \ int __i = 0; \ - int __count = AFR_COUNT(list, __priv->child_count); \ + int __count = 0; \ + unsigned char *__list = alloca(__priv->child_count); \ \ + memcpy(__list, list, sizeof(*__list) * __priv->child_count); \ + __count = AFR_COUNT(__list, __priv->child_count); \ __local->barrier.waitfor = __count; \ afr_local_replies_wipe(__local, __priv); \ \ for (__i = 0; __i < __priv->child_count; __i++) { \ - if (!list[__i]) \ + if (!__list[__i]) \ continue; \ STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \ __priv->children[__i], \ @@ -83,9 +83,9 @@ #define ALLOC_MATRIX(n, type) \ ({ \ - type **__ptr = NULL; \ int __i; \ - __ptr = alloca0(n * sizeof(type *)); \ + type **__ptr = alloca(n * sizeof(type *)); \ + \ for (__i = 0; __i < n; __i++) \ __ptr[__i] = alloca0(n * sizeof(type)); \ __ptr; \ @@ -369,4 +369,9 @@ gf_boolean_t afr_is_file_empty_on_all_children(afr_private_t *priv, struct afr_reply *replies); +int +afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name, + inode_t *inode, int child, struct afr_reply *replies); +int +afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode); #endif /* !_AFR_SELFHEAL_H */ diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index 8b4d2dc9cd1..109fd4b7421 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -94,7 +94,7 @@ __afr_shd_healer_wait(struct subvol_healer *healer) priv = healer->this->private; disabled_loop: - wait_till.tv_sec = time(NULL) + priv->shd.timeout; + wait_till.tv_sec = gf_time() + priv->shd.timeout; while (!healer->rerun) { ret = pthread_cond_timedwait(&healer->cond, &healer->mutex, &wait_till); @@ -222,7 +222,7 @@ out: } int -afr_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name, +afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name, ia_type_t type) { int ret = 0; @@ -371,7 +371,7 @@ afr_shd_sweep_prepare(struct subvol_healer *healer) event->split_brain_count = 0; event->heal_failed_count = 0; - time(&event->start_time); + event->start_time = gf_time(); event->end_time = 0; _mask_cancellation(); } @@ -386,7 +386,7 @@ afr_shd_sweep_done(struct subvol_healer *healer) event = &healer->crawl_event; shd = &(((afr_private_t *)healer->this->private)->shd); - time(&event->end_time); + event->end_time = gf_time(); history = gf_memdup(event, sizeof(*event)); event->start_time = 0; @@ -424,7 +424,7 @@ afr_shd_index_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, ret = afr_shd_selfheal(healer, healer->subvol, gfid); if (ret == -ENOENT || ret == -ESTALE) - afr_shd_index_purge(subvol, parent->inode, entry->d_name, val); + afr_shd_entry_purge(subvol, parent->inode, entry->d_name, val); if (ret == 2) /* If bricks crashed in pre-op after creating indices/xattrop @@ -843,6 +843,176 @@ out: return need_heal; } +static int +afr_shd_anon_inode_cleaner(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent, + void *data) +{ + struct subvol_healer *healer = data; + afr_private_t *priv = healer->this->private; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int ret = 0; + loc_t loc = {0}; + int count = 0; + int i = 0; + int op_errno = 0; + struct iatt *iatt = NULL; + gf_boolean_t multiple_links = _gf_false; + unsigned char *gfid_present = alloca0(priv->child_count); + unsigned char *entry_present = alloca0(priv->child_count); + char *type = "file"; + + frame = afr_frame_create(healer->this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + local = frame->local; + if (AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) { + gf_msg_debug(healer->this->name, 0, + "Not all bricks are up. Skipping " + "cleanup of %s on %s", + entry->d_name, subvol->name); + ret = 0; + goto out; + } + + loc.inode = inode_new(parent->inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + ret = gf_uuid_parse(entry->d_name, loc.gfid); + if (ret) { + ret = 0; + goto out; + } + AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc, + NULL); + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == 0) { + count++; + gfid_present[i] = 1; + iatt = &local->replies[i].poststat; + if (iatt->ia_type == IA_IFDIR) { + type = "dir"; + } + + if (i == healer->subvol) { + if (local->replies[i].poststat.ia_nlink > 1) { + multiple_links = _gf_true; + } + } + } else if (local->replies[i].op_errno != ENOENT && + local->replies[i].op_errno != ESTALE) { + /*We don't have complete view. Skip the entry*/ + gf_msg_debug(healer->this->name, local->replies[i].op_errno, + "Skipping cleanup of %s on %s", entry->d_name, + subvol->name); + ret = 0; + goto out; + } + } + + /*Inode is deleted from subvol*/ + if (count == 1 || (iatt->ia_type != IA_IFDIR && multiple_links)) { + gf_msg(healer->this->name, GF_LOG_WARNING, 0, + AFR_MSG_EXPUNGING_FILE_OR_DIR, "expunging %s %s/%s on %s", type, + priv->anon_inode_name, entry->d_name, subvol->name); + ret = afr_shd_entry_purge(subvol, parent->inode, entry->d_name, + iatt->ia_type); + if (ret == -ENOENT || ret == -ESTALE) + ret = 0; + } else if (count > 1) { + loc_wipe(&loc); + loc.parent = inode_ref(parent->inode); + loc.name = entry->d_name; + loc.inode = inode_new(parent->inode->table); + if (!loc.inode) { + ret = -ENOMEM; + goto out; + } + AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, + &loc, NULL); + count = 0; + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == 0) { + count++; + entry_present[i] = 1; + iatt = &local->replies[i].poststat; + } else if (local->replies[i].op_errno != ENOENT && + local->replies[i].op_errno != ESTALE) { + /*We don't have complete view. Skip the entry*/ + gf_msg_debug(healer->this->name, local->replies[i].op_errno, + "Skipping cleanup of %s on %s", entry->d_name, + subvol->name); + ret = 0; + goto out; + } + } + for (i = 0; i < priv->child_count; i++) { + if (gfid_present[i] && !entry_present[i]) { + /*Entry is not anonymous on at least one subvol*/ + gf_msg_debug(healer->this->name, 0, + "Valid entry present on %s " + "Skipping cleanup of %s on %s", + priv->children[i]->name, entry->d_name, + subvol->name); + ret = 0; + goto out; + } + } + + gf_msg(healer->this->name, GF_LOG_WARNING, 0, + AFR_MSG_EXPUNGING_FILE_OR_DIR, + "expunging %s %s/%s on all subvols", type, priv->anon_inode_name, + entry->d_name); + ret = 0; + for (i = 0; i < priv->child_count; i++) { + op_errno = -afr_shd_entry_purge(priv->children[i], loc.parent, + entry->d_name, iatt->ia_type); + if (op_errno != ENOENT && op_errno != ESTALE) { + ret |= -op_errno; + } + } + } + +out: + if (frame) + AFR_STACK_DESTROY(frame); + loc_wipe(&loc); + return ret; +} + +static void +afr_cleanup_anon_inode_dir(struct subvol_healer *healer) +{ + int ret = 0; + call_frame_t *frame = NULL; + afr_private_t *priv = healer->this->private; + loc_t loc = {0}; + + ret = afr_anon_inode_create(healer->this, healer->subvol, &loc.inode); + if (ret) + goto out; + + frame = afr_frame_create(healer->this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + + ret = syncop_mt_dir_scan(frame, priv->children[healer->subvol], &loc, + GF_CLIENT_PID_SELF_HEALD, healer, + afr_shd_anon_inode_cleaner, NULL, + priv->shd.max_threads, priv->shd.wait_qlength); +out: + if (frame) + AFR_STACK_DESTROY(frame); + loc_wipe(&loc); + return; +} + void * afr_shd_index_healer(void *data) { @@ -900,6 +1070,10 @@ afr_shd_index_healer(void *data) sleep(1); } while (ret > 0); + if (ret == 0) { + afr_cleanup_anon_inode_dir(healer); + } + if (ret == 0 && pre_crawl_xdata && !healer->crawl_event.heal_failed_count) { afr_shd_ta_check_and_unset_xattrs(this, &loc, healer, @@ -1024,7 +1198,7 @@ afr_shd_dict_add_crawl_event(xlator_t *this, dict_t *output, { int ret = 0; uint64_t count = 0; - char key[256] = {0}; + char key[128] = {0}; int keylen = 0; char suffix[64] = {0}; int xl_id = 0; @@ -1149,9 +1323,9 @@ afr_shd_dict_add_path(xlator_t *this, dict_t *output, int child, char *path, { int ret = -1; uint64_t count = 0; - char key[256] = {0}; + char key[64] = {0}; int keylen = 0; - char xl_id_child_str[64] = {0}; + char xl_id_child_str[32] = {0}; int xl_id = 0; ret = dict_get_int32(output, this->name, &xl_id); @@ -1374,19 +1548,40 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output) int op_ret = 0; uint64_t cnt = 0; +#define AFR_SET_DICT_AND_LOG(name, output, key, keylen, dict_str, \ + dict_str_len) \ + { \ + int ret; \ + \ + ret = dict_set_nstrn(output, key, keylen, dict_str, dict_str_len); \ + if (ret) { \ + gf_smsg(name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, \ + "key=%s", key, "value=%s", dict_str, NULL); \ + } \ + } + priv = this->private; shd = &priv->shd; ret = dict_get_int32_sizen(input, "xl-op", (int32_t *)&op); - if (ret) + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED, + "key=xl-op", NULL); goto out; + } this_name_len = strlen(this->name); ret = dict_get_int32n(input, this->name, this_name_len, &xl_id); - if (ret) + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED, + "key=%s", this->name, NULL); goto out; + } ret = dict_set_int32n(output, this->name, this_name_len, xl_id); - if (ret) + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "key=%s", this->name, NULL); goto out; + } switch (op) { case GF_SHD_OP_HEAL_INDEX: op_ret = 0; @@ -1396,23 +1591,30 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output) keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i); if (!priv->child_up[i]) { - ret = dict_set_nstrn(output, key, keylen, + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, SBRICK_NOT_CONNECTED, SLEN(SBRICK_NOT_CONNECTED)); op_ret = -1; } else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) { - ret = dict_set_nstrn(output, key, keylen, + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, SLESS_THAN2_BRICKS_in_REP, SLEN(SLESS_THAN2_BRICKS_in_REP)); op_ret = -1; } else if (!afr_shd_is_subvol_local(this, healer->subvol)) { - ret = dict_set_nstrn(output, key, keylen, SBRICK_IS_REMOTE, + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SBRICK_IS_REMOTE, SLEN(SBRICK_IS_REMOTE)); } else { - ret = dict_set_nstrn(output, key, keylen, + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, SSTARTED_SELF_HEAL, SLEN(SSTARTED_SELF_HEAL)); - afr_shd_index_healer_spawn(this, i); + + ret = afr_shd_index_healer_spawn(this, i); + + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_HEALER_SPAWN_FAILED, NULL); + } } } break; @@ -1424,21 +1626,28 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output) keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i); if (!priv->child_up[i]) { - ret = dict_set_nstrn(output, key, keylen, + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, SBRICK_NOT_CONNECTED, SLEN(SBRICK_NOT_CONNECTED)); } else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) { - ret = dict_set_nstrn(output, key, keylen, + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, SLESS_THAN2_BRICKS_in_REP, SLEN(SLESS_THAN2_BRICKS_in_REP)); } else if (!afr_shd_is_subvol_local(this, healer->subvol)) { - ret = dict_set_nstrn(output, key, keylen, SBRICK_IS_REMOTE, + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, + SBRICK_IS_REMOTE, SLEN(SBRICK_IS_REMOTE)); } else { - ret = dict_set_nstrn(output, key, keylen, + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, SSTARTED_SELF_HEAL, SLEN(SSTARTED_SELF_HEAL)); - afr_shd_full_healer_spawn(this, i); + + ret = afr_shd_full_healer_spawn(this, i); + + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_HEALER_SPAWN_FAILED, NULL); + } op_ret = 0; } } @@ -1446,24 +1655,25 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output) case GF_SHD_OP_INDEX_SUMMARY: /* this case has been handled in glfs-heal.c */ break; - case GF_SHD_OP_HEALED_FILES: - case GF_SHD_OP_HEAL_FAILED_FILES: - for (i = 0; i < priv->child_count; i++) { - keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i); - ret = dict_set_nstrn(output, key, keylen, SOP_NOT_SUPPORTED, - SLEN(SOP_NOT_SUPPORTED)); - } - break; case GF_SHD_OP_SPLIT_BRAIN_FILES: eh_dump(shd->split_brain, output, afr_add_shd_event); break; case GF_SHD_OP_STATISTICS: for (i = 0; i < priv->child_count; i++) { eh_dump(shd->statistics[i], output, afr_add_crawl_event); - afr_shd_dict_add_crawl_event( + ret = afr_shd_dict_add_crawl_event( this, output, &shd->index_healers[i].crawl_event); - afr_shd_dict_add_crawl_event(this, output, - &shd->full_healers[i].crawl_event); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL); + } + + ret = afr_shd_dict_add_crawl_event( + this, output, &shd->full_healers[i].crawl_event); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL); + } } break; case GF_SHD_OP_STATISTICS_HEAL_COUNT: @@ -1474,7 +1684,7 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output) if (!priv->child_up[i]) { keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i); - ret = dict_set_nstrn(output, key, keylen, + AFR_SET_DICT_AND_LOG(this->name, output, key, keylen, SBRICK_NOT_CONNECTED, SLEN(SBRICK_NOT_CONNECTED)); } else { @@ -1483,6 +1693,10 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output) if (ret == 0) { ret = dict_set_uint64(output, key, cnt); } + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, -ret, + AFR_MSG_DICT_SET_FAILED, NULL); + } op_ret = 0; } } @@ -1490,11 +1704,13 @@ afr_xl_op(xlator_t *this, dict_t *input, dict_t *output) break; default: - gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, - "Unknown set op %d", op); + gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "op=%d", + op, NULL); break; } out: dict_deln(output, this->name, this_name_len); return op_ret; + +#undef AFR_SET_DICT_AND_LOG } diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h index 19905394540..18db728ea7b 100644 --- a/xlators/cluster/afr/src/afr-self-heald.h +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -14,12 +14,11 @@ #include <pthread.h> typedef struct { - int child; char *path; + int child; } shd_event_t; typedef struct { - int child; uint64_t healed_count; uint64_t split_brain_count; uint64_t heal_failed_count; @@ -31,32 +30,33 @@ typedef struct { cralwer is in progress */ time_t end_time; char *crawl_type; + int child; } crawl_event_t; struct subvol_healer { xlator_t *this; - int subvol; - gf_boolean_t local; - gf_boolean_t running; - gf_boolean_t rerun; crawl_event_t crawl_event; pthread_mutex_t mutex; pthread_cond_t cond; pthread_t thread; + int subvol; + gf_boolean_t local; + gf_boolean_t running; + gf_boolean_t rerun; }; typedef struct { - gf_boolean_t iamshd; - gf_boolean_t enabled; - int timeout; struct subvol_healer *index_healers; struct subvol_healer *full_healers; eh_t *split_brain; eh_t **statistics; + int timeout; uint32_t max_threads; uint32_t wait_qlength; uint32_t halo_max_latency_msec; + gf_boolean_t iamshd; + gf_boolean_t enabled; } afr_self_heald_t; int @@ -70,6 +70,6 @@ afr_shd_gfid_to_path(xlator_t *this, xlator_t *subvol, uuid_t gfid, char **path_p); int -afr_shd_index_purge(xlator_t *subvol, inode_t *inode, char *name, +afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name, ia_type_t type); #endif /* !_AFR_SELF_HEALD_H */ diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 88cb186dedb..a51f79b1f43 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -124,9 +124,9 @@ afr_release_notify_lock_for_ta(void *opaque) this = (xlator_t *)opaque; priv = this->private; - ret = afr_fill_ta_loc(this, &loc); + ret = afr_fill_ta_loc(this, &loc, _gf_true); if (ret) { - gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, "Failed to populate loc for thin-arbiter."); goto out; } @@ -521,42 +521,6 @@ afr_compute_pre_op_sources(call_frame_t *frame, xlator_t *this) local->transaction.pre_op_sources[j] = 0; } -gf_boolean_t -afr_has_arbiter_fop_cbk_quorum(call_frame_t *frame) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - xlator_t *this = NULL; - gf_boolean_t fop_failed = _gf_false; - unsigned char *pre_op_sources = NULL; - int i = 0; - - local = frame->local; - this = frame->this; - priv = this->private; - pre_op_sources = local->transaction.pre_op_sources; - - /* If the fop failed on the brick, it is not a source. */ - for (i = 0; i < priv->child_count; i++) - if (local->transaction.failed_subvols[i]) - pre_op_sources[i] = 0; - - switch (AFR_COUNT(pre_op_sources, priv->child_count)) { - case 1: - if (pre_op_sources[ARBITER_BRICK_INDEX]) - fop_failed = _gf_true; - break; - case 0: - fop_failed = _gf_true; - break; - } - - if (fop_failed) - return _gf_false; - - return _gf_true; -} - void afr_txn_arbitrate_fop(call_frame_t *frame, xlator_t *this) { @@ -873,7 +837,7 @@ afr_has_quorum(unsigned char *subvols, xlator_t *this, call_frame_t *frame) priv = this->private; up_children_count = AFR_COUNT(subvols, priv->child_count); - if (afr_lookup_has_quorum(frame, this, subvols)) + if (afr_lookup_has_quorum(frame, up_children_count)) return _gf_true; if (priv->quorum_count == AFR_QUORUM_AUTO) { @@ -971,12 +935,8 @@ afr_need_dirty_marking(call_frame_t *frame, xlator_t *this) priv->child_count) return _gf_false; - if (priv->arbiter_count) { - if (!afr_has_arbiter_fop_cbk_quorum(frame)) - need_dirty = _gf_true; - } else if (!afr_has_fop_cbk_quorum(frame)) { + if (!afr_has_fop_cbk_quorum(frame)) need_dirty = _gf_true; - } return need_dirty; } @@ -1026,12 +986,8 @@ afr_handle_quorum(call_frame_t *frame, xlator_t *this) * no split-brain with the fix. The problem is eliminated completely. */ - if (priv->arbiter_count) { - if (afr_has_arbiter_fop_cbk_quorum(frame)) - return; - } else if (afr_has_fop_cbk_quorum(frame)) { + if (afr_has_fop_cbk_quorum(frame)) return; - } if (afr_need_dirty_marking(frame, this)) goto set_response; @@ -1073,7 +1029,7 @@ set_response: } int -afr_fill_ta_loc(xlator_t *this, loc_t *loc) +afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop) { afr_private_t *priv = NULL; @@ -1081,6 +1037,11 @@ afr_fill_ta_loc(xlator_t *this, loc_t *loc) loc->parent = inode_ref(priv->root_inode); gf_uuid_copy(loc->pargfid, loc->parent->gfid); loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX]; + if (is_gfid_based_fop && gf_uuid_is_null(priv->ta_gfid)) { + /* Except afr_ta_id_file_check() which is path based, all other gluster + * FOPS need gfid.*/ + return -EINVAL; + } gf_uuid_copy(loc->gfid, priv->ta_gfid); loc->inode = inode_new(loc->parent->table); if (!loc->inode) { @@ -1090,86 +1051,6 @@ afr_fill_ta_loc(xlator_t *this, loc_t *loc) return 0; } -int -afr_changelog_thin_arbiter_post_op(xlator_t *this, afr_local_t *local) -{ - int ret = 0; - afr_private_t *priv = NULL; - dict_t *xattr = NULL; - int failed_count = 0; - struct gf_flock flock = { - 0, - }; - loc_t loc = { - 0, - }; - int i = 0; - - priv = this->private; - if (!priv->thin_arbiter_count) - return 0; - - failed_count = AFR_COUNT(local->transaction.failed_subvols, - priv->child_count); - if (!failed_count) - return 0; - - GF_ASSERT(failed_count == 1); - ret = afr_fill_ta_loc(this, &loc); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, - "Failed to populate thin-arbiter loc for: %s.", loc.name); - goto out; - } - - xattr = dict_new(); - if (!xattr) { - ret = -ENOMEM; - goto out; - } - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_static_bin(xattr, priv->pending_key[i], - local->pending[i], - AFR_NUM_CHANGE_LOGS * sizeof(int)); - if (ret) - goto out; - } - - flock.l_type = F_WRLCK; - flock.l_start = 0; - flock.l_len = 0; - - /*TODO: Convert to two domain locking. */ - ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], - AFR_TA_DOM_NOTIFY, &loc, F_SETLKW, &flock, NULL, NULL); - if (ret) - goto out; - - ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, - GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL); - - if (ret == -EINVAL) { - gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_THIN_ARB, - "Thin-arbiter has denied post-op on %s for gfid %s.", - priv->pending_key[THIN_ARBITER_BRICK_INDEX], - uuid_utoa(local->inode->gfid)); - - } else if (ret) { - gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, - "Post-op on thin-arbiter id file %s failed for gfid %s.", - priv->pending_key[THIN_ARBITER_BRICK_INDEX], - uuid_utoa(local->inode->gfid)); - } - flock.l_type = F_UNLCK; - syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], AFR_TA_DOM_NOTIFY, - &loc, F_SETLK, &flock, NULL, NULL); -out: - if (xattr) - dict_unref(xattr); - - return ret; -} - static int afr_ta_post_op_done(int ret, call_frame_t *frame, void *opaque) { @@ -1264,9 +1145,9 @@ afr_ta_post_op_do(void *opaque) this = local->transaction.frame->this; priv = this->private; - ret = afr_fill_ta_loc(this, &loc); + ret = afr_fill_ta_loc(this, &loc, _gf_true); if (ret) { - gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, "Failed to populate loc for thin-arbiter."); goto out; } @@ -2466,8 +2347,13 @@ afr_is_delayed_changelog_post_op_needed(call_frame_t *frame, xlator_t *this, goto out; } - if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP)) { - /*Only allow writes but shard does [f]xattrops on writes, so + if (local->transaction.disable_delayed_post_op) { + goto out; + } + + if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP) && + (local->op != GF_FOP_FSYNC)) { + /*Only allow writes/fsyncs but shard does [f]xattrops on writes, so * they are fine too*/ goto out; } diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index b2c64ecf0e6..df7366f0a65 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -135,6 +135,27 @@ set_data_self_heal_algorithm(afr_private_t *priv, char *algo) } } +void +afr_handle_anon_inode_options(afr_private_t *priv, dict_t *options) +{ + char *volfile_id_str = NULL; + uuid_t anon_inode_gfid = {0}; + + /*If volume id is not present don't enable anything*/ + if (dict_get_str(options, "volume-id", &volfile_id_str)) + return; + GF_ASSERT(strlen(AFR_ANON_DIR_PREFIX) + strlen(volfile_id_str) <= NAME_MAX); + /*anon_inode_name is not supposed to change once assigned*/ + if (!priv->anon_inode_name[0]) { + snprintf(priv->anon_inode_name, sizeof(priv->anon_inode_name), "%s-%s", + AFR_ANON_DIR_PREFIX, volfile_id_str); + gf_uuid_parse(volfile_id_str, anon_inode_gfid); + /*Flip a bit to make sure volfile-id and anon-gfid are not same*/ + anon_inode_gfid[0] ^= 1; + uuid_utoa_r(anon_inode_gfid, priv->anon_gfid_str); + } +} + int reconfigure(xlator_t *this, dict_t *options) { @@ -168,7 +189,8 @@ reconfigure(xlator_t *this, dict_t *options) bool, out); GF_OPTION_RECONF("data-self-heal", data_self_heal, options, str, out); - gf_string2boolean(data_self_heal, &priv->data_self_heal); + if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1) + goto out; GF_OPTION_RECONF("entry-self-heal", priv->entry_self_heal, options, bool, out); @@ -241,6 +263,8 @@ reconfigure(xlator_t *this, dict_t *options) out); GF_OPTION_RECONF("eager-lock", priv->eager_lock, options, bool, out); + GF_OPTION_RECONF("optimistic-change-log", priv->optimistic_change_log, + options, bool, out); GF_OPTION_RECONF("quorum-type", qtype, options, str, out); GF_OPTION_RECONF("quorum-count", priv->quorum_count, options, uint32, out); fix_quorum_options(this, priv, qtype, options); @@ -287,6 +311,10 @@ reconfigure(xlator_t *this, dict_t *options) consistent_io = _gf_false; priv->consistent_io = consistent_io; + afr_handle_anon_inode_options(priv, options); + + GF_OPTION_RECONF("use-anonymous-inode", priv->use_anon_inode, options, bool, + out); if (priv->shd.enabled) { if ((priv->shd.enabled != enabled_old) || (timeout_old != priv->shd.timeout)) @@ -417,6 +445,8 @@ init(xlator_t *this) goto out; priv = this->private; + INIT_LIST_HEAD(&priv->saved_locks); + INIT_LIST_HEAD(&priv->lk_healq); LOCK_INIT(&priv->lock); child_count = xlator_subvolume_count(this); @@ -481,7 +511,8 @@ init(xlator_t *this) GF_OPTION_INIT("heal-wait-queue-length", priv->heal_wait_qlen, uint32, out); GF_OPTION_INIT("data-self-heal", data_self_heal, str, out); - gf_string2boolean(data_self_heal, &priv->data_self_heal); + if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1) + goto out; GF_OPTION_INIT("data-self-heal-algorithm", data_self_heal_algorithm, str, out); @@ -535,7 +566,9 @@ init(xlator_t *this) GF_OPTION_INIT("consistent-metadata", priv->consistent_metadata, bool, out); GF_OPTION_INIT("consistent-io", priv->consistent_io, bool, out); + afr_handle_anon_inode_options(priv, this->options); + GF_OPTION_INIT("use-anonymous-inode", priv->use_anon_inode, bool, out); if (priv->quorum_count != 0) priv->consistent_io = _gf_false; @@ -547,13 +580,19 @@ init(xlator_t *this) goto out; } + priv->anon_inode = GF_CALLOC(sizeof(unsigned char), child_count, + gf_afr_mt_char); + priv->child_up = GF_CALLOC(sizeof(unsigned char), child_count, gf_afr_mt_char); priv->child_latency = GF_MALLOC(sizeof(*priv->child_latency) * child_count, gf_afr_mt_child_latency_t); + priv->halo_child_up = GF_CALLOC(sizeof(unsigned char), child_count, + gf_afr_mt_char); - if (!priv->child_up || !priv->child_latency) { + if (!priv->child_up || !priv->child_latency || !priv->halo_child_up || + !priv->anon_inode) { ret = -ENOMEM; goto out; } @@ -684,6 +723,7 @@ fini(xlator_t *this) priv = this->private; afr_selfheal_daemon_fini(this); + GF_ASSERT(list_empty(&priv->saved_locks)); LOCK(&priv->lock); if (priv->timer != NULL) { @@ -728,6 +768,7 @@ struct xlator_fops fops = { .getxattr = afr_getxattr, .fgetxattr = afr_fgetxattr, .readv = afr_readv, + .seek = afr_seek, /* inode write */ .writev = afr_writev, @@ -1276,6 +1317,14 @@ struct volume_options options[] = { .tags = {"replicate"}, .description = "This option exists only for backward compatibility " "and configuring it doesn't have any effect"}, + {.key = {"use-anonymous-inode"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", + .op_version = {GD_OP_VERSION_8_0}, + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE, + .tags = {"replicate"}, + .description = "Setting this option heals directory renames efficiently"}, + {.key = {NULL}}, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index a3f2942b317..d62f9a9caf2 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -39,7 +39,10 @@ #define AFR_TA_DOM_NOTIFY "afr.ta.dom-notify" #define AFR_TA_DOM_MODIFY "afr.ta.dom-modify" +#define AFR_LK_HEAL_DOM "afr.lock-heal.domain" + #define AFR_HALO_MAX_LATENCY 99999 +#define AFR_ANON_DIR_PREFIX ".glusterfs-anonymous-inode" #define PFLAG_PENDING (1 << 0) #define PFLAG_SBRAIN (1 << 1) @@ -95,6 +98,16 @@ typedef int (*afr_changelog_resume_t)(call_frame_t *frame, xlator_t *this); gf_fop_list[local->op], uuid_utoa(local->inode->gfid)); \ } while (0) +#define AFR_ERROR_OUT_IF_FDCTX_INVALID(__fd, __this, __error, __label) \ + do { \ + afr_fd_ctx_t *__fd_ctx = NULL; \ + __fd_ctx = afr_fd_ctx_get(__fd, __this); \ + if (__fd_ctx && __fd_ctx->is_fd_bad) { \ + __error = EBADF; \ + goto __label; \ + } \ + } while (0) + typedef enum { AFR_READ_POLICY_FIRST_UP, AFR_READ_POLICY_GFID_HASH, @@ -139,10 +152,23 @@ typedef enum { } afr_ta_fop_state_t; struct afr_nfsd { - gf_boolean_t iamnfsd; uint32_t halo_max_latency_msec; + gf_boolean_t iamnfsd; }; +typedef struct _afr_lk_heal_info { + fd_t *fd; + int32_t cmd; + struct gf_flock flock; + dict_t *xdata_req; + unsigned char *locked_nodes; + struct list_head pos; + gf_lkowner_t lk_owner; + pid_t pid; + int32_t *child_up_event_gen; + int32_t *child_down_event_gen; +} afr_lk_heal_info_t; + typedef struct _afr_private { gf_lock_t lock; /* to guard access to child_count, etc */ unsigned int child_count; /* total number of children */ @@ -153,20 +179,21 @@ typedef struct _afr_private { inode_t *root_inode; + int favorite_child; /* subvolume to be preferred in resolving + split-brain cases */ /* For thin-arbiter. */ - unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/ uuid_t ta_gfid; - unsigned char ta_child_up; + unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/ int ta_bad_child_index; int ta_event_gen; - off_t ta_notify_dom_lock_offset; - gf_boolean_t release_ta_notify_dom_lock; unsigned int ta_in_mem_txn_count; unsigned int ta_on_wire_txn_count; struct list_head ta_waitq; struct list_head ta_onwireq; + unsigned char *anon_inode; unsigned char *child_up; + unsigned char *halo_child_up; int64_t *child_latency; unsigned char *local; @@ -187,30 +214,31 @@ typedef struct _afr_private { int32_t healers; /* No. of elements currently undergoing background heal*/ + gf_boolean_t release_ta_notify_dom_lock; + gf_boolean_t metadata_self_heal; /* on/off */ gf_boolean_t entry_self_heal; /* on/off */ gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */ int read_child; /* read-subvolume */ - afr_read_hash_mode_t hash_mode; /* for when read_child is not set */ - gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/ - int favorite_child; /* subvolume to be preferred in resolving - split-brain cases */ + gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/ - afr_favorite_child_policy fav_child_policy; /*Policy to use for automatic - resolution of split-brains.*/ + gf_timer_t *timer; /* launched when parent up is received */ unsigned int wait_count; /* # of servers to wait for success */ - gf_timer_t *timer; /* launched when parent up is received */ - + unsigned char ta_child_up; gf_boolean_t optimistic_change_log; gf_boolean_t eager_lock; gf_boolean_t pre_op_compat; /* on/off */ uint32_t post_op_delay_secs; unsigned int quorum_count; - char vol_uuid[UUID_SIZE + 1]; + off_t ta_notify_dom_lock_offset; + afr_favorite_child_policy fav_child_policy; /*Policy to use for automatic + resolution of split-brains.*/ + afr_read_hash_mode_t hash_mode; /* for when read_child is not set */ + int32_t *last_event; /* @event_generation: Keeps count of number of events received which can @@ -223,31 +251,41 @@ typedef struct _afr_private { important as we might have had a network split brain. */ uint32_t event_generation; + char vol_uuid[UUID_SIZE + 1]; gf_boolean_t choose_local; gf_boolean_t did_discovery; - uint64_t sh_readdir_size; gf_boolean_t ensure_durability; + gf_boolean_t halo_enabled; + gf_boolean_t consistent_metadata; + gf_boolean_t need_heal; + gf_boolean_t granular_locks; + uint64_t sh_readdir_size; char *sh_domain; char *afr_dirty; - gf_boolean_t halo_enabled; - uint32_t halo_max_latency_msec; - uint32_t halo_max_replicas; - uint32_t halo_min_replicas; + uint64_t spb_choice_timeout; afr_self_heald_t shd; struct afr_nfsd nfsd; - gf_boolean_t consistent_metadata; - uint64_t spb_choice_timeout; - gf_boolean_t need_heal; + uint32_t halo_max_latency_msec; + uint32_t halo_max_replicas; + uint32_t halo_min_replicas; - gf_boolean_t granular_locks; gf_boolean_t full_lock; gf_boolean_t esh_granular; gf_boolean_t consistent_io; gf_boolean_t data_self_heal; /* on/off */ + gf_boolean_t use_anon_inode; + + /*For lock healing.*/ + struct list_head saved_locks; + struct list_head lk_healq; + + /*For anon-inode handling */ + char anon_inode_name[NAME_MAX + 1]; + char anon_gfid_str[UUID_SIZE + 1]; } afr_private_t; typedef enum { @@ -311,18 +349,17 @@ afr_entry_lockee_cmp(const void *l1, const void *l2); typedef struct { loc_t *lk_loc; - int lockee_count; afr_lockee_t lockee[AFR_LOCKEE_COUNT_MAX]; const char *lk_basename; const char *lower_basename; const char *higher_basename; - char lower_locked; - char higher_locked; unsigned char *lower_locked_nodes; - int32_t lock_count; + afr_lock_cbk_t lock_cbk; + + int lockee_count; int32_t lk_call_count; int32_t lk_expected_count; @@ -330,14 +367,15 @@ typedef struct { int32_t lock_op_ret; int32_t lock_op_errno; - afr_lock_cbk_t lock_cbk; char *domain; /* Domain on which inode/entry lock/unlock in progress.*/ + int32_t lock_count; + char lower_locked; + char higher_locked; } afr_internal_lock_t; struct afr_reply { int valid; int32_t op_ret; - int32_t op_errno; dict_t *xattr; /*For xattrop*/ dict_t *xdata; struct iatt poststat; @@ -346,6 +384,7 @@ struct afr_reply { struct iatt preparent; struct iatt preparent2; struct iatt postparent2; + int32_t op_errno; /* For rchecksum */ uint8_t checksum[SHA256_DIGEST_LENGTH]; gf_boolean_t buf_has_zeroes; @@ -369,6 +408,10 @@ typedef struct { arrives, we continue to read off this subvol. */ int readdir_subvol; + /* lock-healing related members. */ + gf_boolean_t is_fd_bad; + afr_lk_heal_info_t *lk_heal_info; + } afr_fd_ctx_t; typedef enum { @@ -385,8 +428,6 @@ typedef struct _afr_inode_lock_t { */ int32_t num_inodelks; unsigned int event_generation; - gf_boolean_t release; - gf_boolean_t acquired; gf_timer_t *delay_timer; struct list_head owners; /*Transactions that are performing fop*/ struct list_head post_op; /*Transactions that are done with the fop @@ -395,6 +436,8 @@ typedef struct _afr_inode_lock_t { *conflicting transactions to complete*/ struct list_head frozen; /*Transactions that need to go as part of * next batch of eager-lock*/ + gf_boolean_t release; + gf_boolean_t acquired; } afr_lock_t; typedef struct _afr_inode_ctx { @@ -403,15 +446,11 @@ typedef struct _afr_inode_ctx { int lock_count; int spb_choice; gf_timer_t *timer; - gf_boolean_t need_refresh; unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS]; int inherited[AFR_NUM_CHANGE_LOGS]; int on_disk[AFR_NUM_CHANGE_LOGS]; - - /* set if any write on this fd was a non stable write - (i.e, without O_SYNC or O_DSYNC) - */ - gf_boolean_t witnessed_unstable_write; + /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/ + afr_lock_t lock[2]; /* @open_fd_count: Number of open FDs queried from the server, as queried through @@ -419,8 +458,12 @@ typedef struct _afr_inode_ctx { temporarily disabled. */ uint32_t open_fd_count; - /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/ - afr_lock_t lock[2]; + gf_boolean_t need_refresh; + + /* set if any write on this fd was a non stable write + (i.e, without O_SYNC or O_DSYNC) + */ + gf_boolean_t witnessed_unstable_write; } afr_inode_ctx_t; typedef struct _afr_local { @@ -434,19 +477,15 @@ typedef struct _afr_local { unsigned int event_generation; uint32_t open_fd_count; - gf_boolean_t update_open_fd_count; int32_t num_inodelks; - gf_boolean_t update_num_inodelks; - - gf_lkowner_t saved_lk_owner; int32_t op_ret; int32_t op_errno; - int32_t **pending; - int dirty[AFR_NUM_CHANGE_LOGS]; + int32_t **pending; + loc_t loc; loc_t newloc; @@ -477,14 +516,6 @@ typedef struct _afr_local { afr_read_txn_wind_t readfn; - /* @refreshed: - - the inode was "refreshed" (i.e, pending xattrs from all subvols - freshly inspected and inode ctx updated accordingly) as part of - this transaction already. - */ - gf_boolean_t refreshed; - /* @inode: the inode on which the read txn is performed on. ref'ed and copied @@ -509,8 +540,6 @@ typedef struct _afr_local { unsigned char *readable; unsigned char *readable2; /*For rename transaction*/ - int read_subvol; /* Current read subvolume */ - afr_inode_refresh_cbk_t refreshfn; /* @refreshinode: @@ -519,9 +548,30 @@ typedef struct _afr_local { */ inode_t *refreshinode; + dict_t *xattr_req; + + dict_t *dict; + + int read_subvol; /* Current read subvolume */ + + int optimistic_change_log; + + afr_internal_lock_t internal_lock; + /*To handle setattr/setxattr on yet to be linked inode from dht*/ uuid_t refreshgfid; + /* @refreshed: + + the inode was "refreshed" (i.e, pending xattrs from all subvols + freshly inspected and inode ctx updated accordingly) as part of + this transaction already. + */ + gf_boolean_t refreshed; + + gf_boolean_t update_num_inodelks; + gf_boolean_t update_open_fd_count; + /* @pre_op_compat: @@ -531,14 +581,6 @@ typedef struct _afr_local { gf_boolean_t pre_op_compat; - dict_t *xattr_req; - - afr_internal_lock_t internal_lock; - - dict_t *dict; - - int optimistic_change_log; - /* Is the current writev() going to perform a stable write? i.e, is fd->flags or @flags writev param have O_SYNC or O_DSYNC? @@ -557,25 +599,25 @@ typedef struct _afr_local { struct { struct { - gf_boolean_t needs_fresh_lookup; - uuid_t gfid_req; - } lookup; - - struct { - unsigned char buf_set; struct statvfs buf; + unsigned char buf_set; } statfs; struct { - int32_t flags; fd_t *fd; + int32_t flags; } open; struct { - int32_t cmd; struct gf_flock user_flock; struct gf_flock ret_flock; unsigned char *locked_nodes; + int32_t cmd; + /*For lock healing only.*/ + unsigned char *dom_locked_nodes; + int32_t *dom_lock_op_ret; + int32_t *dom_lock_op_errno; + struct gf_flock *getlk_rsp; } lk; /* inode read */ @@ -600,8 +642,8 @@ typedef struct _afr_local { struct { char *name; - int last_index; long xattr_len; + int last_index; } getxattr; struct { @@ -614,11 +656,10 @@ typedef struct _afr_local { /* dir read */ struct { + uint32_t *checksum; int success_count; int32_t op_ret; int32_t op_errno; - - uint32_t *checksum; } opendir; struct { @@ -627,8 +668,8 @@ typedef struct _afr_local { size_t size; off_t offset; dict_t *dict; - gf_boolean_t failed; int last_index; + gf_boolean_t failed; } readdir; /* inode write */ @@ -638,12 +679,11 @@ typedef struct _afr_local { } inode_wfop; // common structure for all inode-write-fops struct { - int32_t op_ret; - struct iovec *vector; struct iobref *iobref; - int32_t count; off_t offset; + int32_t op_ret; + int32_t count; uint32_t flags; } writev; @@ -703,29 +743,25 @@ typedef struct _afr_local { } create; struct { + dict_t *params; dev_t dev; mode_t mode; - dict_t *params; } mknod; struct { - int32_t mode; dict_t *params; + int32_t mode; } mkdir; struct { - int flags; - } rmdir; - - struct { dict_t *params; char *linkpath; } symlink; struct { - int32_t mode; off_t offset; size_t len; + int32_t mode; } fallocate; struct { @@ -752,10 +788,10 @@ typedef struct _afr_local { struct { char *volume; char *basename; + void *xdata; entrylk_cmd in_cmd; entrylk_cmd cmd; entrylk_type type; - void *xdata; } entrylk; struct { @@ -764,31 +800,33 @@ typedef struct _afr_local { } seek; struct { - int32_t datasync; - } fsync; - - struct { struct gf_lease user_lease; struct gf_lease ret_lease; unsigned char *locked_nodes; } lease; - } cont; + struct { + int flags; + } rmdir; - struct { - off_t start, len; + struct { + int32_t datasync; + } fsync; - gf_boolean_t eager_lock_on; - gf_boolean_t do_eager_unlock; + struct { + uuid_t gfid_req; + gf_boolean_t needs_fresh_lookup; + } lookup; + + } cont; + struct { char *basename; char *new_basename; loc_t parent_loc; loc_t new_parent_loc; - afr_transaction_type type; - /* stub to resume on destruction of the transaction frame */ call_stub_t *resume_stub; @@ -806,6 +844,30 @@ typedef struct _afr_local { FOP failed. */ unsigned char *failed_subvols; + call_frame_t *main_frame; /*Fop frame*/ + call_frame_t *frame; /*Transaction frame*/ + + int (*wind)(call_frame_t *frame, xlator_t *this, int subvol); + + int (*unwind)(call_frame_t *frame, xlator_t *this); + + off_t start, len; + + afr_transaction_type type; + + int32_t in_flight_sb_errno; /* This is where the cause of the + failure on the last good copy of + the file is stored. + */ + + /* @changelog_resume: function to be called after changlogging + (either pre-op or post-op) is done + */ + afr_changelog_resume_t changelog_resume; + + gf_boolean_t eager_lock_on; + gf_boolean_t do_eager_unlock; + /* @dirtied: flag which indicates whether we set dirty flag in the OP. Typically true when we are performing operation on more than one subvol and optimistic changelog is disabled @@ -830,6 +892,10 @@ typedef struct _afr_local { */ gf_boolean_t no_uninherit; + gf_boolean_t in_flight_sb; /* Indicator for occurrence of + split-brain while in the middle of + a txn. */ + /* @uninherit_done: @uninherit_value: @@ -842,27 +908,7 @@ typedef struct _afr_local { gf_boolean_t uninherit_done; gf_boolean_t uninherit_value; - gf_boolean_t in_flight_sb; /* Indicator for occurrence of - split-brain while in the middle of - a txn. */ - int32_t in_flight_sb_errno; /* This is where the cause of the - failure on the last good copy of - the file is stored. - */ - - /* @changelog_resume: function to be called after changlogging - (either pre-op or post-op) is done - */ - afr_changelog_resume_t changelog_resume; - - call_frame_t *main_frame; /*Fop frame*/ - call_frame_t *frame; /*Transaction frame*/ - - int (*wind)(call_frame_t *frame, xlator_t *this, int subvol); - - int (*unwind)(call_frame_t *frame, xlator_t *this); - - /* post-op hook */ + gf_boolean_t disable_delayed_post_op; } transaction; syncbarrier_t barrier; @@ -875,36 +921,36 @@ typedef struct _afr_local { mode_t umask; int xflag; - gf_boolean_t do_discovery; struct afr_reply *replies; /* For client side background heals. */ struct list_head healer; call_frame_t *heal_frame; - gf_boolean_t need_full_crawl; - afr_fop_lock_state_t fop_lock_state; - - gf_boolean_t is_read_txn; afr_inode_ctx_t *inode_ctx; /*For thin-arbiter transactions.*/ - unsigned char read_txn_query_child; - unsigned char ta_child_up; + int ta_failed_subvol; + int ta_event_gen; struct list_head ta_waitq; struct list_head ta_onwireq; afr_ta_fop_state_t fop_state; - int ta_failed_subvol; - int ta_event_gen; + afr_fop_lock_state_t fop_lock_state; + gf_lkowner_t saved_lk_owner; + unsigned char read_txn_query_child; + unsigned char ta_child_up; + gf_boolean_t do_discovery; + gf_boolean_t need_full_crawl; + gf_boolean_t is_read_txn; gf_boolean_t is_new_entry; } afr_local_t; typedef struct afr_spbc_timeout { call_frame_t *frame; - gf_boolean_t d_spb; - gf_boolean_t m_spb; loc_t *loc; int spb_child_index; + gf_boolean_t d_spb; + gf_boolean_t m_spb; } afr_spbc_timeout_t; typedef struct afr_spb_status { @@ -914,9 +960,9 @@ typedef struct afr_spb_status { typedef struct afr_empty_brick_args { call_frame_t *frame; + char *op_type; loc_t loc; int empty_index; - char *op_type; } afr_empty_brick_args_t; typedef struct afr_read_subvol_args { @@ -958,7 +1004,10 @@ afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, int event_generation); int -afr_inode_event_gen_reset(inode_t *inode, xlator_t *this); +__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this); + +int +afr_inode_need_refresh_set(inode_t *inode, xlator_t *this); int afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this, @@ -1074,6 +1123,9 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd); if (__local && __local->is_read_txn) \ afr_pending_read_decrement(__this->private, \ __local->read_subvol); \ + if (__local && __local->xdata_req && \ + afr_is_lock_mode_mandatory(__local->xdata_req)) \ + afr_dom_lock_release(frame); \ frame->local = NULL; \ } \ \ @@ -1226,8 +1278,8 @@ int afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this, int spb_choice); int -afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this, - int *spb_choice); +afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this, + call_frame_t *frame, int *spb_subvol); int afr_get_child_index_from_name(xlator_t *this, char *name); @@ -1312,7 +1364,7 @@ int afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode); int -afr_fill_ta_loc(xlator_t *this, loc_t *loc); +afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop); int afr_ta_post_op_lock(xlator_t *this, loc_t *loc); @@ -1339,8 +1391,8 @@ void afr_ta_locked_priv_invalidate(afr_private_t *priv); gf_boolean_t -afr_lookup_has_quorum(call_frame_t *frame, xlator_t *this, - unsigned char *subvols); +afr_lookup_has_quorum(call_frame_t *frame, + const unsigned int up_children_count); void afr_mark_new_entry_changelog(call_frame_t *frame, xlator_t *this); @@ -1354,4 +1406,18 @@ afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv, void afr_selfheal_childup(xlator_t *this, afr_private_t *priv); + +gf_boolean_t +afr_is_lock_mode_mandatory(dict_t *xdata); + +void +afr_dom_lock_release(call_frame_t *frame); + +void +afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, + unsigned char *replies); + +gf_boolean_t +afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name, + pid_t pid); #endif /* __AFR_H__ */ |
