diff options
Diffstat (limited to 'xlators')
249 files changed, 56393 insertions, 14819 deletions
diff --git a/xlators/Makefile.am b/xlators/Makefile.am index b1643d26c..f60fa85ce 100644 --- a/xlators/Makefile.am +++ b/xlators/Makefile.am @@ -1,3 +1,4 @@ -SUBDIRS = cluster storage protocol performance debug features encryption mount nfs mgmt system +SUBDIRS = cluster storage protocol performance debug features encryption mount nfs mgmt system \ + playground CLEANFILES = diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index a9acb4094..af01f2ef2 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -51,7 +51,7 @@ #define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000100000000ULL #define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL - +#define AFR_STATISTICS_HISTORY_SIZE 50 int afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, gf_boolean_t fail_conflict); @@ -779,6 +779,12 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) sh = &local->self_heal; priv = this->private; + if (sh->data_sh_info && strcmp (sh->data_sh_info, "")) + GF_FREE (sh->data_sh_info); + + if (sh->metadata_sh_info && strcmp (sh->metadata_sh_info, "")) + GF_FREE (sh->metadata_sh_info); + GF_FREE (sh->buf); GF_FREE (sh->parentbufs); @@ -826,7 +832,8 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) void afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) { - afr_private_t * priv = NULL; + afr_private_t *priv = NULL; + int i = 0; priv = this->private; @@ -836,7 +843,9 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) GF_FREE (local->internal_lock.locked_nodes); - GF_FREE (local->internal_lock.inode_locked_nodes); + for (i = 0; local->internal_lock.inodelk[i].domain; i++) { + GF_FREE (local->internal_lock.inodelk[i].locked_nodes); + } GF_FREE (local->internal_lock.lower_locked_nodes); @@ -1071,6 +1080,88 @@ afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) uuid_copy (loc->pargfid, postparent->ia_gfid); } +/* + * Quota size xattrs are not maintained by afr. There is a + * possibility that they differ even when both the directory changelog xattrs + * suggest everything is fine. So if there is at least one 'source' check among + * the sources which has the maximum quota size. Otherwise check among all the + * available ones for maximum quota size. This way if there is a source and + * stale copies it always votes for the 'source'. + * */ + +static void +afr_handle_quota_size (afr_local_t *local, xlator_t *this, + dict_t *rsp_dict) +{ + int32_t *sources = NULL; + dict_t *xattr = NULL; + data_t *max_data = NULL; + int64_t max_quota_size = -1; + data_t *data = NULL; + int64_t *size = NULL; + int64_t quota_size = -1; + afr_private_t *priv = NULL; + int i = 0; + int ret = -1; + gf_boolean_t source_present = _gf_false; + + priv = this->private; + sources = local->cont.lookup.sources; + + if (rsp_dict == NULL) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "%s: Invalid " + "response dictionary", local->loc.path); + return; + } + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source_present = _gf_true; + break; + } + } + + for (i = 0; i < priv->child_count; i++) { + /* + * If there is at least one source lets check + * for maximum quota sizes among sources, otherwise take the + * maximum of the ones present to be on the safer side. + */ + if (source_present && !sources[i]) + continue; + + xattr = local->cont.lookup.xattrs[i]; + if (!xattr) + continue; + + data = dict_get (xattr, QUOTA_SIZE_KEY); + if (!data) + continue; + + size = (int64_t*)data->data; + quota_size = ntoh64(*size); + gf_log (this->name, GF_LOG_DEBUG, "%s: %d, size: %"PRId64, + local->loc.path, i, quota_size); + if (quota_size > max_quota_size) { + if (max_data) + data_unref (max_data); + + max_quota_size = quota_size; + max_data = data_ref (data); + } + } + + if (max_data) { + ret = dict_set (rsp_dict, QUOTA_SIZE_KEY, max_data); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " + "quota size", local->loc.path); + } + + data_unref (max_data); + } +} + int afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) { @@ -1115,13 +1206,18 @@ afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) ret = -1; goto out; } + gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d", read_child); if (!*xattr) *xattr = dict_ref (local->cont.lookup.xattrs[read_child]); + *buf = local->cont.lookup.bufs[read_child]; *postparent = local->cont.lookup.postparents[read_child]; + if (dict_get (local->xattr_req, QUOTA_SIZE_KEY)) + afr_handle_quota_size (local, this, *xattr); + if (IA_INVAL == local->cont.lookup.inode->ia_type) { /* fix for RT #602 */ local->cont.lookup.inode->ia_type = buf->ia_type; @@ -1270,7 +1366,7 @@ afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) { /* mismatching gfid */ - gf_log (this->name, GF_LOG_WARNING, + gf_log (this->name, GF_LOG_DEBUG, "%s: gfid different on subvolume", local->loc.path); } } @@ -1494,7 +1590,7 @@ afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, child2 = &bufs[success_children[i-1]]; if (FILETYPE_DIFFERS (child1, child2)) { - gf_log (xlator_name, GF_LOG_WARNING, "%s: filetype " + gf_log (xlator_name, GF_LOG_DEBUG, "%s: filetype " "differs on subvolumes (%d, %d)", path, success_children[i-1], success_children[i]); conflicting = _gf_true; @@ -1503,7 +1599,7 @@ afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, if (!gfid || uuid_is_null (child1->ia_gfid)) continue; if (uuid_compare (*gfid, child1->ia_gfid)) { - gf_log (xlator_name, GF_LOG_WARNING, "%s: gfid differs" + gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid differs" " on subvolume %d", path, success_children[i]); conflicting = _gf_true; goto out; @@ -1632,7 +1728,6 @@ afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, afr_local_t *local = NULL; int ret = -1; dict_t *xattr = NULL; - int32_t spb = 0; local = frame->local; @@ -1663,10 +1758,6 @@ afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, local->loc.path, local->self_heal.actual_sh_started); } - - if (local->loc.inode) - spb = afr_is_split_brain (this, local->loc.inode); - ret = dict_set_int32 (xattr, "split-brain", spb); } out: AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, @@ -1741,7 +1832,8 @@ afr_lookup_perform_self_heal (call_frame_t *frame, xlator_t *this, afr_lookup_set_self_heal_params (local, this); if (afr_can_self_heal_proceed (&local->self_heal, priv)) { - if (afr_is_transaction_running (local)) + if (afr_is_transaction_running (local) && + (!local->allow_sh_for_running_transaction)) goto out; reason = "lookup detected pending operations"; @@ -2336,7 +2428,7 @@ afr_lookup (call_frame_t *frame, xlator_t *this, int call_count = 0; uint64_t ctx = 0; int32_t op_errno = 0; - + int allow_sh = 0; priv = this->private; AFR_LOCAL_ALLOC_OR_GOTO (local, out); @@ -2353,6 +2445,13 @@ afr_lookup (call_frame_t *frame, xlator_t *this, goto out; } + if (local->loc.path && + (strcmp (local->loc.path, "/" GF_REPLICATE_TRASH_DIR) == 0)) { + op_errno = EPERM; + ret = -1; + goto out; + } + ret = inode_ctx_get (local->loc.inode, this, &ctx); if (ret == 0) { /* lookup is a revalidate */ @@ -2401,6 +2500,11 @@ afr_lookup (call_frame_t *frame, xlator_t *this, /* By default assume ENOTCONN. On success it will be set to 0. */ local->op_errno = ENOTCONN; + ret = dict_get_int32 (xattr_req, "allow-sh-for-running-transaction", + &allow_sh); + dict_del (xattr_req, "allow-sh-for-running-transaction"); + local->allow_sh_for_running_transaction = allow_sh; + ret = afr_lookup_xattr_req_prepare (local, this, xattr_req, &local->loc, &gfid_req); if (ret) { @@ -2578,15 +2682,42 @@ afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, return 0; } +static int +afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + + priv = this->private; + local = frame->local; + call_count = local->call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_flush_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->flush, + local->fd, NULL); + if (!--call_count) + break; + + } + } + + return 0; +} + int afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; + call_stub_t *stub = NULL; int ret = -1; int op_errno = 0; - int call_count = -1; - int i = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -2602,23 +2733,14 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) goto out; local->fd = fd_ref(fd); - call_count = local->call_count; - - afr_delayed_changelog_wake_up (this, fd); - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_flush_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->flush, - local->fd, NULL); - - if (!--call_count) - break; - } + stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata); + if (!stub) { + ret = -1; + op_errno = ENOMEM; + goto out; } + afr_delayed_changelog_wake_resume (this, fd, stub); ret = 0; out: @@ -2694,6 +2816,16 @@ afr_release (xlator_t *this, fd_t *fd) /* {{{ fsync */ int +afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + +int afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) @@ -2718,13 +2850,13 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_ret = 0; if (local->success_count == 0) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; } if (child_index == read_child) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; } local->success_count++; @@ -2746,10 +2878,11 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, post-op. This guarantee is expected by FUSE graph switching for example. */ - stub = fop_fsync_cbk_stub (frame, default_fsync_cbk, + stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk, local->op_ret, local->op_errno, - &local->cont.fsync.prebuf, - &local->cont.fsync.postbuf, xdata); + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + xdata); if (!stub) { AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); return 0; @@ -3966,6 +4099,8 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) goto out; } + local->append_write = _gf_false; + ret = 0; out: return ret; @@ -3977,11 +4112,6 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, { int ret = -ENOMEM; - lk->inode_locked_nodes = GF_CALLOC (sizeof (*lk->inode_locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->inode_locked_nodes) - goto out; - lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), child_count, gf_afr_mt_char); if (NULL == lk->locked_nodes) @@ -4040,6 +4170,21 @@ out: } int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count) +{ + int ret = -ENOMEM; + + lk->domain = dom; + lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), + child_count, gf_afr_mt_char); + if (NULL == lk->locked_nodes) + goto out; + ret = 0; +out: + return ret; +} + +int afr_transaction_local_init (afr_local_t *local, xlator_t *this) { int child_up_count = 0; @@ -4052,6 +4197,14 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (ret < 0) goto out; + if ((local->transaction.type == AFR_DATA_TRANSACTION) || + (local->transaction.type == AFR_METADATA_TRANSACTION)) { + ret = afr_inodelk_init (&local->internal_lock.inodelk[0], + this->name, priv->child_count); + if (ret < 0) + goto out; + } + ret = -ENOMEM; child_up_count = afr_up_children_count (local->child_up, priv->child_count); @@ -4278,6 +4431,16 @@ afr_priv_destroy (afr_private_t *priv) if (priv->shd.split_brain) eh_destroy (priv->shd.split_brain); + for (i = 0; i < priv->child_count; i++) + { + if (priv->shd.statistics[i]) + eh_destroy (priv->shd.statistics[i]); + } + + GF_FREE (priv->shd.statistics); + + GF_FREE (priv->shd.crawl_events); + GF_FREE (priv->last_event); if (priv->pending_key) { for (i = 0; i < priv->child_count; i++) @@ -4356,3 +4519,73 @@ afr_is_fd_fixable (fd_t *fd) return _gf_true; } + +void +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + inode_t *inode = NULL; + afr_inode_ctx_t *ctx = NULL; + + local = frame->local; + + if (local->fd) + inode = local->fd->inode; + else + inode = local->loc.inode; + + if (!inode) + return; + + LOCK (&inode->lock); + { + ctx = __afr_inode_ctx_get (inode, this); + ctx->open_fd_count = local->open_fd_count; + } + UNLOCK (&inode->lock); +} + +int +afr_initialise_statistics (xlator_t *this) +{ + afr_private_t *priv = NULL; + int ret = -1; + int i = 0; + int child_count = 0; + eh_t *stats_per_brick = NULL; + shd_crawl_event_t ***shd_crawl_events = NULL; + priv = this->private; + + priv->shd.statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count, + gf_common_mt_eh_t); + if (!priv->shd.statistics) { + ret = -1; + goto out; + } + child_count = priv->child_count; + for (i=0; i < child_count ; i++) { + stats_per_brick = eh_new (AFR_STATISTICS_HISTORY_SIZE, + _gf_false, + _destroy_crawl_event_data); + if (!stats_per_brick) { + ret = -1; + goto out; + } + priv->shd.statistics[i] = stats_per_brick; + + } + + shd_crawl_events = (shd_crawl_event_t***)(&priv->shd.crawl_events); + *shd_crawl_events = GF_CALLOC (sizeof(shd_crawl_event_t*), + priv->child_count, + gf_afr_mt_shd_crawl_event_t); + + if (!priv->shd.crawl_events) { + ret = -1; + goto out; + } + ret = 0; +out: + return ret; + +} diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index 0e1718814..689dd84e6 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -253,7 +253,7 @@ unlock: goto out; if (!afr_is_opendir_done (this, local->fd->inode) && - up_children_count > 1) { + up_children_count > 1 && priv->entry_self_heal) { /* * This is the first opendir on this inode. We need @@ -383,14 +383,36 @@ afr_forget_entries (fd_t *fd) } } +static void +afr_readdir_filter_trash_dir (gf_dirent_t *entries, fd_t *fd) +{ + gf_dirent_t * entry = NULL; + gf_dirent_t * tmp = NULL; + + list_for_each_entry_safe (entry, tmp, &entries->list, list) { + if (__is_root_gfid (fd->inode->gfid) && + !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { + list_del_init (&entry->list); + GF_FREE (entry); + } + } +} int32_t afr_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, dict_t *xdata) { - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL); + afr_local_t *local = NULL; + if (op_ret == -1) + goto out; + + local = frame->local; + afr_readdir_filter_trash_dir (entries, local->fd); + +out: + AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL); return 0; } @@ -400,8 +422,16 @@ afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, dict_t *xdata) { - AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, NULL); + afr_local_t *local = NULL; + + if (op_ret == -1) + goto out; + + local = frame->local; + afr_readdir_filter_trash_dir (entries, local->fd); +out: + AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, NULL); return 0; } diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 5a22696ce..1943b719b 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -235,26 +235,6 @@ out: } void -afr_dir_fop_handle_all_fop_failures (call_frame_t *frame) -{ - xlator_t *this = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - this = frame->this; - local = frame->local; - priv = this->private; - - if (local->op_ret >= 0) - goto out; - - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); -out: - return; -} - -void afr_dir_fop_done (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; @@ -273,7 +253,6 @@ afr_dir_fop_done (call_frame_t *frame, xlator_t *this) done: local->transaction.unwind (frame, this); afr_dir_fop_mark_entry_pending_changelog (frame, this); - afr_dir_fop_handle_all_fop_failures (frame); local->transaction.resume (frame, this); } diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index 051b7b164..e06e3b2f2 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -117,6 +117,8 @@ afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, children = priv->children; + AFR_SBRAIN_CHECK_LOC (loc, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; @@ -232,6 +234,8 @@ afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) children = priv->children; + AFR_SBRAIN_CHECK_LOC (loc, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; @@ -348,10 +352,7 @@ afr_fstat (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (fd->inode, out); - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } + AFR_SBRAIN_CHECK_FD (fd, out); AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; @@ -472,6 +473,8 @@ afr_readlink (call_frame_t *frame, xlator_t *this, children = priv->children; + AFR_SBRAIN_CHECK_LOC (loc, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; @@ -1323,6 +1326,62 @@ afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, return ret; } +static int +afr_aggregate_stime_xattr (dict_t *this, char *key, data_t *value, void *data) +{ + int ret = 0; + + if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) + ret = gf_get_min_stime (THIS, data, key, value); + + return ret; +} + +int32_t +afr_common_getxattr_stime_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t callcnt = 0; + + if (!frame || !frame->local || !this) { + gf_log ("", GF_LOG_ERROR, "possible NULL deref"); + goto out; + } + + local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (!dict || (op_ret < 0)) { + local->op_errno = op_errno; + goto cleanup; + } + + if (!local->dict) + local->dict = dict_copy_with_ref (dict, NULL); + else + dict_foreach (dict, afr_aggregate_stime_xattr, + local->dict); + local->op_ret = 0; + } + +cleanup: + UNLOCK (&frame->lock); + + if (!callcnt) { + AFR_STACK_UNWIND (getxattr, frame, local->op_ret, + local->op_errno, local->dict, xdata); + } + +out: + return 0; +} + + static gf_boolean_t afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk, gf_boolean_t is_fgetxattr) @@ -1355,6 +1414,8 @@ afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk, } else { *cbk = afr_getxattr_lockinfo_cbk; } + } else if (fnmatch (GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) { + *cbk = afr_common_getxattr_stime_cbk; } else { is_spl = _gf_false; } @@ -1403,7 +1464,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, int32_t read_child = -1; int ret = -1; fop_getxattr_cbk_t cbk = NULL; - + int afr_xtime_gauge[MCNT_MAX] = {0,}; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1414,6 +1475,8 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, children = priv->children; + AFR_SBRAIN_CHECK_LOC (loc, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; @@ -1452,6 +1515,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, sub_volumes, priv->child_count, MARKER_UUID_TYPE, + marker_uuid_default_gauge, priv->vol_uuid)) { gf_log (this->name, GF_LOG_INFO, @@ -1498,12 +1562,20 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, } + /* don't err out on getting ENOTCONN (brick down) + * from a subset of the bricks + */ + memcpy (afr_xtime_gauge, marker_xtime_default_gauge, + sizeof (afr_xtime_gauge)); + afr_xtime_gauge[MCNT_NOTFOUND] = 0; + afr_xtime_gauge[MCNT_ENOTCONN] = 0; if (cluster_getmarkerattr (frame, this, loc, name, local, afr_getxattr_unwind, sub_volumes, priv->child_count, MARKER_XTIME_TYPE, + afr_xtime_gauge, priv->vol_uuid)) { gf_log (this->name, GF_LOG_INFO, "%s: failed to get marker attr (%s)", @@ -1660,10 +1732,8 @@ afr_fgetxattr (call_frame_t *frame, xlator_t *this, children = priv->children; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } + AFR_SBRAIN_CHECK_FD (fd, out); + AFR_LOCAL_ALLOC_OR_GOTO (local, out); frame->local = local; @@ -1819,10 +1889,7 @@ afr_readv (call_frame_t *frame, xlator_t *this, priv = this->private; children = priv->children; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } + AFR_SBRAIN_CHECK_FD (fd, out); AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index ce4fbf226..c1ec69a55 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -39,6 +39,46 @@ #include "afr-transaction.h" #include "afr-self-heal-common.h" +void +__inode_write_fop_cbk (call_frame_t *frame, int child_index, int read_child, + xlator_t *this, int32_t *op_ret, int32_t *op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (afr_fop_failed (*op_ret, *op_errno)) { + local->child_errno[child_index] = *op_errno; + + switch (local->op) { + case GF_FOP_TRUNCATE: + case GF_FOP_FTRUNCATE: + if (*op_errno != EFBIG) + afr_transaction_fop_failed (frame, this, + child_index); + break; + default: + afr_transaction_fop_failed (frame, this, child_index); + break; + } + local->op_errno = *op_errno; + goto out; + } + + if ((local->success_count == 0) || (read_child == child_index)) { + local->op_ret = *op_ret; + if (prebuf) + local->cont.inode_wfop.prebuf = *prebuf; + if (postbuf) + local->cont.inode_wfop.postbuf = *postbuf; + } + + local->success_count++; +out: + return; +} + /* {{{ writev */ void @@ -52,8 +92,8 @@ afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame) dst_local->op_ret = src_local->op_ret; dst_local->op_errno = src_local->op_errno; - dst_local->cont.writev.prebuf = src_local->cont.writev.prebuf; - dst_local->cont.writev.postbuf = src_local->cont.writev.postbuf; + dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf; + dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf; } void @@ -64,8 +104,8 @@ afr_writev_unwind (call_frame_t *frame, xlator_t *this) AFR_STACK_UNWIND (writev, frame, local->op_ret, local->op_errno, - &local->cont.writev.prebuf, - &local->cont.writev.postbuf, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, NULL); } @@ -133,12 +173,17 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; + afr_private_t *priv = NULL; call_frame_t *fop_frame = NULL; int child_index = (long) cookie; int call_count = -1; int read_child = 0; + int ret = 0; + uint32_t open_fd_count = 0; + uint32_t write_is_append = 0; local = frame->local; + priv = this->private; read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); @@ -148,12 +193,14 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); + local->replies[child_index].valid = 1; local->replies[child_index].op_ret = op_ret; local->replies[child_index].op_errno = op_errno; - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); /* stage the best case return value for unwind */ if ((local->success_count == 0) || (op_ret > local->op_ret)) { @@ -162,12 +209,24 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } if (op_ret != -1) { - if ((local->success_count == 0) || - (child_index == read_child)) { - local->cont.writev.prebuf = *prebuf; - local->cont.writev.postbuf = *postbuf; - } - local->success_count++; + if (xdata) { + ret = dict_get_uint32 (xdata, + GLUSTERFS_OPEN_FD_COUNT, + &open_fd_count); + if ((ret == 0) && + (open_fd_count > local->open_fd_count)) { + local->open_fd_count = open_fd_count; + local->update_open_fd_count = _gf_true; + } + + write_is_append = 0; + ret = dict_get_uint32 (xdata, + GLUSTERFS_WRITE_IS_APPEND, + &write_is_append); + if (ret || !write_is_append) + local->append_write = _gf_false; + } + } } UNLOCK (&frame->lock); @@ -176,10 +235,23 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (call_count == 0) { - if (!local->stable_write) - afr_fd_report_unstable_write (this, local->fd); + if (local->update_open_fd_count) + afr_handle_open_fd_count (frame, this); + + if (!local->stable_write && !local->append_write) + /* An appended write removes the necessity to + fsync() the file. This is because self-heal + has the logic to check for larger file when + the xattrs are not reliably pointing at + a stale file. + */ + afr_fd_report_unstable_write (this, local->fd); afr_writev_handle_short_writes (frame, this); + if (afr_any_fops_failed (local, priv)) { + //Don't unwind until post-op is complete + local->transaction.resume (frame, this); + } else { /* * Generally inode-write fops do transaction.unwind then * transaction.resume, but writev needs to make sure that @@ -191,10 +263,11 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, * completed. */ - fop_frame = afr_transaction_detach_fop_frame (frame); - afr_writev_copy_outvars (frame, fop_frame); - local->transaction.resume (frame, this); - afr_writev_unwind (fop_frame, this); + fop_frame = afr_transaction_detach_fop_frame (frame); + afr_writev_copy_outvars (frame, fop_frame); + local->transaction.resume (frame, this); + afr_writev_unwind (fop_frame, this); + } } return 0; } @@ -206,6 +279,8 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) afr_private_t *priv = NULL; int i = 0; int call_count = -1; + dict_t *xdata = NULL; + GF_UNUSED int ret = 0; local = frame->local; priv = this->private; @@ -229,6 +304,19 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) return 0; } + xdata = dict_new (); + if (xdata) { + ret = dict_set_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, + sizeof (uint32_t)); + ret = dict_set_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, + 0); + /* Set append_write to be true speculatively. If on any + server it turns not be true, we unset it in the + callback. + */ + local->append_write = _gf_true; + } + for (i = 0; i < priv->child_count; i++) { if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, @@ -241,13 +329,16 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) local->cont.writev.offset, local->cont.writev.flags, local->cont.writev.iobref, - NULL); + xdata); if (!--call_count) break; } } + if (xdata) + dict_unref (xdata); + return 0; } @@ -515,8 +606,8 @@ afr_truncate_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, local->op_errno, - &local->cont.truncate.prebuf, - &local->cont.truncate.postbuf, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, NULL); } @@ -530,14 +621,11 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; int child_index = (long) cookie; int read_child = 0; int call_count = -1; - int need_unwind = 0; local = frame->local; - priv = this->private; read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); @@ -547,38 +635,22 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno) && op_errno != EFBIG) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.truncate.prebuf = *prebuf; - local->cont.truncate.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.truncate.prebuf = *prebuf; - local->cont.truncate.postbuf = *postbuf; - } - - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if (prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; } - local->op_errno = op_errno; + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); } UNLOCK (&frame->lock); - if (need_unwind) - local->transaction.unwind (frame, this); - call_count = afr_frame_return (frame); if (call_count == 0) { + if (local->stable_write && afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); + local->transaction.resume (frame, this); } @@ -606,6 +678,7 @@ afr_truncate_wind (call_frame_t *frame, xlator_t *this) } local->call_count = call_count; + local->stable_write = _gf_true; for (i = 0; i < priv->child_count; i++) { if (local->transaction.pre_op[i]) { @@ -726,8 +799,8 @@ afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, local->op_errno, - &local->cont.ftruncate.prebuf, - &local->cont.ftruncate.postbuf, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, NULL); } return 0; @@ -740,14 +813,11 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; int child_index = (long) cookie; int call_count = -1; - int need_unwind = 0; int read_child = 0; local = frame->local; - priv = this->private; read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); @@ -757,38 +827,22 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.ftruncate.prebuf = *prebuf; - local->cont.ftruncate.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.ftruncate.prebuf = *prebuf; - local->cont.ftruncate.postbuf = *postbuf; - } - - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if (prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; } - local->op_errno = op_errno; + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); } UNLOCK (&frame->lock); - if (need_unwind) - local->transaction.unwind (frame, this); - call_count = afr_frame_return (frame); if (call_count == 0) { + if (local->stable_write && afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); + local->transaction.resume (frame, this); } @@ -816,6 +870,7 @@ afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) } local->call_count = call_count; + local->stable_write = _gf_true; for (i = 0; i < priv->child_count; i++) { if (local->transaction.pre_op[i]) { @@ -970,8 +1025,8 @@ afr_setattr_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, local->op_errno, - &local->cont.setattr.preop_buf, - &local->cont.setattr.postop_buf, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, NULL); } @@ -1002,29 +1057,14 @@ afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.setattr.preop_buf = *preop; - local->cont.setattr.postop_buf = *postop; - } - - if (child_index == read_child) { - local->cont.setattr.preop_buf = *preop; - local->cont.setattr.postop_buf = *postop; - } + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, preop, postop, + xdata); - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; } - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1179,8 +1219,8 @@ afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, local->op_errno, - &local->cont.fsetattr.preop_buf, - &local->cont.fsetattr.postop_buf, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, NULL); } @@ -1211,29 +1251,14 @@ afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.fsetattr.preop_buf = *preop; - local->cont.fsetattr.postop_buf = *postop; - } - - if (child_index == read_child) { - local->cont.fsetattr.preop_buf = *preop; - local->cont.fsetattr.postop_buf = *postop; - } + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, preop, postop, + xdata); - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; } - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1406,28 +1431,23 @@ int afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int need_unwind = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; local = frame->local; priv = this->private; LOCK (&frame->lock); { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->child_count) { - need_unwind = 1; - } + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + if (local->success_count == priv->child_count) { + need_unwind = 1; } - - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1596,28 +1616,24 @@ int afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int need_unwind = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; local = frame->local; priv = this->private; LOCK (&frame->lock); { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - if (local->success_count == priv->child_count) { - need_unwind = 1; - } + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + if (local->success_count == priv->child_count) { + need_unwind = 1; } - - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1796,28 +1812,23 @@ int afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int need_unwind = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; local = frame->local; priv = this->private; LOCK (&frame->lock); { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + if (local->success_count == priv->wait_count) { + need_unwind = 1; } - - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1985,28 +1996,24 @@ int afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int need_unwind = 0; + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; local = frame->local; priv = this->private; LOCK (&frame->lock); { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } + if (local->success_count == priv->wait_count) { + need_unwind = 1; } - - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -2151,3 +2158,704 @@ out: return 0; } + +static int +afr_fallocate_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, + local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); + } + return 0; +} + +static int +afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + int read_child = 0; + + local = frame->local; + priv = this->private; + + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + + LOCK (&frame->lock); + { + if (child_index == read_child) { + local->read_child_returned = _gf_true; + } + + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); + + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; + } + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + +static int +afr_fallocate_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fallocate, + local->fd, + local->cont.fallocate.mode, + local->cont.fallocate.offset, + local->cont.fallocate.len, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + +static int +afr_fallocate_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +static int +afr_do_fallocate (call_frame_t *frame, xlator_t *this) +{ + call_frame_t * transaction_frame = NULL; + afr_local_t * local = NULL; + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_FALLOCATE; + + local->transaction.fop = afr_fallocate_wind; + local->transaction.done = afr_fallocate_done; + local->transaction.unwind = afr_fallocate_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = local->cont.fallocate.offset; + local->transaction.len = 0; + + /* fallocate can modify the file size */ + op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + op_ret = 0; +out: + if (op_ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fallocate, frame, op_ret, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(fallocate,out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + local->cont.fallocate.mode = mode; + local->cont.fallocate.offset = offset; + local->cont.fallocate.len = len; + + local->fd = fd_ref (fd); + + afr_open_fd_fix (fd, this); + + afr_do_fallocate (frame, this); + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ discard */ + +static int +afr_discard_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (discard, main_frame, local->op_ret, + local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); + } + return 0; +} + +static int +afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + int read_child = 0; + + local = frame->local; + priv = this->private; + + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + + LOCK (&frame->lock); + { + if (child_index == read_child) { + local->read_child_returned = _gf_true; + } + + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); + + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; + } + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + +static int +afr_discard_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->discard, + local->fd, + local->cont.discard.offset, + local->cont.discard.len, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + +static int +afr_discard_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +static int +afr_do_discard (call_frame_t *frame, xlator_t *this) +{ + call_frame_t * transaction_frame = NULL; + afr_local_t * local = NULL; + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_DISCARD; + + local->transaction.fop = afr_discard_wind; + local->transaction.done = afr_discard_done; + local->transaction.unwind = afr_discard_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = local->cont.discard.offset; + local->transaction.len = 0; + + op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + op_ret = 0; +out: + if (op_ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (discard, frame, op_ret, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(discard, out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + local->cont.discard.offset = offset; + local->cont.discard.len = len; + + local->fd = fd_ref (fd); + + afr_open_fd_fix (fd, this); + + afr_do_discard(frame, this); + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + } + + return 0; +} + + +/* {{{ zerofill */ + +static int +afr_zerofill_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (zerofill, main_frame, local->op_ret, + local->op_errno, + &local->cont.zerofill.prebuf, + &local->cont.zerofill.postbuf, + NULL); + } + return 0; +} + +static int +afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + int read_child = 0; + + local = frame->local; + priv = this->private; + + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + + LOCK (&frame->lock); + { + if (child_index == read_child) { + local->read_child_returned = _gf_true; + } + + if (afr_fop_failed (op_ret, op_errno)) { + afr_transaction_fop_failed (frame, this, child_index); + } + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.zerofill.prebuf = *prebuf; + local->cont.zerofill.postbuf = *postbuf; + } + + if (child_index == read_child) { + local->cont.zerofill.prebuf = *prebuf; + local->cont.zerofill.postbuf = *postbuf; + } + + local->success_count++; + + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; + } + } + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) { + local->transaction.unwind (frame, this); + } + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + +static int +afr_zerofill_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->zerofill, + local->fd, + local->cont.zerofill.offset, + local->cont.zerofill.len, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + +static int +afr_zerofill_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +static int +afr_do_zerofill(call_frame_t *frame, xlator_t *this) +{ + call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_ZEROFILL; + + local->transaction.fop = afr_zerofill_wind; + local->transaction.done = afr_zerofill_done; + local->transaction.unwind = afr_zerofill_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = local->cont.zerofill.offset; + local->transaction.len = 0; + + op_ret = afr_transaction (transaction_frame, this, + AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + op_ret = 0; +out: + if (op_ret < 0) { + if (transaction_frame) { + AFR_STACK_DESTROY (transaction_frame); + } + AFR_STACK_UNWIND (zerofill, frame, op_ret, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +int +afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(zerofill, out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) { + goto out; + } + local->cont.zerofill.offset = offset; + local->cont.zerofill.len = len; + + local->fd = fd_ref (fd); + + afr_open_fd_fix (fd, this); + + afr_do_zerofill(frame, this); + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) { + AFR_STACK_DESTROY (transaction_frame); + } + AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +/* }}} */ + + diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h index ed11079fd..8e93ca44a 100644 --- a/xlators/cluster/afr/src/afr-inode-write.h +++ b/xlators/cluster/afr/src/afr-inode-write.h @@ -68,4 +68,15 @@ int32_t afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, dict_t *xdata); +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata); + +int +afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); #endif /* __INODE_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index e091a7939..060d78f35 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -57,11 +57,15 @@ int afr_entry_lockee_cmp (const void *l1, const void *l2) { - const afr_entry_lockee_t *r1 = l1; - const afr_entry_lockee_t *r2 = l2; - int ret = 0; - - ret = uuid_compare (r1->loc.gfid, r2->loc.gfid); + const afr_entry_lockee_t *r1 = l1; + const afr_entry_lockee_t *r2 = l2; + int ret = 0; + uuid_t gfid1 = {0}; + uuid_t gfid2 = {0}; + + loc_gfid ((loc_t*)&r1->loc, gfid1); + loc_gfid ((loc_t*)&r2->loc, gfid2); + ret = uuid_compare (gfid1, gfid2); /*Entrylks with NULL basename are the 'smallest'*/ if (ret == 0) { if (!r1->basename) @@ -75,7 +79,6 @@ afr_entry_lockee_cmp (const void *l1, const void *l2) return -1; else return 1; - } int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); @@ -556,19 +559,23 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; afr_private_t *priv = NULL; - int i = 0; + afr_inodelk_t *inodelk = NULL; priv = this->private; local = frame->local; int_lock = &local->internal_lock; - int_lock->inodelk_lock_count = 0; - int_lock->lock_op_ret = -1; - int_lock->lock_op_errno = 0; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - for (i = 0; i < priv->child_count; i++) { - int_lock->inode_locked_nodes[i] = 0; - } + inodelk->lock_count = 0; + int_lock->lk_attempted_count = 0; + int_lock->lock_op_ret = -1; + int_lock->lock_op_errno = 0; + + memset (inodelk->locked_nodes, 0, + sizeof (*inodelk->locked_nodes) * priv->child_count); + memset (int_lock->locked_nodes, 0, + sizeof (*int_lock->locked_nodes) * priv->child_count); return 0; } @@ -649,7 +656,9 @@ afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; int32_t child_index = (long)cookie; + afr_private_t *priv = NULL; local = frame->local; int_lock = &local->internal_lock; @@ -658,14 +667,18 @@ afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, AFR_UNLOCK_OP, NULL, op_ret, op_errno, child_index); + priv = this->private; + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { - gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on %d " - "unlock by %s", local->loc.path, child_index, + gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on subvolume %s " + "with lock owner %s", local->loc.path, + priv->children[child_index]->name, lkowner_utoa (&frame->root->lk_owner)); } - int_lock->inode_locked_nodes[child_index] &= LOCKED_NO; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + inodelk->locked_nodes[child_index] &= LOCKED_NO; if (local->transaction.eager_lock) local->transaction.eager_lock[child_index] = 0; @@ -679,6 +692,7 @@ static int afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; struct gf_flock flock = {0,}; @@ -694,12 +708,14 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; priv = this->private; - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; flock.l_type = F_UNLCK; full_flock.l_type = F_UNLCK; - call_count = afr_locked_nodes_count (int_lock->inode_locked_nodes, + call_count = afr_locked_nodes_count (inodelk->locked_nodes, priv->child_count); int_lock->lk_call_count = call_count; @@ -715,8 +731,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) fd_ctx = afr_fd_ctx_get (local->fd, this); for (i = 0; i < priv->child_count; i++) { - if ((int_lock->inode_locked_nodes[i] & LOCKED_YES) - != LOCKED_YES) + if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES) continue; if (local->fd) { @@ -757,7 +772,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) (void *) (long)i, priv->children[i], priv->children[i]->fops->finodelk, - this->name, local->fd, + int_lock->domain, local->fd, F_SETLK, flock_use, NULL); if (!--call_count) @@ -772,7 +787,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) (void *) (long)i, priv->children[i], priv->children[i]->fops->inodelk, - this->name, &local->loc, + int_lock->domain, &local->loc, F_SETLK, &flock, NULL); if (!--call_count) @@ -858,7 +873,7 @@ afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[index], priv->children[index]->fops->entrylk, - this->name, + int_lock->domain, &int_lock->lockee[lockee_no].loc, int_lock->lockee[lockee_no].basename, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); @@ -961,6 +976,7 @@ static int afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -971,10 +987,10 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - memcpy (int_lock->inode_locked_nodes, - int_lock->locked_nodes, - priv->child_count); - int_lock->inodelk_lock_count = int_lock->lock_count; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + memcpy (inodelk->locked_nodes, int_lock->locked_nodes, + sizeof (*inodelk->locked_nodes) * priv->child_count); + inodelk->lock_count = int_lock->lock_count; break; case AFR_ENTRY_RENAME_TRANSACTION: @@ -1025,6 +1041,7 @@ int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; struct gf_flock flock = {0,}; @@ -1039,10 +1056,15 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) priv = this->private; child_index = cookie % priv->child_count; lockee_no = cookie / priv->child_count; + is_entrylk = afr_is_entrylk (int_lock, local->transaction.type); + - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = int_lock->lk_flock.l_type; + if (!is_entrylk) { + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = inodelk->flock.l_type; + } if (local->fd) { ret = fd_ctx_get (local->fd, this, &ctx); @@ -1064,8 +1086,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) } if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { - is_entrylk = afr_is_entrylk (int_lock, local->transaction.type); - if ((is_entrylk && int_lock->entrylk_lock_count == 0) || (!is_entrylk && int_lock->lock_count == 0)) { gf_log (this->name, GF_LOG_INFO, @@ -1114,7 +1134,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->finodelk, - this->name, local->fd, + int_lock->domain, local->fd, F_SETLKW, &flock, NULL); } else { @@ -1127,7 +1147,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->inodelk, - this->name, &local->loc, + int_lock->domain, &local->loc, F_SETLKW, &flock, NULL); } @@ -1148,7 +1168,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) (void *) (long) cookie, priv->children[child_index], priv->children[child_index]->fops->fentrylk, - this->name, local->fd, + int_lock->domain, local->fd, int_lock->lockee[lockee_no].basename, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); } else { @@ -1161,7 +1181,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) (void *) (long) cookie, priv->children[child_index], priv->children[child_index]->fops->entrylk, - this->name, + int_lock->domain, &int_lock->lockee[lockee_no].loc, int_lock->lockee[lockee_no].basename, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); @@ -1316,6 +1336,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) local->op_errno = EINVAL; int_lock->lock_op_errno = EINVAL; + afr_unlock (frame, this); return -1; } @@ -1390,6 +1411,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; int call_count = 0; int child_index = (long) cookie; @@ -1398,6 +1420,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; int_lock = &local->internal_lock; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION, AFR_LOCK_OP, NULL, op_ret, @@ -1423,9 +1446,8 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (local->transaction.eager_lock) local->transaction.eager_lock[child_index] = 0; } else { - int_lock->inode_locked_nodes[child_index] - |= LOCKED_YES; - int_lock->inodelk_lock_count++; + inodelk->locked_nodes[child_index] |= LOCKED_YES; + inodelk->lock_count++; if (local->transaction.eager_lock && local->transaction.eager_lock[child_index] && @@ -1448,8 +1470,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (this->name, GF_LOG_TRACE, "Last inode locking reply received"); /* all locks successful. Proceed to call FOP */ - if (int_lock->inodelk_lock_count == - int_lock->lk_expected_count) { + if (inodelk->lock_count == int_lock->lk_expected_count) { gf_log (this->name, GF_LOG_TRACE, "All servers locked. Calling the cbk"); int_lock->lock_op_ret = 0; @@ -1473,6 +1494,7 @@ int afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; afr_fd_ctx_t *fd_ctx = NULL; @@ -1488,11 +1510,13 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; priv = this->private; - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = int_lock->lk_flock.l_type; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = inodelk->flock.l_type; - full_flock.l_type = int_lock->lk_flock.l_type; + full_flock.l_type = inodelk->flock.l_type; initialize_inodelk_variables (frame, this); @@ -1508,6 +1532,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) local->op_errno = EINVAL; int_lock->lock_op_errno = EINVAL; + afr_unlock (frame, this); ret = -1; goto out; } @@ -1566,7 +1591,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->finodelk, - this->name, local->fd, + int_lock->domain, local->fd, F_SETLK, flock_use, NULL); if (!--call_count) @@ -1588,7 +1613,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->inodelk, - this->name, &local->loc, + int_lock->domain, &local->loc, F_SETLK, &flock, NULL); if (!--call_count) @@ -2112,29 +2137,38 @@ out: return ret; } -void -afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, unsigned int child_count) { - afr_local_t *dst_local = NULL; - afr_local_t *src_local = NULL; - afr_internal_lock_t *dst_lock = NULL; - afr_internal_lock_t *src_lock = NULL; + afr_local_t *dst_local = NULL; + afr_local_t *src_local = NULL; + afr_internal_lock_t *dst_lock = NULL; + afr_internal_lock_t *src_lock = NULL; + afr_inodelk_t *dst_inodelk = NULL; + afr_inodelk_t *src_inodelk = NULL; + int ret = -1; - dst_local = dst->local; - dst_lock = &dst_local->internal_lock; src_local = src->local; src_lock = &src_local->internal_lock; - if (src_lock->inode_locked_nodes) { - memcpy (dst_lock->inode_locked_nodes, - src_lock->inode_locked_nodes, - sizeof (*dst_lock->inode_locked_nodes) * child_count); - memset (src_lock->inode_locked_nodes, 0, - sizeof (*src_lock->inode_locked_nodes) * child_count); + src_inodelk = afr_get_inodelk (src_lock, dom); + dst_local = dst->local; + dst_lock = &dst_local->internal_lock; + dst_inodelk = afr_get_inodelk (dst_lock, dom); + if (!dst_inodelk || !src_inodelk) + goto out; + if (src_inodelk->locked_nodes) { + memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes, + sizeof (*dst_inodelk->locked_nodes) * child_count); + memset (src_inodelk->locked_nodes, 0, + sizeof (*src_inodelk->locked_nodes) * child_count); } dst_lock->transaction_lk_type = src_lock->transaction_lk_type; dst_lock->selfheal_lk_type = src_lock->selfheal_lk_type; - dst_lock->inodelk_lock_count = src_lock->inodelk_lock_count; - src_lock->inodelk_lock_count = 0; + dst_inodelk->lock_count = src_inodelk->lock_count; + src_inodelk->lock_count = 0; + ret = 0; +out: + return ret; } diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index e01ab366f..73594f265 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -41,7 +41,10 @@ enum gf_afr_mem_types_ { gf_afr_mt_shd_event_t, gf_afr_mt_time_t, gf_afr_mt_pos_data_t, - gf_afr_mt_reply_t, + gf_afr_mt_reply_t, + gf_afr_mt_stats_t, + gf_afr_mt_shd_crawl_event_t, + gf_afr_mt_uint64_t, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c index 1721fd270..83846f152 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c @@ -100,7 +100,7 @@ sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this, } sh_private_cleanup (sh_frame, this); - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { GF_ASSERT (!last_loop_frame); //loop_finish should have happened and the old_loop should be NULL gf_log (this->name, GF_LOG_DEBUG, @@ -143,7 +143,7 @@ sh_loop_finish (call_frame_t *loop_frame, xlator_t *this) } if (loop_sh && loop_sh->data_lock_held) { - afr_sh_data_unlock (loop_frame, this, + afr_sh_data_unlock (loop_frame, this, this->name, sh_destroy_frame); } else { sh_destroy_frame (loop_frame, this); @@ -214,7 +214,7 @@ sh_loop_frame_create (call_frame_t *sh_frame, xlator_t *this, goto out; //We want the frame to have same lk_owner as sh_frame //so that locks translator allows conflicting locks - new_loop_local = afr_local_copy (local, this); + new_loop_local = afr_self_heal_local_init (local, this); if (!new_loop_local) goto out; new_loop_frame->local = new_loop_local; @@ -273,10 +273,10 @@ sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, new_loop_sh->offset = offset; new_loop_sh->block_size = sh->block_size; afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size, - _gf_true, sh_loop_lock_success, sh_loop_lock_failure); + _gf_true, this->name, sh_loop_lock_success, sh_loop_lock_failure); return 0; out: - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); if (old_loop_frame) sh_loop_finish (old_loop_frame, this); sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM); @@ -307,8 +307,9 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, sh_priv->loops_running--; offset = sh_priv->offset; block_size = sh->block_size; - while ((!sh->eof_reached) && (0 == sh->op_failed) && - (sh_priv->loops_running < priv->data_self_heal_window_size) + while ((!sh->eof_reached) && + (!is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) && + (sh_priv->loops_running < priv->data_self_heal_window_size) && (sh_priv->offset < sh->file_size)) { loop++; @@ -327,7 +328,8 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, if (0 == loop) { //loop finish does unlock, but the erasing of the pending //xattrs needs to happen before that so do not finish the loop - if (is_driver_done && !sh->op_failed) + if (is_driver_done && + !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) goto driver_done; if (old_loop_frame) { sh_loop_finish (old_loop_frame, this); @@ -338,7 +340,7 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, //If we have more loops to form we should finish previous loop after //the next loop lock while (loop--) { - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { // op failed in other loop, stop spawning more loops if (old_loop_frame) { sh_loop_finish (old_loop_frame, this); @@ -384,7 +386,7 @@ sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame } if (op_ret == -1) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); if (loop_frame) { sh_loop_finish (loop_frame, this); @@ -432,16 +434,16 @@ sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (loop_sh, op_errno); } else if (op_ret < loop_local->cont.writev.vector->iov_len) { - gf_log(this->name, GF_LOG_ERROR, - "incomplete write to %s on subvolume %s " - "(expected %lu, returned %d)", sh_local->loc.path, - priv->children[child_index]->name, - loop_local->cont.writev.vector->iov_len, op_ret); - sh->op_failed = 1; - } + gf_log (this->name, GF_LOG_ERROR, + "incomplete write to %s on subvolume %s " + "(expected %lu, returned %d)", sh_local->loc.path, + priv->children[child_index]->name, + loop_local->cont.writev.vector->iov_len, op_ret); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } call_count = afr_frame_return (loop_frame); @@ -514,7 +516,7 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, if (op_ret <= 0) { if (op_ret < 0) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); gf_log (this->name, GF_LOG_ERROR, "read failed on %d " "for %s reason :%s", sh->source, sh_local->loc.path, strerror (errno)); @@ -624,7 +626,7 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, "checksum on %s failed on subvolume %s (%s)", sh_local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } else { memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LENGTH, strong_checksum, MD5_DIGEST_LENGTH); @@ -662,7 +664,8 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, } UNLOCK (&sh_priv->lock); - if (write_needed && !sh->op_failed) { + if (write_needed && + !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { sh_loop_read (loop_frame, this); } else { sh_loop_return (sh_frame, this, loop_frame, @@ -751,14 +754,15 @@ out: return sh_priv; } -void -afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, +int +afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, char *dom, unsigned int child_count) { afr_local_t *dst_local = NULL; afr_self_heal_t *dst_sh = NULL; afr_local_t *src_local = NULL; afr_self_heal_t *src_sh = NULL; + int ret = -1; dst_local = dst->local; dst_sh = &dst_local->self_heal; @@ -766,9 +770,12 @@ afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, src_sh = &src_local->self_heal; GF_ASSERT (src_sh->data_lock_held); GF_ASSERT (!dst_sh->data_lock_held); - afr_lk_transfer_datalock (dst, src, child_count); + ret = afr_lk_transfer_datalock (dst, src, dom, child_count); + if (ret) + return ret; src_sh->data_lock_held = _gf_false; dst_sh->data_lock_held = _gf_true; + return 0; } int @@ -790,7 +797,10 @@ afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, ret = sh_loop_frame_create (sh_frame, this, NULL, &first_loop_frame); if (ret) goto out; - afr_sh_transfer_lock (first_loop_frame, sh_frame, priv->child_count); + ret = afr_sh_transfer_lock (first_loop_frame, sh_frame, this->name, + priv->child_count); + if (ret) + goto out; sh->private = afr_sh_priv_init (); if (!sh->private) { ret = -1; @@ -800,7 +810,7 @@ afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, ret = 0; out: if (ret) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); sh_loop_driver_done (sh_frame, this, NULL); } return 0; diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 2538f4c8b..ef92b4205 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -18,6 +18,28 @@ #include "afr-self-heal.h" #include "pump.h" +#define ADD_FMT_STRING(msg, off, sh_str, status, print_log) \ + do { \ + if (AFR_SELF_HEAL_NOT_ATTEMPTED != status) { \ + off += snprintf (msg + off, sizeof (msg) - off, \ + " "sh_str" self heal %s,", \ + get_sh_completion_status (status));\ + print_log = 1; \ + } \ + } while (0) + +#define ADD_FMT_STRING_SYNC(msg, off, sh_str, status, print_log) \ + do { \ + if (AFR_SELF_HEAL_SYNC_BEGIN == status || \ + AFR_SELF_HEAL_FAILED == status) { \ + off += snprintf (msg + off, sizeof (msg) - off, \ + " "sh_str" self heal %s,", \ + get_sh_completion_status (status));\ + print_log = 1; \ + } \ + } while (0) + + void afr_sh_reset (call_frame_t *frame, xlator_t *this) { @@ -141,9 +163,8 @@ afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) GF_FREE (buf); } -void -afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, - const char *loc) +char* +afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this) { afr_private_t * priv = this->private; char *buf = NULL; @@ -173,10 +194,8 @@ afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, + (child_count * child_count * pending_entry_strlen); buf = GF_CALLOC (1, 1 + strlen (msg) + string_length , gf_afr_mt_char); - if (!buf) { - buf = ""; + if (!buf) goto out; - } ptr = buf; ptr += sprintf (ptr, "%s", msg); @@ -192,11 +211,27 @@ afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, ptr += sprintf (ptr, "%s", matrix_end); out: + return buf; +} + +void +afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, + const char *loc) +{ + char *buf = NULL; + char *free_ptr = NULL; + + buf = afr_get_pending_matrix_str (pending_matrix, this); + if (buf) + free_ptr = buf; + else + buf = ""; + + gf_log (this->name, GF_LOG_ERROR, "Unable to self-heal contents of '%s'" " (possible split-brain). Please delete the file from all but " "the preferred subvolume.%s", loc, buf); - if (buf) - GF_FREE (buf); + GF_FREE (free_ptr); return; } @@ -466,6 +501,8 @@ afr_find_biggest_witness_among_fools (int32_t *witnesses, { int i = 0; int biggest_witness = -1; + int biggest_witness_idx = -1; + int biggest_witness_cnt = -1; GF_ASSERT (witnesses); GF_ASSERT (characters); @@ -475,10 +512,21 @@ afr_find_biggest_witness_among_fools (int32_t *witnesses, if (characters[i].type != AFR_NODE_FOOL) continue; - if (biggest_witness < witnesses[i]) + if (biggest_witness < witnesses[i]) { biggest_witness = witnesses[i]; + biggest_witness_idx = i; + biggest_witness_cnt = 1; + continue; + } + + if (biggest_witness == witnesses[i]) + biggest_witness_cnt++; } - return biggest_witness; + + if (biggest_witness_cnt != 1) + return -1; + + return biggest_witness_idx; } int @@ -506,10 +554,84 @@ afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses, return nsources; } + +int +afr_mark_fool_as_source_by_idx (int32_t *sources, int child_count, int idx) +{ + if (idx >= 0 && idx < child_count) { + sources[idx] = 1; + return 1; + } + return 0; +} + + +static int +afr_find_largest_file_size (struct iatt *bufs, int32_t *success_children, + int child_count) +{ + int idx = -1; + int i = -1; + int child = -1; + uint64_t max_size = 0; + uint64_t min_size = 0; + int num_children = 0; + + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + + child = success_children[i]; + if (bufs[child].ia_size > max_size) { + max_size = bufs[child].ia_size; + idx = child; + } + + if ((num_children == 0) || (bufs[child].ia_size < min_size)) { + min_size = bufs[child].ia_size; + } + + num_children++; + } + + /* If sizes are same for all of them, finding sources will have to + * happen with pending changelog. So return -1 + */ + if ((num_children > 1) && (min_size == max_size)) + return -1; + return idx; +} + + +static int +afr_find_newest_file (struct iatt *bufs, int32_t *success_children, + int child_count) +{ + int idx = -1; + int i = -1; + int child = -1; + uint64_t max_ctime = 0; + + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + + child = success_children[i]; + if (bufs[child].ia_ctime > max_ctime) { + max_ctime = bufs[child].ia_ctime; + idx = child; + } + } + + return idx; +} + + static int afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, afr_node_character *characters, - int child_count) + int32_t *success_children, + int child_count, struct iatt *bufs) { int32_t biggest_witness = 0; int nsources = 0; @@ -517,6 +639,11 @@ afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, GF_ASSERT (child_count > 0); + biggest_witness = afr_find_largest_file_size (bufs, success_children, + child_count); + if (biggest_witness != -1) + goto found; + witnesses = GF_CALLOC (child_count, sizeof (*witnesses), gf_afr_mt_int32_t); if (NULL == witnesses) { @@ -529,9 +656,15 @@ afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, biggest_witness = afr_find_biggest_witness_among_fools (witnesses, characters, child_count); - nsources = afr_mark_fool_as_source_by_witness (sources, witnesses, - characters, child_count, - biggest_witness); + if (biggest_witness != -1) + goto found; + + biggest_witness = afr_find_newest_file (bufs, success_children, + child_count); + +found: + nsources = afr_mark_fool_as_source_by_idx (sources, child_count, + biggest_witness); out: GF_FREE (witnesses); return nsources; @@ -875,7 +1008,8 @@ afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, nsources = afr_mark_biggest_of_fools_as_source (sources, pending_matrix, characters, - child_count); + success_children, + child_count, bufs); } out: @@ -1012,14 +1146,13 @@ afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) afr_sh_reset (frame, this); - if (local->govinda_gOvinda) { + if (local->unhealable) { gf_log (this->name, GF_LOG_DEBUG, "split brain found, aborting selfheal of %s", local->loc.path); - sh->op_failed = 1; } - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { sh->completion_cbk (frame, this); } else { gf_log (this->name, GF_LOG_TRACE, @@ -1251,7 +1384,7 @@ out: if (ret) { gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, " "reason: %s", local->loc.path, strerror (-ret)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } afr_sh_missing_entries_finish (frame, this); } @@ -1266,7 +1399,7 @@ afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this, local = frame->local; sh = &local->self_heal; if (op_ret < 0) - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_missing_entries_finish (frame, this); return 0; } @@ -1290,7 +1423,7 @@ sh_missing_entries_create (call_frame_t *frame, xlator_t *this) if (!afr_valid_ia_type (type)) { gf_log (this->name, GF_LOG_ERROR, "%s: unknown file type: 0%o", local->loc.path, type); - local->govinda_gOvinda = 1; + afr_set_local_for_unhealable (local); afr_sh_missing_entries_finish (frame, this); goto out; } @@ -1323,8 +1456,9 @@ afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this, loc = &local->loc; if (op_ret < 0) { - if (op_errno == EIO) - local->govinda_gOvinda = 1; + if (op_errno == EIO) { + afr_set_local_for_unhealable (local); + } // EIO can happen if finding the fresh parent dir failed goto out; } @@ -1386,7 +1520,7 @@ afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this, } return; out: - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_missing_entries_finish (frame, this); return; @@ -1470,7 +1604,7 @@ afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child, LOCK (&frame->lock); { afr_sh_set_error (sh, EIO); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } UNLOCK (&frame->lock); } @@ -1552,7 +1686,7 @@ afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; priv = this->private; - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_missing_entries_finish (frame, this); } else { if (afr_gfid_missing_count (this->name, sh->fresh_children, @@ -1766,7 +1900,7 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this, priv->child_count, ENOENT); if (fresh_child_enoents == fresh_parent_count) { afr_sh_set_error (sh, ENOENT); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_purge_entry (frame, this); } else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children, priv->child_count, local->loc.path, @@ -1780,14 +1914,14 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this, afr_sh_purge_stale_entry (frame, this); } else { op_errno = EIO; - local->govinda_gOvinda = 1; + afr_set_local_for_unhealable (local); goto fail; } return; fail: - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_missing_entries_finish (frame, this); return; @@ -1858,8 +1992,8 @@ afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this, out: afr_sh_set_error (sh, op_errno); - sh->op_failed = 1; - afr_sh_missing_entries_finish (frame, this); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_missing_entries_finish (frame, this); return; } @@ -1962,7 +2096,7 @@ afr_sh_post_nb_entrylk_missing_entry_sh_cbk (call_frame_t *frame, if (int_lock->lock_op_ret < 0) { gf_log (this->name, GF_LOG_INFO, "Non blocking entrylks failed."); - sh->op_failed = -1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_missing_entries_done (frame, this); } else { @@ -1997,6 +2131,7 @@ afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, int_lock->lk_basename = base_name; int_lock->lk_loc = loc; int_lock->lock_cbk = lock_cbk; + int_lock->domain = this->name; int_lock->lockee_count = 0; afr_init_entry_lockee (&int_lock->lockee[0], local, loc, @@ -2041,19 +2176,29 @@ out: static int afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) { + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + sh->sh_type_in_action = AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY; + + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); + afr_self_heal_parent_entrylk (frame, this, - afr_sh_post_nb_entrylk_missing_entry_sh_cbk); + afr_sh_post_nb_entrylk_missing_entry_sh_cbk); return 0; } afr_local_t* -afr_local_copy (afr_local_t *l, xlator_t *this) +afr_self_heal_local_init (afr_local_t *l, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *lc = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *shc = NULL; - int i = 0; + afr_private_t *priv = NULL; + afr_local_t *lc = NULL; + afr_self_heal_t *sh = NULL; + afr_self_heal_t *shc = NULL; + int ret = 0; priv = this->private; @@ -2076,13 +2221,23 @@ afr_local_copy (afr_local_t *l, xlator_t *this) shc->forced_merge = sh->forced_merge; shc->background = sh->background; shc->type = sh->type; + shc->data_sh_info = ""; + shc->metadata_sh_info = ""; uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req); - if (l->loc.path) - loc_copy (&lc->loc, &l->loc); + if (l->loc.path) { + ret = loc_copy (&lc->loc, &l->loc); + if (ret < 0) + goto out; + } lc->child_up = memdup (l->child_up, sizeof (*lc->child_up) * priv->child_count); + if (!lc->child_up) { + ret = -1; + goto out; + } + if (l->xattr_req) lc->xattr_req = dict_ref (l->xattr_req); @@ -2090,59 +2245,25 @@ afr_local_copy (afr_local_t *l, xlator_t *this) lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode); if (l->cont.lookup.xattr) lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr); - if (l->internal_lock.inode_locked_nodes) - lc->internal_lock.inode_locked_nodes = - memdup (l->internal_lock.inode_locked_nodes, - sizeof (*lc->internal_lock.inode_locked_nodes) * priv->child_count); - else - lc->internal_lock.inode_locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes), - priv->child_count, - gf_afr_mt_char); - - if (l->internal_lock.locked_nodes) - lc->internal_lock.locked_nodes = - memdup (l->internal_lock.locked_nodes, - sizeof (*lc->internal_lock.locked_nodes) * priv->child_count); - else - lc->internal_lock.locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), - priv->child_count, - gf_afr_mt_char); - - for (i = 0; i < l->internal_lock.lockee_count; i++) { - loc_copy (&lc->internal_lock.lockee[i].loc, - &l->internal_lock.lockee[i].loc); - - lc->internal_lock.lockee[i].locked_count = - l->internal_lock.lockee[i].locked_count; - - if (l->internal_lock.lockee[i].basename) - lc->internal_lock.lockee[i].basename = - gf_strdup (l->internal_lock.lockee[i].basename); - - if (l->internal_lock.lockee[i].locked_nodes) { - lc->internal_lock.lockee[i].locked_nodes = - memdup (l->internal_lock.lockee[i].locked_nodes, - sizeof (*lc->internal_lock.lockee[i].locked_nodes) * - priv->child_count); - } else { - lc->internal_lock.lockee[i].locked_nodes = - GF_CALLOC (priv->child_count, - sizeof (*lc->internal_lock.lockee[i].locked_nodes), - gf_afr_mt_char); - } + lc->internal_lock.locked_nodes = + GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), + priv->child_count, gf_afr_mt_char); + if (!lc->internal_lock.locked_nodes) { + ret = -1; + goto out; } - lc->internal_lock.lockee_count = l->internal_lock.lockee_count; - - lc->internal_lock.inodelk_lock_count = - l->internal_lock.inodelk_lock_count; - lc->internal_lock.entrylk_lock_count = - l->internal_lock.entrylk_lock_count; + ret = afr_inodelk_init (&lc->internal_lock.inodelk[0], + this->name, priv->child_count); + if (ret) + goto out; out: + if (ret) { + afr_local_cleanup (lc, this); + lc = NULL; + } return lc; } @@ -2155,32 +2276,28 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) afr_local_t * orig_frame_local = NULL; afr_self_heal_t * orig_frame_sh = NULL; char sh_type_str[256] = {0,}; + gf_loglevel_t loglevel = 0; priv = this->private; local = bgsh_frame->local; sh = &local->self_heal; - if (local->govinda_gOvinda) { + if (local->unhealable) { afr_set_split_brain (this, sh->inode, SPB, SPB); - sh->op_failed = 1; } afr_self_heal_type_str_get (sh, sh_type_str, sizeof(sh_type_str)); - if (sh->op_failed) { - gf_loglevel_t loglevel = GF_LOG_ERROR; - if (priv->shd.iamshd) - loglevel = GF_LOG_DEBUG; - - gf_log (this->name, loglevel, "background %s self-heal " - "failed on %s", sh_type_str, local->loc.path); - + if (is_self_heal_failed (sh, AFR_CHECK_ALL) && !priv->shd.iamshd) { + loglevel = GF_LOG_ERROR; + } else if (!is_self_heal_failed (sh, AFR_CHECK_ALL)) { + loglevel = GF_LOG_INFO; } else { - gf_log (this->name, GF_LOG_DEBUG, "background %s self-heal " - "completed on %s", sh_type_str, local->loc.path); - + loglevel = GF_LOG_DEBUG; } + afr_log_self_heal_completion_status (local, loglevel); + FRAME_SU_UNDO (bgsh_frame, afr_local_t); if (!sh->unwound && sh->unwind) { @@ -2188,7 +2305,7 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) orig_frame_sh = &orig_frame_local->self_heal; orig_frame_sh->actual_sh_started = _gf_true; sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, - sh->op_failed); + is_self_heal_failed (sh, AFR_CHECK_ALL)); } if (sh->background) { @@ -2237,7 +2354,7 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) afr_set_lk_owner (sh_frame, this, sh_frame->root); afr_set_low_priority (sh_frame); - sh_local = afr_local_copy (local, this); + sh_local = afr_self_heal_local_init (local, this); if (!sh_local) goto out; sh_frame->local = sh_local; @@ -2302,6 +2419,8 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) sh->do_gfid_self_heal = _gf_false; } + sh->sh_type_in_action = AFR_SELF_HEAL_INVALID; + FRAME_SU_DO (sh_frame, afr_local_t); if (sh->do_missing_entry_self_heal || sh->do_gfid_self_heal) { afr_self_heal_missing_entries (sh_frame, this); @@ -2511,9 +2630,183 @@ out: GF_FREE (erase_xattr); if (ret < 0) { - sh->op_failed = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); finish (frame, this); } return 0; } + +void +afr_set_self_heal_status(afr_self_heal_t *sh, afr_self_heal_status status) +{ + xlator_t *this = NULL; + afr_sh_status_for_all_type *sh_status = &(sh->afr_all_sh_status); + afr_self_heal_type sh_type_in_action = sh->sh_type_in_action; + this = THIS; + + if (!sh) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal" + "Structure"); + goto out; + } + + switch (sh_type_in_action) { + case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: + sh_status->gfid_or_missing_entry_self_heal = status; + break; + case AFR_SELF_HEAL_METADATA: + sh_status->metadata_self_heal = status; + break; + case AFR_SELF_HEAL_DATA: + sh_status->data_self_heal = status; + break; + case AFR_SELF_HEAL_ENTRY: + sh_status->entry_self_heal = status; + break; + case AFR_SELF_HEAL_INVALID: + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid" + "self heal type in action"); + break; + } +out: + return; +} + +void +afr_set_local_for_unhealable (afr_local_t *local) +{ + afr_self_heal_t *sh = NULL; + + sh = &local->self_heal; + + local->unhealable = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +} + +int +is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type) +{ + afr_sh_status_for_all_type sh_status = sh->afr_all_sh_status; + afr_self_heal_type sh_type_in_action = AFR_SELF_HEAL_INVALID; + afr_self_heal_status status = AFR_SELF_HEAL_FAILED; + xlator_t *this = NULL; + int sh_failed = 0; + + this = THIS; + + if (!sh) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal " + "structure"); + sh_failed = 1; + goto out; + } + + if (type == AFR_CHECK_ALL) { + if ((sh_status.gfid_or_missing_entry_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.metadata_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.data_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.entry_self_heal == AFR_SELF_HEAL_FAILED)) + sh_failed = 1; + } else if (type == AFR_CHECK_SPECIFIC) { + sh_type_in_action = sh->sh_type_in_action; + switch (sh_type_in_action) { + case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: + status = sh_status.gfid_or_missing_entry_self_heal; + break; + case AFR_SELF_HEAL_METADATA: + status = sh_status.metadata_self_heal; + break; + case AFR_SELF_HEAL_ENTRY: + status = sh_status.entry_self_heal; + break; + case AFR_SELF_HEAL_DATA: + status = sh_status.data_self_heal; + break; + case AFR_SELF_HEAL_INVALID: + status = AFR_SELF_HEAL_NOT_ATTEMPTED; + break; + } + if (status == AFR_SELF_HEAL_FAILED) + sh_failed = 1; + + } + +out: + return sh_failed; +} + +char * +get_sh_completion_status (afr_self_heal_status status) +{ + + char *not_attempted = " is not attempted"; + char *failed = " failed"; + char *started = " is started"; + char *sync_begin = " is successfully completed"; + char *result = " has unknown status"; + + switch (status) + { + case AFR_SELF_HEAL_NOT_ATTEMPTED: + result = not_attempted; + break; + case AFR_SELF_HEAL_FAILED: + result = failed; + break; + case AFR_SELF_HEAL_STARTED: + result = started; + break; + case AFR_SELF_HEAL_SYNC_BEGIN: + result = sync_begin; + break; + } + + return result; + +} + +void +afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t loglvl) +{ + + char sh_log[4096] = {0}; + afr_self_heal_t *sh = &local->self_heal; + afr_sh_status_for_all_type all_status = sh->afr_all_sh_status; + xlator_t *this = NULL; + size_t off = 0; + int data_sh = 0; + int metadata_sh = 0; + int print_log = 0; + + this = THIS; + + ADD_FMT_STRING (sh_log, off, "gfid or missing entry", + all_status.gfid_or_missing_entry_self_heal, print_log); + ADD_FMT_STRING_SYNC (sh_log, off, "metadata", + all_status.metadata_self_heal, print_log); + if (sh->background) { + ADD_FMT_STRING_SYNC (sh_log, off, "backgroung data", + all_status.data_self_heal, print_log); + } else { + ADD_FMT_STRING_SYNC (sh_log, off, "foreground data", + all_status.data_self_heal, print_log); + } + ADD_FMT_STRING_SYNC (sh_log, off, "entry", all_status.entry_self_heal, + print_log); + + if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.data_self_heal && + strcmp (sh->data_sh_info, "") && sh->data_sh_info ) + data_sh = 1; + if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.metadata_self_heal && + strcmp (sh->metadata_sh_info, "") && sh->metadata_sh_info) + metadata_sh = 1; + + if (!print_log) + return; + + gf_log (this->name, loglvl, "%s %s %s on %s", sh_log, + ((data_sh == 1) ? sh->data_sh_info : ""), + ((metadata_sh == 1) ? sh->metadata_sh_info : ""), + local->loc.path); +} diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h index 035fce543..473264776 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -15,13 +15,6 @@ #define AFR_SH_MIN_PARTICIPANTS 2 typedef enum { - AFR_SELF_HEAL_ENTRY, - AFR_SELF_HEAL_METADATA, - AFR_SELF_HEAL_DATA, - AFR_SELF_HEAL_INVALID = -1, -} afr_self_heal_type; - -typedef enum { AFR_LOOKUP_FAIL_CONFLICTS = 1, AFR_LOOKUP_FAIL_MISSING_GFIDS = 2, } afr_lookup_flags_t; @@ -98,13 +91,13 @@ int afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, int child_index); int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, afr_lock_cbk_t lock_cbk); afr_local_t * -afr_local_copy (afr_local_t *l, xlator_t *this); +afr_self_heal_local_init (afr_local_t *l, xlator_t *this); int afr_sh_data_lock (call_frame_t *frame, xlator_t *this, - off_t start, off_t len, gf_boolean_t block, + off_t start, off_t len, gf_boolean_t block, char *dom, afr_lock_cbk_t success_handler, afr_lock_cbk_t failure_handler); void @@ -133,4 +126,19 @@ int afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, afr_transaction_type type, afr_fxattrop_cbk_t cbk, int (*finish)(call_frame_t *frame, xlator_t *this)); + +void +afr_set_local_for_unhealable (afr_local_t *local); + +int +is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type); + +void +afr_set_self_heal_status (afr_self_heal_t *sh, afr_self_heal_status status); + +void +afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t logl); + +char* +afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this); #endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 2f63ed27d..9de26ee56 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -156,6 +156,25 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this) } int +afr_sh_dom_unlock (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + if (sh->sh_dom_lock_held) + afr_sh_data_unlock (frame, this, priv->sh_domain, + afr_sh_data_close); + else + afr_sh_data_close (frame, this); + return 0; +} + +int afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *statpre, struct iatt *statpost, dict_t *xdata) @@ -283,31 +302,45 @@ afr_sh_set_timestamps (call_frame_t *frame, xlator_t *this) //Fun fact, lock_cbk is being used for both lock & unlock int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, afr_lock_cbk_t lock_cbk) { afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int ret = 0; local = frame->local; int_lock = &local->internal_lock; sh = &local->self_heal; + priv = this->private; - GF_ASSERT (sh->data_lock_held); - - sh->data_lock_held = _gf_false; + if (strcmp (dom, this->name) == 0) { + sh->data_lock_held = _gf_false; + } else if (strcmp (dom, priv->sh_domain) == 0) { + sh->sh_dom_lock_held = _gf_false; + } else { + ret = -1; + goto out; + } int_lock->lock_cbk = lock_cbk; + int_lock->domain = dom; afr_unlock (frame, this); +out: + if (ret) { + int_lock->lock_op_ret = -1; + int_lock->lock_cbk (frame, this); + } return 0; } int afr_sh_data_finish (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; local = frame->local; sh = &local->self_heal; @@ -316,9 +349,9 @@ afr_sh_data_finish (call_frame_t *frame, xlator_t *this) "finishing data selfheal of %s", local->loc.path); if (sh->data_lock_held) - afr_sh_data_unlock (frame, this, afr_sh_data_close); + afr_sh_data_unlock (frame, this, this->name, afr_sh_dom_unlock); else - afr_sh_data_close (frame, this); + afr_sh_dom_unlock (frame, this); return 0; } @@ -335,11 +368,8 @@ afr_sh_data_fail (call_frame_t *frame, xlator_t *this) gf_log (this->name, GF_LOG_DEBUG, "finishing failed data selfheal of %s", local->loc.path); - sh->op_failed = 1; - if (sh->data_lock_held) - afr_sh_data_unlock (frame, this, afr_sh_data_close); - else - afr_sh_data_close (frame, this); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_data_finish (frame, this); return 0; } @@ -362,13 +392,13 @@ afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, "log failed on %s for subvol %s, reason: %s", local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } call_count = afr_frame_return (frame); if (call_count == 0) { - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { if (sh->old_loop_frame) sh_loop_finish (sh->old_loop_frame, this); sh->old_loop_frame = NULL; @@ -380,7 +410,7 @@ afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, goto out; } GF_ASSERT (sh->old_loop_frame); - afr_sh_data_lock (frame, this, 0, 0, _gf_true, + afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, afr_post_sh_big_lock_success, afr_post_sh_big_lock_failure); } @@ -418,7 +448,7 @@ afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, priv->children[child_index]->name, strerror (op_errno)); LOCK (&frame->lock); { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } UNLOCK (&frame->lock); if (sh->old_loop_frame) @@ -428,7 +458,7 @@ afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - if (sh->op_failed) + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) afr_sh_data_fail (frame, this); else afr_sh_data_erase_pending (frame, this); @@ -604,7 +634,7 @@ afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } else { gf_log (this->name, GF_LOG_DEBUG, "ftruncate of %s on subvolume %s completed", @@ -617,7 +647,7 @@ afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - if (sh->op_failed) + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) afr_sh_data_fail (frame, this); else afr_sh_data_sync_prepare (frame, this); @@ -697,6 +727,199 @@ out: return ret; } +char* +afr_get_sizes_str (afr_local_t *local, struct iatt *bufs, xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = 0; + char num[1024] = {0}; + size_t len = 0; + char *sizes_str = NULL; + size_t off = 0; + char *fmt_str = "%llu bytes on %s, "; + char *child_down = " %s,"; + char *child_unknown = " %s,"; + int down_child_present = 0; + int down_count = 0; + int unknown_count = 0; + int unknown_child_present = 0; + char *down_subvol_1 = " down subvolume is "; + char *unknown_subvol_1 = " unknown subvolume is "; + char *down_subvol_2 = " down subvolumes are "; + char *unknown_subvol_2 = " unknown subvolumes are "; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 1) { + len += snprintf (num, sizeof (num), fmt_str, + (unsigned long long) bufs[i].ia_size, + priv->children[i]->name); + } else if (local->child_up[i] == 0) { + len += snprintf (num, sizeof (num), child_down, + priv->children[i]->name); + if (!down_child_present) + down_child_present = 1; + down_count ++; + } else if (local->child_up[i] == -1) { + len += snprintf (num, sizeof (num), child_unknown, + priv->children[i]->name); + if (!unknown_child_present) + unknown_child_present = 1; + unknown_count++; + } + + } + + if (down_child_present) { + if (down_count > 1) + len += snprintf (num, sizeof (num), "%s", + down_subvol_2); + else + len += snprintf (num, sizeof (num), "%s", + down_subvol_1); + } + if (unknown_child_present) { + if (unknown_count > 1) + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_2); + else + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_1); + } + + len++;//for '\0' + + sizes_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); + + if (!sizes_str) + return NULL; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 1) { + off += snprintf (sizes_str + off, len - off, fmt_str, + (unsigned long long) bufs[i].ia_size, + priv->children[i]->name); + } + } + + if (down_child_present) { + if (down_count > 1) { + off += snprintf (sizes_str + off, len - off, "%s", + down_subvol_2); + } else { + off += snprintf (sizes_str + off, len - off, "%s", + down_subvol_1); + } + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 0) { + off += snprintf (sizes_str + off, len - off, child_down, + priv->children[i]->name); + } + } + + if (unknown_child_present) { + if (unknown_count > 1) { + off += snprintf (sizes_str + off, len - off, "%s", + unknown_subvol_2); + } else { + off += snprintf (sizes_str + off, len - off, "%s", + unknown_subvol_1); + } + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == -1) { + off += snprintf (sizes_str + off, len - off, + child_unknown, + priv->children[i]->name); + + } + } + + return sizes_str; +} + +char* +afr_get_sinks_str (xlator_t *this, afr_local_t *local, afr_self_heal_t *sh) +{ + afr_private_t *priv = NULL; + int i = 0; + char num[1024] = {0}; + size_t len = 0; + char *sinks_str = NULL; + char *temp_str = " to sinks "; + char *str_format = " %s,"; + char off = 0; + + priv = this->private; + + len += snprintf (num, sizeof (num), "%s", temp_str); + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { + len += snprintf (num, sizeof (num), str_format, + priv->children[i]->name); + } + } + + len ++; + + sinks_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); + + if (!sinks_str) + return NULL; + + off += snprintf (sinks_str + off, len - off, "%s", temp_str); + + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { + off += snprintf (sinks_str + off, len - off, + str_format, + priv->children[i]->name); + } + } + + return sinks_str; + +} + + +void +afr_set_data_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, xlator_t *this) +{ + char *pending_matrix_str = NULL; + char *sizes_str = NULL; + char *sinks_str = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + + pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, + this); + if (!pending_matrix_str) + pending_matrix_str = ""; + + sizes_str = afr_get_sizes_str (local, sh->buf, this); + if (!sizes_str) + sizes_str = ""; + + sinks_str = afr_get_sinks_str (this, local, sh); + if (!sinks_str) + sinks_str = ""; + + gf_asprintf (&sh->data_sh_info, " data self heal from %s %s with " + "%s data %s", priv->children[sh->source]->name, sinks_str, + sizes_str, pending_matrix_str); + + if (pending_matrix_str && strcmp (pending_matrix_str, "")) + GF_FREE (pending_matrix_str); + + if (sizes_str && strcmp (sizes_str, "")) + GF_FREE (sizes_str); +} + void afr_sh_data_fix (call_frame_t *frame, xlator_t *this) { @@ -718,7 +941,7 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this) if (sh->background && sh->unwind && !sh->unwound) { sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, - sh->op_failed); + is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)); sh->unwound = _gf_true; } @@ -737,6 +960,7 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this) sh->active_sinks); sh->actual_sh_started = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); afr_sh_data_trim_sinks (frame, this); } @@ -817,6 +1041,7 @@ afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this) } afr_sh_data_setattr (frame, this, &sh->buf[tstamp_source]); } else { + afr_set_data_sh_info_str (local, sh, this); if (nsources == 0) { gf_log (this->name, GF_LOG_DEBUG, "No self-heal needed for %s", @@ -1174,6 +1399,22 @@ afr_sh_data_big_lock_success (call_frame_t *frame, xlator_t *this) } int +afr_sh_dom_lock_success (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + sh->sh_dom_lock_held = _gf_true; + afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, + afr_sh_data_big_lock_success, + afr_sh_data_fail); + return 0; +} + +int afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; @@ -1237,9 +1478,11 @@ afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) } int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t len) +afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, char *dom, + off_t start, off_t len) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; local = frame->local; @@ -1250,11 +1493,14 @@ afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t le afr_set_lock_number (frame, this); - int_lock->lk_flock.l_start = start; - int_lock->lk_flock.l_len = len; - int_lock->lk_flock.l_type = F_WRLCK; int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk; + int_lock->domain = dom; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + inodelk->flock.l_start = start; + inodelk->flock.l_len = len; + inodelk->flock.l_type = F_WRLCK; + afr_nonblocking_inodelk (frame, this); return 0; @@ -1298,7 +1544,7 @@ afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this) int afr_sh_data_lock (call_frame_t *frame, xlator_t *this, off_t start, off_t len, gf_boolean_t block, - afr_lock_cbk_t success_handler, + char *dom, afr_lock_cbk_t success_handler, afr_lock_cbk_t failure_handler) { afr_local_t * local = NULL; @@ -1310,7 +1556,7 @@ afr_sh_data_lock (call_frame_t *frame, xlator_t *this, sh->data_lock_success_handler = success_handler; sh->data_lock_failure_handler = failure_handler; sh->data_lock_block = block; - return afr_sh_data_lock_rec (frame, this, start, len); + return afr_sh_data_lock_rec (frame, this, dom, start, len); } int @@ -1322,7 +1568,6 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_private_t *priv = NULL; int call_count = 0; int child_index = 0; - gf_boolean_t block = _gf_true; local = frame->local; sh = &local->self_heal; @@ -1342,7 +1587,7 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } else { gf_log (this->name, GF_LOG_TRACE, "open of %s succeeded on child %s", @@ -1355,7 +1600,7 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_data_fail (frame, this); return 0; } @@ -1364,15 +1609,8 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "fd for %s opened, commencing sync", local->loc.path); - /* - * The read and write self-heal trigger codepaths do not provide - * an unwind callback. We run a trylock in these codepaths - * because we are sensitive to locking latency. - */ - block = sh->unwind ? _gf_true : _gf_false; - afr_sh_data_lock (frame, this, 0, 0, block, - afr_sh_data_big_lock_success, - afr_sh_data_fail); + afr_sh_data_lock (frame, this, 0, 0, _gf_true, priv->sh_domain, + afr_sh_dom_lock_success, afr_sh_data_fail); } return 0; @@ -1477,18 +1715,31 @@ afr_can_start_data_self_heal (afr_self_heal_t *sh, afr_private_t *priv) int afr_self_heal_data (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = this->private; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = this->private; + int ret = -1; local = frame->local; sh = &local->self_heal; + sh->sh_type_in_action = AFR_SELF_HEAL_DATA; + if (afr_can_start_data_self_heal (sh, priv)) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); + ret = afr_inodelk_init (&local->internal_lock.inodelk[1], + priv->sh_domain, priv->child_count); + if (ret < 0) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_data_done (frame, this); + return 0; + } + if (IA_ISREG (sh->type)) { afr_sh_data_open (frame, this); } else { afr_sh_data_lock (frame, this, 0, 0, _gf_true, + this->name, afr_sh_non_reg_lock_success, afr_sh_data_fail); } diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index c3c9f9fca..53491a1d7 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -162,7 +162,7 @@ afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; if (sh->entries_skipped) { - sh->op_failed = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); goto out; } afr_sh_erase_pending (frame, this, AFR_ENTRY_TRANSACTION, @@ -613,6 +613,19 @@ out: return 0; } +static gf_boolean_t +can_skip_entry_self_heal (char *name, loc_t *parent_loc) +{ + if (strcmp (name, ".") == 0) { + return _gf_true; + } else if (strcmp (name, "..") == 0) { + return _gf_true; + } else if (loc_is_root (parent_loc) && + (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0)) { + return _gf_true; + } + return _gf_false; +} int afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, @@ -640,13 +653,7 @@ afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, sh->expunge_done = afr_sh_entry_expunge_entry_done; name = entry->d_name; - - if ((strcmp (name, ".") == 0) - || (strcmp (name, "..") == 0)) { - - gf_log (this->name, GF_LOG_TRACE, - "skipping inspection of %s under %s", - name, local->loc.path); + if (can_skip_entry_self_heal (name, &local->loc)) { op_ret = 0; goto out; } @@ -799,7 +806,7 @@ afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this) active_src = next_active_sink (frame, this, sh->active_source); sh->active_source = active_src; - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { goto out; } @@ -1256,6 +1263,35 @@ afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", impunge_local->loc.path); + /* + * Reason for adding GLUSTERFS_INTERNAL_FOP_KEY : + * + * Problem: + * While a brick is down in a replica pair, lets say the user creates + * one file(file-A) and a hard link to that file(h-file-A). After the + * brick comes back up, entry self-heal is attempted on parent dir of + * these two files. As part of readdir in self-heal it reads both the + * entries file-A and h-file-A for both of them it does name less lookup + * to check if there are any hardlinks already present in the + * destination brick. It finds that there are no hard links already + * present for files file-A, h-file-A. Self-heal does mknods for both + * file-A and h-file-A. This leads to file-A and h-file-A not being + * hardlinks anymore. + * + * Fix: (More like shrinking of race-window, the race itself is still + * present in posix-mknod). + * If mknod comes with the presence of GLUSTERFS_INTERNAL_FOP_KEY then + * posix_mknod checks if there are already any gfid-links and does + * link() instead of mknod. There still can be a race where two + * posix_mknods same gfid see that + * gfid-link file is not present and proceeds with mknods and result in + * two different files with same gfid. + */ + ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + if (ret) + gf_log (this->name, GF_LOG_INFO, "%s: %s set failed", + impunge_local->loc.path, GLUSTERFS_INTERNAL_FOP_KEY); + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, (void *) (long) child_index, priv->children[child_index], @@ -1872,12 +1908,7 @@ afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, active_src = sh->active_source; sh->impunge_done = afr_sh_entry_impunge_entry_done; - if ((strcmp (entry->d_name, ".") == 0) - || (strcmp (entry->d_name, "..") == 0)) { - - gf_log (this->name, GF_LOG_TRACE, - "skipping inspection of %s under %s", - entry->d_name, local->loc.path); + if (can_skip_entry_self_heal (entry->d_name, &local->loc)) { op_ret = 0; goto out; } @@ -1946,7 +1977,7 @@ afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, local->loc.path, priv->children[active_src]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } else { gf_log (this->name, GF_LOG_TRACE, "readdir of %s on subvolume %s complete", @@ -2019,7 +2050,7 @@ afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) active_src = next_active_source (frame, this, sh->active_source); sh->active_source = active_src; - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_entry_finish (frame, this); return 0; } @@ -2068,7 +2099,7 @@ afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } } UNLOCK (&frame->lock); @@ -2076,7 +2107,7 @@ afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_entry_finish (frame, this); return 0; } @@ -2209,6 +2240,7 @@ afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) local->loc.path); sh->actual_sh_started = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); afr_sh_entry_open (frame, this); return 0; @@ -2231,7 +2263,7 @@ afr_sh_entry_fix (call_frame_t *frame, xlator_t *this, priv = this->private; if (op_ret < 0) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_entry_finish (frame, this); goto out; @@ -2294,7 +2326,7 @@ afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this) if (int_lock->lock_op_ret < 0) { gf_log (this->name, GF_LOG_ERROR, "Non Blocking entrylks " "failed for %s.", local->loc.path); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_entry_done (frame, this); } else { @@ -2313,14 +2345,18 @@ afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this) int afr_self_heal_entry (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; afr_private_t *priv = NULL; - + afr_self_heal_t *sh = NULL; priv = this->private; local = frame->local; + sh = &local->self_heal; + + sh->sh_type_in_action = AFR_SELF_HEAL_ENTRY; if (local->self_heal.do_entry_self_heal && priv->entry_self_heal) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); afr_sh_entrylk (frame, this, &local->loc, NULL, afr_sh_post_nonblocking_entry_cbk); } else { diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index cc85d9b9f..fd5da6cfd 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -97,7 +97,7 @@ afr_sh_metadata_fail (call_frame_t *frame, xlator_t *this) local = frame->local; sh = &local->self_heal; - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_metadata_finish (frame, this); return 0; } @@ -406,6 +406,140 @@ afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, return 0; } +static void +afr_set_metadata_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, + xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = 0; + char num[1024] = {0}; + size_t len = 0; + char *string = NULL; + size_t off = 0; + char *source_child = " from source %s to"; + char *format = " %s, "; + char *string_msg = " metadata self heal"; + char *pending_matrix_str = NULL; + int down_child_present = 0; + int unknown_child_present = 0; + char *down_subvol_1 = " down subvolume is "; + char *unknown_subvol_1 = " unknown subvolume is"; + char *down_subvol_2 = " down subvolumes are "; + char *unknown_subvol_2 = " unknown subvolumes are "; + int down_count = 0; + int unknown_count = 0; + + priv = this->private; + + pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, + this); + + if (!pending_matrix_str) + pending_matrix_str = ""; + + len += snprintf (num, sizeof (num), "%s", string_msg); + + for (i = 0; i < priv->child_count; i++) { + if ((sh->source == i) && (local->child_up[i] == 1)) { + len += snprintf (num, sizeof (num), source_child, + priv->children[i]->name); + } else if ((local->child_up[i] == 1) && (sh->sources[i] == 0)) { + len += snprintf (num, sizeof (num), format, + priv->children[i]->name); + } else if (local->child_up[i] == 0) { + len += snprintf (num, sizeof (num), format, + priv->children[i]->name); + if (!down_child_present) + down_child_present = 1; + down_count++; + } else if (local->child_up[i] == -1) { + len += snprintf (num, sizeof (num), format, + priv->children[i]->name); + if (!unknown_child_present) + unknown_child_present = 1; + unknown_count++; + } + } + + if (down_child_present) { + if (down_count > 1) { + len += snprintf (num, sizeof (num), "%s", + down_subvol_2); + } else { + len += snprintf (num, sizeof (num), "%s", + down_subvol_1); + } + } + if (unknown_child_present) { + if (unknown_count > 1) { + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_2); + } else { + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_1); + } + } + + len ++; + + string = GF_CALLOC (len, sizeof (char), gf_common_mt_char); + if (!string) + return; + + off += snprintf (string + off, len - off, "%s", string_msg); + for (i=0; i < priv->child_count; i++) { + if ((sh->source == i) && (local->child_up[i] == 1)) + off += snprintf (string + off, len - off, source_child, + priv->children[i]->name); + } + + for (i = 0; i < priv->child_count; i++) { + if ((local->child_up[i] == 1)&& (sh->sources[i] == 0)) + off += snprintf (string + off, len - off, format, + priv->children[i]->name); + } + + if (down_child_present) { + if (down_count > 1) { + off += snprintf (string + off, len - off, "%s", + down_subvol_2); + } else { + off += snprintf (string + off, len - off, "%s", + down_subvol_1); + } + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 0) + off += snprintf (string + off, len - off, format, + priv->children[i]->name); + } + + if (unknown_child_present) { + if (unknown_count > 1) { + off += snprintf (string + off, len - off, "%s", + unknown_subvol_2); + } else { + off += snprintf (string + off, len - off, "%s", + unknown_subvol_1); + } + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == -1) + off += snprintf (string + off, len - off, format, + priv->children[i]->name); + } + + gf_asprintf (&sh->metadata_sh_info, "%s metadata %s,", string, + pending_matrix_str); + + if (pending_matrix_str && strcmp (pending_matrix_str, "")) + GF_FREE (pending_matrix_str); + + if (string && strcmp (string, "")) + GF_FREE (string); +} int afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) @@ -436,6 +570,8 @@ afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) sh->active_sinks); sh->actual_sh_started = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); + afr_set_metadata_sh_info_str (local, sh, this); STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, priv->children[source], priv->children[source]->fops->getxattr, @@ -461,7 +597,7 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this, priv = this->private; if (op_ret < 0) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_metadata_finish (frame, this); goto out; @@ -579,19 +715,22 @@ int afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; local = frame->local; int_lock = &local->internal_lock; + int_lock->domain = this->name; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); int_lock->transaction_lk_type = AFR_SELFHEAL_LK; int_lock->selfheal_lk_type = AFR_METADATA_SELF_HEAL_LK; afr_set_lock_number (frame, this); - int_lock->lk_flock.l_start = LLONG_MAX - 1; - int_lock->lk_flock.l_len = 0; - int_lock->lk_flock.l_type = F_WRLCK; + inodelk->flock.l_start = LLONG_MAX - 1; + inodelk->flock.l_len = 0; + inodelk->flock.l_type = F_WRLCK; int_lock->lock_cbk = afr_sh_metadata_post_nonblocking_inodelk_cbk; afr_nonblocking_inodelk (frame, this); @@ -618,8 +757,10 @@ afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) local = frame->local; sh = &local->self_heal; + sh->sh_type_in_action = AFR_SELF_HEAL_METADATA; if (afr_can_start_metadata_self_heal (sh, priv)) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); afr_sh_metadata_lock (frame, this); } else { afr_sh_metadata_done (frame, this); diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index f33b04eed..1b48a1bca 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -25,7 +25,8 @@ typedef enum { typedef enum { HEAL = 1, - INFO + INFO, + STATISTICS_TO_BE_HEALED, } shd_crawl_op; typedef struct shd_dump { @@ -84,6 +85,33 @@ _loc_assign_gfid_path (loc_t *loc) } void +_destroy_crawl_event_data (void *data) +{ + shd_crawl_event_t *crawl_event = NULL; + + if (!data) + goto out; + + crawl_event = (shd_crawl_event_t *)data; + GF_FREE (crawl_event->start_time_str); + GF_FREE (crawl_event->end_time_str); + +out: + return; +} + +void +_destroy_shd_event_data (void *data) +{ + shd_event_t *event = NULL; + if (!data) + goto out; + event = (shd_event_t*)data; + GF_FREE (event->path); +out: + return; +} +void shd_cleanup_event (void *event) { shd_event_t *shd_event = event; @@ -128,6 +156,123 @@ _build_index_loc (xlator_t *this, loc_t *loc, char *name, loc_t *parent) } int +_add_crawl_stats_to_dict (xlator_t *this, dict_t *output, int child, + shd_crawl_event_t *shd_event, struct timeval *tv) +{ + int ret = 0; + uint64_t count = 0; + char key[256] = {0}; + int xl_id = 0; + uint64_t healed_count = 0; + uint64_t split_brain_count = 0; + uint64_t heal_failed_count = 0; + char *start_time_str = NULL; + char *end_time_str = NULL; + char *crawl_type = NULL; + int progress = -1; + + healed_count = shd_event->healed_count; + split_brain_count = shd_event->split_brain_count; + heal_failed_count = shd_event->heal_failed_count; + start_time_str = shd_event->start_time_str; + end_time_str = shd_event->end_time_str; + crawl_type = shd_event->crawl_type; + + if (!start_time_str) { + ret = -1; + goto out; + } + + + ret = dict_get_int32 (output, this->name, &xl_id); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); + goto out; + } + + snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child); + ret = dict_get_uint64 (output, key, &count); + + snprintf (key, sizeof (key), "statistics_healed_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64(output, key, healed_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "healed_count to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_sb_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64 (output, key, split_brain_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "split_brain_count to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_crawl_type-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_dynstr (output, key, gf_strdup (crawl_type)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "crawl_type to output"); + goto out; + } + snprintf (key, sizeof (key), "statistics_heal_failed_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64 (output, key, heal_failed_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "healed_failed_count to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_strt_time-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_dynstr (output, key, gf_strdup(start_time_str)); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "crawl_start_time to outout"); + goto out; + } + + snprintf (key, sizeof (key), "statistics_end_time-%d-%d-%"PRIu64, + xl_id, child, count); + + if (!end_time_str) + end_time_str = "Could not determine the end time"; + ret = dict_set_dynstr (output, key, gf_strdup(end_time_str)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "crawl_end_time to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_inprogress-%d-%d-%"PRIu64, + xl_id, child, count); + + if (shd_event->crawl_inprogress == _gf_true) + progress = 1; + else + progress = 0; + + ret = dict_set_int32 (output, key, progress); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "inprogress to outout"); + goto out; + } + + snprintf (key, sizeof (key), "statistics-%d-%d-count",xl_id, child); + ret = dict_set_uint64 (output, key, count + 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not increment the " + "counter."); + goto out; + } +out: + return ret; +} + +int _add_path_to_dict (xlator_t *this, dict_t *output, int child, char *path, struct timeval *tv, gf_boolean_t dyn) { @@ -233,6 +378,20 @@ out: } int +_add_crawl_event_statistics_to_dict (circular_buffer_t *cb, void *data) +{ + int ret = 0; + shd_dump_t *dump_data = NULL; + shd_crawl_event_t *shd_event = NULL; + + dump_data = data; + shd_event = cb->data; + ret = _add_crawl_stats_to_dict (dump_data->this, dump_data->dict, + dump_data->child, shd_event, &cb->tv); + return ret; +} + +int _add_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict, int child) { shd_dump_t dump_data = {0}; @@ -244,6 +403,26 @@ _add_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict, int child) return 0; } + +int +_add_statistics_to_dict (xlator_t *this, dict_t *dict, int child) +{ + shd_dump_t dump_data = {0}; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + + priv = this->private; + shd = &priv->shd; + + dump_data.this = this; + dump_data.dict = dict; + dump_data.child = child; + eh_dump (shd->statistics[child], &dump_data, + _add_crawl_event_statistics_to_dict); + return 0; + +} + void _remove_stale_index (xlator_t *this, xlator_t *readdir_xl, loc_t *parent, char *fname) @@ -269,6 +448,46 @@ out: } int +_count_hard_links_under_base_indices_dir (xlator_t *this, + afr_crawl_data_t *crawl_data, + gf_dirent_t *entry, loc_t *childloc, + loc_t *parentloc, struct iatt *iattr) +{ + xlator_t *readdir_xl = crawl_data->readdir_xl; + struct iatt parent = {0}; + int ret = 0; + dict_t *output = NULL; + int xl_id = 0; + char key[256] = {0}; + int child = -1; + uint64_t hardlinks = 0; + + output = crawl_data->op_data; + child = crawl_data->child; + + ret = syncop_lookup (readdir_xl, childloc, NULL, iattr, NULL, &parent); + if (ret) + goto out; + + ret = dict_get_int32 (output, this->name, &xl_id); + if (ret) + goto out; + + snprintf (key, sizeof (key), "%d-%d-hardlinks", xl_id, child); + ret = dict_get_uint64 (output, key, &hardlinks); + + /*Removing the count of base_entry under indices/base_indicies and + * entry under indices/xattrop */ + hardlinks = hardlinks + iattr->ia_nlink - 2; + ret = dict_set_uint64 (output, key, hardlinks); + if (ret) + goto out; + +out: + return ret; +} + +int _add_summary_to_dict (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *entry, loc_t *childloc, loc_t *parentloc, struct iatt *iattr) @@ -307,16 +526,18 @@ _crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child, int32_t op_ret, int32_t op_errno, dict_t *xattr_rsp, afr_crawl_data_t *crawl_data) { - int ret = 0; - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - eh_t *eh = NULL; - char *path = NULL; - char gfid_str[64] = {0}; - shd_event_t *event = NULL; - int32_t sh_failed = 0; - gf_boolean_t split_brain = 0; - int32_t actual_sh_done = 0; + int ret = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + eh_t *eh = NULL; + char *path = NULL; + char gfid_str[64] = {0}; + shd_event_t *event = NULL; + int32_t sh_failed = 0; + gf_boolean_t split_brain = 0; + int32_t actual_sh_done = 0; + shd_crawl_event_t **shd_crawl_event = NULL; + priv = this->private; shd = &priv->shd; if (crawl_data->crawl == INDEX) { @@ -343,16 +564,19 @@ _crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child, ret = dict_get_int32 (xattr_rsp, "actual-sh-done", &actual_sh_done); } - split_brain = afr_is_split_brain (this, child->inode); + shd_crawl_event = (shd_crawl_event_t**)(shd->crawl_events); + split_brain = afr_is_split_brain (this, child->inode); if ((op_ret < 0 && op_errno == EIO) || split_brain) { eh = shd->split_brain; + shd_crawl_event[crawl_data->child]->split_brain_count += 1; } else if ((op_ret < 0) || sh_failed) { eh = shd->heal_failed; + shd_crawl_event[crawl_data->child]->heal_failed_count += 1; } else if (actual_sh_done == 1) { - eh = shd->healed; + eh = shd->healed; + shd_crawl_event[crawl_data->child]->healed_count += 1; } - ret = -1; if (eh != NULL) { @@ -408,10 +632,20 @@ _self_heal_entry (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *ent struct iatt parentbuf = {0}; int ret = 0; dict_t *xattr_rsp = NULL; + dict_t *xattr_req = NULL; + + xattr_req = dict_new (); + if (!xattr_req) { + errno = ENOMEM; + ret = -1; + goto out; + } + + ret = dict_set_int32 (xattr_req, "allow-sh-for-running-transaction", 1); gf_log (this->name, GF_LOG_DEBUG, "lookup %s", child->path); - ret = syncop_lookup (this, child, NULL, + ret = syncop_lookup (this, child, xattr_req, iattr, &xattr_rsp, &parentbuf); _crawl_post_sh_action (this, parent, child, ret, errno, xattr_rsp, crawl_data); @@ -420,6 +654,9 @@ _self_heal_entry (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *ent if (ret == 0) ret = _link_inode_update_loc (this, child, iattr); +out: + if (xattr_req) + dict_unref(xattr_req); return ret; } @@ -528,12 +765,20 @@ _do_crawl_op_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl, status = "Started self-heal"; _do_self_heal_on_subvol (this, i, crawl); - } else if (output) { + } else if (output && (op == INFO)) { status = ""; afr_start_crawl (this, i, INDEX, _add_summary_to_dict, output, _gf_false, 0, NULL); + } else if (output && + (op == STATISTICS_TO_BE_HEALED)) { + status = ""; + afr_start_crawl (this, i, + INDEX_TO_BE_HEALED, + _count_hard_links_under_base_indices_dir, + output, _gf_false, + 0, NULL); } } if (output) { @@ -567,8 +812,103 @@ _get_index_summary_on_local_subvols (xlator_t *this, dict_t *output) return _do_crawl_op_on_local_subvols (this, INDEX, INFO, output); } +void +afr_fill_completed_crawl_statistics_to_dict (xlator_t *this, dict_t *dict) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int i = 0; + priv = this->private; + shd= &priv->shd; + for (i = 0; i < priv->child_count; i++) { + if (shd->pos[i] != AFR_POS_LOCAL) + continue; + _add_statistics_to_dict (this, dict, i); + } + + return ; +} + +static void +reset_crawl_event (shd_crawl_event_t *crawl_event) +{ + crawl_event->healed_count = 0; + crawl_event->split_brain_count = 0; + crawl_event->heal_failed_count = 0; + GF_FREE (crawl_event->start_time_str); + crawl_event->start_time_str = NULL; + crawl_event->end_time_str = NULL; + crawl_event->crawl_type = NULL; + crawl_event->crawl_inprogress = _gf_false; + return; +} + +static void +afr_copy_crawl_event_struct (shd_crawl_event_t *src, shd_crawl_event_t *dst) +{ + dst->healed_count = src->healed_count; + dst->split_brain_count = src->split_brain_count; + dst->heal_failed_count = src->heal_failed_count; + dst->start_time_str = gf_strdup (src->start_time_str); + dst->end_time_str = "Crawl is already in progress"; + dst->crawl_type = src->crawl_type; + dst->crawl_inprogress = _gf_true; + return; +} + +static int +afr_fill_crawl_statistics_of_running_crawl(xlator_t *this, dict_t *dict) +{ + shd_crawl_event_t *evnt = NULL; + int ret = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int i = 0; + priv = this->private; + shd = &priv->shd; + + evnt = GF_CALLOC (1, sizeof (shd_crawl_event_t), + gf_afr_mt_shd_crawl_event_t); + if (!evnt) { + ret = -1; + goto out; + } + LOCK (&priv->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (shd->pos[i] != AFR_POS_LOCAL) + continue; + + reset_crawl_event (evnt); + + if (!shd->crawl_events[i]) { + continue; + } + + afr_copy_crawl_event_struct (shd->crawl_events[i], + evnt); + _add_crawl_stats_to_dict (this, dict, i, evnt, NULL); + + } + } + UNLOCK (&priv->lock); + reset_crawl_event (evnt); + GF_FREE (evnt); + +out: + return ret; +} + +static int +_add_local_subvols_crawl_statistics_to_dict (xlator_t *this, dict_t *dict) +{ + int ret = 0; + afr_fill_completed_crawl_statistics_to_dict (this, dict); + ret = afr_fill_crawl_statistics_of_running_crawl (this, dict); + return ret; +} int -_add_all_subvols_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict) +_add_local_subvols_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict) { afr_private_t *priv = NULL; afr_self_heald_t *shd = NULL; @@ -618,16 +958,25 @@ afr_xl_op (xlator_t *this, dict_t *input, dict_t *output) ret = 0; break; case GF_AFR_OP_HEALED_FILES: - ret = _add_all_subvols_eh_to_dict (this, shd->healed, output); + ret = _add_local_subvols_eh_to_dict (this, shd->healed, output); break; case GF_AFR_OP_HEAL_FAILED_FILES: - ret = _add_all_subvols_eh_to_dict (this, shd->heal_failed, + ret = _add_local_subvols_eh_to_dict (this, shd->heal_failed, output); break; case GF_AFR_OP_SPLIT_BRAIN_FILES: - ret = _add_all_subvols_eh_to_dict (this, shd->split_brain, + ret = _add_local_subvols_eh_to_dict (this, shd->split_brain, output); break; + case GF_AFR_OP_STATISTICS: + ret = _add_local_subvols_crawl_statistics_to_dict (this, output); + break; + case GF_AFR_OP_STATISTICS_HEAL_COUNT: + case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: + ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED, + STATISTICS_TO_BE_HEALED, + output); + break; default: gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op); break; @@ -642,7 +991,7 @@ afr_poll_self_heal (void *data) { afr_private_t *priv = NULL; afr_self_heald_t *shd = NULL; - struct timeval timeout = {0}; + struct timespec timeout = {0}; xlator_t *this = NULL; long child = (long)data; gf_timer_t *old_timer = NULL; @@ -666,7 +1015,7 @@ afr_poll_self_heal (void *data) if (shd->enabled && (shd->pos[child] == AFR_POS_LOCAL)) _do_self_heal_on_subvol (this, child, INDEX); timeout.tv_sec = shd->timeout; - timeout.tv_usec = 0; + timeout.tv_nsec = 0; //notify and previous timer should be synchronized. LOCK (&priv->lock); { @@ -800,6 +1149,7 @@ afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data, afr_private_t *priv = NULL; dict_t *xattr = NULL; void *index_gfid = NULL; + void *base_indices_holder_vgfid = NULL; loc_t rootloc = {0}; struct iatt iattr = {0}; struct iatt parent = {0}; @@ -809,7 +1159,7 @@ afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data, priv = this->private; if (crawl_data->crawl == FULL) { afr_build_root_loc (this, dirloc); - } else { + } else if (crawl_data->crawl == INDEX) { afr_build_root_loc (this, &rootloc); ret = syncop_getxattr (readdir_xl, &rootloc, &xattr, GF_XATTROP_INDEX_GFID); @@ -843,6 +1193,47 @@ afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data, ret = _link_inode_update_loc (this, dirloc, &iattr); if (ret) goto out; + } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) { + afr_build_root_loc (this, &rootloc); + ret = syncop_getxattr (readdir_xl, &rootloc, &xattr, + GF_BASE_INDICES_HOLDER_GFID); + if (ret < 0) + goto out; + ret = dict_get_ptr (xattr, GF_BASE_INDICES_HOLDER_GFID, + &base_indices_holder_vgfid); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "index gfid empty " + "on %s", readdir_xl->name); + ret = -1; + goto out; + } + if (!base_indices_holder_vgfid) { + gf_log (this->name, GF_LOG_ERROR, "Base indices holder" + "virtual gfid is null on %s", readdir_xl->name); + ret = -1; + goto out; + } + uuid_copy (dirloc->gfid, base_indices_holder_vgfid); + dirloc->path = ""; + dirloc->inode = inode_new (priv->root_inode->table); + ret = syncop_lookup (readdir_xl, dirloc, NULL, &iattr, NULL, + &parent); + if (ret < 0) { + if (errno != ENOENT) { + gf_log (this->name, GF_LOG_ERROR, "lookup " + "failed for base_indices_holder dir" + " on %s - (%s)", readdir_xl->name, + strerror (errno)); + + } else { + gf_log (this->name, GF_LOG_ERROR, "base_indices" + "_holder is not yet created."); + } + goto out; + } + ret = _link_inode_update_loc (this, dirloc, &iattr); + if (ret) + goto out; } ret = 0; out: @@ -907,6 +1298,16 @@ afr_crawl_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, priv = this->private; if (crawl_data->crawl == FULL) { ret = afr_build_child_loc (this, child, parent, entry->d_name); + } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) { + ret = _build_index_loc (this, child, entry->d_name, parent); + if (ret) + goto out; + child->inode = inode_new (priv->root_inode->table); + if (!child->inode) { + ret = -1; + goto out; + } + child->path = NULL; } else { child->inode = inode_new (priv->root_inode->table); if (!child->inode) @@ -956,10 +1357,14 @@ _process_entries (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries, ret = crawl_data->process_entry (this, crawl_data, entry, &entry_loc, parentloc, &iattr); - if (ret) + if (crawl_data->crawl == INDEX_TO_BE_HEALED && ret) { + goto out; + } else if (ret) { continue; + } - if (crawl_data->crawl == INDEX) + if ((crawl_data->crawl == INDEX) || + (crawl_data->crawl == INDEX_TO_BE_HEALED)) continue; if (!IA_ISDIR (iattr.ia_type)) @@ -974,6 +1379,10 @@ _process_entries (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries, } ret = 0; out: + if ((crawl_data->crawl == INDEX_TO_BE_HEALED) && ret) { + gf_log (this->name, GF_LOG_ERROR,"Failed to get the hardlink " + "count"); + } loc_wipe (&entry_loc); return ret; } @@ -1021,6 +1430,9 @@ _crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data) ret = _process_entries (this, loc, &entries, &offset, crawl_data); + if ((ret < 0) && (crawl_data->crawl == INDEX_TO_BE_HEALED)) { + goto out; + } gf_dirent_free (&entries); free_entries = _gf_false; } @@ -1126,8 +1538,13 @@ afr_dir_crawl (void *data) goto out; ret = afr_crawl_opendir (this, crawl_data, &fd, &dirloc); - if (ret) + if (ret) { + if (crawl_data->crawl == INDEX_TO_BE_HEALED) { + gf_log (this->name, GF_LOG_ERROR, "Failed to open base_" + "indices_holder"); + } goto out; + } ret = _crawl_directory (fd, &dirloc, crawl_data); if (ret) @@ -1141,12 +1558,102 @@ afr_dir_crawl (void *data) out: if (fd) fd_unref (fd); - if (crawl_data->crawl == INDEX) + if ((crawl_data->crawl == INDEX) || + (crawl_data->crawl == INDEX_TO_BE_HEALED )) dirloc.path = NULL; loc_wipe (&dirloc); return ret; } +char * +get_crawl_type_in_string (afr_crawl_type_t crawl) +{ + char *index = "INDEX"; + char *full = "FULL"; + char *crawl_type = NULL; + + if (crawl == INDEX){ + crawl_type = index; + } else if (crawl == FULL) { + crawl_type = full; + } + + return crawl_type; +} + +static int +afr_allocate_crawl_event (xlator_t *this, int child, afr_crawl_type_t crawl) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int ret = 0; + shd_crawl_event_t *crawl_event = NULL; + time_t get_time = 0; + + priv = this->private; + shd = &priv->shd; + + crawl_event = GF_CALLOC (sizeof (shd_crawl_event_t), 1, + gf_afr_mt_shd_crawl_event_t); + if (!crawl_event) { + ret = -1; + goto out; + } + + get_time = time(NULL); + if (get_time == ((time_t)-1)) { + ret = -1; + goto out; + } + + crawl_event->start_time_str = gf_strdup (ctime(&get_time)); + + crawl_event->crawl_type = get_crawl_type_in_string (crawl); + if (!crawl_event->crawl_type) { + ret = -1; + goto out; + } + LOCK (&priv->lock); + { + shd->crawl_events[child] = crawl_event; + } + UNLOCK (&priv->lock); + ret = 0; +out: + return ret; + +} + +static int +afr_put_crawl_event_in_eh (xlator_t *this, int child) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int ret = 0; + time_t get_time = 0; + shd_crawl_event_t **crawl_event = NULL; + + priv = this->private; + shd = &priv->shd; + + get_time = time(NULL); + if (get_time == ((time_t)-1)) { + ret = -1; + goto out; + } + crawl_event = (shd_crawl_event_t**)shd->crawl_events; + LOCK (&priv->lock); + { + crawl_event[child]->end_time_str = gf_strdup (ctime(&get_time)); + ret = eh_save_history (shd->statistics[child], + crawl_event[child]); + crawl_event[child] = NULL; + } + UNLOCK (&priv->lock); +out: + return ret; +} + static int afr_dir_exclusive_crawl (void *data) { @@ -1182,7 +1689,15 @@ afr_dir_exclusive_crawl (void *data) } do { + ret = afr_allocate_crawl_event (this, child, crawl_data->crawl); + if (ret) + goto out; afr_dir_crawl (data); + + ret = afr_put_crawl_event_in_eh (this, child); + if (ret < 0) + goto out; + LOCK (&priv->lock); { if (shd->pending[child] != NONE) { @@ -1270,4 +1785,3 @@ afr_set_root_gfid (dict_t *dict) return ret; } - diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h index 32a8aaca5..e0c083754 100644 --- a/xlators/cluster/afr/src/afr-self-heald.h +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -29,6 +29,19 @@ typedef struct afr_crawl_data_ { struct iatt *iattr); } afr_crawl_data_t; +typedef struct crawl_event_stats_ { + uint64_t healed_count; + uint64_t split_brain_count; + uint64_t heal_failed_count; + char *start_time_str; + char *end_time_str; + char *crawl_type; + gf_boolean_t crawl_inprogress; +} shd_crawl_event_t; + +void _destroy_crawl_event_data (void *data); +void _destroy_shd_event_data (void *data); + typedef int (*process_entry_cbk_t) (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *entry, loc_t *child, loc_t *parent, struct iatt *iattr); diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 217ff8548..20306e469 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -79,7 +79,7 @@ afr_save_lk_owner (call_frame_t *frame) local = frame->local; - local->saved_lk_owner = frame->root->lk_owner; + local->saved_lk_owner = frame->root->lk_owner; } @@ -90,10 +90,9 @@ afr_restore_lk_owner (call_frame_t *frame) local = frame->local; - frame->root->lk_owner = local->saved_lk_owner; + frame->root->lk_owner = local->saved_lk_owner; } - static void __mark_all_pending (int32_t *pending[], int child_count, afr_transaction_type type) @@ -369,7 +368,7 @@ afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (call_count == 0) { if (local->transaction.resume_stub) { - AFR_CALL_RESUME (local->transaction.resume_stub); + call_resume (local->transaction.resume_stub); local->transaction.resume_stub = NULL; } @@ -443,14 +442,30 @@ out: return; } +afr_inodelk_t* +afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom) +{ + afr_inodelk_t *inodelk = NULL; + int i = 0; + + for (i = 0; int_lock->inodelk[i].domain; i++) { + inodelk = &int_lock->inodelk[i]; + if (strcmp (dom, inodelk->domain) == 0) + return inodelk; + } + return NULL; +} + unsigned char* afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock) { unsigned char *locked_nodes = NULL; + afr_inodelk_t *inodelk = NULL; switch (type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - locked_nodes = int_lock->inode_locked_nodes; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + locked_nodes = inodelk->locked_nodes; break; case AFR_ENTRY_TRANSACTION: @@ -553,24 +568,77 @@ afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr, gf_boolean_t afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int index = -1; - int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int index = -1; + int i = 0; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; index = afr_index_for_transaction_type (local->transaction.type); - for (i = 0; i < priv->child_count; i++) { + for (i = 0; i < priv->child_count; i++) { if (local->pending[i][index] == 0) - return _gf_false; + return _gf_false; } - return _gf_true; + return _gf_true; } +static void +afr_dir_fop_handle_all_fop_failures (call_frame_t *frame) +{ + xlator_t *this = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + this = frame->this; + local = frame->local; + priv = this->private; + + if ((local->transaction.type != AFR_ENTRY_TRANSACTION) && + (local->transaction.type != AFR_ENTRY_RENAME_TRANSACTION)) + return; + + if (local->op_ret >= 0) + goto out; + + __mark_all_success (local->pending, priv->child_count, + local->transaction.type); +out: + return; +} + +static void +afr_data_handle_quota_errors (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + gf_boolean_t all_quota_failures = _gf_false; + + local = frame->local; + priv = this->private; + if (local->transaction.type != AFR_DATA_TRANSACTION) + return; + /* + * Idea is to not leave the file in FOOL-FOOL scenario in case on + * all the bricks data transaction failed with EDQUOT to avoid + * increasing un-necessary load of self-heals in the system. + */ + all_quota_failures = _gf_true; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] && + (local->child_errno[i] != EDQUOT)) { + all_quota_failures = _gf_false; + break; + } + } + if (all_quota_failures) + __mark_all_success (local->pending, priv->child_count, + local->transaction.type); +} int afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) @@ -593,6 +661,9 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) local->transaction.pre_op, local->transaction.type); + afr_data_handle_quota_errors (frame, this); + afr_dir_fop_handle_all_fop_failures (frame); + if (local->fd) afr_transaction_rm_stale_children (frame, this, local->fd->inode, @@ -619,7 +690,7 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) goto out; } - nothing_failed = afr_txn_nothing_failed (frame, this); + nothing_failed = afr_txn_nothing_failed (frame, this); afr_compute_txn_changelog (local , priv); @@ -645,14 +716,14 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) break; } - /* local->transaction.postop_piggybacked[] was - precomputed in is_piggyback_postop() when called from - afr_changelog_post_op_safe() - */ + /* local->transaction.postop_piggybacked[] was + precomputed in is_piggyback_postop() when called from + afr_changelog_post_op_safe() + */ - piggyback = 0; - if (local->transaction.postop_piggybacked[i]) - piggyback = 1; + piggyback = 0; + if (local->transaction.postop_piggybacked[i]) + piggyback = 1; afr_set_postop_dict (local, this, xattr[i], piggyback, i); @@ -906,7 +977,7 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) } UNLOCK (&local->fd->lock); - afr_set_delayed_post_op (frame, this); + afr_set_delayed_post_op (frame, this); if (piggyback) afr_changelog_pre_op_cbk (frame, (void *)(long)i, @@ -1179,12 +1250,14 @@ int afr_set_transaction_flock (afr_local_t *local) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; int_lock = &local->internal_lock; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - int_lock->lk_flock.l_len = local->transaction.len; - int_lock->lk_flock.l_start = local->transaction.start; - int_lock->lk_flock.l_type = F_WRLCK; + inodelk->flock.l_len = local->transaction.len; + inodelk->flock.l_start = local->transaction.start; + inodelk->flock.l_type = F_WRLCK; return 0; } @@ -1199,6 +1272,7 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; int_lock->transaction_lk_type = AFR_TRANSACTION_LK; + int_lock->domain = this->name; switch (local->transaction.type) { case AFR_DATA_TRANSACTION: @@ -1259,69 +1333,115 @@ afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) void afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - /* call this function from any of the related optimizations - which benefit from delaying post op are enabled, namely: + /* call this function from any of the related optimizations + which benefit from delaying post op are enabled, namely: - - changelog piggybacking - - eager locking - */ + - changelog piggybacking + - eager locking + */ - priv = this->private; - if (!priv) - return; + priv = this->private; + if (!priv) + return; - if (!priv->post_op_delay_secs) - return; + if (!priv->post_op_delay_secs) + return; local = frame->local; if (!local->transaction.eager_lock_on) return; - if (!local) - return; + if (!local) + return; - if (!local->fd) - return; + if (!local->fd) + return; - if (local->op == GF_FOP_WRITE) - local->delayed_post_op = _gf_true; + if (local->op == GF_FOP_WRITE) + local->delayed_post_op = _gf_true; } +gf_boolean_t +afr_are_multiple_fds_opened (inode_t *inode, xlator_t *this) +{ + afr_inode_ctx_t *ictx = NULL; + + if (!inode) { + /* If false is returned, it may keep on taking eager-lock + * which may lead to starvation, so return true to avoid that. + */ + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid inode"); + return _gf_true; + } + /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock + * is taken mount2 opened the same file, it won't be able to + * perform any data operations until mount1 releases eager-lock. + * To avoid such scenario do not enable eager-lock for this transaction + * if open-fd-count is > 1 + */ + + ictx = afr_inode_ctx_get (inode, this); + if (!ictx) + return _gf_true; + + if (ictx->open_fd_count > 1) + return _gf_true; + + return _gf_false; +} + +gf_boolean_t +afr_any_fops_failed (afr_local_t *local, afr_private_t *priv) +{ + if (local->success_count != priv->child_count) + return _gf_true; + return _gf_false; +} gf_boolean_t is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - gf_boolean_t res = _gf_false; + afr_local_t *local = NULL; + gf_boolean_t res = _gf_false; + afr_private_t *priv = NULL; - local = frame->local; - if (!local) - goto out; + priv = this->private; - if (!local->delayed_post_op) - goto out; + local = frame->local; + if (!local) + goto out; - res = _gf_true; + if (!local->delayed_post_op) + goto out; + + //Mark pending changelog ASAP + if (afr_any_fops_failed (local, priv)) + goto out; + + if (local->fd && afr_are_multiple_fds_opened (local->fd->inode, this)) + goto out; + + res = _gf_true; out: - return res; + return res; } void afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, - call_stub_t *stub); + call_stub_t *stub); void afr_delayed_changelog_wake_up_cbk (void *data) { - fd_t *fd = NULL; + fd_t *fd = NULL; - fd = data; + fd = data; - afr_delayed_changelog_wake_up (THIS, fd); + afr_delayed_changelog_wake_up (THIS, fd); } @@ -1332,39 +1452,39 @@ afr_delayed_changelog_wake_up_cbk (void *data) static gf_boolean_t is_piggyback_post_op (call_frame_t *frame, fd_t *fd) { - afr_fd_ctx_t *fdctx = NULL; - afr_local_t *local = NULL; - gf_boolean_t piggyback = _gf_true; - afr_private_t *priv = NULL; - int i = 0; + afr_fd_ctx_t *fdctx = NULL; + afr_local_t *local = NULL; + gf_boolean_t piggyback = _gf_true; + afr_private_t *priv = NULL; + int i = 0; - priv = frame->this->private; - local = frame->local; - fdctx = afr_fd_ctx_get (fd, frame->this); + priv = frame->this->private; + local = frame->local; + fdctx = afr_fd_ctx_get (fd, frame->this); - LOCK(&fd->lock); - { - piggyback = _gf_true; - - for (i = 0; i < priv->child_count; i++) { - if (!local->transaction.pre_op[i]) - continue; - if (fdctx->pre_op_piggyback[i]) { - fdctx->pre_op_piggyback[i]--; - local->transaction.postop_piggybacked[i] = 1; - } else { - /* For at least _one_ subvolume we cannot - piggyback on the changelog, and have to - perform a hard POST-OP and therefore fsync - if necesssary - */ - piggyback = _gf_false; + LOCK(&fd->lock); + { + piggyback = _gf_true; + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; + if (fdctx->pre_op_piggyback[i]) { + fdctx->pre_op_piggyback[i]--; + local->transaction.postop_piggybacked[i] = 1; + } else { + /* For at least _one_ subvolume we cannot + piggyback on the changelog, and have to + perform a hard POST-OP and therefore fsync + if necesssary + */ + piggyback = _gf_false; GF_ASSERT (fdctx->pre_op_done[i]); fdctx->pre_op_done[i]--; - } - } - } - UNLOCK(&fd->lock); + } + } + } + UNLOCK(&fd->lock); if (!afr_txn_nothing_failed (frame, frame->this)) { /* something failed in this transaction, @@ -1381,186 +1501,205 @@ is_piggyback_post_op (call_frame_t *frame, fd_t *fd) int afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) { - afr_fd_ctx_t *fdctx = NULL; + afr_fd_ctx_t *fdctx = NULL; - fdctx = afr_fd_ctx_get (fd, this); + fdctx = afr_fd_ctx_get (fd, this); - LOCK(&fd->lock); - { - fdctx->witnessed_unstable_write = _gf_true; - } - UNLOCK(&fd->lock); + LOCK(&fd->lock); + { + fdctx->witnessed_unstable_write = _gf_true; + } + UNLOCK(&fd->lock); - return 0; + return 0; } /* TEST and CLEAR operation */ gf_boolean_t afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd) { - afr_fd_ctx_t *fdctx = NULL; - gf_boolean_t witness = _gf_false; + afr_fd_ctx_t *fdctx = NULL; + gf_boolean_t witness = _gf_false; fdctx = afr_fd_ctx_get (fd, this); + if (!fdctx) + return _gf_true; - LOCK(&fd->lock); - { - if (fdctx->witnessed_unstable_write) { - witness = _gf_true; - fdctx->witnessed_unstable_write = _gf_false; - } - } - UNLOCK (&fd->lock); + LOCK(&fd->lock); + { + if (fdctx->witnessed_unstable_write) { + witness = _gf_true; + fdctx->witnessed_unstable_write = _gf_false; + } + } + UNLOCK (&fd->lock); - return witness; + return witness; } int afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *pre, - struct iatt *post, dict_t *xdata) + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) { - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; int child_index = (long) cookie; - int call_count = -1; - afr_local_t *local = NULL; + int call_count = -1; + afr_local_t *local = NULL; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - if (afr_fop_failed (op_ret, op_errno)) { - /* Failure of fsync() is as good as failure of previous - write(). So treat it like one. - */ - gf_log (this->name, GF_LOG_WARNING, - "fsync(%s) failed on subvolume %s. Transaction was %s", - uuid_utoa (local->fd->inode->gfid), - priv->children[child_index]->name, - gf_fop_list[local->op]); - - afr_transaction_fop_failed (frame, this, child_index); - } + if (afr_fop_failed (op_ret, op_errno)) { + /* Failure of fsync() is as good as failure of previous + write(). So treat it like one. + */ + gf_log (this->name, GF_LOG_WARNING, + "fsync(%s) failed on subvolume %s. Transaction was %s", + uuid_utoa (local->fd->inode->gfid), + priv->children[child_index]->name, + gf_fop_list[local->op]); + + afr_transaction_fop_failed (frame, this, child_index); + } call_count = afr_frame_return (frame); if (call_count == 0) - afr_changelog_post_op_now (frame, this); + afr_changelog_post_op_now (frame, this); - return 0; + return 0; } int afr_changelog_fsync (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - int i = 0; - int call_count = 0; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + GF_UNUSED int ret = -1; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; call_count = afr_pre_op_done_children_count (local->transaction.pre_op, priv->child_count); - if (!call_count) { - /* will go straight to unlock */ - afr_changelog_post_op_now (frame, this); - return 0; - } + if (!call_count) { + /* will go straight to unlock */ + afr_changelog_post_op_now (frame, this); + return 0; + } - local->call_count = call_count; + local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!local->transaction.pre_op[i]) - continue; + xdata = dict_new(); + if (xdata) + ret = dict_set_int32 (xdata, "batch-fsync", 1); - STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->fsync, local->fd, - 1, NULL); - if (!--call_count) - break; - } + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->fsync, local->fd, + 1, xdata); + if (!--call_count) + break; + } - return 0; + if (xdata) + dict_unref (xdata); + + return 0; } -int + int afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; + afr_private_t *priv = NULL; local = frame->local; + priv = this->private; - if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) { - afr_changelog_post_op_now (frame, this); - return 0; - } - - if (is_piggyback_post_op (frame, local->fd)) { - /* just detected that this post-op is about to - be optimized away as a new write() has - already piggybacked on this frame's changelog. - */ - afr_changelog_post_op_now (frame, this); - return 0; - } + if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) { + afr_changelog_post_op_now (frame, this); + return 0; + } - /* Calling afr_changelog_post_op_now() now will result in - issuing ->[f]xattrop(). - - Performing a hard POST-OP (->[f]xattrop() FOP) is a more - responsible operation that what it might appear on the surface. - - The changelog of a file (in the xattr of the file on the server) - stores information (pending count) about the state of the file - on the OTHER server. This changelog is blindly trusted, and must - therefore be updated in such a way it remains trustworthy. This - implies that decrementing the pending count (essentially "clearing - the dirty flag") must be done STRICTLY after we are sure that the - operation on the other server has reached stable storage. - - While the backend filesystem on that server will eventually flush - it to stable storage, we (being in userspace) have no mechanism - to get notified when the write became "stable". - - This means we need take matter into our own hands and issue an - fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES, - and get an acknowledgement for it. And we need to wait for the - fsync() acknowledgement before initiating the hard POST-OP. - - However if the FD itself was opened in O_SYNC or O_DSYNC then - we are already guaranteed that the writes were made stable as - part of the FOP itself. The same holds true for NFS stable - writes which happen on an anonymous FD with O_DSYNC or O_SYNC - flag set in the writev() @flags param. For all other write types, - mark a flag in the fdctx whenever an unstable write is witnessed. - */ - - if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { - afr_changelog_post_op_now (frame, this); - return 0; - } + if (is_piggyback_post_op (frame, local->fd)) { + /* just detected that this post-op is about to + be optimized away as a new write() has + already piggybacked on this frame's changelog. + */ + afr_changelog_post_op_now (frame, this); + return 0; + } - /* Time to fsync() */ + /* Calling afr_changelog_post_op_now() now will result in + issuing ->[f]xattrop(). + + Performing a hard POST-OP (->[f]xattrop() FOP) is a more + responsible operation that what it might appear on the surface. + + The changelog of a file (in the xattr of the file on the server) + stores information (pending count) about the state of the file + on the OTHER server. This changelog is blindly trusted, and must + therefore be updated in such a way it remains trustworthy. This + implies that decrementing the pending count (essentially "clearing + the dirty flag") must be done STRICTLY after we are sure that the + operation on the other server has reached stable storage. + + While the backend filesystem on that server will eventually flush + it to stable storage, we (being in userspace) have no mechanism + to get notified when the write became "stable". + + This means we need take matter into our own hands and issue an + fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES, + and get an acknowledgement for it. And we need to wait for the + fsync() acknowledgement before initiating the hard POST-OP. + + However if the FD itself was opened in O_SYNC or O_DSYNC then + we are already guaranteed that the writes were made stable as + part of the FOP itself. The same holds true for NFS stable + writes which happen on an anonymous FD with O_DSYNC or O_SYNC + flag set in the writev() @flags param. For all other write types, + mark a flag in the fdctx whenever an unstable write is witnessed. + */ + + if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { + afr_changelog_post_op_now (frame, this); + return 0; + } - afr_changelog_fsync (frame, this); + /* Check whether users want durability and perform fsync/post-op + * accordingly. + */ + if (priv->ensure_durability) { + /* Time to fsync() */ + afr_changelog_fsync (frame, this); + } else { + afr_changelog_post_op_now (frame, this); + } - return 0; + return 0; } void afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, - call_stub_t *stub) + call_stub_t *stub) { afr_fd_ctx_t *fd_ctx = NULL; call_frame_t *prev_frame = NULL; - struct timeval delta = {0, }; + struct timespec delta = {0, }; afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -1568,10 +1707,10 @@ afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, fd_ctx = afr_fd_ctx_get (fd, this); if (!fd_ctx) - return; + goto out; delta.tv_sec = priv->post_op_delay_secs; - delta.tv_usec = 0; + delta.tv_nsec = 0; pthread_mutex_lock (&fd_ctx->delay_lock); { @@ -1590,6 +1729,7 @@ afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, unlock: pthread_mutex_unlock (&fd_ctx->delay_lock); +out: if (prev_frame) { local = prev_frame->local; local->transaction.resume_stub = stub; @@ -1603,14 +1743,14 @@ unlock: void afr_changelog_post_op (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (is_afr_delayed_changelog_post_op_needed (frame, this)) - afr_delayed_changelog_post_op (this, frame, local->fd, NULL); - else - afr_changelog_post_op_safe (frame, this); + if (is_afr_delayed_changelog_post_op_needed (frame, this)) + afr_delayed_changelog_post_op (this, frame, local->fd, NULL); + else + afr_changelog_post_op_safe (frame, this); } @@ -1621,22 +1761,22 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) The @stub gets saved in @local and gets resumed in afr_local_cleanup() -*/ + */ void afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub) { - afr_delayed_changelog_post_op (this, NULL, fd, stub); + afr_delayed_changelog_post_op (this, NULL, fd, stub); } void afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd) { - afr_delayed_changelog_post_op (this, NULL, fd, NULL); + afr_delayed_changelog_post_op (this, NULL, fd, NULL); } -int + int afr_transaction_resume (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; @@ -1647,18 +1787,18 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; priv = this->private; - if (local->transaction.eager_lock_on) { - /* We don't need to retain "local" in the - fd list anymore, writes to all subvols - are finished by now */ - LOCK (&local->fd->lock); - { - list_del_init (&local->transaction.eager_locked); - } - UNLOCK (&local->fd->lock); - } + if (local->transaction.eager_lock_on) { + /* We don't need to retain "local" in the + fd list anymore, writes to all subvols + are finished by now */ + LOCK (&local->fd->lock); + { + list_del_init (&local->transaction.eager_locked); + } + UNLOCK (&local->fd->lock); + } - afr_restore_lk_owner (frame); + afr_restore_lk_owner (frame); if (__fop_changelog_needed (frame, this)) { afr_changelog_post_op (frame, this); @@ -1680,7 +1820,8 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) */ void -afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index) +afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, + int child_index) { afr_local_t * local = NULL; afr_private_t * priv = NULL; @@ -1689,55 +1830,56 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index priv = this->private; __mark_child_dead (local->pending, priv->child_count, - child_index, local->transaction.type); + child_index, local->transaction.type); } -static gf_boolean_t + static gf_boolean_t afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) { - uint64_t start1 = local1->transaction.start; - uint64_t start2 = local2->transaction.start; - uint64_t end1 = 0; - uint64_t end2 = 0; - - if (local1->transaction.len) - end1 = start1 + local1->transaction.len - 1; - else - end1 = ULLONG_MAX; - - if (local2->transaction.len) - end2 = start2 + local2->transaction.len - 1; - else - end2 = ULLONG_MAX; - - return ((end1 >= start2) && (end2 >= start1)); -} + uint64_t start1 = local1->transaction.start; + uint64_t start2 = local2->transaction.start; + uint64_t end1 = 0; + uint64_t end2 = 0; + if (local1->transaction.len) + end1 = start1 + local1->transaction.len - 1; + else + end1 = ULLONG_MAX; + + if (local2->transaction.len) + end2 = start2 + local2->transaction.len - 1; + else + end2 = ULLONG_MAX; + + return ((end1 >= start2) && (end2 >= start1)); +} void afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) { - afr_private_t *priv = NULL; - afr_fd_ctx_t *fdctx = NULL; - afr_local_t *each = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fdctx = NULL; + afr_local_t *each = NULL; - priv = this->private; + priv = this->private; - if (!local->fd) - return; + if (!local->fd) + return; - if (local->transaction.type != AFR_DATA_TRANSACTION) - return; + if (local->transaction.type != AFR_DATA_TRANSACTION) + return; - if (!priv->eager_lock) - return; + if (!priv->eager_lock) + return; - fdctx = afr_fd_ctx_get (local->fd, this); - if (!fdctx) - return; + fdctx = afr_fd_ctx_get (local->fd, this); + if (!fdctx) + return; + if (afr_are_multiple_fds_opened (local->fd->inode, this)) + return; /* * Once full file lock is acquired in eager-lock phase, overlapping * writes do not compete for inode-locks, instead are transferred to the @@ -1755,22 +1897,22 @@ afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) * This check makes sure the locks are not transferred for * overlapping writes. */ - LOCK (&local->fd->lock); - { - list_for_each_entry (each, &fdctx->eager_locked, - transaction.eager_locked) { - if (afr_locals_overlap (each, local)) { - local->transaction.eager_lock_on = _gf_false; - goto unlock; - } - } - - local->transaction.eager_lock_on = _gf_true; - list_add_tail (&local->transaction.eager_locked, - &fdctx->eager_locked); - } + LOCK (&local->fd->lock); + { + list_for_each_entry (each, &fdctx->eager_locked, + transaction.eager_locked) { + if (afr_locals_overlap (each, local)) { + local->transaction.eager_lock_on = _gf_false; + goto unlock; + } + } + + local->transaction.eager_lock_on = _gf_true; + list_add_tail (&local->transaction.eager_locked, + &fdctx->eager_locked); + } unlock: - UNLOCK (&local->fd->lock); + UNLOCK (&local->fd->lock); } @@ -1789,11 +1931,10 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) local->transaction.type = type; ret = afr_transaction_local_init (local, this); - if (ret < 0) { + if (ret < 0) goto out; - } - afr_transaction_eager_lock_init (local, this); + afr_transaction_eager_lock_init (local, this); if (local->fd && local->transaction.eager_lock_on) afr_set_lk_owner (frame, this, local->fd); @@ -1802,6 +1943,9 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) if (!local->transaction.eager_lock_on && local->loc.inode) { fd = fd_lookup (local->loc.inode, frame->root->pid); + if (fd == NULL) + fd = fd_lookup_anonymous (local->loc.inode); + if (fd) { afr_delayed_changelog_wake_up (this, fd); fd_unref (fd); diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index 55e8bbcca..fa626fd0d 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -23,6 +23,9 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int afr_lock_server_count (afr_private_t *priv, afr_transaction_type type); +afr_inodelk_t* +afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom); + int32_t afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); @@ -40,4 +43,9 @@ afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd); void __mark_all_success (int32_t *pending[], int child_count, afr_transaction_type type); +gf_boolean_t +afr_any_fops_failed (afr_local_t *local, afr_private_t *priv); + +gf_boolean_t +afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this); #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index bee10fd01..c724eb2ae 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -191,6 +191,8 @@ reconfigure (xlator_t *this, dict_t *options) /* Reset this so we re-discover in case the topology changed. */ GF_OPTION_RECONF ("readdir-failover", priv->readdir_failover, options, bool, out); + GF_OPTION_RECONF ("ensure-durability", priv->ensure_durability, options, + bool, out); priv->did_discovery = _gf_false; ret = 0; @@ -335,6 +337,8 @@ init (xlator_t *this) GF_OPTION_INIT ("post-op-delay-secs", priv->post_op_delay_secs, uint32, out); GF_OPTION_INIT ("readdir-failover", priv->readdir_failover, bool, out); + GF_OPTION_INIT ("ensure-durability", priv->ensure_durability, bool, + out); priv->wait_count = 1; @@ -376,8 +380,6 @@ init (xlator_t *this) AFR_XATTR_PREFIX, trav->xlator->name); if (-1 == ret) { - gf_log (this->name, GF_LOG_ERROR, - "asprintf failed to set pending key"); ret = -ENOMEM; goto out; } @@ -386,6 +388,13 @@ init (xlator_t *this) i++; } + ret = gf_asprintf (&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT, + this->name); + if (-1 == ret) { + ret = -ENOMEM; + goto out; + } + priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event), gf_afr_mt_int32_t); if (!priv->last_event) { @@ -430,15 +439,18 @@ init (xlator_t *this) if (!priv->shd.timer) goto out; - priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false); + priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false, + _destroy_shd_event_data); if (!priv->shd.healed) goto out; - priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false); + priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false, + _destroy_shd_event_data); if (!priv->shd.heal_failed) goto out; - priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false); + priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false, + _destroy_shd_event_data); if (!priv->shd.split_brain) goto out; @@ -448,7 +460,9 @@ init (xlator_t *this) priv->root_inode = inode_ref (this->itable->root); GF_OPTION_INIT ("node-uuid", priv->shd.node_uuid, str, out); GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out); - + ret = afr_initialise_statistics (this); + if (ret) + goto out; ret = 0; out: return ret; @@ -483,6 +497,9 @@ struct xlator_fops fops = { .finodelk = afr_finodelk, .entrylk = afr_entrylk, .fentrylk = afr_fentrylk, + .fallocate = afr_fallocate, + .discard = afr_discard, + .zerofill = afr_zerofill, /* inode read */ .access = afr_access, @@ -765,5 +782,12 @@ struct volume_options options[] = { .description = "readdir(p) will not failover if this option is off", .default_value = "on", }, + { .key = {"ensure-durability"}, + .type = GF_OPTION_TYPE_BOOL, + .description = "Afr performs fsyncs for transactions if this " + "option is on to make sure the changelogs/data is " + "written to the disk", + .default_value = "on", + }, { .key = {NULL} }, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 387ed12ec..21064db58 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -28,8 +28,10 @@ #define AFR_XATTR_PREFIX "trusted.afr" #define AFR_PATHINFO_HEADER "REPLICATE:" #define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size" +#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal" #define AFR_LOCKEE_COUNT_MAX 3 +#define AFR_DOM_COUNT_MAX 3 struct _pump_private; @@ -86,26 +88,30 @@ typedef struct afr_inode_ctx_ { int32_t *fresh_children;//increasing order of latency afr_spb_state_t mdata_spb; afr_spb_state_t data_spb; + uint32_t open_fd_count; } afr_inode_ctx_t; typedef enum { NONE, INDEX, + INDEX_TO_BE_HEALED, FULL, } afr_crawl_type_t; typedef struct afr_self_heald_ { - gf_boolean_t enabled; - gf_boolean_t iamshd; - afr_crawl_type_t *pending; - gf_boolean_t *inprogress; - afr_child_pos_t *pos; - gf_timer_t **timer; - eh_t *healed; - eh_t *heal_failed; - eh_t *split_brain; - char *node_uuid; - int timeout; + gf_boolean_t enabled; + gf_boolean_t iamshd; + afr_crawl_type_t *pending; + gf_boolean_t *inprogress; + afr_child_pos_t *pos; + gf_timer_t **timer; + eh_t *healed; + eh_t *heal_failed; + eh_t *split_brain; + eh_t **statistics; + void **crawl_events; + char *node_uuid; + int timeout; } afr_self_heald_t; typedef struct _afr_private { @@ -170,9 +176,38 @@ typedef struct _afr_private { gf_boolean_t did_discovery; gf_boolean_t readdir_failover; uint64_t sh_readdir_size; + gf_boolean_t ensure_durability; + char *sh_domain; } afr_private_t; +typedef enum { + AFR_SELF_HEAL_NOT_ATTEMPTED, + AFR_SELF_HEAL_STARTED, + AFR_SELF_HEAL_FAILED, + AFR_SELF_HEAL_SYNC_BEGIN, +} afr_self_heal_status; + typedef struct { + afr_self_heal_status gfid_or_missing_entry_self_heal; + afr_self_heal_status metadata_self_heal; + afr_self_heal_status data_self_heal; + afr_self_heal_status entry_self_heal; +} afr_sh_status_for_all_type; + +typedef enum { + AFR_SELF_HEAL_ENTRY, + AFR_SELF_HEAL_METADATA, + AFR_SELF_HEAL_DATA, + AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY, + AFR_SELF_HEAL_INVALID = -1, +} afr_self_heal_type; + +typedef enum { + AFR_CHECK_ALL, + AFR_CHECK_SPECIFIC, +} afr_sh_fail_check_type; + +struct afr_self_heal_ { /* External interface: These are variables (some optional) that are set by whoever has triggered self-heal */ @@ -249,10 +284,10 @@ typedef struct { const char *linkname; gf_boolean_t entries_skipped; - int op_failed; gf_boolean_t actual_sh_started; gf_boolean_t sync_done; gf_boolean_t data_lock_held; + gf_boolean_t sh_dom_lock_held; gf_boolean_t eof_reached; fd_t *healing_fd; int file_has_holes; @@ -263,13 +298,17 @@ typedef struct { uint8_t *checksum; afr_post_remove_call_t post_remove_call; - loc_t parent_loc; + char *data_sh_info; + char *metadata_sh_info; + loc_t parent_loc; call_frame_t *orig_frame; call_frame_t *old_loop_frame; gf_boolean_t unwound; afr_sh_algo_private_t *private; + afr_sh_status_for_all_type afr_all_sh_status; + afr_self_heal_type sh_type_in_action; struct afr_sh_algorithm *algo; afr_lock_cbk_t data_lock_success_handler; @@ -282,7 +321,9 @@ typedef struct { void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this); call_frame_t *sh_frame; -} afr_self_heal_t; +}; + +typedef struct afr_self_heal_ afr_self_heal_t; typedef enum { AFR_DATA_TRANSACTION, /* truncate, write, ... */ @@ -356,12 +397,19 @@ int afr_entry_lockee_cmp (const void *l1, const void *l2); typedef struct { + char *domain; /* Domain on which inodelk is taken */ + struct gf_flock flock; + unsigned char *locked_nodes; + int32_t lock_count; +} afr_inodelk_t; + +typedef struct { loc_t *lk_loc; - struct gf_flock lk_flock; int lockee_count; afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX]; + afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX]; const char *lk_basename; const char *lower_basename; const char *higher_basename; @@ -370,13 +418,11 @@ typedef struct { unsigned char *locked_nodes; unsigned char *lower_locked_nodes; - unsigned char *inode_locked_nodes; selfheal_lk_type_t selfheal_lk_type; transaction_lk_type_t transaction_lk_type; int32_t lock_count; - int32_t inodelk_lock_count; int32_t entrylk_lock_count; uint64_t lock_number; @@ -387,6 +433,7 @@ typedef struct { int32_t lock_op_ret; int32_t lock_op_errno; afr_lock_cbk_t lock_cbk; + char *domain; /* Domain on which inode/entry lock/unlock in progress.*/ } afr_internal_lock_t; typedef struct _afr_locked_fd { @@ -406,9 +453,11 @@ typedef struct _afr_local { unsigned int call_count; unsigned int success_count; unsigned int enoent_count; + uint32_t open_fd_count; + gf_boolean_t update_open_fd_count; - unsigned int govinda_gOvinda; + unsigned int unhealable; unsigned int read_child_index; unsigned char read_child_returned; @@ -448,15 +497,23 @@ typedef struct _afr_local { int optimistic_change_log; gf_boolean_t delayed_post_op; + /* Is the current writev() going to perform a stable write? i.e, is fd->flags or @flags writev param have O_SYNC or O_DSYNC? */ - gf_boolean_t stable_write; + gf_boolean_t stable_write; - /* - This struct contains the arguments for the "continuation" - (scheme-like) of fops + /* This write appended to the file. Nnot necessarily O_APPEND, + just means the offset of write was at the end of file. + */ + gf_boolean_t append_write; + + int allow_sh_for_running_transaction; + + + /* This struct contains the arguments for the "continuation" + (scheme-like) of fops */ int op; @@ -552,7 +609,9 @@ typedef struct _afr_local { struct { struct iatt prebuf; struct iatt postbuf; + } inode_wfop; //common structure for all inode-write-fops + struct { int32_t op_ret; struct iovec *vector; @@ -563,34 +622,21 @@ typedef struct _afr_local { } writev; struct { - struct iatt prebuf; - struct iatt postbuf; - } fsync; - - struct { off_t offset; - struct iatt prebuf; - struct iatt postbuf; } truncate; struct { off_t offset; - struct iatt prebuf; - struct iatt postbuf; } ftruncate; struct { struct iatt in_buf; int32_t valid; - struct iatt preop_buf; - struct iatt postop_buf; } setattr; struct { struct iatt in_buf; int32_t valid; - struct iatt preop_buf; - struct iatt postop_buf; } fsetattr; struct { @@ -652,6 +698,26 @@ typedef struct _afr_local { dict_t *params; char *linkpath; } symlink; + + struct { + int32_t mode; + off_t offset; + size_t len; + } fallocate; + + struct { + off_t offset; + size_t len; + } discard; + + struct { + off_t offset; + size_t len; + struct iatt prebuf; + struct iatt postbuf; + } zerofill; + + } cont; struct { @@ -824,8 +890,8 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this); int afr_internal_lock_finish (call_frame_t *frame, xlator_t *this); -void -afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, unsigned int child_count); int pump_start (call_frame_t *frame, xlator_t *this); @@ -927,22 +993,6 @@ afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd); } \ } while (0); -#define AFR_CALL_RESUME(stub) \ - do { \ - afr_local_t *__local = NULL; \ - xlator_t *__this = NULL; \ - \ - __local = stub->frame->local; \ - __this = stub->frame->this; \ - stub->frame->local = NULL; \ - \ - call_resume (stub); \ - if (__local) { \ - afr_local_cleanup (__local, __this); \ - mem_put (__local); \ - } \ - } while (0) - #define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ /* allocate and return a string that is the basename of argument */ static inline char * @@ -1120,6 +1170,27 @@ afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count); } while (0); +#define AFR_SBRAIN_MSG "Failed on %s as split-brain is seen. Returning EIO." + +#define AFR_SBRAIN_CHECK_FD(fd, label) do { \ + if (fd->inode && afr_is_split_brain (this, fd->inode)) { \ + op_errno = EIO; \ + gf_log (this->name, GF_LOG_WARNING, \ + AFR_SBRAIN_MSG ,uuid_utoa (fd->inode->gfid)); \ + goto label; \ + } \ +} while (0) + +#define AFR_SBRAIN_CHECK_LOC(loc, label) do { \ + if (loc->inode && afr_is_split_brain (this, loc->inode)) { \ + op_errno = EIO; \ + loc_path (loc, NULL); \ + gf_log (this->name, GF_LOG_WARNING, \ + AFR_SBRAIN_MSG , loc->path); \ + goto label; \ + } \ +} while (0) + int afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); @@ -1129,4 +1200,13 @@ afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd); void afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub); +int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count); + +void +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this); + +afr_inode_ctx_t* +afr_inode_ctx_get (inode_t *inode, xlator_t *this); + #endif /* __AFR_H__ */ diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c index db83410c7..a7f72fb30 100644 --- a/xlators/cluster/afr/src/pump.c +++ b/xlators/cluster/afr/src/pump.c @@ -2501,6 +2501,12 @@ init (xlator_t *this) i++; } + ret = gf_asprintf (&priv->sh_domain, "%s-self-heal", this->name); + if (-1 == ret) { + op_errno = ENOMEM; + goto out; + } + priv->first_lookup = 1; priv->root_inode = NULL; @@ -2532,7 +2538,7 @@ init (xlator_t *this) goto out; } - pump_priv->env = syncenv_new (0); + pump_priv->env = this->ctx->env; if (!pump_priv->env) { gf_log (this->name, GF_LOG_ERROR, "Could not create new sync-environment"); @@ -2573,9 +2579,6 @@ fini (xlator_t *this) if (!pump_priv) goto afr_priv; - if (pump_priv->env) - syncenv_destroy (pump_priv->env); - GF_FREE (pump_priv->resume_path); LOCK_DESTROY (&pump_priv->resume_path_lock); LOCK_DESTROY (&pump_priv->pump_state_lock); diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 2dbcd756d..8f61339e6 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -22,6 +22,7 @@ #include "dht-common.h" #include "defaults.h" #include "byte-order.h" +#include "glusterfs-acl.h" #include <sys/time.h> #include <libgen.h> @@ -62,6 +63,11 @@ dht_aggregate (dict_t *this, char *key, data_t *value, void *data) } *size = hton64 (ntoh64 (*size) + ntoh64 (*ptr)); + + } else if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) { + ret = gf_get_min_stime (THIS, dst, key, value); + if (ret < 0) + return ret; } else { /* compare user xattrs only */ if (!strncmp (key, "user.", strlen ("user."))) { @@ -149,7 +155,6 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame) int ret = -1; dht_layout_t *layout = NULL; dht_conf_t *conf = NULL; - uint32_t missing = 0; local = discover_frame->local; layout = local->layout; @@ -186,26 +191,20 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame) goto out; } } else { - ret = dht_layout_normalize (this, &local->loc, layout, - &missing); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "normalizing failed on %s (internal error)", - local->loc.path); - op_errno = EIO; - goto out; - } - if (missing == conf->subvolume_cnt) { - gf_log (this->name, GF_LOG_DEBUG, - "normalizing failed on %s, ENOENT errors: %u)", - local->loc.path, missing); - op_errno = ENOENT; - goto out; - } - if (ret != 0) { + ret = dht_layout_normalize (this, &local->loc, layout); + if ((ret < 0) || ((ret > 0) && (local->op_ret != 0))) { + /* either the layout is incorrect or the directory is + * not found even in one subvolume. + */ gf_log (this->name, GF_LOG_DEBUG, "normalizing failed on %s " - "(overlaps/holes present)", local->loc.path); + "(overlaps/holes present: %s, " + "ENOENT errors: %d)", local->loc.path, + (ret < 0) ? "yes" : "no", (ret > 0) ? ret : 0); + if ((ret > 0) && (ret == conf->subvolume_cnt)) { + op_errno = ESTALE; + goto out; + } } if (local->inode) @@ -409,7 +408,6 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dht_layout_t *layout = NULL; int ret = -1; int is_dir = 0; - uint32_t missing = 0; GF_VALIDATE_OR_GOTO ("dht", frame, out); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -443,7 +441,7 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, op_ret, op_errno, xattr); if (op_ret == -1) { - local->op_errno = ENOENT; + local->op_errno = op_errno; gf_log (this->name, GF_LOG_DEBUG, "lookup of %s on %s returned error (%s)", local->loc.path, prev->this->name, @@ -490,16 +488,9 @@ unlock: } if (local->op_ret == 0) { - ret = dht_layout_normalize (this, &local->loc, layout, - &missing); - - /* - * Arguably, we shouldn't do self-heal just because - * bricks are missing as long as there are no other - * anomalies. For now, though, just preserve the - * existing behavior. - */ - if ((ret != 0) || (missing != 0)) { + ret = dht_layout_normalize (this, &local->loc, layout); + + if (ret != 0) { gf_log (this->name, GF_LOG_DEBUG, "fixing assignment on %s", local->loc.path); @@ -1424,7 +1415,6 @@ dht_lookup (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); conf = this->private; if (!conf) @@ -1645,7 +1635,8 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, LOCK (&frame->lock); { - if (op_ret == -1) { + if ((op_ret == -1) && !((op_errno == ENOENT) || + (op_errno == ENOTCONN))) { local->op_errno = op_errno; gf_log (this->name, GF_LOG_DEBUG, "subvolume %s returned -1 (%s)", @@ -1658,7 +1649,7 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, unlock: UNLOCK (&frame->lock); - if (op_ret == -1) + if (local->op_ret == -1) goto err; cached_subvol = dht_subvol_get_cached (this, local->loc.inode); @@ -1797,6 +1788,7 @@ dht_vgetxattr_alloc_and_fill (dht_local_t *local, dict_t *xattr, xlator_t *this, } (void) strcat (local->xattr_val, value); + (void) strcat (local->xattr_val, " "); local->op_ret = 0; } @@ -1821,6 +1813,8 @@ dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this, if (!*dict) goto out; + local->xattr_val[strlen (local->xattr_val) - 1] = '\0'; + /* we would need max this many bytes to create xattr string * extra 40 bytes is just an estimated amount of additional * space required as we include translator name and some @@ -2053,10 +2047,8 @@ dht_getxattr_get_real_filename_cbk (call_frame_t *frame, void *cookie, { int this_call_cnt = 0; dht_local_t *local = NULL; - dht_conf_t *conf = NULL; - conf = this->private; local = frame->local; if (op_ret != -1) { @@ -2083,7 +2075,6 @@ int dht_getxattr_get_real_filename (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key, dict_t *xdata) { - dht_conf_t *conf = NULL; dht_local_t *local = NULL; int i = 0; dht_layout_t *layout = NULL; @@ -2091,7 +2082,6 @@ dht_getxattr_get_real_filename (call_frame_t *frame, xlator_t *this, xlator_t *subvol = NULL; - conf = this->private; local = frame->local; layout = local->layout; @@ -2132,7 +2122,6 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); VALIDATE_OR_GOTO (this->private, err); conf = this->private; @@ -2177,8 +2166,9 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, * NOTE: Don't trust inode here, as that may not be valid * (until inode_link() happens) */ - if (key && (strcmp (key, GF_XATTR_PATHINFO_KEY) == 0) - && DHT_IS_DIR(layout)) { + if (key && DHT_IS_DIR(layout) && + ((strcmp (key, GF_XATTR_PATHINFO_KEY) == 0) + || (strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0))) { (void) strncpy (local->xsel, key, 256); cnt = local->call_cnt = layout->cnt; for (i = 0; i < cnt; i++) { @@ -2249,7 +2239,8 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, if (cluster_getmarkerattr (frame, this, loc, key, local, dht_getxattr_unwind, sub_volumes, cnt, - MARKER_UUID_TYPE, conf->vol_uuid)) { + MARKER_UUID_TYPE, marker_uuid_default_gauge, + conf->vol_uuid)) { op_errno = EINVAL; goto err; } @@ -2273,6 +2264,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, local, dht_getxattr_unwind, sub_volumes, cnt, MARKER_XTIME_TYPE, + marker_xtime_default_gauge, conf->vol_uuid)) { op_errno = EINVAL; goto err; @@ -2488,7 +2480,6 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); conf = this->private; @@ -2702,7 +2693,6 @@ dht_removexattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); local = dht_local_init (frame, loc, NULL, GF_FOP_REMOVEXATTR); if (!local) { @@ -2934,7 +2924,6 @@ dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); VALIDATE_OR_GOTO (this->private, err); conf = this->private; @@ -3055,7 +3044,7 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, list_for_each_entry (orig_entry, (&orig_entries->list), list) { next_offset = orig_entry->d_off; if (check_is_dir (NULL, (&orig_entry->d_stat), NULL) && - (prev->this != dht_first_up_subvol (this))) { + (prev->this != local->first_up_subvol)) { continue; } if (check_is_linkfile (NULL, (&orig_entry->d_stat), @@ -3137,13 +3126,16 @@ done: } if (conf->readdir_optimize == _gf_true) { - if (next_subvol != dht_first_up_subvol (this)) { + if (next_subvol != local->first_up_subvol) { ret = dict_set_int32 (local->xattr, GF_READDIR_SKIP_DIRS, 1); if (ret) gf_log (this->name, GF_LOG_ERROR, "dict set failed"); - } + } else { + dict_del (local->xattr, + GF_READDIR_SKIP_DIRS); + } } STACK_WIND (frame, dht_readdirp_cbk, @@ -3289,6 +3281,7 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, local->fd = fd_ref (fd); local->size = size; local->xattr_req = (dict)? dict_ref (dict) : NULL; + local->first_up_subvol = dht_first_up_subvol (this); dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff); @@ -3307,13 +3300,16 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, "failed to set '%s' key", conf->link_xattr_name); if (conf->readdir_optimize == _gf_true) { - if (xvol != dht_first_up_subvol (this)) { + if (xvol != local->first_up_subvol) { ret = dict_set_int32 (local->xattr, GF_READDIR_SKIP_DIRS, 1); if (ret) gf_log (this->name, GF_LOG_ERROR, "Dict set failed"); + } else { + dict_del (local->xattr, + GF_READDIR_SKIP_DIRS); } } } @@ -3571,7 +3567,9 @@ dht_mknod (call_frame_t *frame, xlator_t *this, subvol, subvol->fops->mknod, loc, mode, rdev, umask, params); } else { - avail_subvol = dht_free_disk_available_subvol (this, subvol); + + avail_subvol = dht_free_disk_available_subvol (this, subvol, + local); if (avail_subvol != subvol) { /* Choose the minimum filled volume, and create the files there */ @@ -3992,7 +3990,7 @@ dht_create (call_frame_t *frame, xlator_t *this, } /* Choose the minimum filled volume, and create the files there */ - avail_subvol = dht_free_disk_available_subvol (this, subvol); + avail_subvol = dht_free_disk_available_subvol (this, subvol, local); if (avail_subvol != subvol) { local->params = dict_ref (params); local->flags = flags; @@ -4868,7 +4866,6 @@ dht_entrylk (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); local = dht_local_init (frame, loc, NULL, GF_FOP_ENTRYLK); if (!local) { @@ -5176,8 +5173,8 @@ unlock: * not need to handle CHILD_DOWN event here. */ if (conf->defrag) { - ret = pthread_create (&conf->defrag->th, NULL, - gf_defrag_start, this); + ret = gf_thread_create (&conf->defrag->th, NULL, + gf_defrag_start, this); if (ret) { conf->defrag = NULL; GF_FREE (conf->defrag); diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 9de861360..5ccd66799 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -183,6 +183,7 @@ struct dht_local { xlator_t *link_subvol; struct dht_rebalance_ rebalance; + xlator_t *first_up_subvol; }; typedef struct dht_local dht_local_t; @@ -211,6 +212,10 @@ enum gf_defrag_status_t { GF_DEFRAG_STATUS_STOPPED, GF_DEFRAG_STATUS_COMPLETE, GF_DEFRAG_STATUS_FAILED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED, + GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE, + GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED, }; typedef enum gf_defrag_status_t gf_defrag_status_t; @@ -227,6 +232,7 @@ struct gf_defrag_info_ { uint64_t total_data; uint64_t num_files_lookedup; uint64_t total_failures; + uint64_t skipped; gf_lock_t lock; int cmd; pthread_t th; @@ -394,26 +400,18 @@ typedef enum { } while (0) #define is_greater_time(a, an, b, bn) (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))) - -dht_layout_t *dht_layout_new (xlator_t *this, int cnt); -dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode); -dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol); -xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout, - const char *name); -int dht_layout_normalize (xlator_t *this, loc_t *loc, - dht_layout_t *layout, - uint32_t *missing_p); -int dht_layout_anomalies (xlator_t *this, loc_t *loc, - dht_layout_t *layout, - uint32_t *holes_p, - uint32_t *overlaps_p, - uint32_t *missing_p, - uint32_t *down_p, - uint32_t *misc_p, - uint32_t *no_space_p); -int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, - xlator_t *subvol, loc_t *loc, - dict_t *xattr); +dht_layout_t *dht_layout_new (xlator_t *this, int cnt); +dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode); +dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol); +xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout, + const char *name); +int dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout); +int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, + uint32_t *holes_p, uint32_t *overlaps_p, + uint32_t *missing_p, uint32_t *down_p, + uint32_t *misc_p, uint32_t *no_space_p); +int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, + xlator_t *subvol, loc_t *loc, dict_t *xattr); xlator_t *dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *buf, dict_t *xattr); @@ -445,6 +443,7 @@ int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc); xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode); xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev); +xlator_t *dht_subvol_next_available (xlator_t *this, xlator_t *prev); int dht_subvol_cnt (xlator_t *this, xlator_t *subvol); int dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p); @@ -469,7 +468,8 @@ dht_layout_sort_volname (dht_layout_t *layout); int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc); gf_boolean_t dht_is_subvol_filled (xlator_t *this, xlator_t *subvol); -xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol); +xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, + dht_local_t *layout); int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx); int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode); @@ -691,6 +691,12 @@ int32_t dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, int32_t valid, dict_t *xdata); int32_t dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, int32_t valid, dict_t *xdata); +int32_t dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t mode, off_t offset, size_t len, dict_t *xdata); +int32_t dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, dict_t *xdata); +int32_t dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, dict_t *xdata); int32_t dht_init (xlator_t *this); void dht_fini (xlator_t *this); @@ -760,9 +766,11 @@ dht_dir_has_layout (dict_t *xattr, char *name); gf_boolean_t dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator); xlator_t * -dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol); +dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout); xlator_t * -dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol); +dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout); int dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this); @@ -773,4 +781,7 @@ dht_priv_dump (xlator_t *this); int32_t dht_inodectx_dump (xlator_t *this, inode_t *inode); +int +dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol); + #endif/* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 0c87f4a64..fe3955ecb 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -251,25 +251,45 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) /*Get the best subvolume to create the file in*/ xlator_t * -dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol) +dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, + dht_local_t *local) { xlator_t *avail_subvol = NULL; dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + loc_t *loc = NULL; conf = this->private; + if (!local) + goto out; + loc = &local->loc; + if (!local->layout) { + layout = dht_layout_get (this, loc->parent); + + if (!layout) { + gf_log (this->name, GF_LOG_DEBUG, + "layout missing path=%s parent=%s", + loc->path, uuid_utoa (loc->parent->gfid)); + goto out; + } + } else { + layout = dht_layout_ref (this, local->layout); + } - LOCK (&conf->subvolume_lock); + LOCK (&conf->subvolume_lock); { - avail_subvol = dht_subvol_with_free_space_inodes(this, subvol); + avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, + layout); if(!avail_subvol) { avail_subvol = dht_subvol_maxspace_nonzeroinode(this, - subvol); + subvol, + layout); } } UNLOCK (&conf->subvolume_lock); - +out: if (!avail_subvol) { gf_log (this->name, GF_LOG_DEBUG, @@ -278,17 +298,42 @@ dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol) avail_subvol = subvol; } - + if (layout) + dht_layout_unref (this, layout); return avail_subvol; } +static inline +int32_t dht_subvol_has_err (xlator_t *this, dht_layout_t *layout) +{ + int ret = -1; + int i = 0; + + if (!this || !layout) + goto out; + + /* check if subvol has layout errors, before selecting it */ + for (i = 0; i < layout->cnt; i++) { + if (!strcmp (layout->list[i].xlator->name, this->name) && + (layout->list[i].err != 0)) { + ret = -1; + goto out; + } + } + ret = 0; +out: + return ret; +} + /*Get subvolume which has both space and inodes more than the min criteria*/ xlator_t * -dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol) +dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, + dht_layout_t *layout) { int i = 0; double max = 0; double max_inodes = 0; + int ignore_subvol = 0; xlator_t *avail_subvol = NULL; dht_conf_t *conf = NULL; @@ -296,6 +341,12 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol) conf = this->private; for(i=0; i < conf->subvolume_cnt; i++) { + /* check if subvol has layout errors, before selecting it */ + ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], + layout); + if (ignore_subvol) + continue; + if ((conf->disk_unit == 'p') && (conf->du_stats[i].avail_percent > conf->min_free_disk) && (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) { @@ -325,10 +376,12 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol) /* Get subvol which has atleast one inode and maximum space */ xlator_t * -dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol) +dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout) { int i = 0; double max = 0; + int ignore_subvol = 0; xlator_t *avail_subvol = NULL; dht_conf_t *conf = NULL; @@ -336,6 +389,12 @@ dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol) conf = this->private; for (i = 0; i < conf->subvolume_cnt; i++) { + /* check if subvol has layout errors, before selecting it */ + ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], + layout); + if (ignore_subvol) + continue; + if (conf->disk_unit == 'p') { if ((conf->du_stats[i].avail_percent > max) && (conf->du_stats[i].avail_inodes > 0 )) { diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index 23fc1ff96..311a48112 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -18,6 +18,28 @@ #include "xlator.h" #include "dht-common.h" +static inline int +dht_inode_ctx_set1 (xlator_t *this, inode_t *inode, xlator_t *subvol) +{ + uint64_t tmp_subvol = 0; + + tmp_subvol = (long)subvol; + return inode_ctx_set1 (inode, this, &tmp_subvol); +} + +int +dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol) +{ + int ret = -1; + uint64_t tmp_subvol = 0; + + ret = inode_ctx_get1 (inode, this, &tmp_subvol); + if (tmp_subvol && subvol) + *subvol = (xlator_t *)tmp_subvol; + + return ret; +} + int dht_frame_return (call_frame_t *frame) @@ -340,20 +362,6 @@ out: return local; } - -char * -basestr (const char *str) -{ - char *basestr = NULL; - - basestr = strrchr (str, '/'); - if (basestr) - basestr ++; - - return basestr; -} - - xlator_t * dht_first_up_subvol (xlator_t *this) { @@ -505,7 +513,36 @@ out: return next; } +/* This func wraps around, if prev is actually the last subvol. + */ +xlator_t * +dht_subvol_next_available (xlator_t *this, xlator_t *prev) +{ + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *next = NULL; + + conf = this->private; + if (!conf) + goto out; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) { + /* if prev is last in conf->subvolumes, then wrap + * around. + */ + if ((i + 1) < conf->subvolume_cnt) { + next = conf->subvolumes[i + 1]; + } else { + next = conf->subvolumes[0]; + } + break; + } + } + +out: + return next; +} int dht_subvol_cnt (xlator_t *this, xlator_t *subvol) { @@ -698,6 +735,10 @@ dht_migration_complete_check_task (void *data) loc_t tmp_loc = {0,}; char *path = NULL; dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + uint64_t tmp_subvol = 0; + int open_failed = 0; this = THIS; frame = data; @@ -709,6 +750,8 @@ dht_migration_complete_check_task (void *data) if (!local->loc.inode && !local->fd) goto out; + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr * as root:root. If a fd is already open, access check wont be done*/ @@ -771,10 +814,7 @@ dht_migration_complete_check_task (void *data) /* update inode ctx (the layout) */ dht_layout_unref (this, local->layout); - if (!local->loc.inode) - ret = dht_layout_preset (this, dst_node, local->fd->inode); - else - ret = dht_layout_preset (this, dst_node, local->loc.inode); + ret = dht_layout_preset (this, dst_node, inode); if (ret != 0) { gf_log (this->name, GF_LOG_DEBUG, "%s: could not set preset layout for subvol %s", @@ -792,10 +832,7 @@ dht_migration_complete_check_task (void *data) goto out; } - if (!local->loc.inode) - ret = dht_layout_set (this, local->fd->inode, layout); - else - ret = dht_layout_set (this, local->loc.inode, layout); + ret = dht_layout_set (this, inode, layout); if (ret) { gf_log (this->name, GF_LOG_ERROR, "%s: failed to set the new layout", @@ -806,41 +843,46 @@ dht_migration_complete_check_task (void *data) local->cached_subvol = dst_node; ret = 0; - if (!local->fd) + /* once we detect the migration complete, the inode-ctx2 is no more + required.. delete the ctx and also, it means, open() already + done on all the fd of inode */ + ret = inode_ctx_reset1 (inode, this, &tmp_subvol); + if (tmp_subvol) goto out; - /* once we detect the migration complete, the fd-ctx is no more - required.. delete the ctx */ - ret = fd_ctx_del (local->fd, this, NULL); - if (!ret) + + if (list_empty (&inode->fd_list)) goto out; /* perform open as root:root. There is window between linkfile * creation(root:root) and setattr with the correct uid/gid */ SYNCTASK_SETID(0, 0); - /* if 'local->fd' (ie, fd based operation), send a 'open()' on - destination if not already done */ - if (local->loc.inode) { - ret = syncop_open (dst_node, &local->loc, - local->fd->flags, local->fd); - } else { - tmp_loc.inode = local->fd->inode; - inode_path (local->fd->inode, NULL, &path); - if (path) - tmp_loc.path = path; - ret = syncop_open (dst_node, &tmp_loc, - local->fd->flags, local->fd); - GF_FREE (path); + /* perform 'open()' on all the fd's present on the inode */ + tmp_loc.inode = inode; + inode_path (inode, NULL, &path); + if (path) + tmp_loc.path = path; + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (fd_is_anonymous (iter_fd)) + continue; + + ret = syncop_open (dst_node, &tmp_loc, + iter_fd->flags, iter_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "failed to open " + "the fd (%p, flags=0%o) on file %s @ %s", + iter_fd, iter_fd->flags, path, dst_node->name); + open_failed = 1; + } } + GF_FREE (path); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to send open() on target file at %s", - local->loc.path, dst_node->name); + if (open_failed) { + ret = -1; goto out; } - ret = 0; out: @@ -885,6 +927,9 @@ dht_rebalance_inprogress_task (void *data) struct iatt stbuf = {0,}; loc_t tmp_loc = {0,}; dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + int open_failed = 0; this = THIS; frame = data; @@ -896,6 +941,8 @@ dht_rebalance_inprogress_task (void *data) if (!local->loc.inode && !local->fd) goto out; + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr * as root:root. If a fd is already open, access check wont be done*/ if (local->loc.inode) { @@ -947,35 +994,47 @@ dht_rebalance_inprogress_task (void *data) } ret = 0; + + if (list_empty (&inode->fd_list)) + goto done; + /* perform open as root:root. There is window between linkfile * creation(root:root) and setattr with the correct uid/gid */ SYNCTASK_SETID (0, 0); - if (local->loc.inode) { - ret = syncop_open (dst_node, &local->loc, - local->fd->flags, local->fd); - } else { - tmp_loc.inode = local->fd->inode; - inode_path (local->fd->inode, NULL, &path); - if (path) - tmp_loc.path = path; + + tmp_loc.inode = inode; + inode_path (inode, NULL, &path); + if (path) + tmp_loc.path = path; + + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (fd_is_anonymous (iter_fd)) + continue; + ret = syncop_open (dst_node, &tmp_loc, - local->fd->flags, local->fd); - GF_FREE (path); + iter_fd->flags, iter_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "failed to send open " + "the fd (%p, flags=0%o) on file %s @ %s", + iter_fd, iter_fd->flags, path, dst_node->name); + open_failed = 1; + } } + GF_FREE (path); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to send open() on target file at %s", - local->loc.path, dst_node->name); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + + if (open_failed) { + ret = -1; goto out; } - SYNCTASK_SETID (frame->root->uid, frame->root->gid); - ret = fd_ctx_set (local->fd, this, (uint64_t)(long)dst_node); +done: + ret = dht_inode_ctx_set1 (this, inode, dst_node); if (ret) { gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set fd-ctx target file at %s", + "%s: failed to set inode-ctx target file at %s", local->loc.path, dst_node->name); goto out; } diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c index d9b311fe2..ece84151a 100644 --- a/xlators/cluster/dht/src/dht-inode-read.c +++ b/xlators/cluster/dht/src/dht-inode-read.c @@ -130,10 +130,11 @@ int dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata) { - uint64_t tmp_subvol = 0; + xlator_t *subvol = 0; dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + inode_t *inode = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -157,19 +158,20 @@ dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_errno = op_errno; /* Check if the rebalance phase2 is true */ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) { - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (ret) { + inode = (local->fd) ? local->fd->inode : local->loc.inode; + ret = dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { /* Phase 2 of migration */ local->rebalance.target_op_fn = dht_attr2; ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; } else { /* value is already set in fd_ctx, that means no need to check for whether its complete or not. */ dht_attr2 (this, frame, 0); - } - if (!ret) return 0; + } } out: @@ -382,6 +384,8 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { dht_local_t *local = NULL; int ret = 0; + inode_t *inode = NULL; + xlator_t *subvol = 0; local = frame->local; if (!local) { @@ -400,17 +404,18 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_errno = op_errno; if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) { /* File would be migrated to other node */ - ret = fd_ctx_get (local->fd, this, NULL); - if (ret) { + ret = dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { local->rebalance.target_op_fn = dht_readv2; ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; } else { /* value is already set in fd_ctx, that means no need to check for whether its complete or not. */ dht_readv2 (this, frame, 0); - } - if (!ret) return 0; + } } out: @@ -501,18 +506,27 @@ dht_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int ret = -1; dht_local_t *local = NULL; xlator_t *subvol = NULL; + call_frame_t *prev = NULL; local = frame->local; + prev = cookie; + if (!prev || !prev->this) + goto out; if (local->call_cnt != 1) goto out; if ((op_ret == -1) && (op_errno == ENOTCONN) && IA_ISDIR(local->loc.inode->ia_type)) { - subvol = dht_first_up_subvol (this); + subvol = dht_subvol_next_available (this, prev->this); if (!subvol) goto out; + /* check if we are done with visiting every node */ + if (subvol == local->cached_subvol) { + goto out; + } + STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access, &local->loc, local->rebalance.flags, NULL); return 0; @@ -607,8 +621,9 @@ int dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - int ret = -1; + dht_local_t *local = NULL; + inode_t *inode = NULL; + xlator_t *subvol = 0; local = frame->local; @@ -618,8 +633,8 @@ dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; /* If context is set, then send flush() it to the destination */ - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { + dht_inode_ctx_get1 (this, inode, &subvol); + if (subvol) { dht_flush2 (this, frame, 0); return 0; } @@ -635,14 +650,10 @@ dht_flush2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -704,6 +715,8 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + inode_t *inode = NULL; + xlator_t *subvol = 0; local = frame->local; prev = cookie; @@ -725,8 +738,8 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, } local->op_errno = op_errno; - ret = fd_ctx_get (local->fd, this, NULL); - if (ret) { + dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { local->rebalance.target_op_fn = dht_fsync2; /* Check if the rebalance phase1 is true */ @@ -741,11 +754,12 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, if (IS_DHT_MIGRATION_PHASE2 (postbuf)) { ret = dht_rebalance_complete_check (this, frame); } + if (!ret) + return 0; } else { dht_fsync2 (this, frame, 0); - } - if (!ret) return 0; + } out: DHT_STRIP_PHASE1_FLAGS (postbuf); @@ -761,15 +775,10 @@ dht_fsync2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; - + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c index b87d2f732..4b3f3a049 100644 --- a/xlators/cluster/dht/src/dht-inode-write.c +++ b/xlators/cluster/dht/src/dht-inode-write.c @@ -19,6 +19,9 @@ int dht_writev2 (xlator_t *this, call_frame_t *frame, int ret); int dht_truncate2 (xlator_t *this, call_frame_t *frame, int ret); int dht_setattr2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret); +int dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret); +int dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret); int dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -27,6 +30,7 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { dht_local_t *local = NULL; int ret = -1; + xlator_t *subvol = NULL; if (op_ret == -1 && (op_errno != ENOENT)) { goto out; @@ -63,8 +67,8 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dht_iatt_merge (this, &local->stbuf, postbuf, NULL); dht_iatt_merge (this, &local->prebuf, prebuf, NULL); - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { + ret = dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { dht_writev2 (this, frame, 0); return 0; } @@ -88,14 +92,10 @@ dht_writev2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -170,6 +170,8 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + xlator_t *subvol = NULL; + inode_t *inode = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -211,8 +213,9 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { dht_iatt_merge (this, &local->stbuf, postbuf, NULL); dht_iatt_merge (this, &local->prebuf, prebuf, NULL); - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret || !local->fd) { + inode = (local->fd) ? local->fd->inode : local->loc.inode; + dht_inode_ctx_get1 (this, inode, &subvol); + if (subvol) { dht_truncate2 (this, frame, 0); return 0; } @@ -236,16 +239,13 @@ dht_truncate2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; + inode_t *inode = NULL; local = frame->local; - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + inode = local->fd ? local->fd->inode : local->loc.inode; + dht_inode_ctx_get1 (this, inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -348,6 +348,407 @@ err: return 0; } + +int +dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + xlator_t *subvol = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_fallocate2; + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { + dht_fallocate2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (fallocate, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_fallocate_cbk, subvol, subvol->fops->fallocate, + local->fd, local->rebalance.flags, local->rebalance.offset, + local->rebalance.size, NULL); + + return 0; +} + +int +dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FALLOCATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.flags = mode; + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_fallocate_cbk, + subvol, subvol->fops->fallocate, + fd, mode, offset, len, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + +int +dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + xlator_t *subvol = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_discard2; + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { + dht_discard2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (discard, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_discard_cbk, subvol, subvol->fops->discard, + local->fd, local->rebalance.offset, local->rebalance.size, + NULL); + + return 0; +} + +int +dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_DISCARD); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_discard_cbk, subvol, subvol->fops->discard, + fd, offset, len, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +int +dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_zerofill2; + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + ret = fd_ctx_get (local->fd, this, NULL); + if (!ret) { + dht_zerofill2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (zerofill, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + uint64_t tmp_subvol = 0; + int ret = -1; + + local = frame->local; + + if (local->fd) + ret = fd_ctx_get (local->fd, this, &tmp_subvol); + if (!ret) + subvol = (xlator_t *)(long)tmp_subvol; + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill, + local->fd, local->rebalance.offset, local->rebalance.size, + NULL); + + return 0; +} + +int +dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_ZEROFILL); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill, + fd, offset, len, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + + /* handle cases of migration here for 'setattr()' calls */ int dht_file_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -399,15 +800,13 @@ dht_setattr2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; + inode_t *inode = NULL; local = frame->local; - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + inode = (local->fd) ? local->fd->inode : local->loc.inode; + + dht_inode_ctx_get1 (this, inode, &subvol); if (!subvol) subvol = local->cached_subvol; diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c index 07e8cbae4..38e9970a7 100644 --- a/xlators/cluster/dht/src/dht-layout.c +++ b/xlators/cluster/dht/src/dht-layout.c @@ -454,12 +454,19 @@ dht_layout_entry_cmp (dht_layout_t *layout, int i, int j) { int64_t diff = 0; + /* swap zero'ed out layouts to front, if needed */ + if (!layout->list[j].start && !layout->list[j].stop) { + diff = (int64_t) layout->list[i].stop + - (int64_t) layout->list[j].stop; + goto out; + } if (layout->list[i].err || layout->list[j].err) diff = layout->list[i].err - layout->list[j].err; else diff = (int64_t) layout->list[i].start - (int64_t) layout->list[j].start; +out: return diff; } @@ -534,13 +541,13 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, case -1: case ENOENT: missing++; - break; + continue; case ENOTCONN: down++; - break; + continue; case ENOSPC: no_space++; - break; + continue; case 0: /* if err == 0 and start == stop, then it is a non misc++; * participating subvolume(spread-cnt). Then, do not @@ -552,6 +559,7 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, break; default: misc++; + continue; } is_virgin = 0; @@ -593,8 +601,7 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, int -dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout, - uint32_t *missing_p) +dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout) { int ret = 0; int i = 0; @@ -606,7 +613,6 @@ dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout, ret = dht_layout_sort (layout); if (ret == -1) { - /* defensive coding; this can't happen currently */ gf_log (this->name, GF_LOG_WARNING, "sort failed?! how the ...."); goto out; @@ -616,26 +622,23 @@ dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout, &holes, &overlaps, &missing, &down, &misc, NULL); if (ret == -1) { - /* defensive coding; this can't happen currently */ gf_log (this->name, GF_LOG_WARNING, "error while finding anomalies in %s -- not good news", loc->path); goto out; } - ret = holes + overlaps; - if (ret) { + if (holes || overlaps) { if (missing == layout->cnt) { gf_log (this->name, GF_LOG_DEBUG, "directory %s looked up first time", loc->path); } else { gf_log (this->name, GF_LOG_INFO, - "found anomalies in %s. holes=%d overlaps=%d" - " missing=%d down=%d misc=%d", - loc->path, holes, overlaps, missing, down, - misc); + "found anomalies in %s. holes=%d overlaps=%d", + loc->path, holes, overlaps); } + ret = -1; } for (i = 0; i < layout->cnt; i++) { @@ -650,14 +653,14 @@ dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout, (layout->list[i].xlator ? layout->list[i].xlator->name : "<>")); + if ((layout->list[i].err == ENOENT) && (ret >= 0)) { + ret++; + } } } out: - if (missing_p) { - *missing_p = missing; - } return ret; } diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c index 39d72ae63..dbc9d0b3c 100644 --- a/xlators/cluster/dht/src/dht-linkfile.c +++ b/xlators/cluster/dht/src/dht-linkfile.c @@ -297,11 +297,11 @@ dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this) GF_VALIDATE_OR_GOTO ("dht", local, out); GF_VALIDATE_OR_GOTO ("dht", local->link_subvol, out); - if ((local->stbuf.ia_type == IA_INVAL) || - (is_equal (frame->root->uid, local->stbuf.ia_uid) && - is_equal (frame->root->gid, local->stbuf.ia_gid))) + if (local->stbuf.ia_type == IA_INVAL) return 0; + uuid_copy (local->loc.gfid, local->stbuf.ia_gfid); + copy = copy_frame (frame); if (!copy) diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index a9f913c2d..bcb19f23e 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -243,7 +243,7 @@ out: static inline int __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf, - dict_t *dict, fd_t **dst_fd) + dict_t *dict, fd_t **dst_fd, dict_t *xattr) { xlator_t *this = NULL; int ret = -1; @@ -307,6 +307,12 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc goto out; } + ret = syncop_fsetxattr (to, fd, xattr, 0); + if (ret == -1) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set xattr on %s (%s)", + loc->path, to->name, strerror (errno)); + ret = syncop_ftruncate (to, fd, stbuf->ia_size); if (ret < 0) gf_log (this->name, GF_LOG_ERROR, @@ -340,6 +346,9 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, int ret = -1; xlator_t *this = NULL; + uint64_t src_statfs_blocks = 1; + uint64_t dst_statfs_blocks = 1; + this = THIS; ret = syncop_statfs (from, loc, &src_statfs); @@ -363,22 +372,34 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, if (flag != GF_DHT_MIGRATE_DATA) goto check_avail_space; - if (((dst_statfs.f_bavail * - dst_statfs.f_bsize) / GF_DISK_SECTOR_SIZE) < - (((src_statfs.f_bavail * src_statfs.f_bsize) / - GF_DISK_SECTOR_SIZE) - stbuf->ia_blocks)) { - gf_log (this->name, GF_LOG_WARNING, - "data movement attempted from node (%s) with" - " higher disk space to a node (%s) with " - "lesser disk space (%s)", from->name, - to->name, loc->path); - - /* this is not a 'failure', but we don't want to - consider this as 'success' too :-/ */ - ret = 1; - goto out; + /* Check: + During rebalance `migrate-data` - Destination subvol experiences + a `reduction` in 'blocks' of free space, at the same time source + subvol gains certain 'blocks' of free space. A valid check is + necessary here to avoid errorneous move to destination where + the space could be scantily available. + */ + if (stbuf) { + dst_statfs_blocks = ((dst_statfs.f_bavail * + dst_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE); + src_statfs_blocks = ((src_statfs.f_bavail * + src_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE); + if ((dst_statfs_blocks - stbuf->ia_blocks) < + (src_statfs_blocks + stbuf->ia_blocks)) { + gf_log (this->name, GF_LOG_WARNING, + "data movement attempted from node (%s) with" + " higher disk space to a node (%s) with " + "lesser disk space (%s)", from->name, + to->name, loc->path); + + /* this is not a 'failure', but we don't want to + consider this as 'success' too :-/ */ + ret = 1; + goto out; + } } - check_avail_space: if (((dst_statfs.f_bavail * dst_statfs.f_bsize) / GF_DISK_SECTOR_SIZE) < stbuf->ia_blocks) { @@ -620,6 +641,15 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc, } done: + ret = syncop_setattr (to, loc, buf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID | + GF_SET_ATTR_MODE), NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform setattr on %s (%s)", + loc->path, to->name, strerror (errno)); + } + ret = syncop_unlink (from, loc); if (ret) gf_log (this->name, GF_LOG_WARNING, "%s: unlink failed (%s)", @@ -699,9 +729,16 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } + /* TODO: move all xattr related operations to fd based operations */ + ret = syncop_listxattr (from, loc, &xattr); + if (ret == -1) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to get xattr from %s (%s)", + loc->path, from->name, strerror (errno)); + /* create the destination, with required modes/xattr */ ret = __dht_rebalance_create_dst_file (to, from, loc, &stbuf, - dict, &dst_fd); + dict, &dst_fd, xattr); if (ret) goto out; @@ -718,6 +755,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } + ret = syncop_fstat (from, src_fd, &stbuf); if (ret) { gf_log (this->name, GF_LOG_ERROR, "failed to lookup %s on %s (%s)", @@ -747,19 +785,6 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } - /* TODO: move all xattr related operations to fd based operations */ - ret = syncop_listxattr (from, loc, &xattr); - if (ret == -1) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to get xattr from %s (%s)", - loc->path, from->name, strerror (errno)); - - ret = syncop_setxattr (to, loc, xattr, 0); - if (ret == -1) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set xattr on %s (%s)", - loc->path, to->name, strerror (errno)); - /* TODO: Sync the locks */ ret = syncop_fsync (to, dst_fd, 0); @@ -823,6 +848,23 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } + /* Free up the data blocks on the source node, as the whole + file is migrated */ + ret = syncop_ftruncate (from, src_fd, 0); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform truncate on %s (%s)", + loc->path, from->name, strerror (errno)); + } + + /* remove the 'linkto' xattr from the destination */ + ret = syncop_fremovexattr (to, dst_fd, conf->link_xattr_name); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform removexattr on %s (%s)", + loc->path, to->name, strerror (errno)); + } + /* Do a stat and check the gfid before unlink */ ret = syncop_stat (from, loc, &empty_iatt); if (ret) { @@ -843,23 +885,6 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, } } - /* Free up the data blocks on the source node, as the whole - file is migrated */ - ret = syncop_ftruncate (from, src_fd, 0); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to perform truncate on %s (%s)", - loc->path, from->name, strerror (errno)); - } - - /* remove the 'linkto' xattr from the destination */ - ret = syncop_fremovexattr (to, dst_fd, conf->link_xattr_name); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to perform removexattr on %s (%s)", - loc->path, to->name, strerror (errno)); - } - ret = syncop_lookup (this, loc, NULL, NULL, NULL, NULL); if (ret) { gf_log (this->name, GF_LOG_DEBUG, @@ -1101,6 +1126,7 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, struct timeval end = {0,}; double elapsed = {0,}; struct timeval start = {0,}; + int32_t err = 0; gf_log (this->name, GF_LOG_INFO, "migrate data called on %s", loc->path); @@ -1258,9 +1284,21 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, ret = syncop_setxattr (this, &entry_loc, migrate_data, 0); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "migrate-data" - " failed for %s", entry_loc.path); - defrag->total_failures +=1; + err = op_errno; + /* errno is overloaded. See + * rebalance_task_completion () */ + if (err != ENOSPC) { + gf_log (this->name, GF_LOG_DEBUG, + "migrate-data skipped for %s" + " due to space constraints", + entry_loc.path); + defrag->skipped +=1; + } else{ + gf_log (this->name, GF_LOG_ERROR, + "migrate-data failed for %s", + entry_loc.path); + defrag->total_failures +=1; + } } if (ret == -1) { @@ -1659,6 +1697,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict) uint64_t size = 0; uint64_t lookup = 0; uint64_t failures = 0; + uint64_t skipped = 0; char *status = ""; double elapsed = 0; struct timeval end = {0,}; @@ -1675,6 +1714,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict) size = defrag->total_data; lookup = defrag->num_files_lookedup; failures = defrag->total_failures; + skipped = defrag->skipped; gettimeofday (&end, NULL); @@ -1698,6 +1738,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict) gf_log (THIS->name, GF_LOG_WARNING, "failed to set lookedup file count"); + ret = dict_set_int32 (dict, "status", defrag->defrag_status); if (ret) gf_log (THIS->name, GF_LOG_WARNING, @@ -1710,6 +1751,14 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict) } ret = dict_set_uint64 (dict, "failures", failures); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set failure count"); + + ret = dict_set_uint64 (dict, "skipped", skipped); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set skipped file count"); log: switch (defrag->defrag_status) { case GF_DEFRAG_STATUS_NOT_STARTED: @@ -1727,13 +1776,15 @@ log: case GF_DEFRAG_STATUS_FAILED: status = "failed"; break; + default: + break; } gf_log (THIS->name, GF_LOG_INFO, "Rebalance is %s. Time taken is %.2f " "secs", status, elapsed); gf_log (THIS->name, GF_LOG_INFO, "Files migrated: %"PRIu64", size: %" - PRIu64", lookups: %"PRIu64", failures: %"PRIu64, files, size, - lookup, failures); + PRIu64", lookups: %"PRIu64", failures: %"PRIu64", skipped: " + "%"PRIu64, files, size, lookup, failures, skipped); out: diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c index 08965976b..5d6f4f232 100644 --- a/xlators/cluster/dht/src/dht-rename.c +++ b/xlators/cluster/dht/src/dht-rename.c @@ -306,7 +306,19 @@ err: NULL, NULL); return 0; } - +#define DHT_MARK_FOP_INTERNAL(xattr) do { \ + int tmp = -1; \ + if (!xattr) { \ + xattr = dict_new (); \ + if (!xattr) \ + break; \ + } \ + tmp = dict_set_str (xattr, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); \ + if (tmp) { \ + gf_log (this->name, GF_LOG_ERROR, "Failed to set" \ + " internal dict key for %s", local->loc.path); \ + } \ + }while (0) int dht_rename_done (call_frame_t *frame, xlator_t *this) { @@ -377,7 +389,7 @@ dht_rename_cleanup (call_frame_t *frame) xlator_t *dst_hashed = NULL; xlator_t *dst_cached = NULL; int call_cnt = 0; - + dict_t *xattr = NULL; local = frame->local; this = frame->this; @@ -401,13 +413,15 @@ dht_rename_cleanup (call_frame_t *frame) if (!call_cnt) goto nolinks; + DHT_MARK_FOP_INTERNAL (xattr); + if (dst_hashed != src_hashed && dst_hashed != src_cached) { gf_log (this->name, GF_LOG_TRACE, "unlinking linkfile %s @ %s => %s", local->loc.path, dst_hashed->name, src_cached->name); STACK_WIND (frame, dht_rename_unlink_cbk, dst_hashed, dst_hashed->fops->unlink, - &local->loc, 0, NULL); + &local->loc, 0, xattr); } if (src_cached != dst_hashed) { @@ -416,9 +430,12 @@ dht_rename_cleanup (call_frame_t *frame) local->loc2.path, src_cached->name); STACK_WIND (frame, dht_rename_unlink_cbk, src_cached, src_cached->fops->unlink, - &local->loc2, 0, NULL); + &local->loc2, 0, xattr); } + if (xattr) + dict_unref (xattr); + return 0; nolinks: @@ -482,6 +499,7 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, xlator_t *rename_subvol = NULL; call_frame_t *link_frame = NULL; dht_local_t *link_local = NULL; + dict_t *xattr = NULL; local = frame->local; prev = cookie; @@ -565,6 +583,8 @@ err: if (local->call_cnt == 0) goto unwind; + DHT_MARK_FOP_INTERNAL (xattr); + if (src_cached != dst_hashed && src_cached != dst_cached) { gf_log (this->name, GF_LOG_TRACE, "deleting old src datafile %s @ %s", @@ -572,7 +592,7 @@ err: STACK_WIND (frame, dht_rename_unlink_cbk, src_cached, src_cached->fops->unlink, - &local->loc, 0, NULL); + &local->loc, 0, xattr); } if (src_hashed != rename_subvol && src_hashed != src_cached) { @@ -582,7 +602,7 @@ err: STACK_WIND (frame, dht_rename_unlink_cbk, src_hashed, src_hashed->fops->unlink, - &local->loc, 0, NULL); + &local->loc, 0, xattr); } if (dst_cached @@ -594,8 +614,10 @@ err: STACK_WIND (frame, dht_rename_unlink_cbk, dst_cached, dst_cached->fops->unlink, - &local->loc2, 0, NULL); + &local->loc2, 0, xattr); } + if (xattr) + dict_unref (xattr); return 0; unwind: @@ -603,12 +625,16 @@ unwind: WIPE (&local->postoldparent); WIPE (&local->preparent); WIPE (&local->postparent); + if (xattr) + dict_unref (xattr); dht_rename_done (frame, this); return 0; cleanup: + if (xattr) + dict_unref (xattr); dht_rename_cleanup (frame); return 0; @@ -742,6 +768,7 @@ dht_rename_create_links (call_frame_t *frame) xlator_t *dst_hashed = NULL; xlator_t *dst_cached = NULL; int call_cnt = 0; + dict_t *xattr = NULL; local = frame->local; @@ -752,6 +779,7 @@ dht_rename_create_links (call_frame_t *frame) dst_hashed = local->dst_hashed; dst_cached = local->dst_cached; + DHT_MARK_FOP_INTERNAL (xattr); if (src_cached == dst_cached) { if (dst_hashed == dst_cached) @@ -763,7 +791,7 @@ dht_rename_create_links (call_frame_t *frame) STACK_WIND (frame, dht_rename_unlink_links_cbk, dst_hashed, dst_hashed->fops->unlink, - &local->loc2, 0, NULL); + &local->loc2, 0, xattr); return 0; } @@ -790,7 +818,7 @@ dht_rename_create_links (call_frame_t *frame) local->loc2.path, src_cached->name); STACK_WIND (frame, dht_rename_links_cbk, src_cached, src_cached->fops->link, - &local->loc, &local->loc2, NULL); + &local->loc, &local->loc2, xattr); } nolinks: @@ -798,6 +826,8 @@ nolinks: /* skip to next step */ dht_do_rename (frame); } + if (xattr) + dict_unref (xattr); return 0; } diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index b220a0e25..3fe96b1c7 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -17,6 +17,7 @@ #include "glusterfs.h" #include "xlator.h" #include "dht-common.h" +#include "glusterfs-acl.h" #define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path) do { \ layout->list[i].start = srt; \ @@ -564,9 +565,33 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) for (i = 0; i < layout->cnt; i++) { err = layout->list[i].err; - if (err == -1 || err == 0) { - layout->list[i].err = -1; + if (err == -1 || err == 0 || err == ENOENT) { + /* Setting list[i].err = -1 is an indication for + dht_selfheal_layout_new_directory() to assign + a range. We set it to -1 based on any one of + the three criteria: + + - err == -1 already, which means directory + existed but layout was not set on it. + + - err == 0, which means directory exists and + has an old layout piece which will be + overwritten now. + + - err == ENOENT, which means directory does + not exist (possibly racing with mkdir or + finishing half done mkdir). The missing + directory will be attempted to be recreated. + + It is important to note that it is safe + to race with mkdir() as self-heal and + mkdir are idempotent operations. Both will + strive to set the directory and layouts to + the same final state. + */ count++; + if (!err) + layout->list[i].err = -1; } } @@ -776,7 +801,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, DHT_RESET_LAYOUT_RANGE (layout); for (i = start_subvol; i < layout->cnt; i++) { err = layout->list[i].err; - if (err == -1) { + if (err == -1 || err == ENOENT) { DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, cnt, loc->path); if (--cnt == 0) { @@ -789,7 +814,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, for (i = 0; i < start_subvol; i++) { err = layout->list[i].err; - if (err == -1) { + if (err == -1 || err == ENOENT) { DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, cnt, loc->path); if (--cnt == 0) { diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 6b91c56a1..70aac7710 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -262,6 +262,28 @@ out: return ret; } + +int +dht_decommissioned_remove (xlator_t *this, dht_conf_t *conf) +{ + int i = 0; + int ret = -1; + + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i]) { + conf->decommissioned_bricks[i] = NULL; + conf->decommission_subvols_cnt--; + } + } + + ret = 0; +out: + + return ret; +} void dht_init_regex (xlator_t *this, dict_t *odict, char *name, regex_t *re, gf_boolean_t *re_valid) @@ -358,6 +380,10 @@ dht_reconfigure (xlator_t *this, dict_t *options) ret = dht_parse_decommissioned_bricks (this, conf, temp_str); if (ret == -1) goto out; + } else { + ret = dht_decommissioned_remove (this, conf); + if (ret == -1) + goto out; } dht_init_regex (this, options, "rsync-hash-regex", diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c index 814f0a8eb..fc0ca2f77 100644 --- a/xlators/cluster/dht/src/dht.c +++ b/xlators/cluster/dht/src/dht.c @@ -70,6 +70,9 @@ struct xlator_fops fops = { .fxattrop = dht_fxattrop, .setattr = dht_setattr, .fsetattr = dht_fsetattr, + .fallocate = dht_fallocate, + .discard = dht_discard, + .zerofill = dht_zerofill, }; struct xlator_dumpops dumpops = { diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c index 5e5c68058..e934acdf0 100644 --- a/xlators/cluster/dht/src/nufa.c +++ b/xlators/cluster/dht/src/nufa.c @@ -323,7 +323,8 @@ nufa_create (call_frame_t *frame, xlator_t *this, if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) { avail_subvol = dht_free_disk_available_subvol (this, - (xlator_t *)conf->private); + (xlator_t *)conf->private, + local); } if (subvol != avail_subvol) { @@ -427,7 +428,8 @@ nufa_mknod (call_frame_t *frame, xlator_t *this, if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) { avail_subvol = dht_free_disk_available_subvol (this, - (xlator_t *)conf->private); + (xlator_t *)conf->private, + local); } if (avail_subvol != subvol) { @@ -482,97 +484,141 @@ same_first_part (char *str1, char term1, char *str2, char term2) } } +typedef struct nufa_args { + xlator_t *this; + char *volname; + gf_boolean_t addr_match; +} nufa_args_t; + +static void +nufa_find_local_brick (xlator_t *xl, void *data) +{ + nufa_args_t *args = data; + xlator_t *this = args->this; + char *local_volname = args->volname; + gf_boolean_t addr_match = args->addr_match; + char *brick_host = NULL; + dht_conf_t *conf = this->private; + int ret = -1; + + /*This means a local subvol was already found. We pick the first brick + * that is local*/ + if (conf->private) + return; + + if (strcmp (xl->name, local_volname) == 0) { + conf->private = xl; + gf_log (this->name, GF_LOG_INFO, "Using specified subvol %s", + local_volname); + return; + } + + if (!addr_match) + return; + + ret = dict_get_str (xl->options, "remote-host", &brick_host); + if ((ret == 0) && + (gf_is_same_address (local_volname, brick_host) || + gf_is_local_addr (brick_host))) { + conf->private = xl; + gf_log (this->name, GF_LOG_INFO, "Using the first local " + "subvol %s", xl->name); + return; + } + +} + +static void +nufa_to_dht (xlator_t *this) +{ + GF_ASSERT (this); + GF_ASSERT (this->fops); + + this->fops->lookup = dht_lookup; + this->fops->create = dht_create; + this->fops->mknod = dht_mknod; +} + +int +nufa_find_local_subvol (xlator_t *this, + void (*fn) (xlator_t *each, void* data), void *data) +{ + int ret = -1; + dht_conf_t *conf = this->private; + xlator_list_t *trav = NULL; + xlator_t *parent = NULL; + xlator_t *candidate = NULL; + + xlator_foreach_depth_first (this, fn, data); + if (!conf->private) { + gf_log (this->name, GF_LOG_ERROR, "Couldn't find a local " + "brick"); + return -1; + } + + candidate = conf->private; + trav = candidate->parents; + while (trav) { + + parent = trav->xlator; + if (strcmp (parent->type, "cluster/nufa") == 0) { + gf_log (this->name, GF_LOG_INFO, "Found local subvol, " + "%s", candidate->name); + ret = 0; + conf->private = candidate; + break; + } + + candidate = parent; + trav = parent->parents; + } + + return ret; +} + int nufa_init (xlator_t *this) { - dht_conf_t *conf = NULL; - xlator_list_t *trav = NULL; data_t *data = NULL; char *local_volname = NULL; int ret = -1; char my_hostname[256]; - xlator_t *local_subvol = NULL; - char *brick_host = NULL; - xlator_t *kid = NULL; + gf_boolean_t addr_match = _gf_false; + nufa_args_t args = {0, }; ret = dht_init(this); if (ret) { return ret; } - conf = this->private; - local_volname = "localhost"; - ret = gethostname (my_hostname, 256); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "could not find hostname (%s)", - strerror (errno)); - } + if ((data = dict_get (this->options, "local-volume-name"))) { + local_volname = data->data; - if (ret == 0) - local_volname = my_hostname; + } else { + addr_match = _gf_true; + local_volname = "localhost"; + ret = gethostname (my_hostname, 256); + if (ret == 0) + local_volname = my_hostname; - data = dict_get (this->options, "local-volume-name"); - if (data) { - local_volname = data->data; - } + else + gf_log (this->name, GF_LOG_WARNING, + "could not find hostname (%s)", + strerror (errno)); - for (trav = this->children; trav; trav = trav->next) { - if (strcmp (trav->xlator->name, local_volname) == 0) - break; - if (local_subvol) { - continue; - } - kid = trav->xlator; - for (;;) { - if (dict_get_str(trav->xlator->options,"remote-host", - &brick_host) == 0) { - /* Found it. */ - break; - } - if (!kid->children) { - /* Nowhere further to look. */ - gf_log (this->name, GF_LOG_ERROR, - "could not get remote-host"); - goto err; - } - if (kid->children->next) { - /* Multiple choices, can't/shouldn't decide. */ - gf_log (this->name, GF_LOG_ERROR, - "NUFA found fan-out (type %s) volume", - kid->type); - goto err; - } - /* One-to-one xlators are OK, try the next one. */ - kid = kid->children->xlator; - } - if (same_first_part(my_hostname,'.',brick_host,'.')) { - local_subvol = trav->xlator; - } } - if (trav) { - gf_log (this->name, GF_LOG_INFO, - "Using specified subvol %s", local_volname); - conf->private = trav->xlator; - } - else if (local_subvol) { + args.this = this; + args.volname = local_volname; + args.addr_match = addr_match; + ret = nufa_find_local_subvol (this, nufa_find_local_brick, &args); + if (ret) { gf_log (this->name, GF_LOG_INFO, - "Using first local subvol %s", local_subvol->name); - conf->private = local_subvol; + "Unable to find local subvolume, switching " + "to dht mode"); + nufa_to_dht (this); } - else { - gf_log (this->name, GF_LOG_ERROR, - "Could not find specified or local subvol"); - goto err; - - } - return 0; - -err: - dht_fini(this); - return -1; } diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c index 861012247..d3ea90ba8 100644 --- a/xlators/cluster/dht/src/switch.c +++ b/xlators/cluster/dht/src/switch.c @@ -437,7 +437,8 @@ switch_create (call_frame_t *frame, xlator_t *this, avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol); if (dht_is_subvol_filled (this, avail_subvol)) { avail_subvol = - dht_free_disk_available_subvol (this, avail_subvol); + dht_free_disk_available_subvol (this, avail_subvol, + local); } if (subvol != avail_subvol) { @@ -536,7 +537,8 @@ switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol); if (dht_is_subvol_filled (this, avail_subvol)) { avail_subvol = - dht_free_disk_available_subvol (this, avail_subvol); + dht_free_disk_available_subvol (this, avail_subvol, + local); } if (avail_subvol != subvol) { diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c index dadd3fec5..69b510e23 100644 --- a/xlators/cluster/stripe/src/stripe.c +++ b/xlators/cluster/stripe/src/stripe.c @@ -3774,6 +3774,524 @@ err: int32_t +stripe_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t callcnt = 0; + stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; + call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; + + LOCK(&frame->lock); + { + callcnt = ++mlocal->call_count; + + if (op_ret == 0) { + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; + + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); + + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; + } + + /* return the first failure */ + if (mlocal->op_ret == 0) { + mlocal->op_ret = op_ret; + mlocal->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + + STRIPE_STACK_UNWIND (fallocate, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); + } +out: + STRIPE_STACK_DESTROY(frame); + return 0; +} + +int32_t +stripe_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_fd_ctx_t *fctx = NULL; + int32_t op_errno = 1; + int32_t idx = 0; + int32_t offset_offset = 0; + int32_t remaining_size = 0; + off_t fill_size = 0; + uint64_t stripe_size = 0; + uint64_t tmp_fctx = 0; + off_t dest_offset = 0; + call_frame_t *fframe = NULL; + stripe_local_t *flocal = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + inode_ctx_get (fd->inode, this, &tmp_fctx); + if (!tmp_fctx) { + op_errno = EINVAL; + goto err; + } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + frame->local = local; + local->stripe_size = stripe_size; + local->fctx = fctx; + + if (!stripe_size) { + gf_log (this->name, GF_LOG_DEBUG, + "Wrong stripe size for the file"); + op_errno = EINVAL; + goto err; + } + + while (1) { + fframe = copy_frame(frame); + flocal = mem_get0(this->local_pool); + if (!flocal) { + op_errno = ENOMEM; + goto err; + } + flocal->orig_frame = frame; + fframe->local = flocal; + + /* send fallocate request to the associated child node */ + idx = (((offset + offset_offset) / + local->stripe_size) % fctx->stripe_count); + + fill_size = (local->stripe_size - + ((offset + offset_offset) % local->stripe_size)); + if (fill_size > remaining_size) + fill_size = remaining_size; + + remaining_size -= fill_size; + + local->wind_count++; + if (remaining_size == 0) + local->unwind = 1; + + dest_offset = offset + offset_offset; + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, fctx->stripe_count); + + /* + * TODO: Create a separate handler for coalesce mode that sends a + * single fallocate per-child (since the ranges are linear). + */ + STACK_WIND(fframe, stripe_fallocate_cbk, fctx->xl_array[idx], + fctx->xl_array[idx]->fops->fallocate, fd, mode, + dest_offset, fill_size, xdata); + + offset_offset += fill_size; + if (remaining_size == 0) + break; + } + + return 0; +err: + if (fframe) + STRIPE_STACK_DESTROY(fframe); + + STRIPE_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + + +int32_t +stripe_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t callcnt = 0; + stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; + call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; + + LOCK(&frame->lock); + { + callcnt = ++mlocal->call_count; + + if (op_ret == 0) { + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; + + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); + + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; + } + + /* return the first failure */ + if (mlocal->op_ret == 0) { + mlocal->op_ret = op_ret; + mlocal->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + + STRIPE_STACK_UNWIND (discard, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); + } +out: + STRIPE_STACK_DESTROY(frame); + return 0; +} + +int32_t +stripe_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_fd_ctx_t *fctx = NULL; + int32_t op_errno = 1; + int32_t idx = 0; + int32_t offset_offset = 0; + int32_t remaining_size = 0; + off_t fill_size = 0; + uint64_t stripe_size = 0; + uint64_t tmp_fctx = 0; + off_t dest_offset = 0; + call_frame_t *fframe = NULL; + stripe_local_t *flocal = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + inode_ctx_get (fd->inode, this, &tmp_fctx); + if (!tmp_fctx) { + op_errno = EINVAL; + goto err; + } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + frame->local = local; + local->stripe_size = stripe_size; + local->fctx = fctx; + + if (!stripe_size) { + gf_log (this->name, GF_LOG_DEBUG, + "Wrong stripe size for the file"); + op_errno = EINVAL; + goto err; + } + + while (1) { + fframe = copy_frame(frame); + flocal = mem_get0(this->local_pool); + if (!flocal) { + op_errno = ENOMEM; + goto err; + } + flocal->orig_frame = frame; + fframe->local = flocal; + + /* send discard request to the associated child node */ + idx = (((offset + offset_offset) / + local->stripe_size) % fctx->stripe_count); + + fill_size = (local->stripe_size - + ((offset + offset_offset) % local->stripe_size)); + if (fill_size > remaining_size) + fill_size = remaining_size; + + remaining_size -= fill_size; + + local->wind_count++; + if (remaining_size == 0) + local->unwind = 1; + + dest_offset = offset + offset_offset; + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, fctx->stripe_count); + + /* + * TODO: Create a separate handler for coalesce mode that sends a + * single discard per-child (since the ranges are linear). + */ + STACK_WIND(fframe, stripe_discard_cbk, fctx->xl_array[idx], + fctx->xl_array[idx]->fops->discard, fd, dest_offset, + fill_size, xdata); + + offset_offset += fill_size; + if (remaining_size == 0) + break; + } + + return 0; +err: + if (fframe) + STRIPE_STACK_DESTROY(fframe); + + STRIPE_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +stripe_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t callcnt = 0; + stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; + call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; + + LOCK(&frame->lock); + { + callcnt = ++mlocal->call_count; + + if (op_ret == 0) { + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; + + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); + + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; + } + + /* return the first failure */ + if (mlocal->op_ret == 0) { + mlocal->op_ret = op_ret; + mlocal->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + + STRIPE_STACK_UNWIND (zerofill, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); + } +out: + STRIPE_STACK_DESTROY(frame); + return 0; +} + +int32_t +stripe_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_fd_ctx_t *fctx = NULL; + int32_t op_errno = 1; + int32_t idx = 0; + int32_t offset_offset = 0; + int32_t remaining_size = 0; + off_t fill_size = 0; + uint64_t stripe_size = 0; + uint64_t tmp_fctx = 0; + off_t dest_offset = 0; + call_frame_t *fframe = NULL; + stripe_local_t *flocal = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + inode_ctx_get (fd->inode, this, &tmp_fctx); + if (!tmp_fctx) { + op_errno = EINVAL; + goto err; + } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + frame->local = local; + local->stripe_size = stripe_size; + local->fctx = fctx; + + if (!stripe_size) { + gf_log (this->name, GF_LOG_DEBUG, + "Wrong stripe size for the file"); + op_errno = EINVAL; + goto err; + } + + while (1) { + fframe = copy_frame(frame); + flocal = mem_get0(this->local_pool); + if (!flocal) { + op_errno = ENOMEM; + goto err; + } + flocal->orig_frame = frame; + fframe->local = flocal; + + idx = (((offset + offset_offset) / + local->stripe_size) % fctx->stripe_count); + + fill_size = (local->stripe_size - + ((offset + offset_offset) % local->stripe_size)); + if (fill_size > remaining_size) + fill_size = remaining_size; + + remaining_size -= fill_size; + + local->wind_count++; + if (remaining_size == 0) + local->unwind = 1; + + dest_offset = offset + offset_offset; + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, + fctx->stripe_count); + + STACK_WIND(fframe, stripe_zerofill_cbk, fctx->xl_array[idx], + fctx->xl_array[idx]->fops->zerofill, fd, + dest_offset, fill_size, xdata); + offset_offset += fill_size; + if (remaining_size == 0) + break; + } + + return 0; +err: + if (fframe) + STRIPE_STACK_DESTROY(fframe); + + STRIPE_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t stripe_release (xlator_t *this, fd_t *fd) { return 0; @@ -3947,6 +4465,37 @@ stripe_setxattr_cbk (call_frame_t *frame, void *cookie, return 0; } +#ifdef HAVE_BD_XLATOR +int +stripe_is_bd (dict_t *this, char *key, data_t *value, void *data) +{ + gf_boolean_t *is_bd = data; + + if (data == NULL) + return 0; + + if (XATTR_IS_BD (key)) + *is_bd = _gf_true; + + return 0; +} + +inline gf_boolean_t +stripe_setxattr_is_bd (dict_t *dict) +{ + gf_boolean_t is_bd = _gf_false; + + if (dict == NULL) + goto out; + + dict_foreach (dict, stripe_is_bd, &is_bd); +out: + return is_bd; +} +#else +#define stripe_setxattr_is_bd(dict) _gf_false +#endif + int stripe_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, int flags, dict_t *xdata) @@ -3956,6 +4505,7 @@ stripe_setxattr (call_frame_t *frame, xlator_t *this, stripe_private_t *priv = NULL; stripe_local_t *local = NULL; int i = 0; + gf_boolean_t is_bd = _gf_false; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -3978,11 +4528,15 @@ stripe_setxattr (call_frame_t *frame, xlator_t *this, local->wind_count = priv->child_count; local->op_ret = local->op_errno = 0; + is_bd = stripe_setxattr_is_bd (dict); + /** * Set xattrs for directories on all subvolumes. Additionally - * this power is only given to a special client. + * this power is only given to a special client. Bd xlator + * also needs xattrs for regular files (ie LVs) */ - if ((frame->root->pid == GF_CLIENT_PID_GSYNCD) && IA_ISDIR (loc->inode->ia_type)) { + if (((frame->root->pid == GF_CLIENT_PID_GSYNCD) && + IA_ISDIR (loc->inode->ia_type)) || is_bd) { for (i = 0; i < priv->child_count; i++, trav = trav->next) { STACK_WIND (frame, stripe_setxattr_cbk, trav->xlator, trav->xlator->fops->setxattr, @@ -4013,21 +4567,21 @@ stripe_fsetxattr_cbk (call_frame_t *frame, void *cookie, int -stripe_is_lockinfo (dict_t *this, - char *key, - data_t *value, - void *data) +stripe_is_special_key (dict_t *this, + char *key, + data_t *value, + void *data) { - gf_boolean_t *is_lockinfo = NULL; + gf_boolean_t *is_special = NULL; if (data == NULL) { goto out; } - is_lockinfo = data; + is_special = data; - if (XATTR_IS_LOCKINFO (key)) - *is_lockinfo = _gf_true; + if (XATTR_IS_LOCKINFO (key) || XATTR_IS_BD (key)) + *is_special = _gf_true; out: return 0; @@ -4104,7 +4658,7 @@ stripe_fsetxattr_is_special (dict_t *dict) goto out; } - dict_foreach (dict, stripe_is_lockinfo, &is_spl); + dict_foreach (dict, stripe_is_special_key, &is_spl); out: return is_spl; @@ -4866,10 +5420,10 @@ stripe_vgetxattr_cbk (call_frame_t *frame, void *cookie, xattr->pos = cky; xattr->xattr_value = gf_memdup (xattr_val, - xattr->xattr_value, xattr->xattr_len); - local->xattr_total_len += xattr->xattr_len + 1; + if (xattr->xattr_value != NULL) + local->xattr_total_len += xattr->xattr_len + 1; } } out: @@ -4969,7 +5523,8 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this, if (cluster_getmarkerattr (frame, this, loc, name, local, stripe_getxattr_unwind, sub_volumes, priv->child_count, - MARKER_UUID_TYPE, priv->vol_uuid)) { + MARKER_UUID_TYPE, marker_uuid_default_gauge, + priv->vol_uuid)) { op_errno = EINVAL; goto err; } @@ -5048,6 +5603,7 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this, sub_volumes, local->marker.call_count, MARKER_XTIME_TYPE, + marker_xtime_default_gauge, priv->vol_uuid)) { op_errno = EINVAL; goto err; @@ -5221,6 +5777,9 @@ struct xlator_fops fops = { .removexattr = stripe_removexattr, .fremovexattr = stripe_fremovexattr, .readdirp = stripe_readdirp, + .fallocate = stripe_fallocate, + .discard = stripe_discard, + .zerofill = stripe_zerofill, }; struct xlator_cbks cbks = { @@ -5246,10 +5805,10 @@ struct volume_options options[] = { }, { .key = {"coalesce"}, .type = GF_OPTION_TYPE_BOOL, - .default_value = "false", - .description = "Enable coalesce mode to flatten striped files as " - "stored on the server (i.e., eliminate holes caused " - "by the traditional format)." + .default_value = "true", + .description = "Enable/Disable coalesce mode to flatten striped " + "files as stored on the server (i.e., eliminate holes " + "caused by the traditional format)." }, { .key = {NULL} }, }; diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c index 63bb8fa90..7fb697ae4 100644 --- a/xlators/debug/io-stats/src/io-stats.c +++ b/xlators/debug/io-stats/src/io-stats.c @@ -1724,6 +1724,40 @@ io_stats_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int +io_stats_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + UPDATE_PROFILE_STATS(frame, FALLOCATE); + STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + + +int +io_stats_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + UPDATE_PROFILE_STATS(frame, DISCARD); + STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + +int +io_stats_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + UPDATE_PROFILE_STATS(frame, ZEROFILL); + STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + +int io_stats_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { @@ -2392,6 +2426,45 @@ io_stats_fstat (call_frame_t *frame, xlator_t *this, int +io_stats_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + START_FOP_LATENCY(frame); + + STACK_WIND(frame, io_stats_fallocate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len, + xdata); + + return 0; +} + + +int +io_stats_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + START_FOP_LATENCY(frame); + + STACK_WIND(frame, io_stats_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); + + return 0; +} + +int +io_stats_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + START_FOP_LATENCY(frame); + + STACK_WIND(frame, io_stats_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + + return 0; +} + + +int io_stats_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata) { @@ -2817,6 +2890,9 @@ struct xlator_fops fops = { .fxattrop = io_stats_fxattrop, .setattr = io_stats_setattr, .fsetattr = io_stats_fsetattr, + .fallocate = io_stats_fallocate, + .discard = io_stats_discard, + .zerofill = io_stats_zerofill, }; struct xlator_cbks cbks = { diff --git a/xlators/debug/trace/src/trace.c b/xlators/debug/trace/src/trace.c index a9c11babe..c9d839356 100644 --- a/xlators/debug/trace/src/trace.c +++ b/xlators/debug/trace/src/trace.c @@ -3134,7 +3134,7 @@ init (xlator_t *this) gf_log (this->name, GF_LOG_DEBUG, "logging to history %s", (conf->log_history == _gf_true)?"enabled":"disabled"); - history = eh_new (history_size, _gf_false); + history = eh_new (history_size, _gf_false, NULL); if (!history) { gf_log (this->name, GF_LOG_ERROR, "event history cannot be " "initialized"); diff --git a/xlators/encryption/Makefile.am b/xlators/encryption/Makefile.am index 2cbde680f..36efc6698 100644 --- a/xlators/encryption/Makefile.am +++ b/xlators/encryption/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = rot-13 +SUBDIRS = rot-13 crypt CLEANFILES = diff --git a/xlators/encryption/crypt/Makefile.am b/xlators/encryption/crypt/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/encryption/crypt/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/encryption/crypt/src/Makefile.am b/xlators/encryption/crypt/src/Makefile.am new file mode 100644 index 000000000..faadd117f --- /dev/null +++ b/xlators/encryption/crypt/src/Makefile.am @@ -0,0 +1,24 @@ +if ENABLE_CRYPT_XLATOR + +xlator_LTLIBRARIES = crypt.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/encryption + +crypt_la_LDFLAGS = -module -avoidversion -lssl -lcrypto + +crypt_la_SOURCES = keys.c data.c metadata.c atom.c crypt.c +crypt_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = crypt-common.h crypt-mem-types.h crypt.h metadata.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = + +else + +noinst_DIST = keys.c data.c metadata.c atom.c crypt.c +noinst_HEADERS = crypt-common.h crypt-mem-types.h crypt.h metadata.h + +endif
\ No newline at end of file diff --git a/xlators/encryption/crypt/src/atom.c b/xlators/encryption/crypt/src/atom.c new file mode 100644 index 000000000..1ec41495c --- /dev/null +++ b/xlators/encryption/crypt/src/atom.c @@ -0,0 +1,962 @@ +/* + Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "defaults.h" +#include "crypt-common.h" +#include "crypt.h" + +/* + * Glossary + * + * + * cblock (or cipher block). A logical unit in a file. + * cblock size is defined as the number of bits + * in an input (or output) block of the block + * cipher (*). Cipher block size is a property of + * cipher algorithm. E.g. cblock size is 64 bits + * for DES, 128 bits for AES, etc. + * + * atomic cipher A cipher algorithm, which requires some chunks of + * algorithm text to be padded at left and(or) right sides before + * cipher transaform. + * + * + * block (atom) Minimal chunk of file's data, which doesn't require + * padding. We'll consider logical units in a file of + * block size (atom size). + * + * cipher algorithm Atomic cipher algorithm, which requires the last + * with EOF issue incomplete cblock in a file to be padded with some + * data (usually zeros). + * + * + * operation, which reading/writing from offset, which is not aligned to + * forms a gap at to atom size + * the beginning + * + * + * operation, which reading/writing count bytes starting from offset off, + * forms a gap at so that off+count is not aligned to atom_size + * the end + * + * head block the first atom affected by an operation, which forms + * a gap at the beginning, or(and) at the end. + * Сomment. Head block has at least one gap (either at + * the beginning, or at the end) + * + * + * tail block the last atom different from head, affected by an + * operation, which forms a gap at the end. + * Сomment: Tail block has exactly one gap (at the end). + * + * + * partial block head or tail block + * + * + * full block block without gaps. + * + * + * (*) Recommendation for Block Cipher Modes of Operation + * Methods and Techniques + * NIST Special Publication 800-38A Edition 2001 + */ + +/* + * atom->offset_at() + */ +static off_t offset_at_head(struct avec_config *conf) +{ + return conf->aligned_offset; +} + +static off_t offset_at_hole_head(call_frame_t *frame, + struct object_cipher_info *object) +{ + return offset_at_head(get_hole_conf(frame)); +} + +static off_t offset_at_data_head(call_frame_t *frame, + struct object_cipher_info *object) +{ + return offset_at_head(get_data_conf(frame)); +} + + +static off_t offset_at_tail(struct avec_config *conf, + struct object_cipher_info *object) +{ + return conf->aligned_offset + + (conf->off_in_head ? get_atom_size(object) : 0) + + (conf->nr_full_blocks << get_atom_bits(object)); +} + +static off_t offset_at_hole_tail(call_frame_t *frame, + struct object_cipher_info *object) +{ + return offset_at_tail(get_hole_conf(frame), object); +} + + +static off_t offset_at_data_tail(call_frame_t *frame, + struct object_cipher_info *object) +{ + return offset_at_tail(get_data_conf(frame), object); +} + +static off_t offset_at_full(struct avec_config *conf, + struct object_cipher_info *object) +{ + return conf->aligned_offset + + (conf->off_in_head ? get_atom_size(object) : 0); +} + +static off_t offset_at_data_full(call_frame_t *frame, + struct object_cipher_info *object) +{ + return offset_at_full(get_data_conf(frame), object); +} + +static off_t offset_at_hole_full(call_frame_t *frame, + struct object_cipher_info *object) +{ + return offset_at_full(get_hole_conf(frame), object); +} + +/* + * atom->io_size_nopad() + */ + +static uint32_t io_size_nopad_head(struct avec_config *conf, + struct object_cipher_info *object) +{ + uint32_t gap_at_beg; + uint32_t gap_at_end; + + check_head_block(conf); + + gap_at_beg = conf->off_in_head; + + if (has_tail_block(conf) || has_full_blocks(conf) || conf->off_in_tail == 0 ) + gap_at_end = 0; + else + gap_at_end = get_atom_size(object) - conf->off_in_tail; + + return get_atom_size(object) - (gap_at_beg + gap_at_end); +} + +static uint32_t io_size_nopad_tail(struct avec_config *conf, + struct object_cipher_info *object) +{ + check_tail_block(conf); + return conf->off_in_tail; +} + +static uint32_t io_size_nopad_full(struct avec_config *conf, + struct object_cipher_info *object) +{ + check_full_block(conf); + return get_atom_size(object); +} + +static uint32_t io_size_nopad_data_head(call_frame_t *frame, + struct object_cipher_info *object) +{ + return io_size_nopad_head(get_data_conf(frame), object); +} + +static uint32_t io_size_nopad_hole_head(call_frame_t *frame, + struct object_cipher_info *object) +{ + return io_size_nopad_head(get_hole_conf(frame), object); +} + +static uint32_t io_size_nopad_data_tail(call_frame_t *frame, + struct object_cipher_info *object) +{ + return io_size_nopad_tail(get_data_conf(frame), object); +} + +static uint32_t io_size_nopad_hole_tail(call_frame_t *frame, + struct object_cipher_info *object) +{ + return io_size_nopad_tail(get_hole_conf(frame), object); +} + +static uint32_t io_size_nopad_data_full(call_frame_t *frame, + struct object_cipher_info *object) +{ + return io_size_nopad_full(get_data_conf(frame), object); +} + +static uint32_t io_size_nopad_hole_full(call_frame_t *frame, + struct object_cipher_info *object) +{ + return io_size_nopad_full(get_hole_conf(frame), object); +} + +static uint32_t offset_in_head(struct avec_config *conf) +{ + check_cursor_head(conf); + + return conf->off_in_head; +} + +static uint32_t offset_in_tail(call_frame_t *frame, + struct object_cipher_info *object) +{ + return 0; +} + +static uint32_t offset_in_full(struct avec_config *conf, + struct object_cipher_info *object) +{ + check_cursor_full(conf); + + if (has_head_block(conf)) + return (conf->cursor - 1) << get_atom_bits(object); + else + return conf->cursor << get_atom_bits(object); +} + +static uint32_t offset_in_data_head(call_frame_t *frame, + struct object_cipher_info *object) +{ + return offset_in_head(get_data_conf(frame)); +} + +static uint32_t offset_in_hole_head(call_frame_t *frame, + struct object_cipher_info *object) +{ + return offset_in_head(get_hole_conf(frame)); +} + +static uint32_t offset_in_data_full(call_frame_t *frame, + struct object_cipher_info *object) +{ + return offset_in_full(get_data_conf(frame), object); +} + +static uint32_t offset_in_hole_full(call_frame_t *frame, + struct object_cipher_info *object) +{ + return offset_in_full(get_hole_conf(frame), object); +} + +/* + * atom->rmw() + */ +/* + * Pre-conditions: + * @vec contains plain text of the latest + * version. + * + * Uptodate gaps of the @partial block with + * this plain text, encrypt the whole block + * and write the result to disk. + */ +static int32_t rmw_partial_block(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vec, + int32_t count, + struct iatt *stbuf, + struct iobref *iobref, + struct rmw_atom *atom) +{ + size_t was_read = 0; + uint64_t file_size; + crypt_local_t *local = frame->local; + struct object_cipher_info *object = &local->info->cinfo; + + struct iovec *partial = atom->get_iovec(frame, 0); + struct avec_config *conf = atom->get_config(frame); + end_writeback_handler_t end_writeback_partial_block; +#if DEBUG_CRYPT + gf_boolean_t check_last_cblock = _gf_false; +#endif + local->op_ret = op_ret; + local->op_errno = op_errno; + + if (op_ret < 0) + goto exit; + + file_size = local->cur_file_size; + was_read = op_ret; + + if (atom->locality == HEAD_ATOM && conf->off_in_head) { + /* + * head atom with a non-uptodate gap + * at the beginning + * + * fill the gap with plain text of the + * latest version. Convert a part of hole + * (if any) to zeros. + */ + int32_t i; + int32_t copied = 0; + int32_t to_gap; /* amount of data needed to uptodate + the gap at the beginning */ +#if 0 + int32_t hole = 0; /* The part of the hole which + * got in the head block */ +#endif /* 0 */ + to_gap = conf->off_in_head; + + if (was_read < to_gap) { + if (file_size > + offset_at_head(conf) + was_read) { + /* + * It is impossible to uptodate + * head block: too few bytes have + * been read from disk, so that + * partial write is impossible. + * + * It could happen because of many + * reasons: IO errors, (meta)data + * corruption in the local file system, + * etc. + */ + gf_log(this->name, GF_LOG_WARNING, + "Can not uptodate a gap at the beginning"); + local->op_ret = -1; + local->op_errno = EIO; + goto exit; + } +#if 0 + hole = to_gap - was_read; +#endif /* 0 */ + to_gap = was_read; + } + /* + * uptodate the gap at the beginning + */ + for (i = 0; i < count && copied < to_gap; i++) { + int32_t to_copy; + + to_copy = vec[i].iov_len; + if (to_copy > to_gap - copied) + to_copy = to_gap - copied; + + memcpy(partial->iov_base, vec[i].iov_base, to_copy); + copied += to_copy; + } +#if 0 + /* + * If possible, convert part of the + * hole, which got in the head block + */ + ret = TRY_LOCK(&local->hole_lock); + if (!ret) { + if (local->hole_handled) + /* + * already converted by + * crypt_writev_cbk() + */ + UNLOCK(&local->hole_lock); + else { + /* + * convert the part of the hole + * which got in the head block + * to zeros. + * + * Update the orig_offset to make + * sure writev_cbk() won't care + * about this part of the hole. + * + */ + memset(partial->iov_base + to_gap, 0, hole); + + conf->orig_offset -= hole; + conf->orig_size += hole; + UNLOCK(&local->hole_lock); + } + } + else /* + * conversion is being performed + * by crypt_writev_cbk() + */ + ; +#endif /* 0 */ + } + if (atom->locality == TAIL_ATOM || + (!has_tail_block(conf) && conf->off_in_tail)) { + /* + * tail atom, or head atom with a non-uptodate + * gap at the end. + * + * fill the gap at the end of the block + * with plain text of the latest version. + * Pad the result, (if needed) + */ + int32_t i; + int32_t to_gap; + int copied; + off_t off_in_tail; + int32_t to_copy; + + off_in_tail = conf->off_in_tail; + to_gap = conf->gap_in_tail; + + if (to_gap && was_read < off_in_tail + to_gap) { + /* + * It is impossible to uptodate + * the gap at the end: too few bytes + * have been read from disk, so that + * partial write is impossible. + * + * It could happen because of many + * reasons: IO errors, (meta)data + * corruption in the local file system, + * etc. + */ + gf_log(this->name, GF_LOG_WARNING, + "Can not uptodate a gap at the end"); + local->op_ret = -1; + local->op_errno = EIO; + goto exit; + } + /* + * uptodate the gap at the end + */ + copied = 0; + to_copy = to_gap; + for(i = count - 1; i >= 0 && to_copy > 0; i--) { + uint32_t from_vec, off_in_vec; + + off_in_vec = 0; + from_vec = vec[i].iov_len; + if (from_vec > to_copy) { + off_in_vec = from_vec - to_copy; + from_vec = to_copy; + } + memcpy(partial->iov_base + + off_in_tail + to_gap - copied - from_vec, + vec[i].iov_base + off_in_vec, + from_vec); + + gf_log(this->name, GF_LOG_DEBUG, + "uptodate %d bytes at tail. Offset at target(source): %d(%d)", + (int)from_vec, + (int)off_in_tail + to_gap - copied - from_vec, + (int)off_in_vec); + + copied += from_vec; + to_copy -= from_vec; + } + partial->iov_len = off_in_tail + to_gap; + + if (object_alg_should_pad(object)) { + int32_t resid = 0; + resid = partial->iov_len & (object_alg_blksize(object) - 1); + if (resid) { + /* + * append a new EOF padding + */ + local->eof_padding_size = + object_alg_blksize(object) - resid; + + gf_log(this->name, GF_LOG_DEBUG, + "set padding size %d", + local->eof_padding_size); + + memset(partial->iov_base + partial->iov_len, + 1, + local->eof_padding_size); + partial->iov_len += local->eof_padding_size; +#if DEBUG_CRYPT + gf_log(this->name, GF_LOG_DEBUG, + "pad cblock with %d zeros:", + local->eof_padding_size); + dump_cblock(this, + (unsigned char *)partial->iov_base + + partial->iov_len - object_alg_blksize(object)); + check_last_cblock = _gf_true; +#endif + } + } + } + /* + * encrypt the whole block + */ + encrypt_aligned_iov(object, + partial, + 1, + atom->offset_at(frame, object)); +#if DEBUG_CRYPT + if (check_last_cblock == _gf_true) { + gf_log(this->name, GF_LOG_DEBUG, + "encrypt last cblock with offset %llu", + (unsigned long long)atom->offset_at(frame, object)); + dump_cblock(this, (unsigned char *)partial->iov_base + + partial->iov_len - object_alg_blksize(object)); + } +#endif + set_local_io_params_writev(frame, object, atom, + atom->offset_at(frame, object), + iovec_get_size(partial, 1)); + /* + * write the whole block to disk + */ + end_writeback_partial_block = dispatch_end_writeback(local->fop); + conf->cursor ++; + STACK_WIND(frame, + end_writeback_partial_block, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, + local->fd, + partial, + 1, + atom->offset_at(frame, object), + local->flags, + local->iobref_data, + local->xdata); + + gf_log("crypt", GF_LOG_DEBUG, + "submit partial block: %d bytes from %d offset", + (int)iovec_get_size(partial, 1), + (int)atom->offset_at(frame, object)); + exit: + return 0; +} + +/* + * Perform a (read-)modify-write sequence. + * This should be performed only after approval + * of upper server-side manager, i.e. the caller + * needs to make sure this is his turn to rmw. + */ +void submit_partial(call_frame_t *frame, + xlator_t *this, + fd_t *fd, + atom_locality_type ltype) +{ + int32_t ret; + dict_t *dict; + struct rmw_atom *atom; + crypt_local_t *local = frame->local; + struct object_cipher_info *object = &local->info->cinfo; + + atom = atom_by_types(local->active_setup, ltype); + /* + * To perform the "read" component of the read-modify-write + * sequence the crypt translator does stack_wind to itself. + * + * Pass current file size to crypt_readv() + */ + dict = dict_new(); + if (!dict) { + /* + * FIXME: Handle the error + */ + gf_log("crypt", GF_LOG_WARNING, "Can not alloc dict"); + return; + } + ret = dict_set(dict, + FSIZE_XATTR_PREFIX, + data_from_uint64(local->cur_file_size)); + if (ret) { + /* + * FIXME: Handle the error + */ + dict_unref(dict); + gf_log("crypt", GF_LOG_WARNING, "Can not set dict"); + goto exit; + } + STACK_WIND(frame, + atom->rmw, + this, + this->fops->readv, /* crypt_readv */ + fd, + atom->count_to_uptodate(frame, object), /* count */ + atom->offset_at(frame, object), /* offset to read from */ + 0, + dict); + exit: + dict_unref(dict); +} + +/* + * submit blocks of FULL_ATOM type + */ +void submit_full(call_frame_t *frame, xlator_t *this) +{ + crypt_local_t *local = frame->local; + struct object_cipher_info *object = &local->info->cinfo; + struct rmw_atom *atom = atom_by_types(local->active_setup, FULL_ATOM); + uint32_t count; /* total number of full blocks to submit */ + uint32_t granularity; /* number of blocks to submit in one iteration */ + + uint64_t off_in_file; /* start offset in the file, bytes */ + uint32_t off_in_atom; /* start offset in the atom, blocks */ + uint32_t blocks_written = 0; /* blocks written for this submit */ + + struct avec_config *conf = atom->get_config(frame); + end_writeback_handler_t end_writeback_full_block; + /* + * Write full blocks by groups of granularity size. + */ + end_writeback_full_block = dispatch_end_writeback(local->fop); + + if (is_ordered_mode(frame)) { + uint32_t skip = has_head_block(conf) ? 1 : 0; + count = 1; + granularity = 1; + /* + * calculate start offset using cursor value; + * here we should take into accout head block, + * which corresponds to cursor value 0. + */ + off_in_file = atom->offset_at(frame, object) + + ((conf->cursor - skip) << get_atom_bits(object)); + off_in_atom = conf->cursor - skip; + } + else { + /* + * in parallel mode + */ + count = conf->nr_full_blocks; + granularity = MAX_IOVEC; + off_in_file = atom->offset_at(frame, object); + off_in_atom = 0; + } + while (count) { + uint32_t blocks_to_write = count; + + if (blocks_to_write > granularity) + blocks_to_write = granularity; + if (conf->type == HOLE_ATOM) + /* + * reset iovec before encryption + */ + memset(atom->get_iovec(frame, 0)->iov_base, + 0, + get_atom_size(object)); + /* + * encrypt the group + */ + encrypt_aligned_iov(object, + atom->get_iovec(frame, + off_in_atom + + blocks_written), + blocks_to_write, + off_in_file + (blocks_written << + get_atom_bits(object))); + + set_local_io_params_writev(frame, object, atom, + off_in_file + (blocks_written << get_atom_bits(object)), + blocks_to_write << get_atom_bits(object)); + + conf->cursor += blocks_to_write; + + STACK_WIND(frame, + end_writeback_full_block, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, + local->fd, + atom->get_iovec(frame, off_in_atom + blocks_written), + blocks_to_write, + off_in_file + (blocks_written << get_atom_bits(object)), + local->flags, + local->iobref_data ? local->iobref_data : local->iobref, + local->xdata); + + gf_log("crypt", GF_LOG_DEBUG, "submit %d full blocks from %d offset", + blocks_to_write, + (int)(off_in_file + (blocks_written << get_atom_bits(object)))); + + count -= blocks_to_write; + blocks_written += blocks_to_write; + } + return; +} + +static int32_t rmw_data_head(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vec, + int32_t count, + struct iatt *stbuf, + struct iobref *iobref, + dict_t *xdata) +{ + return rmw_partial_block(frame, + cookie, + this, + op_ret, + op_errno, + vec, + count, + stbuf, + iobref, + atom_by_types(DATA_ATOM, HEAD_ATOM)); +} + +static int32_t rmw_data_tail(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vec, + int32_t count, + struct iatt *stbuf, + struct iobref *iobref, + dict_t *xdata) +{ + return rmw_partial_block(frame, + cookie, + this, + op_ret, + op_errno, + vec, + count, + stbuf, + iobref, + atom_by_types(DATA_ATOM, TAIL_ATOM)); +} + +static int32_t rmw_hole_head(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vec, + int32_t count, + struct iatt *stbuf, + struct iobref *iobref, + dict_t *xdata) +{ + return rmw_partial_block(frame, + cookie, + this, + op_ret, + op_errno, + vec, + count, + stbuf, + iobref, + atom_by_types(HOLE_ATOM, HEAD_ATOM)); +} + +static int32_t rmw_hole_tail(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vec, + int32_t count, + struct iatt *stbuf, + struct iobref *iobref, + dict_t *xdata) +{ + return rmw_partial_block(frame, + cookie, + this, + op_ret, + op_errno, + vec, + count, + stbuf, + iobref, + atom_by_types(HOLE_ATOM, TAIL_ATOM)); +} + +/* + * atom->count_to_uptodate() + */ +static uint32_t count_to_uptodate_head(struct avec_config *conf, + struct object_cipher_info *object) +{ + if (conf->acount == 1 && conf->off_in_tail) + return get_atom_size(object); + else + /* there is no need to read the whole head block */ + return conf->off_in_head; +} + +static uint32_t count_to_uptodate_tail(struct avec_config *conf, + struct object_cipher_info *object) +{ + /* we need to read the whole tail block */ + return get_atom_size(object); +} + +static uint32_t count_to_uptodate_data_head(call_frame_t *frame, + struct object_cipher_info *object) +{ + return count_to_uptodate_head(get_data_conf(frame), object); +} + +static uint32_t count_to_uptodate_data_tail(call_frame_t *frame, + struct object_cipher_info *object) +{ + return count_to_uptodate_tail(get_data_conf(frame), object); +} + +static uint32_t count_to_uptodate_hole_head(call_frame_t *frame, + struct object_cipher_info *object) +{ + return count_to_uptodate_head(get_hole_conf(frame), object); +} + +static uint32_t count_to_uptodate_hole_tail(call_frame_t *frame, + struct object_cipher_info *object) +{ + return count_to_uptodate_tail(get_hole_conf(frame), object); +} + +/* atom->get_config() */ + +static struct avec_config *get_config_data(call_frame_t *frame) +{ + return &((crypt_local_t *)frame->local)->data_conf; +} + +static struct avec_config *get_config_hole(call_frame_t *frame) +{ + return &((crypt_local_t *)frame->local)->hole_conf; +} + +/* + * atom->get_iovec() + */ +static struct iovec *get_iovec_hole_head(call_frame_t *frame, + uint32_t count) +{ + struct avec_config *conf = get_hole_conf(frame); + + return conf->avec; +} + +static struct iovec *get_iovec_hole_full(call_frame_t *frame, + uint32_t count) +{ + struct avec_config *conf = get_hole_conf(frame); + + return conf->avec + (conf->off_in_head ? 1 : 0); +} + +static inline struct iovec *get_iovec_hole_tail(call_frame_t *frame, + uint32_t count) +{ + struct avec_config *conf = get_hole_conf(frame); + + return conf->avec + (conf->blocks_in_pool - 1); +} + +static inline struct iovec *get_iovec_data_head(call_frame_t *frame, + uint32_t count) +{ + struct avec_config *conf = get_data_conf(frame); + + return conf->avec; +} + +static inline struct iovec *get_iovec_data_full(call_frame_t *frame, + uint32_t count) +{ + struct avec_config *conf = get_data_conf(frame); + + return conf->avec + (conf->off_in_head ? 1 : 0) + count; +} + +static inline struct iovec *get_iovec_data_tail(call_frame_t *frame, + uint32_t count) +{ + struct avec_config *conf = get_data_conf(frame); + + return conf->avec + + (conf->off_in_head ? 1 : 0) + + conf->nr_full_blocks; +} + +static struct rmw_atom atoms[LAST_DATA_TYPE][LAST_LOCALITY_TYPE] = { + [DATA_ATOM][HEAD_ATOM] = + { .locality = HEAD_ATOM, + .rmw = rmw_data_head, + .offset_at = offset_at_data_head, + .offset_in = offset_in_data_head, + .get_iovec = get_iovec_data_head, + .io_size_nopad = io_size_nopad_data_head, + .count_to_uptodate = count_to_uptodate_data_head, + .get_config = get_config_data + }, + [DATA_ATOM][TAIL_ATOM] = + { .locality = TAIL_ATOM, + .rmw = rmw_data_tail, + .offset_at = offset_at_data_tail, + .offset_in = offset_in_tail, + .get_iovec = get_iovec_data_tail, + .io_size_nopad = io_size_nopad_data_tail, + .count_to_uptodate = count_to_uptodate_data_tail, + .get_config = get_config_data + }, + [DATA_ATOM][FULL_ATOM] = + { .locality = FULL_ATOM, + .offset_at = offset_at_data_full, + .offset_in = offset_in_data_full, + .get_iovec = get_iovec_data_full, + .io_size_nopad = io_size_nopad_data_full, + .get_config = get_config_data + }, + [HOLE_ATOM][HEAD_ATOM] = + { .locality = HEAD_ATOM, + .rmw = rmw_hole_head, + .offset_at = offset_at_hole_head, + .offset_in = offset_in_hole_head, + .get_iovec = get_iovec_hole_head, + .io_size_nopad = io_size_nopad_hole_head, + .count_to_uptodate = count_to_uptodate_hole_head, + .get_config = get_config_hole + }, + [HOLE_ATOM][TAIL_ATOM] = + { .locality = TAIL_ATOM, + .rmw = rmw_hole_tail, + .offset_at = offset_at_hole_tail, + .offset_in = offset_in_tail, + .get_iovec = get_iovec_hole_tail, + .io_size_nopad = io_size_nopad_hole_tail, + .count_to_uptodate = count_to_uptodate_hole_tail, + .get_config = get_config_hole + }, + [HOLE_ATOM][FULL_ATOM] = + { .locality = FULL_ATOM, + .offset_at = offset_at_hole_full, + .offset_in = offset_in_hole_full, + .get_iovec = get_iovec_hole_full, + .io_size_nopad = io_size_nopad_hole_full, + .get_config = get_config_hole + } +}; + +struct rmw_atom *atom_by_types(atom_data_type data, + atom_locality_type locality) +{ + return &atoms[data][locality]; +} + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 80 + scroll-step: 1 + End: +*/ diff --git a/xlators/encryption/crypt/src/crypt-common.h b/xlators/encryption/crypt/src/crypt-common.h new file mode 100644 index 000000000..7c212ad5d --- /dev/null +++ b/xlators/encryption/crypt/src/crypt-common.h @@ -0,0 +1,141 @@ +/* + Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __CRYPT_COMMON_H__ +#define __CRYPT_COMMON_H__ + +#define INVAL_SUBVERSION_NUMBER (0xff) +#define CRYPT_INVAL_OP (GF_FOP_NULL) + +#define CRYPTO_FORMAT_PREFIX "trusted.glusterfs.crypt.att.cfmt" +#define FSIZE_XATTR_PREFIX "trusted.glusterfs.crypt.att.size" +#define SUBREQ_PREFIX "trusted.glusterfs.crypt.msg.sreq" +#define FSIZE_MSG_PREFIX "trusted.glusterfs.crypt.msg.size" +#define DE_MSG_PREFIX "trusted.glusterfs.crypt.msg.dent" +#define REQUEST_ID_PREFIX "trusted.glusterfs.crypt.msg.rqid" +#define MSGFLAGS_PREFIX "trusted.glusterfs.crypt.msg.xfgs" + + +/* messages for crypt_open() */ +#define MSGFLAGS_REQUEST_MTD_RLOCK 1 /* take read lock and don't unlock */ +#define MSGFLAGS_REQUEST_MTD_WLOCK 2 /* take write lock and don't unlock */ + +#define AES_BLOCK_BITS (4) /* AES_BLOCK_SIZE == 1 << AES_BLOCK_BITS */ + +#define noop do {; } while (0) +#define cassert(cond) ({ switch (-1) { case (cond): case 0: break; } }) +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) + +/* + * Format of file's metadata + */ +struct crypt_format { + uint8_t loader_id; /* version of metadata loader */ + uint8_t versioned[0]; /* file's metadata of specific version */ +} __attribute__((packed)); + +typedef enum { + AES_CIPHER_ALG, + LAST_CIPHER_ALG +} cipher_alg_t; + +typedef enum { + XTS_CIPHER_MODE, + LAST_CIPHER_MODE +} cipher_mode_t; + +typedef enum { + MTD_LOADER_V1, + LAST_MTD_LOADER +} mtd_loader_id; + +static inline void msgflags_set_mtd_rlock(uint32_t *flags) +{ + *flags |= MSGFLAGS_REQUEST_MTD_RLOCK; +} + +static inline void msgflags_set_mtd_wlock(uint32_t *flags) +{ + *flags |= MSGFLAGS_REQUEST_MTD_WLOCK; +} + +static inline gf_boolean_t msgflags_check_mtd_rlock(uint32_t *flags) +{ + return *flags & MSGFLAGS_REQUEST_MTD_RLOCK; +} + +static inline gf_boolean_t msgflags_check_mtd_wlock(uint32_t *flags) +{ + return *flags & MSGFLAGS_REQUEST_MTD_WLOCK; +} + +static inline gf_boolean_t msgflags_check_mtd_lock(uint32_t *flags) +{ + return msgflags_check_mtd_rlock(flags) || + msgflags_check_mtd_wlock(flags); +} + +/* + * returns number of logical blocks occupied + * (maybe partially) by @count bytes + * at offset @start. + */ +static inline off_t logical_blocks_occupied(uint64_t start, off_t count, + int blkbits) +{ + return ((start + count - 1) >> blkbits) - (start >> blkbits) + 1; +} + +/* + * are two bytes (represented by offsets @off1 + * and @off2 respectively) in the same logical + * block. + */ +static inline int in_same_lblock(uint64_t off1, uint64_t off2, + int blkbits) +{ + return off1 >> blkbits == off2 >> blkbits; +} + +static inline void dump_cblock(xlator_t *this, unsigned char *buf) +{ + gf_log(this->name, GF_LOG_DEBUG, + "dump cblock: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x", + (buf)[0], + (buf)[1], + (buf)[2], + (buf)[3], + (buf)[4], + (buf)[5], + (buf)[6], + (buf)[7], + (buf)[8], + (buf)[9], + (buf)[10], + (buf)[11], + (buf)[12], + (buf)[13], + (buf)[14], + (buf)[15]); +} + +#endif /* __CRYPT_COMMON_H__ */ + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 80 + scroll-step: 1 + End: +*/ diff --git a/xlators/encryption/crypt/src/crypt-mem-types.h b/xlators/encryption/crypt/src/crypt-mem-types.h new file mode 100644 index 000000000..799727573 --- /dev/null +++ b/xlators/encryption/crypt/src/crypt-mem-types.h @@ -0,0 +1,43 @@ +/* + Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef __CRYPT_MEM_TYPES_H__ +#define __CRYPT_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_crypt_mem_types_ { + gf_crypt_mt_priv = gf_common_mt_end + 1, + gf_crypt_mt_inode, + gf_crypt_mt_data, + gf_crypt_mt_mtd, + gf_crypt_mt_loc, + gf_crypt_mt_iatt, + gf_crypt_mt_key, + gf_crypt_mt_iovec, + gf_crypt_mt_char, +}; + +#endif /* __CRYPT_MEM_TYPES_H__ */ + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 80 + scroll-step: 1 + End: +*/ + + + diff --git a/xlators/encryption/crypt/src/crypt.c b/xlators/encryption/crypt/src/crypt.c new file mode 100644 index 000000000..db2e6d83c --- /dev/null +++ b/xlators/encryption/crypt/src/crypt.c @@ -0,0 +1,4498 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#include <ctype.h> +#include <sys/uio.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "xlator.h" +#include "logging.h" +#include "defaults.h" + +#include "crypt-common.h" +#include "crypt.h" + +static void init_inode_info_head(struct crypt_inode_info *info, fd_t *fd); +static int32_t init_inode_info_tail(struct crypt_inode_info *info, + struct master_cipher_info *master); +static int32_t prepare_for_submit_hole(call_frame_t *frame, xlator_t *this, + uint64_t from, off_t size); +static int32_t load_file_size(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata); +static void do_ordered_submit(call_frame_t *frame, xlator_t *this, + atom_data_type dtype); +static void do_parallel_submit(call_frame_t *frame, xlator_t *this, + atom_data_type dtype); +static void put_one_call_open(call_frame_t *frame); +static void put_one_call_readv(call_frame_t *frame, xlator_t *this); +static void put_one_call_writev(call_frame_t *frame, xlator_t *this); +static void put_one_call_ftruncate(call_frame_t *frame, xlator_t *this); +static void free_avec(struct iovec *avec, char **pool, int blocks_in_pool); +static void free_avec_data(crypt_local_t *local); +static void free_avec_hole(crypt_local_t *local); + +static crypt_local_t *crypt_alloc_local(call_frame_t *frame, xlator_t *this, + glusterfs_fop_t fop) +{ + crypt_local_t *local = NULL; + + local = mem_get0(this->local_pool); + if (!local) { + gf_log(this->name, GF_LOG_ERROR, "out of memory"); + return NULL; + } + local->fop = fop; + LOCK_INIT(&local->hole_lock); + LOCK_INIT(&local->call_lock); + LOCK_INIT(&local->rw_count_lock); + + frame->local = local; + return local; +} + +struct crypt_inode_info *get_crypt_inode_info(inode_t *inode, xlator_t *this) +{ + int ret; + uint64_t value = 0; + struct crypt_inode_info *info; + + ret = inode_ctx_get(inode, this, &value); + if (ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "Can not get inode info"); + return NULL; + } + info = (struct crypt_inode_info *)(long)value; + if (info == NULL) { + gf_log (this->name, GF_LOG_WARNING, + "Can not obtain inode info"); + return NULL; + } + return info; +} + +static struct crypt_inode_info *local_get_inode_info(crypt_local_t *local, + xlator_t *this) +{ + if (local->info) + return local->info; + local->info = get_crypt_inode_info(local->fd->inode, this); + return local->info; +} + +static struct crypt_inode_info *alloc_inode_info(crypt_local_t *local, + loc_t *loc) +{ + struct crypt_inode_info *info; + + info = GF_CALLOC(1, sizeof(*info), gf_crypt_mt_inode); + if (!info) { + local->op_ret = -1; + local->op_errno = ENOMEM; + gf_log ("crypt", GF_LOG_WARNING, + "Can not allocate inode info"); + return NULL; + } + memset(info, 0, sizeof(*info)); +#if DEBUG_CRYPT + info->loc = GF_CALLOC(1, sizeof(*loc), gf_crypt_mt_loc); + if (!info->loc) { + gf_log("crypt", GF_LOG_WARNING, "Can not allocate loc"); + GF_FREE(info); + return NULL; + } + if (loc_copy(info->loc, loc)){ + GF_FREE(info->loc); + GF_FREE(info); + return NULL; + } +#endif /* DEBUG_CRYPT */ + + local->info = info; + return info; +} + +static void free_inode_info(struct crypt_inode_info *info) +{ +#if DEBUG_CRYPT + loc_wipe(info->loc); + GF_FREE(info->loc); +#endif + memset(info, 0, sizeof(*info)); + GF_FREE(info); +} + +int crypt_forget (xlator_t *this, inode_t *inode) +{ + uint64_t ctx_addr = 0; + if (!inode_ctx_del (inode, this, &ctx_addr)) + free_inode_info((struct crypt_inode_info *)(long)ctx_addr); + return 0; +} + +#if DEBUG_CRYPT +static void check_read(call_frame_t *frame, xlator_t *this, int32_t read, + struct iovec *vec, int32_t count, struct iatt *stbuf) +{ + crypt_local_t *local = frame->local; + struct object_cipher_info *object = get_object_cinfo(local->info); + struct avec_config *conf = &local->data_conf; + uint32_t resid = stbuf->ia_size & (object_alg_blksize(object) - 1); + + if (read <= 0) + return; + if (read != iovec_get_size(vec, count)) + gf_log ("crypt", GF_LOG_DEBUG, + "op_ret differs from amount of read bytes"); + + if (object_alg_should_pad(object) && (read & (object_alg_blksize(object) - 1))) + gf_log ("crypt", GF_LOG_DEBUG, + "bad amount of read bytes (!= 0 mod(cblock size))"); + + if (conf->aligned_offset + read > + stbuf->ia_size + (resid ? object_alg_blksize(object) - resid : 0)) + gf_log ("crypt", GF_LOG_DEBUG, + "bad amount of read bytes (too large))"); + +} + +#define PT_BYTES_TO_DUMP (32) +static void dump_plain_text(crypt_local_t *local, struct iovec *avec) +{ + int32_t to_dump; + char str[PT_BYTES_TO_DUMP + 1]; + + if (!avec) + return; + to_dump = avec->iov_len; + if (to_dump > PT_BYTES_TO_DUMP) + to_dump = PT_BYTES_TO_DUMP; + memcpy(str, avec->iov_base, to_dump); + memset(str + to_dump, '0', 1); + gf_log("crypt", GF_LOG_DEBUG, "Read file: %s", str); +} + +static int32_t data_conf_invariant(struct avec_config *conf) +{ + return conf->acount == + !!has_head_block(conf) + + !!has_tail_block(conf)+ + conf->nr_full_blocks; +} + +static int32_t hole_conf_invariant(struct avec_config *conf) +{ + return conf->blocks_in_pool == + !!has_head_block(conf) + + !!has_tail_block(conf)+ + !!has_full_blocks(conf); +} + +static void crypt_check_conf(struct avec_config *conf) +{ + int32_t ret = 0; + const char *msg; + + switch (conf->type) { + case DATA_ATOM: + msg = "data"; + ret = data_conf_invariant(conf); + break; + case HOLE_ATOM: + msg = "hole"; + ret = hole_conf_invariant(conf); + break; + default: + msg = "unknown"; + } + if (!ret) + gf_log("crypt", GF_LOG_DEBUG, "bad %s conf", msg); +} + +static void check_buf(call_frame_t *frame, xlator_t *this, struct iatt *buf) +{ + crypt_local_t *local = frame->local; + struct object_cipher_info *object = &local->info->cinfo; + uint64_t local_file_size; + + switch(local->fop) { + case GF_FOP_FTRUNCATE: + return; + case GF_FOP_WRITE: + local_file_size = local->new_file_size; + break; + case GF_FOP_READ: + if (parent_is_crypt_xlator(frame, this)) + return; + local_file_size = local->cur_file_size; + break; + default: + gf_log("crypt", GF_LOG_DEBUG, "bad file operation"); + return; + } + if (buf->ia_size != round_up(local_file_size, + object_alg_blksize(object))) + gf_log("crypt", GF_LOG_DEBUG, + "bad ia_size in buf (%llu), should be %llu", + (unsigned long long)buf->ia_size, + (unsigned long long)round_up(local_file_size, + object_alg_blksize(object))); +} + +#else +#define check_read(frame, this, op_ret, vec, count, stbuf) noop +#define dump_plain_text(local, avec) noop +#define crypt_check_conf(conf) noop +#define check_buf(frame, this, buf) noop +#endif /* DEBUG_CRYPT */ + +/* + * Pre-conditions: + * @vec represents a ciphertext of expanded size and + * aligned offset. + * + * Compound a temporal vector @avec with block-aligned + * components, decrypt and fix it up to represent a chunk + * of data corresponding to the original size and offset. + * Pass the result to the next translator. + */ +int32_t crypt_readv_cbk(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vec, + int32_t count, + struct iatt *stbuf, + struct iobref *iobref, + dict_t *xdata) +{ + crypt_local_t *local = frame->local; + struct avec_config *conf = &local->data_conf; + struct object_cipher_info *object = &local->info->cinfo; + + struct iovec *avec; + uint32_t i; + uint32_t to_vec; + uint32_t to_user; + + check_buf(frame, this, stbuf); + check_read(frame, this, op_ret, vec, count, stbuf); + + local->op_ret = op_ret; + local->op_errno = op_errno; + local->iobref = iobref_ref(iobref); + + local->buf = *stbuf; + local->buf.ia_size = local->cur_file_size; + + if (op_ret <= 0 || count == 0 || vec[0].iov_len == 0) + goto put_one_call; + + if (conf->orig_offset >= local->cur_file_size) { + local->op_ret = 0; + goto put_one_call; + } + /* + * correct config params with real file size + * and actual amount of bytes read + */ + set_config_offsets(frame, this, + conf->orig_offset, op_ret, DATA_ATOM, 0); + + if (conf->orig_offset + conf->orig_size > local->cur_file_size) + conf->orig_size = local->cur_file_size - conf->orig_offset; + /* + * calculate amount of data to be returned + * to user. + */ + to_user = op_ret; + if (conf->aligned_offset + to_user <= conf->orig_offset) { + gf_log(this->name, GF_LOG_WARNING, "Incomplete read"); + local->op_ret = -1; + local->op_errno = EIO; + goto put_one_call; + } + to_user -= (conf->aligned_offset - conf->orig_offset); + + if (to_user > conf->orig_size) + to_user = conf->orig_size; + local->rw_count = to_user; + + op_errno = set_config_avec_data(this, local, + conf, object, vec, count); + if (op_errno) { + local->op_ret = -1; + local->op_errno = op_errno; + goto put_one_call; + } + avec = conf->avec; +#if DEBUG_CRYPT + if (conf->off_in_tail != 0 && + conf->off_in_tail < object_alg_blksize(object) && + object_alg_should_pad(object)) + gf_log(this->name, GF_LOG_DEBUG, "Bad offset in tail %d", + conf->off_in_tail); + if (iovec_get_size(vec, count) != 0 && + in_same_lblock(conf->orig_offset + iovec_get_size(vec, count) - 1, + local->cur_file_size - 1, + object_alg_blkbits(object))) { + gf_log(this->name, GF_LOG_DEBUG, "Compound last cblock"); + dump_cblock(this, + (unsigned char *)(avec[conf->acount - 1].iov_base) + + avec[conf->acount - 1].iov_len - object_alg_blksize(object)); + dump_cblock(this, + (unsigned char *)(vec[count - 1].iov_base) + + vec[count - 1].iov_len - object_alg_blksize(object)); + } +#endif + decrypt_aligned_iov(object, avec, + conf->acount, conf->aligned_offset); + /* + * pass proper plain data to user + */ + avec[0].iov_base += (conf->aligned_offset - conf->orig_offset); + avec[0].iov_len -= (conf->aligned_offset - conf->orig_offset); + + to_vec = to_user; + for (i = 0; i < conf->acount; i++) { + if (avec[i].iov_len > to_vec) + avec[i].iov_len = to_vec; + to_vec -= avec[i].iov_len; + } + put_one_call: + put_one_call_readv(frame, this); + return 0; +} + +static int32_t do_readv(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict, + dict_t *xdata) +{ + data_t *data; + crypt_local_t *local = frame->local; + + if (op_ret < 0) + goto error; + /* + * extract regular file size + */ + data = dict_get(dict, FSIZE_XATTR_PREFIX); + if (!data) { + gf_log("crypt", GF_LOG_WARNING, "Regular file size not found"); + op_errno = EIO; + goto error; + } + local->cur_file_size = data_to_uint64(data); + + get_one_call(frame); + STACK_WIND(frame, + crypt_readv_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->readv, + local->fd, + /* + * FIXME: read amount can be reduced + */ + local->data_conf.expanded_size, + local->data_conf.aligned_offset, + local->flags, + local->xdata); + return 0; + error: + local->op_ret = -1; + local->op_errno = op_errno; + + get_one_call(frame); + put_one_call_readv(frame, this); + return 0; +} + +static int32_t crypt_readv_finodelk_cbk(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *xdata) +{ + crypt_local_t *local = frame->local; + + if (op_ret < 0) + goto error; + /* + * An access has been granted, + * retrieve file size + */ + STACK_WIND(frame, + do_readv, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, + local->fd, + FSIZE_XATTR_PREFIX, + NULL); + return 0; + error: + fd_unref(local->fd); + if (local->xdata) + dict_unref(local->xdata); + STACK_UNWIND_STRICT(readv, + frame, + -1, + op_errno, + NULL, + 0, + NULL, + NULL, + NULL); + return 0; +} + +static int32_t readv_trivial_completion(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iatt *buf, + dict_t *xdata) +{ + crypt_local_t *local = frame->local; + + local->op_ret = op_ret; + local->op_errno = op_errno; + + if (op_ret < 0) { + gf_log(this->name, GF_LOG_WARNING, + "stat failed (%d)", op_errno); + goto error; + } + local->buf = *buf; + STACK_WIND(frame, + load_file_size, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + local->loc, + FSIZE_XATTR_PREFIX, + NULL); + return 0; + error: + STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, + NULL, 0, NULL, NULL, NULL); + return 0; +} + +int32_t crypt_readv(call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset, + uint32_t flags, dict_t *xdata) +{ + int32_t ret; + crypt_local_t *local; + struct crypt_inode_info *info; + struct gf_flock lock = {0, }; + +#if DEBUG_CRYPT + gf_log("crypt", GF_LOG_DEBUG, "reading %d bytes from offset %llu", + (int)size, (long long)offset); + if (parent_is_crypt_xlator(frame, this)) + gf_log("crypt", GF_LOG_DEBUG, "parent is crypt"); +#endif + local = crypt_alloc_local(frame, this, GF_FOP_READ); + if (!local) { + ret = ENOMEM; + goto error; + } + if (size == 0) + goto trivial; + + local->fd = fd_ref(fd); + local->flags = flags; + + info = local_get_inode_info(local, this); + if (info == NULL) { + ret = EINVAL; + fd_unref(fd); + goto error; + } + if (!object_alg_atomic(&info->cinfo)) { + ret = EINVAL; + fd_unref(fd); + goto error; + } + set_config_offsets(frame, this, offset, size, + DATA_ATOM, 0); + if (parent_is_crypt_xlator(frame, this)) { + data_t *data; + /* + * We are called by crypt_writev (or cypt_ftruncate) + * to perform the "read" component of the read-modify-write + * (or read-prune-write) sequence for some atom; + * + * don't ask for access: + * it has already been acquired + * + * Retrieve current file size + */ + if (!xdata) { + gf_log("crypt", GF_LOG_WARNING, + "Regular file size hasn't been passed"); + ret = EIO; + goto error; + } + data = dict_get(xdata, FSIZE_XATTR_PREFIX); + if (!data) { + gf_log("crypt", GF_LOG_WARNING, + "Regular file size not found"); + ret = EIO; + goto error; + } + local->old_file_size = + local->cur_file_size = data_to_uint64(data); + + get_one_call(frame); + STACK_WIND(frame, + crypt_readv_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, + local->fd, + /* + * FIXME: read amount can be reduced + */ + local->data_conf.expanded_size, + local->data_conf.aligned_offset, + flags, + NULL); + return 0; + } + if (xdata) + local->xdata = dict_ref(xdata); + + lock.l_len = 0; + lock.l_start = 0; + lock.l_type = F_RDLCK; + lock.l_whence = SEEK_SET; + + STACK_WIND(frame, + crypt_readv_finodelk_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->finodelk, + this->name, + fd, + F_SETLKW, + &lock, + NULL); + return 0; + trivial: + STACK_WIND(frame, + readv_trivial_completion, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, + fd, + NULL); + return 0; + error: + STACK_UNWIND_STRICT(readv, + frame, + -1, + ret, + NULL, + 0, + NULL, + NULL, + NULL); + return 0; +} + +void set_local_io_params_writev(call_frame_t *frame, + struct object_cipher_info *object, + struct rmw_atom *atom, + off_t io_offset, + uint32_t io_size) +{ + crypt_local_t *local = frame->local; + + local->io_offset = io_offset; + local->io_size = io_size; + + local->io_offset_nopad = + atom->offset_at(frame, object) + atom->offset_in(frame, object); + + gf_log("crypt", GF_LOG_DEBUG, + "set nopad offset to %llu", + (unsigned long long)local->io_offset_nopad); + + local->io_size_nopad = atom->io_size_nopad(frame, object); + + gf_log("crypt", GF_LOG_DEBUG, + "set nopad size to %llu", + (unsigned long long)local->io_size_nopad); + + local->update_disk_file_size = 0; + /* + * NOTE: eof_padding_size is 0 for all full atoms; + * For head and tail atoms it will be set up at rmw_partial block() + */ + local->new_file_size = local->cur_file_size; + + if (local->io_offset_nopad + local->io_size_nopad > local->cur_file_size) { + + local->new_file_size = local->io_offset_nopad + local->io_size_nopad; + + gf_log("crypt", GF_LOG_DEBUG, + "set new file size to %llu", + (unsigned long long)local->new_file_size); + + local->update_disk_file_size = 1; + } +} + +void set_local_io_params_ftruncate(call_frame_t *frame, + struct object_cipher_info *object) +{ + uint32_t resid; + crypt_local_t *local = frame->local; + struct avec_config *conf = &local->data_conf; + + resid = conf->orig_offset & (object_alg_blksize(object) - 1); + if (resid) { + local->eof_padding_size = + object_alg_blksize(object) - resid; + local->new_file_size = conf->aligned_offset; + local->update_disk_file_size = 0; + /* + * file size will be updated + * in the ->writev() stack, + * when submitting file tail + */ + } + else { + local->eof_padding_size = 0; + local->new_file_size = conf->orig_offset; + local->update_disk_file_size = 1; + /* + * file size will be updated + * in this ->ftruncate stack + */ + } +} + +static inline void submit_head(call_frame_t *frame, xlator_t *this) +{ + crypt_local_t *local = frame->local; + submit_partial(frame, this, local->fd, HEAD_ATOM); +} + +static inline void submit_tail(call_frame_t *frame, xlator_t *this) +{ + crypt_local_t *local = frame->local; + submit_partial(frame, this, local->fd, TAIL_ATOM); +} + +static void submit_hole(call_frame_t *frame, xlator_t *this) +{ + /* + * hole conversion always means + * appended write and goes in ordered fashion + */ + do_ordered_submit(frame, this, HOLE_ATOM); +} + +static void submit_data(call_frame_t *frame, xlator_t *this) +{ + if (is_ordered_mode(frame)) { + do_ordered_submit(frame, this, DATA_ATOM); + return; + } + gf_log("crypt", GF_LOG_WARNING, "Bad submit mode"); + get_nr_calls(frame, nr_calls_data(frame)); + do_parallel_submit(frame, this, DATA_ATOM); + return; +} + +/* + * heplers called by writev_cbk, fruncate_cbk in ordered mode + */ + +static inline int32_t should_submit_hole(crypt_local_t *local) +{ + struct avec_config *conf = &local->hole_conf; + + return conf->avec != NULL; +} + +static inline int32_t should_resume_submit_hole(crypt_local_t *local) +{ + struct avec_config *conf = &local->hole_conf; + + if (local->fop == GF_FOP_WRITE && has_tail_block(conf)) + /* + * Don't submit a part of hole, which + * fits into a data block: + * this part of hole will be converted + * as a gap filled by zeros in data head + * block. + */ + return conf->cursor < conf->acount - 1; + else + return conf->cursor < conf->acount; +} + +static inline int32_t should_resume_submit_data(call_frame_t *frame) +{ + crypt_local_t *local = frame->local; + struct avec_config *conf = &local->data_conf; + + if (is_ordered_mode(frame)) + return conf->cursor < conf->acount; + /* + * parallel writes + */ + return 0; +} + +static inline int32_t should_submit_data_after_hole(crypt_local_t *local) +{ + return local->data_conf.avec != NULL; +} + +static void update_local_file_params(call_frame_t *frame, + xlator_t *this, + struct iatt *prebuf, + struct iatt *postbuf) +{ + crypt_local_t *local = frame->local; + + check_buf(frame, this, postbuf); + + local->prebuf = *prebuf; + local->postbuf = *postbuf; + + local->prebuf.ia_size = local->cur_file_size; + local->postbuf.ia_size = local->new_file_size; + + local->cur_file_size = local->new_file_size; +} + +static int32_t end_writeback_writev(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iatt *prebuf, + struct iatt *postbuf, + dict_t *xdata) +{ + crypt_local_t *local = frame->local; + + local->op_ret = op_ret; + local->op_errno = op_errno; + + if (op_ret <= 0) { + gf_log(this->name, GF_LOG_WARNING, + "writev iteration failed"); + goto put_one_call; + } + /* + * op_ret includes paddings (atom's head, atom's tail and EOF) + */ + if (op_ret < local->io_size) { + gf_log(this->name, GF_LOG_WARNING, + "Incomplete writev iteration"); + goto put_one_call; + } + op_ret -= local->eof_padding_size; + local->op_ret = op_ret; + + update_local_file_params(frame, this, prebuf, postbuf); + + if (data_write_in_progress(local)) { + + LOCK(&local->rw_count_lock); + local->rw_count += op_ret; + UNLOCK(&local->rw_count_lock); + + if (should_resume_submit_data(frame)) + submit_data(frame, this); + } + else { + /* + * hole conversion is going on; + * don't take into account written zeros + */ + if (should_resume_submit_hole(local)) + submit_hole(frame, this); + + else if (should_submit_data_after_hole(local)) + submit_data(frame, this); + } + put_one_call: + put_one_call_writev(frame, this); + return 0; +} + +#define crypt_writev_cbk end_writeback_writev + +#define HOLE_WRITE_CHUNK_BITS 12 +#define HOLE_WRITE_CHUNK_SIZE (1 << HOLE_WRITE_CHUNK_BITS) + +/* + * Convert hole of size @size at offset @off to + * zeros and prepare respective iovecs for submit. + * The hole lock should be held. + * + * Pre-conditions: + * @local->file_size is set and valid. + */ +int32_t prepare_for_submit_hole(call_frame_t *frame, xlator_t *this, + uint64_t off, off_t size) +{ + int32_t ret; + crypt_local_t *local = frame->local; + struct object_cipher_info *object = &local->info->cinfo; + + set_config_offsets(frame, this, off, size, HOLE_ATOM, 1); + + ret = set_config_avec_hole(this, local, + &local->hole_conf, object, local->fop); + crypt_check_conf(&local->hole_conf); + + return ret; +} + +/* + * prepare for submit @count bytes at offset @from + */ +int32_t prepare_for_submit_data(call_frame_t *frame, xlator_t *this, + off_t from, int32_t size, struct iovec *vec, + int32_t vec_count, int32_t setup_gap) +{ + uint32_t ret; + crypt_local_t *local = frame->local; + struct object_cipher_info *object = &local->info->cinfo; + + set_config_offsets(frame, this, from, size, + DATA_ATOM, setup_gap); + + ret = set_config_avec_data(this, local, + &local->data_conf, object, vec, vec_count); + crypt_check_conf(&local->data_conf); + + return ret; +} + +static void free_avec(struct iovec *avec, + char **pool, int blocks_in_pool) +{ + if (!avec) + return; + GF_FREE(pool); + GF_FREE(avec); +} + +static void free_avec_data(crypt_local_t *local) +{ + return free_avec(local->data_conf.avec, + local->data_conf.pool, + local->data_conf.blocks_in_pool); +} + +static void free_avec_hole(crypt_local_t *local) +{ + return free_avec(local->hole_conf.avec, + local->hole_conf.pool, + local->hole_conf.blocks_in_pool); +} + + +static void do_parallel_submit(call_frame_t *frame, xlator_t *this, + atom_data_type dtype) +{ + crypt_local_t *local = frame->local; + struct avec_config *conf; + + local->active_setup = dtype; + conf = conf_by_type(frame, dtype); + + if (has_head_block(conf)) + submit_head(frame, this); + + if (has_full_blocks(conf)) + submit_full(frame, this); + + if (has_tail_block(conf)) + submit_tail(frame, this); + return; +} + +static void do_ordered_submit(call_frame_t *frame, xlator_t *this, + atom_data_type dtype) +{ + crypt_local_t *local = frame->local; + struct avec_config *conf; + + local->active_setup = dtype; + conf = conf_by_type(frame, dtype); + + if (should_submit_head_block(conf)) { + get_one_call_nolock(frame); + submit_head(frame, this); + } + else if (should_submit_full_block(conf)) { + get_one_call_nolock(frame); + submit_full(frame, this); + } + else if (should_submit_tail_block(conf)) { + get_one_call_nolock(frame); + submit_tail(frame, this); + } + else + gf_log("crypt", GF_LOG_DEBUG, + "nothing has been submitted in ordered mode"); + return; +} + +static int32_t do_writev(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict, + dict_t *xdata) +{ + data_t *data; + crypt_local_t *local = frame->local; + struct object_cipher_info *object = &local->info->cinfo; + /* + * extract regular file size + */ + data = dict_get(dict, FSIZE_XATTR_PREFIX); + if (!data) { + gf_log("crypt", GF_LOG_WARNING, "Regular file size not found"); + op_ret = -1; + op_errno = EIO; + goto error; + } + local->old_file_size = local->cur_file_size = data_to_uint64(data); + + set_gap_at_end(frame, object, &local->data_conf, DATA_ATOM); + + if (local->cur_file_size < local->data_conf.orig_offset) { + /* + * Set up hole config + */ + op_errno = prepare_for_submit_hole(frame, + this, + local->cur_file_size, + local->data_conf.orig_offset - local->cur_file_size); + if (op_errno) { + local->op_ret = -1; + local->op_errno = op_errno; + goto error; + } + } + if (should_submit_hole(local)) + submit_hole(frame, this); + else + submit_data(frame, this); + return 0; + error: + get_one_call_nolock(frame); + put_one_call_writev(frame, this); + return 0; +} + +static int32_t crypt_writev_finodelk_cbk(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *xdata) +{ + crypt_local_t *local = frame->local; + + local->op_ret = op_ret; + local->op_errno = op_errno; + + if (op_ret < 0) + goto error; + /* + * An access has been granted, + * retrieve file size first + */ + STACK_WIND(frame, + do_writev, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, + local->fd, + FSIZE_XATTR_PREFIX, + NULL); + return 0; + error: + get_one_call_nolock(frame); + put_one_call_writev(frame, this); + return 0; +} + +static int32_t writev_trivial_completion(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iatt *buf, + dict_t *dict) +{ + crypt_local_t *local = frame->local; + + local->op_ret = op_ret; + local->op_errno = op_errno; + local->prebuf = *buf; + local->postbuf = *buf; + + local->prebuf.ia_size = local->cur_file_size; + local->postbuf.ia_size = local->cur_file_size; + + get_one_call(frame); + put_one_call_writev(frame, this); + return 0; +} + +int crypt_writev(call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vec, + int32_t count, + off_t offset, + uint32_t flags, + struct iobref *iobref, + dict_t *xdata) +{ + int32_t ret; + crypt_local_t *local; + struct crypt_inode_info *info; + struct gf_flock lock = {0, }; +#if DEBUG_CRYPT + gf_log ("crypt", GF_LOG_DEBUG, "writing %d bytes from offset %llu", + (int)iovec_get_size(vec, count), (long long)offset); +#endif + local = crypt_alloc_local(frame, this, GF_FOP_WRITE); + if (!local) { + ret = ENOMEM; + goto error; + } + local->fd = fd_ref(fd); + + if (iobref) + local->iobref = iobref_ref(iobref); + /* + * to update real file size on the server + */ + local->xattr = dict_new(); + if (!local->xattr) { + ret = ENOMEM; + goto error; + } + local->flags = flags; + + info = local_get_inode_info(local, this); + if (info == NULL) { + ret = EINVAL; + goto error; + } + if (!object_alg_atomic(&info->cinfo)) { + ret = EINVAL; + goto error; + } + if (iovec_get_size(vec, count) == 0) + goto trivial; + + ret = prepare_for_submit_data(frame, this, offset, + iovec_get_size(vec, count), + vec, count, 0 /* don't setup gup + in tail: we don't + know file size yet */); + if (ret) + goto error; + + if (parent_is_crypt_xlator(frame, this)) { + data_t *data; + /* + * we are called by shinking crypt_ftruncate(), + * which doesn't perform hole conversion; + * + * don't ask for access: + * it has already been acquired + */ + + /* + * extract file size + */ + if (!xdata) { + gf_log("crypt", GF_LOG_WARNING, + "Regular file size hasn't been passed"); + ret = EIO; + goto error; + } + data = dict_get(xdata, FSIZE_XATTR_PREFIX); + if (!data) { + gf_log("crypt", GF_LOG_WARNING, + "Regular file size not found"); + ret = EIO; + goto error; + } + local->old_file_size = + local->cur_file_size = data_to_uint64(data); + + submit_data(frame, this); + return 0; + } + if (xdata) + local->xdata = dict_ref(xdata); + /* + * lock the file and retrieve its size + */ + lock.l_len = 0; + lock.l_start = 0; + lock.l_type = F_WRLCK; + lock.l_whence = SEEK_SET; + + STACK_WIND(frame, + crypt_writev_finodelk_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->finodelk, + this->name, + fd, + F_SETLKW, + &lock, + NULL); + return 0; + trivial: + STACK_WIND(frame, + writev_trivial_completion, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, + fd, + NULL); + return 0; + error: + if (local && local->fd) + fd_unref(fd); + if (local && local->iobref) + iobref_unref(iobref); + if (local && local->xdata) + dict_unref(xdata); + if (local && local->xattr) + dict_unref(local->xattr); + if (local && local->info) + free_inode_info(local->info); + + STACK_UNWIND_STRICT(writev, frame, -1, ret, NULL, NULL, NULL); + return 0; +} + +int32_t prepare_for_prune(call_frame_t *frame, xlator_t *this, uint64_t offset) +{ + set_config_offsets(frame, this, + offset, + 0, /* count */ + DATA_ATOM, + 0 /* since we prune, there is no + gap in tail to uptodate */); + return 0; +} + +/* + * Finish the read-prune-modify sequence + * + * Can be invoked as + * 1) ->ftruncate_cbk() for cblock-aligned, or trivial prune + * 2) ->writev_cbk() for non-cblock-aligned prune + */ + +static int32_t prune_complete(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iatt *prebuf, + struct iatt *postbuf, + dict_t *xdata) +{ + crypt_local_t *local = frame->local; + + local->op_ret = op_ret; + local->op_errno = op_errno; + + update_local_file_params(frame, this, prebuf, postbuf); + + put_one_call_ftruncate(frame, this); + return 0; +} + +/* + * This is called as ->ftruncate_cbk() + * + * Perform the "write" component of the + * read-prune-write sequence. + * + * submuit the rest of the file + */ +static int32_t prune_submit_file_tail(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iatt *prebuf, + struct iatt *postbuf, + dict_t *xdata) +{ + crypt_local_t *local = frame->local; + struct avec_config *conf = &local->data_conf; + dict_t *dict; + + if (op_ret < 0) + goto put_one_call; + + if (local->xdata) { + dict_unref(local->xdata); + local->xdata = NULL; + } + if (xdata) + local->xdata = dict_ref(xdata); + + dict = dict_new(); + if (!dict) { + op_errno = ENOMEM; + goto error; + } + + update_local_file_params(frame, this, prebuf, postbuf); + local->new_file_size = conf->orig_offset; + + /* + * The rest of the file is a partial block and, hence, + * should be written via RMW sequence, so the crypt xlator + * does STACK_WIND to itself. + * + * Pass current file size to crypt_writev() + */ + op_errno = dict_set(dict, + FSIZE_XATTR_PREFIX, + data_from_uint64(local->cur_file_size)); + if (op_errno) { + gf_log("crypt", GF_LOG_WARNING, + "can not set key to update file size"); + dict_unref(dict); + goto error; + } + gf_log("crypt", GF_LOG_DEBUG, + "passing current file size (%llu) to crypt_writev", + (unsigned long long)local->cur_file_size); + /* + * Padding will be filled with + * zeros by rmw_partial_block() + */ + STACK_WIND(frame, + prune_complete, + this, + this->fops->writev, /* crypt_writev */ + local->fd, + &local->vec, + 1, + conf->aligned_offset, /* offset to write from */ + 0, + local->iobref, + dict); + + dict_unref(dict); + return 0; + error: + local->op_ret = -1; + local->op_errno = op_errno; + put_one_call: + put_one_call_ftruncate(frame, this); + return 0; +} + +/* + * This is called as a callback of ->writev() invoked in behalf + * of ftruncate(): it can be + * 1) ordered writes issued by hole conversion in the case of + * expanded truncate, or + * 2) an rmw partial data block issued by non-cblock-aligned + * prune. + */ +int32_t end_writeback_ftruncate(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iatt *prebuf, + struct iatt *postbuf, + dict_t *xdata) +{ + crypt_local_t *local = frame->local; + /* + * if nothing has been written, + * then it must be an error + */ + local->op_ret = op_ret; + local->op_errno = op_errno; + + if (op_ret < 0) + goto put_one_call; + + update_local_file_params(frame, this, prebuf, postbuf); + + if (data_write_in_progress(local)) + /* case (2) */ + goto put_one_call; + /* case (1) */ + if (should_resume_submit_hole(local)) + submit_hole(frame, this); + /* + * case of hole, when we should't resume + */ + put_one_call: + put_one_call_ftruncate(frame, this); + return 0; +} + +/* + * Perform prune and write components of the + * read-prune-write sequence. + * + * Called as ->readv_cbk() + * + * Pre-conditions: + * @vec contains the latest atom of the file + * (plain text) + */ +static int32_t prune_write(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vec, + int32_t count, + struct iatt *stbuf, + struct iobref *iobref, + dict_t *xdata) +{ + int32_t i; + size_t to_copy; + size_t copied = 0; + crypt_local_t *local = frame->local; + struct avec_config *conf = &local->data_conf; + + local->op_ret = op_ret; + local->op_errno = op_errno; + if (op_ret == -1) + goto put_one_call; + + /* + * At first, uptodate head block + */ + if (iovec_get_size(vec, count) < conf->off_in_head) { + gf_log(this->name, GF_LOG_WARNING, + "Failed to uptodate head block for prune"); + local->op_ret = -1; + local->op_errno = EIO; + goto put_one_call; + } + local->vec.iov_len = conf->off_in_head; + local->vec.iov_base = GF_CALLOC(1, local->vec.iov_len, + gf_crypt_mt_data); + + if (local->vec.iov_base == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } + for (i = 0; i < count; i++) { + to_copy = vec[i].iov_len; + if (to_copy > local->vec.iov_len - copied) + to_copy = local->vec.iov_len - copied; + + memcpy((char *)local->vec.iov_base + copied, + vec[i].iov_base, + to_copy); + copied += to_copy; + if (copied == local->vec.iov_len) + break; + } + /* + * perform prune with aligned offset + * (i.e. at this step we prune a bit + * more then it is needed + */ + STACK_WIND(frame, + prune_submit_file_tail, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, + local->fd, + conf->aligned_offset, + local->xdata); + return 0; + put_one_call: + put_one_call_ftruncate(frame, this); + return 0; +} + +/* + * Perform a read-prune-write sequence + */ +int32_t read_prune_write(call_frame_t *frame, xlator_t *this) +{ + int32_t ret = 0; + dict_t *dict = NULL; + crypt_local_t *local = frame->local; + struct avec_config *conf = &local->data_conf; + struct object_cipher_info *object = &local->info->cinfo; + + set_local_io_params_ftruncate(frame, object); + get_one_call_nolock(frame); + + if ((conf->orig_offset & (object_alg_blksize(object) - 1)) == 0) { + /* + * cblock-aligned prune: + * we don't need read and write components, + * just cut file body + */ + gf_log("crypt", GF_LOG_DEBUG, + "prune without RMW (at offset %llu", + (unsigned long long)conf->orig_offset); + + STACK_WIND(frame, + prune_complete, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, + local->fd, + conf->orig_offset, + local->xdata); + return 0; + } + gf_log("crypt", GF_LOG_DEBUG, + "prune with RMW (at offset %llu", + (unsigned long long)conf->orig_offset); + /* + * We are about to perform the "read" component of the + * read-prune-write sequence. It means that we need to + * read encrypted data from disk and decrypt it. + * So, the crypt translator does STACK_WIND to itself. + * + * Pass current file size to crypt_readv() + + */ + dict = dict_new(); + if (!dict) { + gf_log("crypt", GF_LOG_WARNING, "Can not alloc dict"); + ret = ENOMEM; + goto exit; + } + ret = dict_set(dict, + FSIZE_XATTR_PREFIX, + data_from_uint64(local->cur_file_size)); + if (ret) { + gf_log("crypt", GF_LOG_WARNING, "Can not set dict"); + goto exit; + } + STACK_WIND(frame, + prune_write, + this, + this->fops->readv, /* crypt_readv */ + local->fd, + get_atom_size(object), /* bytes to read */ + conf->aligned_offset, /* offset to read from */ + 0, + dict); + exit: + if (dict) + dict_unref(dict); + return ret; +} + +/* + * File prune is more complicated than expand. + * First we need to read the latest atom to not lose info + * needed for proper update. Also we need to make sure that + * every component of read-prune-write sequence leaves data + * consistent + * + * Non-cblock aligned prune is performed as read-prune-write + * sequence: + * + * 1) read the latest atom; + * 2) perform cblock-aligned prune + * 3) issue a write request for the end-of-file + */ +int32_t prune_file(call_frame_t *frame, xlator_t *this, uint64_t offset) +{ + int32_t ret; + + ret = prepare_for_prune(frame, this, offset); + if (ret) + return ret; + return read_prune_write(frame, this); +} + +int32_t expand_file(call_frame_t *frame, xlator_t *this, + uint64_t offset) +{ + int32_t ret; + crypt_local_t *local = frame->local; + + ret = prepare_for_submit_hole(frame, this, + local->old_file_size, + offset - local->old_file_size); + if (ret) + return ret; + submit_hole(frame, this); + return 0; +} + +static int32_t ftruncate_trivial_completion(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iatt *buf, + dict_t *dict) +{ + crypt_local_t *local = frame->local; + + local->op_ret = op_ret; + local->op_errno = op_errno; + local->prebuf = *buf; + local->postbuf = *buf; + + local->prebuf.ia_size = local->cur_file_size; + local->postbuf.ia_size = local->cur_file_size; + + get_one_call(frame); + put_one_call_ftruncate(frame, this); + return 0; +} + +static int32_t do_ftruncate(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict, + dict_t *xdata) +{ + data_t *data; + crypt_local_t *local = frame->local; + + if (op_ret) + goto error; + /* + * extract regular file size + */ + data = dict_get(dict, FSIZE_XATTR_PREFIX); + if (!data) { + gf_log("crypt", GF_LOG_WARNING, "Regular file size not found"); + op_errno = EIO; + goto error; + } + local->old_file_size = local->cur_file_size = data_to_uint64(data); + + if (local->data_conf.orig_offset == local->cur_file_size) { +#if DEBUG_CRYPT + gf_log("crypt", GF_LOG_DEBUG, + "trivial ftruncate (current file size %llu)", + (unsigned long long)local->cur_file_size); +#endif + goto trivial; + } + else if (local->data_conf.orig_offset < local->cur_file_size) { +#if DEBUG_CRYPT + gf_log("crypt", GF_LOG_DEBUG, "prune from %llu to %llu", + (unsigned long long)local->cur_file_size, + (unsigned long long)local->data_conf.orig_offset); +#endif + op_errno = prune_file(frame, + this, + local->data_conf.orig_offset); + } + else { +#if DEBUG_CRYPT + gf_log("crypt", GF_LOG_DEBUG, "expand from %llu to %llu", + (unsigned long long)local->cur_file_size, + (unsigned long long)local->data_conf.orig_offset); +#endif + op_errno = expand_file(frame, + this, + local->data_conf.orig_offset); + } + if (op_errno) + goto error; + return 0; + trivial: + STACK_WIND(frame, + ftruncate_trivial_completion, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, + local->fd, + NULL); + return 0; + error: + /* + * finish with ftruncate + */ + local->op_ret = -1; + local->op_errno = op_errno; + + get_one_call_nolock(frame); + put_one_call_ftruncate(frame, this); + return 0; +} + +static int32_t crypt_ftruncate_finodelk_cbk(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *xdata) +{ + crypt_local_t *local = frame->local; + + local->op_ret = op_ret; + local->op_errno = op_errno; + + if (op_ret < 0) + goto error; + /* + * An access has been granted, + * retrieve file size first + */ + STACK_WIND(frame, + do_ftruncate, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, + local->fd, + FSIZE_XATTR_PREFIX, + NULL); + return 0; + error: + get_one_call_nolock(frame); + put_one_call_ftruncate(frame, this); + return 0; +} + +/* + * ftruncate is performed in 2 steps: + * . recieve file size; + * . expand or prune file. + */ +static int32_t crypt_ftruncate(call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset, + dict_t *xdata) +{ + int32_t ret; + crypt_local_t *local; + struct crypt_inode_info *info; + struct gf_flock lock = {0, }; + + local = crypt_alloc_local(frame, this, GF_FOP_FTRUNCATE); + if (!local) { + ret = ENOMEM; + goto error; + } + local->xattr = dict_new(); + if (!local->xattr) { + ret = ENOMEM; + goto error; + } + local->fd = fd_ref(fd); + info = local_get_inode_info(local, this); + if (info == NULL) { + ret = EINVAL; + goto error; + } + if (!object_alg_atomic(&info->cinfo)) { + ret = EINVAL; + goto error; + } + local->data_conf.orig_offset = offset; + if (xdata) + local->xdata = dict_ref(xdata); + + lock.l_len = 0; + lock.l_start = 0; + lock.l_type = F_WRLCK; + lock.l_whence = SEEK_SET; + + STACK_WIND(frame, + crypt_ftruncate_finodelk_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->finodelk, + this->name, + fd, + F_SETLKW, + &lock, + NULL); + return 0; + error: + if (local && local->fd) + fd_unref(fd); + if (local && local->xdata) + dict_unref(xdata); + if (local && local->xattr) + dict_unref(local->xattr); + if (local && local->info) + free_inode_info(local->info); + + STACK_UNWIND_STRICT(ftruncate, frame, -1, ret, NULL, NULL, NULL); + return 0; +} + +/* ->flush_cbk() */ +int32_t truncate_end(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *xdata) +{ + crypt_local_t *local = frame->local; + + STACK_UNWIND_STRICT(truncate, + frame, + op_ret, + op_errno, + &local->prebuf, + &local->postbuf, + local->xdata); + return 0; +} + +/* ftruncate_cbk() */ +int32_t truncate_flush(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iatt *prebuf, + struct iatt *postbuf, + dict_t *xdata) +{ + crypt_local_t *local = frame->local; + fd_t *fd = local->fd; + local->prebuf = *prebuf; + local->postbuf = *postbuf; + + STACK_WIND(frame, + truncate_end, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, + fd, + NULL); + fd_unref(fd); + return 0; +} + +/* + * is called as ->open_cbk() + */ +static int32_t truncate_begin(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + dict_t *xdata) +{ + crypt_local_t *local = frame->local; + + if (op_ret < 0) { + fd_unref(fd); + STACK_UNWIND_STRICT(truncate, + frame, + op_ret, + op_errno, NULL, NULL, NULL); + return 0; + } + /* + * crypt_truncate() is implemented via crypt_ftruncate(), + * so the crypt xlator does STACK_WIND to itself here + */ + STACK_WIND(frame, + truncate_flush, + this, + this->fops->ftruncate, /* crypt_ftruncate */ + fd, + local->offset, + NULL); + return 0; +} + +/* + * crypt_truncate() is implemented via crypt_ftruncate() as a + * sequence crypt_open() - crypt_ftruncate() - truncate_flush() + */ +int32_t crypt_truncate(call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset, + dict_t *xdata) +{ + fd_t *fd; + crypt_local_t *local; + +#if DEBUG_CRYPT + gf_log(this->name, GF_LOG_DEBUG, + "truncate file %s at offset %llu", + loc->path, (unsigned long long)offset); +#endif + local = crypt_alloc_local(frame, this, GF_FOP_TRUNCATE); + if (!local) + goto error; + + fd = fd_create(loc->inode, frame->root->pid); + if (!fd) { + gf_log(this->name, GF_LOG_ERROR, "Can not create fd"); + goto error; + } + local->fd = fd; + local->offset = offset; + local->xdata = xdata; + STACK_WIND(frame, + truncate_begin, + this, + this->fops->open, /* crypt_open() */ + loc, + O_RDWR, + fd, + NULL); + return 0; + error: + STACK_UNWIND_STRICT(truncate, frame, -1, EINVAL, NULL, NULL, NULL); + return 0; +} + +end_writeback_handler_t dispatch_end_writeback(glusterfs_fop_t fop) +{ + switch (fop) { + case GF_FOP_WRITE: + return end_writeback_writev; + case GF_FOP_FTRUNCATE: + return end_writeback_ftruncate; + default: + gf_log("crypt", GF_LOG_WARNING, "Bad wb operation %d", fop); + return NULL; + } +} + +/* + * true, if the caller needs metadata string + */ +static int32_t is_custom_mtd(dict_t *xdata) +{ + data_t *data; + uint32_t flags; + + if (!xdata) + return 0; + + data = dict_get(xdata, MSGFLAGS_PREFIX); + if (!data) + return 0; + if (data->len != sizeof(uint32_t)) { + gf_log("crypt", GF_LOG_WARNING, + "Bad msgflags size (%d)", data->len); + return -1; + } + flags = *((uint32_t *)data->data); + return msgflags_check_mtd_lock(&flags); +} + +static int32_t crypt_open_done(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + crypt_local_t *local = frame->local; + + local->op_ret = op_ret; + local->op_errno = op_errno; + if (op_ret < 0) + gf_log(this->name, GF_LOG_WARNING, "mtd unlock failed (%d)", + op_errno); + put_one_call_open(frame); + return 0; +} + +static void crypt_open_tail(call_frame_t *frame, xlator_t *this) +{ + struct gf_flock lock = {0, }; + crypt_local_t *local = frame->local; + + lock.l_type = F_UNLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + lock.l_pid = 0; + + STACK_WIND(frame, + crypt_open_done, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->finodelk, + this->name, + local->fd, + F_SETLKW, + &lock, + NULL); +} + +/* + * load private inode info at open time + * called as ->fgetxattr_cbk() + */ +static int load_mtd_open(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict, + dict_t *xdata) +{ + int32_t ret; + gf_boolean_t upload_info; + data_t *mtd; + uint64_t value = 0; + struct crypt_inode_info *info; + crypt_local_t *local = frame->local; + crypt_private_t *priv = this->private; + + local->op_ret = op_ret; + local->op_errno = op_errno; + + if (local->fd->inode->ia_type == IA_IFLNK) + goto exit; + if (op_ret < 0) + goto exit; + /* + * first, check for cached info + */ + ret = inode_ctx_get(local->fd->inode, this, &value); + if (ret != -1) { + info = (struct crypt_inode_info *)(long)value; + if (info == NULL) { + gf_log(this->name, GF_LOG_WARNING, + "Inode info expected, but not found"); + local->op_ret = -1; + local->op_errno = EIO; + goto exit; + } + /* + * info has been found in the cache + */ + upload_info = _gf_false; + } + else { + /* + * info hasn't been found in the cache. + */ + info = alloc_inode_info(local, local->loc); + if (!info) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto exit; + } + init_inode_info_head(info, local->fd); + upload_info = _gf_true; + } + /* + * extract metadata + */ + mtd = dict_get(dict, CRYPTO_FORMAT_PREFIX); + if (!mtd) { + local->op_ret = -1; + local->op_errno = ENOENT; + gf_log (this->name, GF_LOG_WARNING, + "Format string wasn't found"); + goto exit; + } + /* + * authenticate metadata against the path + */ + ret = open_format((unsigned char *)mtd->data, + mtd->len, + local->loc, + info, + get_master_cinfo(priv), + local, + upload_info); + if (ret) { + local->op_ret = -1; + local->op_errno = ret; + goto exit; + } + if (upload_info) { + ret = init_inode_info_tail(info, get_master_cinfo(priv)); + if (ret) { + local->op_ret = -1; + local->op_errno = ret; + goto exit; + } + ret = inode_ctx_put(local->fd->inode, + this, (uint64_t)(long)info); + if (ret == -1) { + local->op_ret = -1; + local->op_errno = EIO; + goto exit; + } + } + if (local->custom_mtd) { + /* + * pass the metadata string to the customer + */ + ret = dict_set_static_bin(local->xdata, + CRYPTO_FORMAT_PREFIX, + mtd->data, + mtd->len); + if (ret) { + local->op_ret = -1; + local->op_errno = ret; + goto exit; + } + } + exit: + if (!local->custom_mtd) + crypt_open_tail(frame, this); + else + put_one_call_open(frame); + return 0; +} + +static int32_t crypt_open_finodelk_cbk(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *xdata) +{ + crypt_local_t *local = frame->local; + + local->op_ret = op_ret; + local->op_errno = op_errno; + + if (op_ret < 0) { + gf_log(this->name, GF_LOG_WARNING, "finodelk (LOCK) failed"); + goto exit; + } + STACK_WIND(frame, + load_mtd_open, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, + local->fd, + CRYPTO_FORMAT_PREFIX, + NULL); + return 0; + exit: + put_one_call_open(frame); + return 0; +} + +/* + * verify metadata against the specified pathname + */ +static int32_t crypt_open_cbk(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + dict_t *xdata) +{ + struct gf_flock lock = {0, }; + crypt_local_t *local = frame->local; + + local->op_ret = op_ret; + local->op_errno = op_errno; + + if (local->fd->inode->ia_type == IA_IFLNK) + goto exit; + if (op_ret < 0) + goto exit; + if (xdata) + local->xdata = dict_ref(xdata); + else if (local->custom_mtd){ + local->xdata = dict_new(); + if (!local->xdata) { + local->op_ret = -1; + local->op_errno = ENOMEM; + gf_log ("crypt", GF_LOG_ERROR, + "Can not get new dict for mtd string"); + goto exit; + } + } + lock.l_len = 0; + lock.l_start = 0; + lock.l_type = local->custom_mtd ? F_WRLCK : F_RDLCK; + lock.l_whence = SEEK_SET; + + STACK_WIND(frame, + crypt_open_finodelk_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->finodelk, + this->name, + fd, + F_SETLKW, + &lock, + NULL); + return 0; + exit: + put_one_call_open(frame); + return 0; +} + +static int32_t crypt_open(call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + fd_t *fd, + dict_t *xdata) +{ + int32_t ret = ENOMEM; + crypt_local_t *local; + + local = crypt_alloc_local(frame, this, GF_FOP_OPEN); + if (!local) + goto error; + local->loc = GF_CALLOC(1, sizeof(*loc), gf_crypt_mt_loc); + if (!local->loc) { + ret = ENOMEM; + goto error; + } + memset(local->loc, 0, sizeof(*local->loc)); + ret = loc_copy(local->loc, loc); + if (ret) { + GF_FREE(local->loc); + goto error; + } + local->fd = fd_ref(fd); + + ret = is_custom_mtd(xdata); + if (ret < 0) { + loc_wipe(local->loc); + GF_FREE(local->loc); + ret = EINVAL; + goto error; + } + local->custom_mtd = ret; + + if ((flags & O_ACCMODE) == O_WRONLY) + /* + * we can't open O_WRONLY, because + * we need to do read-modify-write + */ + flags = (flags & ~O_ACCMODE) | O_RDWR; + /* + * Make sure that out translated offsets + * and counts won't be ignored + */ + flags &= ~O_APPEND; + get_one_call_nolock(frame); + STACK_WIND(frame, + crypt_open_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, + loc, + flags, + fd, + xdata); + return 0; + error: + STACK_UNWIND_STRICT(open, + frame, + -1, + ret, + NULL, + NULL); + return 0; +} + +static int32_t init_inode_info_tail(struct crypt_inode_info *info, + struct master_cipher_info *master) +{ + int32_t ret; + struct object_cipher_info *object = &info->cinfo; + +#if DEBUG_CRYPT + gf_log("crypt", GF_LOG_DEBUG, "Init inode info for object %s", + uuid_utoa(info->oid)); +#endif + ret = data_cipher_algs[object->o_alg][object->o_mode].set_private(info, + master); + if (ret) { + gf_log("crypt", GF_LOG_ERROR, "Set private info failed"); + return ret; + } + return 0; +} + +/* + * Init inode info at ->create() time + */ +static void init_inode_info_create(struct crypt_inode_info *info, + struct master_cipher_info *master, + data_t *data) +{ + struct object_cipher_info *object; + + info->nr_minor = CRYPT_XLATOR_ID; + memcpy(info->oid, data->data, data->len); + + object = &info->cinfo; + + object->o_alg = master->m_alg; + object->o_mode = master->m_mode; + object->o_block_bits = master->m_block_bits; + object->o_dkey_size = master->m_dkey_size; +} + +static void init_inode_info_head(struct crypt_inode_info *info, fd_t *fd) +{ + memcpy(info->oid, fd->inode->gfid, sizeof(uuid_t)); +} + +static int32_t crypt_create_done(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + crypt_private_t *priv = this->private; + crypt_local_t *local = frame->local; + struct crypt_inode_info *info = local->info; + fd_t *local_fd = local->fd; + dict_t *local_xdata = local->xdata; + inode_t *local_inode = local->inode; + + if (op_ret < 0) { + free_inode_info(info); + goto unwind; + } + op_errno = init_inode_info_tail(info, get_master_cinfo(priv)); + if (op_errno) { + op_ret = -1; + free_inode_info(info); + goto unwind; + } + /* + * FIXME: drop major subversion number + */ + op_ret = inode_ctx_put(local->fd->inode, this, (uint64_t)(long)info); + if (op_ret == -1) { + op_errno = EIO; + free_inode_info(info); + goto unwind; + } + unwind: + free_format(local); + STACK_UNWIND_STRICT(create, + frame, + op_ret, + op_errno, + local_fd, + local_inode, + &local->buf, + &local->prebuf, + &local->postbuf, + local_xdata); + fd_unref(local_fd); + inode_unref(local_inode); + if (local_xdata) + dict_unref(local_xdata); + return 0; +} + +static int crypt_create_tail(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *xdata) +{ + struct gf_flock lock = {0, }; + crypt_local_t *local = frame->local; + fd_t *local_fd = local->fd; + dict_t *local_xdata = local->xdata; + inode_t *local_inode = local->inode; + + dict_unref(local->xattr); + + if (op_ret < 0) + goto error; + + lock.l_type = F_UNLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + lock.l_pid = 0; + + STACK_WIND(frame, + crypt_create_done, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->finodelk, + this->name, + local->fd, + F_SETLKW, + &lock, + NULL); + return 0; + error: + free_inode_info(local->info); + free_format(local); + + STACK_UNWIND_STRICT(create, + frame, + op_ret, + op_errno, + local_fd, + local_inode, + &local->buf, + &local->prebuf, + &local->postbuf, + local_xdata); + + fd_unref(local_fd); + inode_unref(local_inode); + if (local_xdata) + dict_unref(local_xdata); + return 0; +} + +static int32_t crypt_create_finodelk_cbk(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *xdata) +{ + crypt_local_t *local = frame->local; + struct crypt_inode_info *info = local->info; + + if (op_ret < 0) + goto error; + + STACK_WIND(frame, + crypt_create_tail, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + local->fd, + local->xattr, /* CRYPTO_FORMAT_PREFIX */ + 0, + NULL); + return 0; + error: + free_inode_info(info); + free_format(local); + fd_unref(local->fd); + dict_unref(local->xattr); + if (local->xdata) + dict_unref(local->xdata); + + STACK_UNWIND_STRICT(create, + frame, + op_ret, + op_errno, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL); + return 0; +} + +/* + * Create and store crypt-specific format on disk; + * Populate cache with private inode info + */ +static int32_t crypt_create_cbk(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct iatt *buf, + struct iatt *preparent, + struct iatt *postparent, + dict_t *xdata) +{ + struct gf_flock lock = {0, }; + crypt_local_t *local = frame->local; + struct crypt_inode_info *info = local->info; + + if (op_ret < 0) + goto error; + if (xdata) + local->xdata = dict_ref(xdata); + local->inode = inode_ref(inode); + local->buf = *buf; + local->prebuf = *preparent; + local->postbuf = *postparent; + + lock.l_len = 0; + lock.l_start = 0; + lock.l_type = F_WRLCK; + lock.l_whence = SEEK_SET; + + STACK_WIND(frame, + crypt_create_finodelk_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->finodelk, + this->name, + local->fd, + F_SETLKW, + &lock, + NULL); + return 0; + error: + free_inode_info(info); + free_format(local); + fd_unref(local->fd); + dict_unref(local->xattr); + + STACK_UNWIND_STRICT(create, + frame, + op_ret, + op_errno, + NULL, NULL, NULL, + NULL, NULL, NULL); + return 0; +} + +static int32_t crypt_create(call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, + mode_t umask, + fd_t *fd, + dict_t *xdata) +{ + int ret; + data_t *data; + crypt_local_t *local; + crypt_private_t *priv; + struct master_cipher_info *master; + struct crypt_inode_info *info; + + priv = this->private; + master = get_master_cinfo(priv); + + if (master_alg_atomic(master)) { + /* + * We can't open O_WRONLY, because we + * need to do read-modify-write. + */ + if ((flags & O_ACCMODE) == O_WRONLY) + flags = (flags & ~O_ACCMODE) | O_RDWR; + /* + * Make sure that out translated offsets + * and counts won't be ignored + */ + flags &= ~O_APPEND; + } + local = crypt_alloc_local(frame, this, GF_FOP_CREATE); + if (!local) { + ret = ENOMEM; + goto error; + } + data = dict_get(xdata, "gfid-req"); + if (!data) { + ret = EINVAL; + gf_log("crypt", GF_LOG_WARNING, "gfid not found"); + goto error; + } + if (data->len != sizeof(uuid_t)) { + ret = EINVAL; + gf_log("crypt", GF_LOG_WARNING, + "bad gfid size (%d), should be %d", + (int)data->len, (int)sizeof(uuid_t)); + goto error; + } + info = alloc_inode_info(local, loc); + if (!info){ + ret = ENOMEM; + goto error; + } + /* + * NOTE: + * format has to be created BEFORE + * proceeding to the untrusted server + */ + ret = alloc_format_create(local); + if (ret) { + free_inode_info(info); + goto error; + } + init_inode_info_create(info, master, data); + + ret = create_format(local->format, + loc, + info, + master); + if (ret) { + free_inode_info(info); + goto error; + } + local->xattr = dict_new(); + if (!local->xattr) { + free_inode_info(info); + free_format(local); + goto error; + } + ret = dict_set_static_bin(local->xattr, + CRYPTO_FORMAT_PREFIX, + local->format, + new_format_size()); + if (ret) { + dict_unref(local->xattr); + free_inode_info(info); + free_format(local); + goto error; + } + ret = dict_set(local->xattr, FSIZE_XATTR_PREFIX, data_from_uint64(0)); + if (ret) { + dict_unref(local->xattr); + free_inode_info(info); + free_format(local); + goto error; + } + local->fd = fd_ref(fd); + + STACK_WIND(frame, + crypt_create_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, + loc, + flags, + mode, + umask, + fd, + xdata); + return 0; + error: + gf_log("crypt", GF_LOG_WARNING, "can not create file"); + STACK_UNWIND_STRICT(create, + frame, + -1, + ret, + NULL, NULL, NULL, + NULL, NULL, NULL); + return 0; +} + +/* + * FIXME: this should depends on the version of format string + */ +static int32_t filter_crypt_xattr(dict_t *dict, + char *key, data_t *value, void *data) +{ + dict_del(dict, key); + return 0; +} + +static int32_t crypt_fsetxattr(call_frame_t *frame, + xlator_t *this, + fd_t *fd, + dict_t *dict, + int32_t flags, dict_t *xdata) +{ + dict_foreach_fnmatch(dict, "trusted.glusterfs.crypt*", + filter_crypt_xattr, NULL); + STACK_WIND(frame, + default_fsetxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + fd, + dict, + flags, + xdata); + return 0; +} + +/* + * TBD: verify file metadata before wind + */ +static int32_t crypt_setxattr(call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags, dict_t *xdata) +{ + dict_foreach_fnmatch(dict, "trusted.glusterfs.crypt*", + filter_crypt_xattr, NULL); + STACK_WIND(frame, + default_setxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + loc, + dict, + flags, + xdata); + return 0; +} + +/* + * called as flush_cbk() + */ +static int32_t linkop_end(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *xdata) +{ + crypt_local_t *local = frame->local; + linkop_unwind_handler_t unwind_fn; + unwind_fn = linkop_unwind_dispatch(local->fop); + + local->op_ret = op_ret; + local->op_errno = op_errno; + + if (op_ret < 0 && + op_errno == ENOENT && + local->loc->inode->ia_type == IA_IFLNK) { + local->op_ret = 0; + local->op_errno = 0; + } + unwind_fn(frame); + return 0; +} + +/* + * unpin inode on the server + */ +static int32_t link_flush(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct iatt *buf, + struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + crypt_local_t *local = frame->local; + + if (op_ret < 0) + goto error; + if (local->xdata) { + dict_unref(local->xdata); + local->xdata = NULL; + } + if (xdata) + local->xdata = dict_ref(xdata); + local->inode = inode_ref(inode); + local->buf = *buf; + local->prebuf = *preparent; + local->postbuf = *postparent; + + STACK_WIND(frame, + linkop_end, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, + local->fd, + NULL); + return 0; + error: + local->op_ret = -1; + local->op_errno = op_errno; + link_unwind(frame); + return 0; +} + +void link_unwind(call_frame_t *frame) +{ + crypt_local_t *local = frame->local; + dict_t *xdata; + dict_t *xattr; + inode_t *inode; + + if (!local) { + STACK_UNWIND_STRICT(link, + frame, + -1, + ENOMEM, + NULL, + NULL, + NULL, + NULL, + NULL); + return; + } + xdata = local->xdata; + xattr = local->xattr; + inode = local->inode; + + if (local->loc){ + loc_wipe(local->loc); + GF_FREE(local->loc); + } + if (local->newloc) { + loc_wipe(local->newloc); + GF_FREE(local->newloc); + } + if (local->fd) + fd_unref(local->fd); + if (local->format) + GF_FREE(local->format); + + STACK_UNWIND_STRICT(link, + frame, + local->op_ret, + local->op_errno, + inode, + &local->buf, + &local->prebuf, + &local->postbuf, + xdata); + if (xdata) + dict_unref(xdata); + if (xattr) + dict_unref(xattr); + if (inode) + inode_unref(inode); +} + +void link_wind(call_frame_t *frame, xlator_t *this) +{ + crypt_local_t *local = frame->local; + + STACK_WIND(frame, + link_flush, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, + local->loc, + local->newloc, + local->xdata); +} + +/* + * unlink() + */ +static int32_t unlink_flush(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + crypt_local_t *local = frame->local; + + if (op_ret < 0) + goto error; + local->prebuf = *preparent; + local->postbuf = *postparent; + if (local->xdata) { + dict_unref(local->xdata); + local->xdata = NULL; + } + if (xdata) + local->xdata = dict_ref(xdata); + + STACK_WIND(frame, + linkop_end, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, + local->fd, + NULL); + return 0; + error: + local->op_ret = -1; + local->op_errno = op_errno; + unlink_unwind(frame); + return 0; +} + +void unlink_unwind(call_frame_t *frame) +{ + crypt_local_t *local = frame->local; + dict_t *xdata; + dict_t *xattr; + + if (!local) { + STACK_UNWIND_STRICT(unlink, + frame, + -1, + ENOMEM, + NULL, + NULL, + NULL); + return; + } + xdata = local->xdata; + xattr = local->xattr; + if (local->loc){ + loc_wipe(local->loc); + GF_FREE(local->loc); + } + if (local->fd) + fd_unref(local->fd); + if (local->format) + GF_FREE(local->format); + + STACK_UNWIND_STRICT(unlink, + frame, + local->op_ret, + local->op_errno, + &local->prebuf, + &local->postbuf, + xdata); + if (xdata) + dict_unref(xdata); + if (xattr) + dict_unref(xattr); +} + +void unlink_wind(call_frame_t *frame, xlator_t *this) +{ + crypt_local_t *local = frame->local; + + STACK_WIND(frame, + unlink_flush, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + local->loc, + local->flags, + local->xdata); +} + +void rename_unwind(call_frame_t *frame) +{ + crypt_local_t *local = frame->local; + dict_t *xdata; + dict_t *xattr; + struct iatt *prenewparent; + struct iatt *postnewparent; + + if (!local) { + STACK_UNWIND_STRICT(rename, + frame, + -1, + ENOMEM, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL); + return; + } + xdata = local->xdata; + xattr = local->xattr; + prenewparent = local->prenewparent; + postnewparent = local->postnewparent; + + if (local->loc){ + loc_wipe(local->loc); + GF_FREE(local->loc); + } + if (local->newloc){ + loc_wipe(local->newloc); + GF_FREE(local->newloc); + } + if (local->fd) + fd_unref(local->fd); + if (local->format) + GF_FREE(local->format); + + STACK_UNWIND_STRICT(rename, + frame, + local->op_ret, + local->op_errno, + &local->buf, + &local->prebuf, + &local->postbuf, + prenewparent, + postnewparent, + xdata); + if (xdata) + dict_unref(xdata); + if (xattr) + dict_unref(xattr); + if (prenewparent) + GF_FREE(prenewparent); + if (postnewparent) + GF_FREE(postnewparent); +} + +/* + * called as flush_cbk() + */ +static int32_t rename_end(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *xdata) +{ + crypt_local_t *local = frame->local; + + local->op_ret = op_ret; + local->op_errno = op_errno; + + rename_unwind(frame); + return 0; +} + +static int32_t rename_flush(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iatt *buf, + struct iatt *preoldparent, + struct iatt *postoldparent, + struct iatt *prenewparent, + struct iatt *postnewparent, + dict_t *xdata) +{ + crypt_local_t *local = frame->local; + + if (op_ret < 0) + goto error; + dict_unref(local->xdata); + local->xdata = NULL; + if (xdata) + local->xdata = dict_ref(xdata); + + local->buf = *buf; + local->prebuf = *preoldparent; + local->postbuf = *postoldparent; + if (prenewparent) { + local->prenewparent = GF_CALLOC(1, sizeof(*prenewparent), + gf_crypt_mt_iatt); + if (!local->prenewparent) { + op_errno = ENOMEM; + goto error; + } + *local->prenewparent = *prenewparent; + } + if (postnewparent) { + local->postnewparent = GF_CALLOC(1, sizeof(*postnewparent), + gf_crypt_mt_iatt); + if (!local->postnewparent) { + op_errno = ENOMEM; + goto error; + } + *local->postnewparent = *postnewparent; + } + STACK_WIND(frame, + rename_end, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, + local->fd, + NULL); + return 0; + error: + local->op_ret = -1; + local->op_errno = op_errno; + rename_unwind(frame); + return 0; +} + +void rename_wind(call_frame_t *frame, xlator_t *this) +{ + crypt_local_t *local = frame->local; + + STACK_WIND(frame, + rename_flush, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, + local->loc, + local->newloc, + local->xdata); +} + +static int32_t __do_linkop(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + crypt_local_t *local = frame->local; + linkop_wind_handler_t wind_fn; + linkop_unwind_handler_t unwind_fn; + + wind_fn = linkop_wind_dispatch(local->fop); + unwind_fn = linkop_unwind_dispatch(local->fop); + + local->op_ret = op_ret; + local->op_errno = op_errno; + + if (op_ret >= 0) + wind_fn(frame, this); + else { + gf_log(this->name, GF_LOG_WARNING, "mtd unlock failed (%d)", + op_errno); + unwind_fn(frame); + } + return 0; +} + +static int32_t do_linkop(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *xdata) +{ + struct gf_flock lock = {0, }; + crypt_local_t *local = frame->local; + linkop_unwind_handler_t unwind_fn; + + unwind_fn = linkop_unwind_dispatch(local->fop); + local->op_ret = op_ret; + local->op_errno = op_errno; + + if(op_ret < 0) + goto error; + + lock.l_type = F_UNLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + lock.l_pid = 0; + + STACK_WIND(frame, + __do_linkop, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->finodelk, + this->name, + local->fd, + F_SETLKW, + &lock, + NULL); + return 0; + error: + unwind_fn(frame); + return 0; +} + +/* + * Update the metadata string (against the new pathname); + * submit the result + */ +static int32_t linkop_begin(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + dict_t *xdata) +{ + gf_boolean_t upload_info; + crypt_local_t *local = frame->local; + crypt_private_t *priv = this->private; + struct crypt_inode_info *info; + data_t *old_mtd; + uint32_t new_mtd_size; + uint64_t value = 0; + void (*unwind_fn)(call_frame_t *frame); + void (*wind_fn)(call_frame_t *frame, xlator_t *this); + mtd_op_t mop; + + wind_fn = linkop_wind_dispatch(local->fop); + unwind_fn = linkop_unwind_dispatch(local->fop); + mop = linkop_mtdop_dispatch(local->fop); + + if (local->fd->inode->ia_type == IA_IFLNK) + goto wind; + if (op_ret < 0) + /* + * verification failed + */ + goto error; + + old_mtd = dict_get(xdata, CRYPTO_FORMAT_PREFIX); + if (!old_mtd) { + op_errno = EIO; + gf_log (this->name, GF_LOG_DEBUG, + "Metadata string wasn't found"); + goto error; + } + new_mtd_size = format_size(mop, old_mtd->len); + op_errno = alloc_format(local, new_mtd_size); + if (op_errno) + goto error; + /* + * check for cached info + */ + op_ret = inode_ctx_get(fd->inode, this, &value); + if (op_ret != -1) { + info = (struct crypt_inode_info *)(long)value; + if (info == NULL) { + gf_log (this->name, GF_LOG_WARNING, + "Inode info was not found"); + op_errno = EINVAL; + goto error; + } + /* + * info was found in the cache + */ + local->info = info; + upload_info = _gf_false; + } + else { + /* + * info wasn't found in the cache; + */ + info = alloc_inode_info(local, local->loc); + if (!info) + goto error; + init_inode_info_head(info, fd); + local->info = info; + upload_info = _gf_true; + } + op_errno = open_format((unsigned char *)old_mtd->data, + old_mtd->len, + local->loc, + info, + get_master_cinfo(priv), + local, + upload_info); + if (op_errno) + goto error; + if (upload_info == _gf_true) { + op_errno = init_inode_info_tail(info, + get_master_cinfo(priv)); + if (op_errno) + goto error; + op_errno = inode_ctx_put(fd->inode, this, + (uint64_t)(long)(info)); + if (op_errno == -1) { + op_errno = EIO; + goto error; + } + } + /* + * update the format string (append/update/cup a MAC) + */ + op_errno = update_format(local->format, + (unsigned char *)old_mtd->data, + old_mtd->len, + local->mac_idx, + mop, + local->newloc, + info, + get_master_cinfo(priv), + local); + if (op_errno) + goto error; + /* + * store the new format string on the server + */ + if (new_mtd_size) { + op_errno = dict_set_static_bin(local->xattr, + CRYPTO_FORMAT_PREFIX, + local->format, + new_mtd_size); + if (op_errno) + goto error; + } + STACK_WIND(frame, + do_linkop, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + local->loc, + local->xattr, + 0, + NULL); + return 0; + wind: + wind_fn(frame, this); + return 0; + error: + local->op_ret = -1; + local->op_errno = op_errno; + unwind_fn(frame); + return 0; +} + +static int32_t linkop_grab_local(call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc, + int flags, dict_t *xdata, + glusterfs_fop_t op) +{ + int32_t ret = ENOMEM; + fd_t *fd; + crypt_local_t *local; + + local = crypt_alloc_local(frame, this, op); + if (!local) + goto error; + if (xdata) + local->xdata = dict_ref(xdata); + + fd = fd_create(oldloc->inode, frame->root->pid); + if (!fd) { + gf_log(this->name, GF_LOG_ERROR, "Can not create fd"); + goto error; + } + local->fd = fd; + local->flags = flags; + local->loc = GF_CALLOC(1, sizeof(*oldloc), gf_crypt_mt_loc); + if (!local->loc) + goto error; + memset(local->loc, 0, sizeof(*local->loc)); + ret = loc_copy(local->loc, oldloc); + if (ret) { + GF_FREE(local->loc); + local->loc = NULL; + goto error; + } + if (newloc) { + local->newloc = GF_CALLOC(1, sizeof(*newloc), gf_crypt_mt_loc); + if (!local->newloc) { + GF_FREE(local->loc); + loc_wipe(local->loc); + goto error; + } + memset(local->newloc, 0, sizeof(*local->newloc)); + ret = loc_copy(local->newloc, newloc); + if (ret) { + GF_FREE(local->loc); + loc_wipe(local->loc); + GF_FREE(local->newloc); + goto error; + } + } + local->xattr = dict_new(); + if (!local->xattr) { + gf_log(this->name, GF_LOG_ERROR, "Can not create dict"); + ret = ENOMEM; + goto error; + } + return 0; + error: + if (local->xdata) + dict_unref(local->xdata); + if (local->fd) + fd_unref(local->fd); + local->fd = 0; + local->loc = NULL; + local->newloc = NULL; + + local->op_ret = -1; + local->op_errno = ret; + + return ret; +} + +/* + * read and verify locked metadata against the old pathname (via open); + * update the metadata string in accordance with the new pathname; + * submit modified metadata; + * wind; + */ +static int32_t linkop(call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc, + int flags, + dict_t *xdata, + glusterfs_fop_t op) +{ + int32_t ret; + dict_t *dict; + crypt_local_t *local; + void (*unwind_fn)(call_frame_t *frame); + + unwind_fn = linkop_unwind_dispatch(op); + + ret = linkop_grab_local(frame, this, oldloc, newloc, flags, xdata, op); + local = frame->local; + if (ret) + goto error; + dict = dict_new(); + if (!dict) { + gf_log(this->name, GF_LOG_ERROR, "Can not create dict"); + ret = ENOMEM; + goto error; + } + /* + * Set a message to crypt_open() that we need + * locked metadata string. + * All link operations (link, unlink, rename) + * need write lock + */ + msgflags_set_mtd_wlock(&local->msgflags); + ret = dict_set_static_bin(dict, + MSGFLAGS_PREFIX, + &local->msgflags, + sizeof(local->msgflags)); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "Can not set dict"); + dict_unref(dict); + goto error; + } + /* + * verify metadata against the old pathname + * and retrieve locked metadata string + */ + STACK_WIND(frame, + linkop_begin, + this, + this->fops->open, /* crypt_open() */ + oldloc, + O_RDWR, + local->fd, + dict); + dict_unref(dict); + return 0; + error: + local->op_ret = -1; + local->op_errno = ret; + unwind_fn(frame); + return 0; +} + +static int32_t crypt_link(call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + return linkop(frame, this, oldloc, newloc, 0, xdata, GF_FOP_LINK); +} + +static int32_t crypt_unlink(call_frame_t *frame, xlator_t *this, + loc_t *loc, int flags, dict_t *xdata) +{ + return linkop(frame, this, loc, NULL, flags, xdata, GF_FOP_UNLINK); +} + +static int32_t crypt_rename(call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + return linkop(frame, this, oldloc, newloc, 0, xdata, GF_FOP_RENAME); +} + +static void put_one_call_open(call_frame_t *frame) +{ + crypt_local_t *local = frame->local; + if (put_one_call(local)) { + fd_t *fd = local->fd; + loc_t *loc = local->loc; + dict_t *xdata = local->xdata; + + STACK_UNWIND_STRICT(open, + frame, + local->op_ret, + local->op_errno, + fd, + xdata); + fd_unref(fd); + if (xdata) + dict_unref(xdata); + loc_wipe(loc); + GF_FREE(loc); + } +} + +static int32_t __crypt_readv_done(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + crypt_local_t *local = frame->local; + fd_t *local_fd = local->fd; + dict_t *local_xdata = local->xdata; + /* read deals with data configs only */ + struct iovec *avec = local->data_conf.avec; + char **pool = local->data_conf.pool; + int blocks_in_pool = local->data_conf.blocks_in_pool; + struct iobref *iobref = local->iobref; + struct iobref *iobref_data = local->iobref_data; + + if (op_ret < 0) { + gf_log(this->name, GF_LOG_WARNING, + "readv unlock failed (%d)", op_errno); + if (local->op_ret >= 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + } + dump_plain_text(local, avec); + + gf_log("crypt", GF_LOG_DEBUG, + "readv: ret_to_user: %d, iovec len: %d, ia_size: %llu", + (int)(local->rw_count > 0 ? local->rw_count : local->op_ret), + (int)(local->rw_count > 0 ? iovec_get_size(avec, local->data_conf.acount) : 0), + (unsigned long long)local->buf.ia_size); + + STACK_UNWIND_STRICT(readv, + frame, + local->rw_count > 0 ? local->rw_count : local->op_ret, + local->op_errno, + avec, + avec ? local->data_conf.acount : 0, + &local->buf, + local->iobref, + local_xdata); + + free_avec(avec, pool, blocks_in_pool); + fd_unref(local_fd); + if (local_xdata) + dict_unref(local_xdata); + if (iobref) + iobref_unref(iobref); + if (iobref_data) + iobref_unref(iobref_data); + return 0; +} + +static void crypt_readv_done(call_frame_t *frame, xlator_t *this) +{ + if (parent_is_crypt_xlator(frame, this)) + /* + * don't unlock (it will be done by the parent) + */ + __crypt_readv_done(frame, NULL, this, 0, 0, NULL); + else { + crypt_local_t *local = frame->local; + struct gf_flock lock = {0, }; + + lock.l_type = F_UNLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + lock.l_pid = 0; + + STACK_WIND(frame, + __crypt_readv_done, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->finodelk, + this->name, + local->fd, + F_SETLKW, + &lock, + NULL); + } +} + +static void put_one_call_readv(call_frame_t *frame, xlator_t *this) +{ + crypt_local_t *local = frame->local; + if (put_one_call(local)) + crypt_readv_done(frame, this); +} + +static int32_t __crypt_writev_done(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + crypt_local_t *local = frame->local; + fd_t *local_fd = local->fd; + dict_t *local_xdata = local->xdata; + int32_t ret_to_user; + + if (local->xattr) + dict_unref(local->xattr); + /* + * Calculate amout of butes to be returned + * to user. We need to subtract paddings that + * have been written as a part of atom. + */ + /* + * subtract head padding + */ + if (local->rw_count == 0) + /* + * Nothing has been written, it must be an error + */ + ret_to_user = local->op_ret; + else if (local->rw_count <= local->data_conf.off_in_head) { + gf_log("crypt", GF_LOG_WARNING, "Incomplete write"); + ret_to_user = 0; + } + else + ret_to_user = local->rw_count - + local->data_conf.off_in_head; + /* + * subtract tail padding + */ + if (ret_to_user > local->data_conf.orig_size) + ret_to_user = local->data_conf.orig_size; + + if (local->iobref) + iobref_unref(local->iobref); + if (local->iobref_data) + iobref_unref(local->iobref_data); + free_avec_data(local); + free_avec_hole(local); + + gf_log("crypt", GF_LOG_DEBUG, + "writev: ret_to_user: %d", ret_to_user); + + STACK_UNWIND_STRICT(writev, + frame, + ret_to_user, + local->op_errno, + &local->prebuf, + &local->postbuf, + local_xdata); + fd_unref(local_fd); + if (local_xdata) + dict_unref(local_xdata); + return 0; +} + +static int32_t crypt_writev_done(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *xdata) +{ + crypt_local_t *local = frame->local; + + if (op_ret < 0) + gf_log("crypt", GF_LOG_WARNING, "can not update file size"); + + if (parent_is_crypt_xlator(frame, this)) + /* + * don't unlock (it will be done by the parent) + */ + __crypt_writev_done(frame, NULL, this, 0, 0, NULL); + else { + struct gf_flock lock = {0, }; + + lock.l_type = F_UNLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + lock.l_pid = 0; + + STACK_WIND(frame, + __crypt_writev_done, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->finodelk, + this->name, + local->fd, + F_SETLKW, + &lock, + NULL); + } + return 0; +} + +static void put_one_call_writev(call_frame_t *frame, xlator_t *this) +{ + crypt_local_t *local = frame->local; + if (put_one_call(local)) { + if (local->update_disk_file_size) { + int32_t ret; + /* + * update file size, unlock the file and unwind + */ + ret = dict_set(local->xattr, + FSIZE_XATTR_PREFIX, + data_from_uint64(local->cur_file_size)); + if (ret) { + gf_log("crypt", GF_LOG_WARNING, + "can not set key to update file size"); + crypt_writev_done(frame, NULL, + this, 0, 0, NULL); + return; + } + gf_log("crypt", GF_LOG_DEBUG, + "Updating disk file size to %llu", + (unsigned long long)local->cur_file_size); + STACK_WIND(frame, + crypt_writev_done, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + local->fd, + local->xattr, /* CRYPTO_FORMAT_PREFIX */ + 0, + NULL); + } + else + crypt_writev_done(frame, NULL, this, 0, 0, NULL); + } +} + +static int32_t __crypt_ftruncate_done(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + crypt_local_t *local = frame->local; + fd_t *local_fd = local->fd; + dict_t *local_xdata = local->xdata; + char *iobase = local->vec.iov_base; + + if (op_ret < 0) { + gf_log(this->name, GF_LOG_WARNING, + "ftruncate unlock failed (%d)", op_errno); + if (local->op_ret >= 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + } + if (local->iobref_data) + iobref_unref(local->iobref_data); + free_avec_data(local); + free_avec_hole(local); + + gf_log("crypt", GF_LOG_DEBUG, + "ftruncate, return to user: presize=%llu, postsize=%llu", + (unsigned long long)local->prebuf.ia_size, + (unsigned long long)local->postbuf.ia_size); + + STACK_UNWIND_STRICT(ftruncate, + frame, + local->op_ret < 0 ? -1 : 0, + local->op_errno, + &local->prebuf, + &local->postbuf, + local_xdata); + fd_unref(local_fd); + if (local_xdata) + dict_unref(local_xdata); + if (iobase) + GF_FREE(iobase); + return 0; +} + +static int32_t crypt_ftruncate_done(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *xdata) +{ + crypt_local_t *local = frame->local; + struct gf_flock lock = {0, }; + + dict_unref(local->xattr); + if (op_ret < 0) + gf_log("crypt", GF_LOG_WARNING, "can not update file size"); + + lock.l_type = F_UNLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + lock.l_pid = 0; + + STACK_WIND(frame, + __crypt_ftruncate_done, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->finodelk, + this->name, + local->fd, + F_SETLKW, + &lock, + NULL); + return 0; +} + +static void put_one_call_ftruncate(call_frame_t *frame, xlator_t *this) +{ + crypt_local_t *local = frame->local; + if (put_one_call(local)) { + if (local->update_disk_file_size) { + int32_t ret; + /* + * update file size, unlock the file and unwind + */ + ret = dict_set(local->xattr, + FSIZE_XATTR_PREFIX, + data_from_uint64(local->cur_file_size)); + if (ret) { + gf_log("crypt", GF_LOG_WARNING, + "can not set key to update file size"); + crypt_ftruncate_done(frame, NULL, + this, 0, 0, NULL); + return; + } + gf_log("crypt", GF_LOG_DEBUG, + "Updating disk file size to %llu", + (unsigned long long)local->cur_file_size); + STACK_WIND(frame, + crypt_ftruncate_done, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + local->fd, + local->xattr, /* CRYPTO_FORMAT_PREFIX */ + 0, + NULL); + } + else + crypt_ftruncate_done(frame, NULL, this, 0, 0, NULL); + } +} + +/* + * load regular file size for some FOPs + */ +static int32_t load_file_size(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict, + dict_t *xdata) +{ + data_t *data; + crypt_local_t *local = frame->local; + + dict_t *local_xdata = local->xdata; + inode_t *local_inode = local->inode; + + if (op_ret < 0) + goto unwind; + /* + * load regular file size + */ + data = dict_get(dict, FSIZE_XATTR_PREFIX); + if (!data) { + if (local->xdata) + dict_unref(local->xdata); + gf_log("crypt", GF_LOG_WARNING, "Regular file size not found"); + op_ret = -1; + op_errno = EIO; + goto unwind; + } + local->buf.ia_size = data_to_uint64(data); + + gf_log(this->name, GF_LOG_DEBUG, + "FOP %d: Translate regular file to %llu", + local->fop, + (unsigned long long)local->buf.ia_size); + unwind: + if (local->fd) + fd_unref(local->fd); + if (local->loc) { + loc_wipe(local->loc); + GF_FREE(local->loc); + } + switch (local->fop) { + case GF_FOP_FSTAT: + STACK_UNWIND_STRICT(fstat, + frame, + op_ret, + op_errno, + op_ret >= 0 ? &local->buf : NULL, + local->xdata); + break; + case GF_FOP_STAT: + STACK_UNWIND_STRICT(stat, + frame, + op_ret, + op_errno, + op_ret >= 0 ? &local->buf : NULL, + local->xdata); + break; + case GF_FOP_LOOKUP: + STACK_UNWIND_STRICT(lookup, + frame, + op_ret, + op_errno, + op_ret >= 0 ? local->inode : NULL, + op_ret >= 0 ? &local->buf : NULL, + local->xdata, + op_ret >= 0 ? &local->postbuf : NULL); + break; + case GF_FOP_READ: + STACK_UNWIND_STRICT(readv, + frame, + op_ret, + op_errno, + NULL, + 0, + op_ret >= 0 ? &local->buf : NULL, + NULL, + NULL); + break; + default: + gf_log(this->name, GF_LOG_WARNING, + "Improper file operation %d", local->fop); + } + if (local_xdata) + dict_unref(local_xdata); + if (local_inode) + inode_unref(local_inode); + return 0; +} + +static int32_t crypt_stat_common_cbk(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iatt *buf, dict_t *xdata) +{ + crypt_local_t *local = frame->local; + + if (op_ret < 0) + goto unwind; + if (!IA_ISREG(buf->ia_type)) + goto unwind; + + local->buf = *buf; + if (xdata) + local->xdata = dict_ref(xdata); + + switch (local->fop) { + case GF_FOP_FSTAT: + STACK_WIND(frame, + load_file_size, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, + local->fd, + FSIZE_XATTR_PREFIX, + NULL); + break; + case GF_FOP_STAT: + STACK_WIND(frame, + load_file_size, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + local->loc, + FSIZE_XATTR_PREFIX, + NULL); + break; + default: + gf_log (this->name, GF_LOG_WARNING, + "Improper file operation %d", local->fop); + } + return 0; + unwind: + if (local->fd) + fd_unref(local->fd); + if (local->loc) { + loc_wipe(local->loc); + GF_FREE(local->loc); + } + switch (local->fop) { + case GF_FOP_FSTAT: + STACK_UNWIND_STRICT(fstat, + frame, + op_ret, + op_errno, + op_ret >= 0 ? buf : NULL, + op_ret >= 0 ? xdata : NULL); + break; + case GF_FOP_STAT: + STACK_UNWIND_STRICT(stat, + frame, + op_ret, + op_errno, + op_ret >= 0 ? buf : NULL, + op_ret >= 0 ? xdata : NULL); + break; + default: + gf_log (this->name, GF_LOG_WARNING, + "Improper file operation %d", local->fop); + } + return 0; +} + +static int32_t crypt_fstat(call_frame_t *frame, + xlator_t *this, + fd_t *fd, dict_t *xdata) +{ + crypt_local_t *local; + + local = crypt_alloc_local(frame, this, GF_FOP_FSTAT); + if (!local) + goto error; + local->fd = fd_ref(fd); + STACK_WIND(frame, + crypt_stat_common_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, + fd, + xdata); + return 0; + error: + STACK_UNWIND_STRICT(fstat, + frame, + -1, + ENOMEM, + NULL, + NULL); + return 0; +} + +static int32_t crypt_stat(call_frame_t *frame, + xlator_t *this, + loc_t *loc, dict_t *xdata) +{ + int32_t ret; + crypt_local_t *local; + + local = crypt_alloc_local(frame, this, GF_FOP_STAT); + if (!local) + goto error; + local->loc = GF_CALLOC(1, sizeof(*loc), gf_crypt_mt_loc); + if (!local->loc) + goto error; + memset(local->loc, 0, sizeof(*local->loc)); + ret = loc_copy(local->loc, loc); + if (ret) { + GF_FREE(local->loc); + goto error; + } + STACK_WIND(frame, + crypt_stat_common_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, + loc, + xdata); + return 0; + error: + STACK_UNWIND_STRICT(stat, + frame, + -1, + ENOMEM, + NULL, + NULL); + return 0; +} + +static int32_t crypt_lookup_cbk(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + crypt_local_t *local = frame->local; + + if (op_ret < 0) + goto unwind; + if (!IA_ISREG(buf->ia_type)) + goto unwind; + + local->inode = inode_ref(inode); + local->buf = *buf; + local->postbuf = *postparent; + if (xdata) + local->xdata = dict_ref(xdata); + uuid_copy(local->loc->gfid, buf->ia_gfid); + + STACK_WIND(frame, + load_file_size, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + local->loc, + FSIZE_XATTR_PREFIX, + NULL); + return 0; + unwind: + loc_wipe(local->loc); + GF_FREE(local->loc); + STACK_UNWIND_STRICT(lookup, + frame, + op_ret, + op_errno, + inode, + buf, + xdata, + postparent); + return 0; +} + +static int32_t crypt_lookup(call_frame_t *frame, + xlator_t *this, + loc_t *loc, dict_t *xdata) +{ + int32_t ret; + crypt_local_t *local; + + local = crypt_alloc_local(frame, this, GF_FOP_LOOKUP); + if (!local) + goto error; + local->loc = GF_CALLOC(1, sizeof(*loc), gf_crypt_mt_loc); + if (!local->loc) + goto error; + memset(local->loc, 0, sizeof(*local->loc)); + ret = loc_copy(local->loc, loc); + if (ret) { + GF_FREE(local->loc); + goto error; + } + gf_log(this->name, GF_LOG_DEBUG, "Lookup %s", loc->path); + STACK_WIND(frame, + crypt_lookup_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, + loc, + xdata); + return 0; + error: + STACK_UNWIND_STRICT(lookup, + frame, + -1, + ENOMEM, + NULL, + NULL, + NULL, + NULL); + return 0; +} + +/* + * for every regular directory entry find its real file size + * and update stat's buf properly + */ +static int32_t crypt_readdirp_cbk(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + gf_dirent_t *entries, dict_t *xdata) +{ + gf_dirent_t *entry = NULL; + + if (op_ret < 0) + goto unwind; + + list_for_each_entry (entry, (&entries->list), list) { + data_t *data; + + if (!IA_ISREG(entry->d_stat.ia_type)) + continue; + data = dict_get(entry->dict, FSIZE_XATTR_PREFIX); + if (!data){ + gf_log("crypt", GF_LOG_WARNING, + "Regular file size of direntry not found"); + op_errno = EIO; + op_ret = -1; + break; + } + entry->d_stat.ia_size = data_to_uint64(data); + } + unwind: + STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, xdata); + return 0; +} + +/* + * ->readdirp() fills in-core inodes, so we need to set proper + * file sizes for all directory entries of the parent @fd. + * Actual updates take place in ->crypt_readdirp_cbk() + */ +static int32_t crypt_readdirp(call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, + dict_t *xdata) +{ + int32_t ret = ENOMEM; + + if (!xdata) { + xdata = dict_new(); + if (!xdata) + goto error; + } + else + dict_ref(xdata); + /* + * make sure that we'll have real file sizes at ->readdirp_cbk() + */ + ret = dict_set(xdata, FSIZE_XATTR_PREFIX, data_from_uint64(0)); + if (ret) { + dict_unref(xdata); + goto error; + } + STACK_WIND(frame, + crypt_readdirp_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, + fd, + size, + offset, + xdata); + dict_unref(xdata); + return 0; + error: + STACK_UNWIND_STRICT(readdirp, frame, -1, ret, NULL, NULL); + return 0; +} + +static int32_t crypt_access(call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask, dict_t *xdata) +{ + gf_log(this->name, GF_LOG_WARNING, + "NFS mounts of encrypted volumes are unsupported"); + STACK_UNWIND_STRICT(access, frame, -1, EPERM, NULL); + return 0; +} + +int32_t master_set_block_size (xlator_t *this, crypt_private_t *priv, + dict_t *options) +{ + uint64_t block_size = 0; + struct master_cipher_info *master = get_master_cinfo(priv); + + if (options != NULL) + GF_OPTION_RECONF("block-size", block_size, options, + size, error); + else + GF_OPTION_INIT("block-size", block_size, size, error); + + switch (block_size) { + case 512: + master->m_block_bits = 9; + break; + case 1024: + master->m_block_bits = 10; + break; + case 2048: + master->m_block_bits = 11; + break; + case 4096: + master->m_block_bits = 12; + break; + default: + gf_log("crypt", GF_LOG_ERROR, + "FATAL: unsupported block size %llu", + (unsigned long long)block_size); + goto error; + } + return 0; + error: + return -1; +} + +int32_t master_set_alg(xlator_t *this, crypt_private_t *priv) +{ + struct master_cipher_info *master = get_master_cinfo(priv); + master->m_alg = AES_CIPHER_ALG; + return 0; +} + +int32_t master_set_mode(xlator_t *this, crypt_private_t *priv) +{ + struct master_cipher_info *master = get_master_cinfo(priv); + master->m_mode = XTS_CIPHER_MODE; + return 0; +} + +/* + * set key size in bits to the master info + * Pre-conditions: cipher mode in the master info is uptodate. + */ +static int master_set_data_key_size (xlator_t *this, crypt_private_t *priv, + dict_t *options) +{ + int32_t ret; + uint64_t key_size = 0; + struct master_cipher_info *master = get_master_cinfo(priv); + + if (options != NULL) + GF_OPTION_RECONF("data-key-size", key_size, options, + size, error); + else + GF_OPTION_INIT("data-key-size", key_size, size, error); + + ret = data_cipher_algs[master->m_alg][master->m_mode].check_key(key_size); + if (ret) { + gf_log("crypt", GF_LOG_ERROR, + "FATAL: wrong bin key size %llu for alg %d mode %d", + (unsigned long long)key_size, + (int)master->m_alg, + (int)master->m_mode); + goto error; + } + master->m_dkey_size = key_size; + return 0; + error: + return -1; +} + +static int is_hex(char *s) { + return ('0' <= *s && *s <= '9') || ('a' <= *s && *s <= 'f'); +} + +static int parse_hex_buf(xlator_t *this, char *src, unsigned char *dst, + int hex_size) +{ + int i; + int hex_byte = 0; + + for (i = 0; i < (hex_size / 2); i++) { + if (!is_hex(src + i*2) || !is_hex(src + i*2 + 1)) { + gf_log("crypt", GF_LOG_ERROR, + "FATAL: not hex symbol in key"); + return -1; + } + if (sscanf(src + i*2, "%2x", &hex_byte) != 1) { + gf_log("crypt", GF_LOG_ERROR, + "FATAL: can not parse hex key"); + return -1; + } + dst[i] = hex_byte & 0xff; + } + return 0; +} + +/* + * Parse options; + * install master volume key + */ +int32_t master_set_master_vol_key(xlator_t *this, crypt_private_t *priv) +{ + int32_t ret; + FILE *file = NULL; + + int32_t key_size; + char *opt_key_file_pathname = NULL; + + unsigned char bin_buf[MASTER_VOL_KEY_SIZE]; + char hex_buf[2 * MASTER_VOL_KEY_SIZE]; + + struct master_cipher_info *master = get_master_cinfo(priv); + /* + * extract master key passed via option + */ + GF_OPTION_INIT("master-key", opt_key_file_pathname, path, bad_key); + + if (!opt_key_file_pathname) { + gf_log(this->name, GF_LOG_ERROR, "FATAL: missing master key"); + return -1; + } + gf_log(this->name, GF_LOG_DEBUG, "handling file key %s", + opt_key_file_pathname); + + file = fopen(opt_key_file_pathname, "r"); + if (file == NULL) { + gf_log(this->name, GF_LOG_ERROR, + "FATAL: can not open file with master key"); + return -1; + } + /* + * extract hex key + */ + key_size = fread(hex_buf, 1, sizeof(hex_buf), file); + if (key_size < sizeof(hex_buf)) { + gf_log(this->name, GF_LOG_ERROR, + "FATAL: master key is too short"); + goto bad_key; + } + ret = parse_hex_buf(this, hex_buf, bin_buf, key_size); + if (ret) + goto bad_key; + memcpy(master->m_key, bin_buf, MASTER_VOL_KEY_SIZE); + memset(hex_buf, 0, sizeof(hex_buf)); + fclose(file); + + memset(bin_buf, 0, sizeof(bin_buf)); + return 0; + bad_key: + gf_log(this->name, GF_LOG_ERROR, "FATAL: bad master key"); + if (file) + fclose(file); + memset(bin_buf, 0, sizeof(bin_buf)); + return -1; +} + +/* + * Derive volume key for object-id authentication + */ +int32_t master_set_nmtd_vol_key(xlator_t *this, crypt_private_t *priv) +{ + return get_nmtd_vol_key(get_master_cinfo(priv)); +} + +int32_t crypt_init_xlator(xlator_t *this) +{ + int32_t ret; + crypt_private_t *priv = this->private; + + ret = master_set_alg(this, priv); + if (ret) + return ret; + ret = master_set_mode(this, priv); + if (ret) + return ret; + ret = master_set_block_size(this, priv, NULL); + if (ret) + return ret; + ret = master_set_data_key_size(this, priv, NULL); + if (ret) + return ret; + ret = master_set_master_vol_key(this, priv); + if (ret) + return ret; + return master_set_nmtd_vol_key(this, priv); +} + +static int32_t crypt_alloc_private(xlator_t *this) +{ + this->private = GF_CALLOC(1, sizeof(crypt_private_t), gf_crypt_mt_priv); + if (!this->private) { + gf_log("crypt", GF_LOG_ERROR, + "Can not allocate memory for private data"); + return ENOMEM; + } + return 0; +} + +static void crypt_free_private(xlator_t *this) +{ + crypt_private_t *priv = this->private; + if (priv) { + memset(priv, 0, sizeof(*priv)); + GF_FREE(priv); + } +} + +int32_t reconfigure (xlator_t *this, dict_t *options) +{ + int32_t ret = -1; + crypt_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO ("crypt", this, error); + GF_VALIDATE_OR_GOTO (this->name, this->private, error); + GF_VALIDATE_OR_GOTO (this->name, options, error); + + priv = this->private; + + ret = master_set_block_size(this, priv, options); + if (ret) { + gf_log("this->name", GF_LOG_ERROR, + "Failed to reconfure block size"); + goto error; + } + ret = master_set_data_key_size(this, priv, options); + if (ret) { + gf_log("this->name", GF_LOG_ERROR, + "Failed to reconfure data key size"); + goto error; + } + return 0; + error: + return ret; +} + +int32_t init(xlator_t *this) +{ + int32_t ret; + + if (!this->children || this->children->next) { + gf_log ("crypt", GF_LOG_ERROR, + "FATAL: crypt should have exactly one child"); + return EINVAL; + } + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + ret = crypt_alloc_private(this); + if (ret) + return ret; + ret = crypt_init_xlator(this); + if (ret) + goto error; + this->local_pool = mem_pool_new(crypt_local_t, 64); + if (!this->local_pool) { + gf_log(this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + ret = ENOMEM; + goto error; + } + gf_log ("crypt", GF_LOG_INFO, "crypt xlator loaded"); + return 0; + error: + crypt_free_private(this); + return ret; +} + +void fini (xlator_t *this) +{ + crypt_free_private(this); +} + +struct xlator_fops fops = { + .readv = crypt_readv, + .writev = crypt_writev, + .truncate = crypt_truncate, + .ftruncate = crypt_ftruncate, + .setxattr = crypt_setxattr, + .fsetxattr = crypt_fsetxattr, + .link = crypt_link, + .unlink = crypt_unlink, + .rename = crypt_rename, + .open = crypt_open, + .create = crypt_create, + .stat = crypt_stat, + .fstat = crypt_fstat, + .lookup = crypt_lookup, + .readdirp = crypt_readdirp, + .access = crypt_access +}; + +struct xlator_cbks cbks = { + .forget = crypt_forget +}; + +struct volume_options options[] = { + { .key = {"master-key"}, + .type = GF_OPTION_TYPE_PATH, + .description = "Pathname of regular file which contains master volume key" + }, + { .key = {"data-key-size"}, + .type = GF_OPTION_TYPE_SIZET, + .description = "Data key size (bits)", + .min = 256, + .max = 512, + .default_value = "256", + }, + { .key = {"block-size"}, + .type = GF_OPTION_TYPE_SIZET, + .description = "Atom size (bits)", + .min = 512, + .max = 4096, + .default_value = "4096" + }, + { .key = {NULL} }, +}; + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 80 + scroll-step: 1 + End: +*/ diff --git a/xlators/encryption/crypt/src/crypt.h b/xlators/encryption/crypt/src/crypt.h new file mode 100644 index 000000000..01a8542ab --- /dev/null +++ b/xlators/encryption/crypt/src/crypt.h @@ -0,0 +1,899 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __CRYPT_H__ +#define __CRYPT_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif +#include <openssl/aes.h> +#include <openssl/evp.h> +#include <openssl/sha.h> +#include <openssl/hmac.h> +#include <openssl/cmac.h> +#include <openssl/modes.h> +#include "crypt-mem-types.h" + +#define CRYPT_XLATOR_ID (0) + +#define MAX_IOVEC_BITS (3) +#define MAX_IOVEC (1 << MAX_IOVEC_BITS) +#define KEY_FACTOR_BITS (6) + +#define DEBUG_CRYPT (0) +#define TRIVIAL_TFM (0) + +#define CRYPT_MIN_BLOCK_BITS (9) +#define CRYPT_MAX_BLOCK_BITS (12) + +#define MASTER_VOL_KEY_SIZE (32) +#define NMTD_VOL_KEY_SIZE (16) + +struct crypt_key { + uint32_t len; + const char *label; +}; + +/* + * Add new key types to the end of this + * enumeration but before LAST_KEY_TYPE + */ +typedef enum { + MASTER_VOL_KEY, + NMTD_VOL_KEY, + NMTD_LINK_KEY, + EMTD_FILE_KEY, + DATA_FILE_KEY_256, + DATA_FILE_KEY_512, + LAST_KEY_TYPE +}crypt_key_type; + +struct kderive_context { + const unsigned char *pkey;/* parent key */ + uint32_t pkey_len; /* parent key size, bits */ + uint32_t ckey_len; /* child key size, bits */ + unsigned char *fid; /* fixed input data, NIST 800-108, 5.1 */ + uint32_t fid_len; /* fid len, bytes */ + unsigned char *out; /* contains child keying material */ + uint32_t out_len; /* out len, bytes */ +}; + +typedef enum { + DATA_ATOM, + HOLE_ATOM, + LAST_DATA_TYPE +}atom_data_type; + +typedef enum { + HEAD_ATOM, + TAIL_ATOM, + FULL_ATOM, + LAST_LOCALITY_TYPE +}atom_locality_type; + +typedef enum { + MTD_CREATE, + MTD_APPEND, + MTD_OVERWRITE, + MTD_CUT, + MTD_LAST_OP +} mtd_op_t; + +struct xts128_context { + void *key1, *key2; + block128_f block1,block2; +}; + +struct object_cipher_info { + cipher_alg_t o_alg; + cipher_mode_t o_mode; + uint32_t o_block_bits; + uint32_t o_dkey_size; /* raw data key size in bits */ + union { + struct { + unsigned char ivec[16]; + AES_KEY dkey[2]; + AES_KEY tkey; /* key used for tweaking */ + XTS128_CONTEXT xts; + } aes_xts; + } u; +}; + +struct master_cipher_info { + /* + * attributes inherited by newly created regular files + */ + cipher_alg_t m_alg; + cipher_mode_t m_mode; + uint32_t m_block_bits; + uint32_t m_dkey_size; /* raw key size in bits */ + /* + * master key + */ + unsigned char m_key[MASTER_VOL_KEY_SIZE]; + /* + * volume key for oid authentication + */ + unsigned char m_nmtd_key[NMTD_VOL_KEY_SIZE]; +}; + +/* +* This info is not changed during file's life + */ +struct crypt_inode_info { +#if DEBUG_CRYPT + loc_t *loc; /* pathname that the file has been + opened, or created with */ +#endif + uint16_t nr_minor; + uuid_t oid; + struct object_cipher_info cinfo; +}; + +/* + * this should locate in secure memory + */ +typedef struct { + struct master_cipher_info master; +} crypt_private_t; + +static inline struct master_cipher_info *get_master_cinfo(crypt_private_t *priv) +{ + return &priv->master; +} + +static inline struct object_cipher_info *get_object_cinfo(struct crypt_inode_info + *info) +{ + return &info->cinfo; +} + +/* + * this describes layouts and properties + * of atoms in an aligned vector + */ +struct avec_config { + uint32_t atom_size; + atom_data_type type; + size_t orig_size; + off_t orig_offset; + size_t expanded_size; + off_t aligned_offset; + + uint32_t off_in_head; + uint32_t off_in_tail; + uint32_t gap_in_tail; + uint32_t nr_full_blocks; + + struct iovec *avec; /* aligned vector */ + uint32_t acount; /* number of avec components. The same + * as number of occupied logical blocks */ + char **pool; + uint32_t blocks_in_pool; + uint32_t cursor; /* makes sense only for ordered writes, + * so there is no races on this counter. + * + * Cursor is per-config object, we don't + * reset cursor for atoms of different + * localities (head, tail, full) + */ +}; + + +typedef struct { + glusterfs_fop_t fop; /* code of FOP this local info built for */ + fd_t *fd; + inode_t *inode; + loc_t *loc; + int32_t mac_idx; + loc_t *newloc; + int32_t flags; + int32_t wbflags; + struct crypt_inode_info *info; + struct iobref *iobref; + struct iobref *iobref_data; + off_t offset; + + uint64_t old_file_size; /* per FOP, retrieved under lock held */ + uint64_t cur_file_size; /* per iteration, before issuing IOs */ + uint64_t new_file_size; /* per iteration, after issuing IOs */ + + uint64_t io_offset; /* offset of IOs issued per iteration */ + uint64_t io_offset_nopad; /* offset of user's data in the atom */ + uint32_t io_size; /* size of IOs issued per iteration */ + uint32_t io_size_nopad; /* size of user's data in the IOs */ + uint32_t eof_padding_size; /* size od EOF padding in the IOs */ + + gf_lock_t call_lock; /* protect nr_calls from many cbks */ + int32_t nr_calls; + + atom_data_type active_setup; /* which setup (hole or date) + is currently active */ + /* data setup */ + struct avec_config data_conf; + + /* hole setup */ + int hole_conv_in_proggress; + gf_lock_t hole_lock; /* protect hole config from many cbks */ + int hole_handled; + struct avec_config hole_conf; + struct iatt buf; + struct iatt prebuf; + struct iatt postbuf; + struct iatt *prenewparent; + struct iatt *postnewparent; + int32_t op_ret; + int32_t op_errno; + int32_t rw_count; /* total read or written */ + gf_lock_t rw_count_lock; /* protect the counter above */ + unsigned char *format; /* for create, update format string */ + uint32_t format_size; + uint32_t msgflags; /* messages for crypt_open() */ + dict_t *xdata; + dict_t *xattr; + struct iovec vec; /* contains last file's atom for + read-prune-write sequence */ + gf_boolean_t custom_mtd; + /* + * the next 3 fields are used by readdir and friends + */ + gf_dirent_t *de; /* directory entry */ + char *de_path; /* pathname of directory entry */ + uint32_t de_prefix_len; /* lenght of the parent's pathname */ + gf_dirent_t *entries; + + uint32_t update_disk_file_size:1; +} crypt_local_t; + +/* This represents a (read)modify-write atom */ +struct rmw_atom { + atom_locality_type locality; + /* + * read-modify-write sequence of the atom + */ + int32_t (*rmw)(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vec, + int32_t count, + struct iatt *stbuf, + struct iobref *iobref, + dict_t *xdata); + /* + * offset of the logical block in a file + */ + loff_t (*offset_at)(call_frame_t *frame, + struct object_cipher_info *object); + /* + * IO offset in an atom + */ + uint32_t (*offset_in)(call_frame_t *frame, + struct object_cipher_info *object); + /* + * number of bytes of plain text of this atom that user + * wants to read/write. + * It can be smaller than atom_size in the case of head + * or tail atoms. + */ + uint32_t (*io_size_nopad)(call_frame_t *frame, + struct object_cipher_info *object); + /* + * which iovec represents the atom + */ + struct iovec *(*get_iovec)(call_frame_t *frame, uint32_t count); + /* + * how many bytes of partial block should be uptodated by + * reading from disk. + * This is used to perform a read component of RMW (read-modify-write). + */ + uint32_t (*count_to_uptodate)(call_frame_t *frame, struct object_cipher_info *object); + struct avec_config *(*get_config)(call_frame_t *frame); +}; + +struct data_cipher_alg { + gf_boolean_t atomic; /* true means that algorithm requires + to pad data before cipher transform */ + gf_boolean_t should_pad; /* true means that algorithm requires + to pad the end of file with extra-data */ + uint32_t blkbits; /* blksize = 1 << blkbits */ + /* + * any preliminary sanity checks goes here + */ + int32_t (*init)(void); + /* + * set alg-mode specific inode info + */ + int32_t (*set_private)(struct crypt_inode_info *info, + struct master_cipher_info *master); + /* + * check alg-mode specific data key + */ + int32_t (*check_key)(uint32_t key_size); + void (*set_iv)(off_t offset, struct object_cipher_info *object); + int32_t (*encrypt)(const unsigned char *from, unsigned char *to, + size_t length, off_t offset, const int enc, + struct object_cipher_info *object); +}; + +/* + * version-dependent metadata loader + */ +struct crypt_mtd_loader { + /* + * return core format size + */ + size_t (*format_size)(mtd_op_t op, size_t old_size); + /* + * pack version-specific metadata of an object + * at ->create() + */ + int32_t (*create_format)(unsigned char *wire, + loc_t *loc, + struct crypt_inode_info *info, + struct master_cipher_info *master); + /* + * extract version-specific metadata of an object + * at ->open() time + */ + int32_t (*open_format)(unsigned char *wire, + int32_t len, + loc_t *loc, + struct crypt_inode_info *info, + struct master_cipher_info *master, + crypt_local_t *local, + gf_boolean_t load_info); + int32_t (*update_format)(unsigned char *new, + unsigned char *old, + size_t old_len, + int32_t mac_idx, + mtd_op_t op, + loc_t *loc, + struct crypt_inode_info *info, + struct master_cipher_info *master, + crypt_local_t *local); +}; + +typedef int32_t (*end_writeback_handler_t)(call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iatt *prebuf, + struct iatt *postbuf, + dict_t *xdata); +typedef void (*linkop_wind_handler_t)(call_frame_t *frame, xlator_t *this); +typedef void (*linkop_unwind_handler_t)(call_frame_t *frame); + + +/* Declarations */ + +/* keys.c */ +extern struct crypt_key crypt_keys[LAST_KEY_TYPE]; +int32_t get_nmtd_vol_key(struct master_cipher_info *master); +int32_t get_nmtd_link_key(loc_t *loc, + struct master_cipher_info *master, + unsigned char *result); +int32_t get_emtd_file_key(struct crypt_inode_info *info, + struct master_cipher_info *master, + unsigned char *result); +int32_t get_data_file_key(struct crypt_inode_info *info, + struct master_cipher_info *master, + uint32_t keysize, + unsigned char *key); +/* data.c */ +extern struct data_cipher_alg data_cipher_algs[LAST_CIPHER_ALG][LAST_CIPHER_MODE]; +void encrypt_aligned_iov(struct object_cipher_info *object, + struct iovec *vec, + int count, + off_t off); +void decrypt_aligned_iov(struct object_cipher_info *object, + struct iovec *vec, + int count, + off_t off); +int32_t align_iov_by_atoms(xlator_t *this, + crypt_local_t *local, + struct object_cipher_info *object, + struct iovec *vec /* input vector */, + int32_t count /* number of vec components */, + struct iovec *avec /* aligned vector */, + char **blocks /* pool of blocks */, + uint32_t *blocks_allocated, + struct avec_config *conf); +int32_t set_config_avec_data(xlator_t *this, + crypt_local_t *local, + struct avec_config *conf, + struct object_cipher_info *object, + struct iovec *vec, + int32_t vec_count); +int32_t set_config_avec_hole(xlator_t *this, + crypt_local_t *local, + struct avec_config *conf, + struct object_cipher_info *object, + glusterfs_fop_t fop); +void set_gap_at_end(call_frame_t *frame, struct object_cipher_info *object, + struct avec_config *conf, atom_data_type dtype); +void set_config_offsets(call_frame_t *frame, + xlator_t *this, + uint64_t offset, + uint64_t count, + atom_data_type dtype, + int32_t setup_gap_in_tail); + +/* metadata.c */ +extern struct crypt_mtd_loader mtd_loaders [LAST_MTD_LOADER]; + +int32_t alloc_format(crypt_local_t *local, size_t size); +int32_t alloc_format_create(crypt_local_t *local); +void free_format(crypt_local_t *local); +size_t format_size(mtd_op_t op, size_t old_size); +size_t new_format_size(void); +int32_t open_format(unsigned char *str, int32_t len, loc_t *loc, + struct crypt_inode_info *info, + struct master_cipher_info *master, crypt_local_t *local, + gf_boolean_t load_info); +int32_t update_format(unsigned char *new, unsigned char *old, + size_t old_len, int32_t mac_idx, mtd_op_t op, loc_t *loc, + struct crypt_inode_info *info, + struct master_cipher_info *master, + crypt_local_t *local); +int32_t create_format(unsigned char *wire, + loc_t *loc, + struct crypt_inode_info *info, + struct master_cipher_info *master); + +/* atom.c */ +struct rmw_atom *atom_by_types(atom_data_type data, + atom_locality_type locality); +void submit_partial(call_frame_t *frame, + xlator_t *this, + fd_t *fd, + atom_locality_type ltype); +void submit_full(call_frame_t *frame, xlator_t *this); + +/* crypt.c */ + +end_writeback_handler_t dispatch_end_writeback(glusterfs_fop_t fop); +static size_t iovec_get_size(struct iovec *vec, uint32_t count); +void set_local_io_params_writev(call_frame_t *frame, + struct object_cipher_info *object, + struct rmw_atom *atom, off_t io_offset, + uint32_t io_size); +void link_wind(call_frame_t *frame, xlator_t *this); +void unlink_wind(call_frame_t *frame, xlator_t *this); +void link_unwind(call_frame_t *frame); +void unlink_unwind(call_frame_t *frame); +void rename_wind(call_frame_t *frame, xlator_t *this); +void rename_unwind(call_frame_t *frame); + +/* Inline functions */ + +static inline size_t iovec_get_size(struct iovec *vec, uint32_t count) +{ + int i; + size_t size = 0; + for (i = 0; i < count; i++) + size += vec[i].iov_len; + return size; +} + +static inline int32_t crypt_xlator_id(void) +{ + return CRYPT_XLATOR_ID; +} + +static inline mtd_loader_id current_mtd_loader(void) +{ + return MTD_LOADER_V1; +} + +static inline uint32_t master_key_size (void) +{ + return crypt_keys[MASTER_VOL_KEY].len >> 3; +} + +static inline uint32_t nmtd_vol_key_size (void) +{ + return crypt_keys[NMTD_VOL_KEY].len >> 3; +} + +static inline uint32_t alg_mode_blkbits(cipher_alg_t alg, + cipher_mode_t mode) +{ + return data_cipher_algs[alg][mode].blkbits; +} + +static inline uint32_t alg_mode_blksize(cipher_alg_t alg, + cipher_mode_t mode) +{ + return 1 << alg_mode_blkbits(alg, mode); +} + +static inline gf_boolean_t alg_mode_atomic(cipher_alg_t alg, + cipher_mode_t mode) +{ + return data_cipher_algs[alg][mode].atomic; +} + +static inline gf_boolean_t alg_mode_should_pad(cipher_alg_t alg, + cipher_mode_t mode) +{ + return data_cipher_algs[alg][mode].should_pad; +} + +static inline uint32_t master_alg_blksize(struct master_cipher_info *mr) +{ + return alg_mode_blksize(mr->m_alg, mr->m_mode); +} + +static inline uint32_t master_alg_blkbits(struct master_cipher_info *mr) +{ + return alg_mode_blkbits(mr->m_alg, mr->m_mode); +} + +static inline gf_boolean_t master_alg_atomic(struct master_cipher_info *mr) +{ + return alg_mode_atomic(mr->m_alg, mr->m_mode); +} + +static inline gf_boolean_t master_alg_should_pad(struct master_cipher_info *mr) +{ + return alg_mode_should_pad(mr->m_alg, mr->m_mode); +} + +static inline uint32_t object_alg_blksize(struct object_cipher_info *ob) +{ + return alg_mode_blksize(ob->o_alg, ob->o_mode); +} + +static inline uint32_t object_alg_blkbits(struct object_cipher_info *ob) +{ + return alg_mode_blkbits(ob->o_alg, ob->o_mode); +} + +static inline gf_boolean_t object_alg_atomic(struct object_cipher_info *ob) +{ + return alg_mode_atomic(ob->o_alg, ob->o_mode); +} + +static inline gf_boolean_t object_alg_should_pad(struct object_cipher_info *ob) +{ + return alg_mode_should_pad(ob->o_alg, ob->o_mode); +} + +static inline uint32_t aes_raw_key_size(struct master_cipher_info *master) +{ + return master->m_dkey_size >> 3; +} + +static inline struct avec_config *get_hole_conf(call_frame_t *frame) +{ + return &(((crypt_local_t *)frame->local)->hole_conf); +} + +static inline struct avec_config *get_data_conf(call_frame_t *frame) +{ + return &(((crypt_local_t *)frame->local)->data_conf); +} + +static inline int32_t get_atom_bits (struct object_cipher_info *object) +{ + return object->o_block_bits; +} + +static inline int32_t get_atom_size (struct object_cipher_info *object) +{ + return 1 << get_atom_bits(object); +} + +static inline int32_t has_head_block(struct avec_config *conf) +{ + return conf->off_in_head || + (conf->acount == 1 && conf->off_in_tail); +} + +static inline int32_t has_tail_block(struct avec_config *conf) +{ + return conf->off_in_tail && conf->acount > 1; +} + +static inline int32_t has_full_blocks(struct avec_config *conf) +{ + return conf->nr_full_blocks; +} + +static inline int32_t should_submit_head_block(struct avec_config *conf) +{ + return has_head_block(conf) && (conf->cursor == 0); +} + +static inline int32_t should_submit_tail_block(struct avec_config *conf) +{ + return has_tail_block(conf) && (conf->cursor == conf->acount - 1); +} + +static inline int32_t should_submit_full_block(struct avec_config *conf) +{ + uint32_t start = has_head_block(conf) ? 1 : 0; + + return has_full_blocks(conf) && + conf->cursor >= start && + conf->cursor < start + conf->nr_full_blocks; +} + +#if DEBUG_CRYPT +static inline void crypt_check_input_len(size_t len, + struct object_cipher_info *object) +{ + if (object_alg_should_pad(object) && (len & (object_alg_blksize(object) - 1))) + gf_log ("crypt", GF_LOG_DEBUG, "bad input len: %d", (int)len); +} + +static inline void check_head_block(struct avec_config *conf) +{ + if (!has_head_block(conf)) + gf_log("crypt", GF_LOG_DEBUG, "not a head atom"); +} + +static inline void check_tail_block(struct avec_config *conf) +{ + if (!has_tail_block(conf)) + gf_log("crypt", GF_LOG_DEBUG, "not a tail atom"); +} + +static inline void check_full_block(struct avec_config *conf) +{ + if (!has_full_blocks(conf)) + gf_log("crypt", GF_LOG_DEBUG, "not a full atom"); +} + +static inline void check_cursor_head(struct avec_config *conf) +{ + if (!has_head_block(conf)) + gf_log("crypt", + GF_LOG_DEBUG, "Illegal call of head atom method"); + else if (conf->cursor != 0) + gf_log("crypt", + GF_LOG_DEBUG, "Cursor (%d) is not at head atom", + conf->cursor); +} + +static inline void check_cursor_full(struct avec_config *conf) +{ + if (!has_full_blocks(conf)) + gf_log("crypt", + GF_LOG_DEBUG, "Illegal call of full atom method"); + if (has_head_block(conf) && (conf->cursor == 0)) + gf_log("crypt", + GF_LOG_DEBUG, "Cursor is not at full atom"); +} + +/* + * FIXME: use avec->iov_len to check setup + */ +static inline int data_local_invariant(crypt_local_t *local) +{ + return 0; +} + +#else +#define crypt_check_input_len(len, object) noop +#define check_head_block(conf) noop +#define check_tail_block(conf) noop +#define check_full_block(conf) noop +#define check_cursor_head(conf) noop +#define check_cursor_full(conf) noop + +#endif /* DEBUG_CRYPT */ + +static inline struct avec_config *conf_by_type(call_frame_t *frame, + atom_data_type dtype) +{ + struct avec_config *conf = NULL; + + switch (dtype) { + case HOLE_ATOM: + conf = get_hole_conf(frame); + break; + case DATA_ATOM: + conf = get_data_conf(frame); + break; + default: + gf_log("crypt", GF_LOG_DEBUG, "bad atom type"); + } + return conf; +} + +static inline uint32_t nr_calls_head(struct avec_config *conf) +{ + return has_head_block(conf) ? 1 : 0; +} + +static inline uint32_t nr_calls_tail(struct avec_config *conf) +{ + return has_tail_block(conf) ? 1 : 0; +} + +static inline uint32_t nr_calls_full(struct avec_config *conf) +{ + switch(conf->type) { + case HOLE_ATOM: + return has_full_blocks(conf); + case DATA_ATOM: + return has_full_blocks(conf) ? + logical_blocks_occupied(0, + conf->nr_full_blocks, + MAX_IOVEC_BITS) : 0; + default: + gf_log("crypt", GF_LOG_DEBUG, "bad atom data type"); + return 0; + } +} + +static inline uint32_t nr_calls(struct avec_config *conf) +{ + return nr_calls_head(conf) + nr_calls_tail(conf) + nr_calls_full(conf); +} + +static inline uint32_t nr_calls_data(call_frame_t *frame) +{ + return nr_calls(get_data_conf(frame)); +} + +static inline uint32_t nr_calls_hole(call_frame_t *frame) +{ + return nr_calls(get_hole_conf(frame)); +} + +static inline void get_one_call_nolock(call_frame_t *frame) +{ + crypt_local_t *local = frame->local; + + ++local->nr_calls; + + //gf_log("crypt", GF_LOG_DEBUG, "get %d calls", 1); +} + +static inline void get_one_call(call_frame_t *frame) +{ + crypt_local_t *local = frame->local; + + LOCK(&local->call_lock); + get_one_call_nolock(frame); + UNLOCK(&local->call_lock); +} + +static inline void get_nr_calls_nolock(call_frame_t *frame, int32_t nr) +{ + crypt_local_t *local = frame->local; + + local->nr_calls += nr; + + //gf_log("crypt", GF_LOG_DEBUG, "get %d calls", nr); +} + +static inline void get_nr_calls(call_frame_t *frame, int32_t nr) +{ + crypt_local_t *local = frame->local; + + LOCK(&local->call_lock); + get_nr_calls_nolock(frame, nr); + UNLOCK(&local->call_lock); +} + +static inline int put_one_call(crypt_local_t *local) +{ + uint32_t last = 0; + + LOCK(&local->call_lock); + if (--local->nr_calls == 0) + last = 1; + + //gf_log("crypt", GF_LOG_DEBUG, "put %d calls", 1); + + UNLOCK(&local->call_lock); + return last; +} + +static inline int is_appended_write(call_frame_t *frame) +{ + crypt_local_t *local = frame->local; + struct avec_config *conf = get_data_conf(frame); + + return conf->orig_offset + conf->orig_size > local->old_file_size; +} + +static inline int is_ordered_mode(call_frame_t *frame) +{ +#if 0 + crypt_local_t *local = frame->local; + return local->fop == GF_FOP_FTRUNCATE || + (local->fop == GF_FOP_WRITE && is_appended_write(frame)); +#endif + return 1; +} + +static inline int32_t hole_conv_completed(crypt_local_t *local) +{ + struct avec_config *conf = &local->hole_conf; + return conf->cursor == conf->acount; +} + +static inline int32_t data_write_in_progress(crypt_local_t *local) +{ + return local->active_setup == DATA_ATOM; +} + +static inline int32_t parent_is_crypt_xlator(call_frame_t *frame, + xlator_t *this) +{ + return frame->parent->this == this; +} + +static inline linkop_wind_handler_t linkop_wind_dispatch(glusterfs_fop_t fop) +{ + switch(fop){ + case GF_FOP_LINK: + return link_wind; + case GF_FOP_UNLINK: + return unlink_wind; + case GF_FOP_RENAME: + return rename_wind; + default: + gf_log("crypt", GF_LOG_ERROR, "Bad link operation %d", fop); + return NULL; + } +} + +static inline linkop_unwind_handler_t linkop_unwind_dispatch(glusterfs_fop_t fop) +{ + switch(fop){ + case GF_FOP_LINK: + return link_unwind; + case GF_FOP_UNLINK: + return unlink_unwind; + case GF_FOP_RENAME: + return rename_unwind; + default: + gf_log("crypt", GF_LOG_ERROR, "Bad link operation %d", fop); + return NULL; + } +} + +static inline mtd_op_t linkop_mtdop_dispatch(glusterfs_fop_t fop) +{ + switch (fop) { + case GF_FOP_LINK: + return MTD_APPEND; + case GF_FOP_UNLINK: + return MTD_CUT; + case GF_FOP_RENAME: + return MTD_OVERWRITE; + default: + gf_log("crypt", GF_LOG_WARNING, "Bad link operation %d", fop); + return MTD_LAST_OP; + } +} + +#endif /* __CRYPT_H__ */ + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 80 + scroll-step: 1 + End: +*/ diff --git a/xlators/encryption/crypt/src/data.c b/xlators/encryption/crypt/src/data.c new file mode 100644 index 000000000..762fa554a --- /dev/null +++ b/xlators/encryption/crypt/src/data.c @@ -0,0 +1,769 @@ +/* + Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "defaults.h" +#include "crypt-common.h" +#include "crypt.h" + +static void set_iv_aes_xts(off_t offset, struct object_cipher_info *object) +{ + unsigned char *ivec; + + ivec = object->u.aes_xts.ivec; + + /* convert the tweak into a little-endian byte + * array (IEEE P1619/D16, May 2007, section 5.1) + */ + + *((uint64_t *)ivec) = htole64(offset); + + /* ivec is padded with zeroes */ +} + +static int32_t aes_set_keys_common(unsigned char *raw_key, uint32_t key_size, + AES_KEY *keys) +{ + int32_t ret; + + ret = AES_set_encrypt_key(raw_key, + key_size, + &keys[AES_ENCRYPT]); + if (ret) { + gf_log("crypt", GF_LOG_ERROR, "Set encrypt key failed"); + return ret; + } + ret = AES_set_decrypt_key(raw_key, + key_size, + &keys[AES_DECRYPT]); + if (ret) { + gf_log("crypt", GF_LOG_ERROR, "Set decrypt key failed"); + return ret; + } + return 0; +} + +/* + * set private cipher info for xts mode + */ +static int32_t set_private_aes_xts(struct crypt_inode_info *info, + struct master_cipher_info *master) +{ + int ret; + struct object_cipher_info *object = get_object_cinfo(info); + unsigned char *data_key; + uint32_t subkey_size; + + /* init tweak value */ + memset(object->u.aes_xts.ivec, 0, 16); + + data_key = GF_CALLOC(1, object->o_dkey_size, gf_crypt_mt_key); + if (!data_key) + return ENOMEM; + + /* + * retrieve data keying meterial + */ + ret = get_data_file_key(info, master, object->o_dkey_size, data_key); + if (ret) { + gf_log("crypt", GF_LOG_ERROR, "Failed to retrieve data key"); + GF_FREE(data_key); + return ret; + } + /* + * parse compound xts key + */ + subkey_size = object->o_dkey_size >> 4; /* (xts-key-size-in-bytes / 2) */ + /* + * install key for data encryption + */ + ret = aes_set_keys_common(data_key, + subkey_size << 3, object->u.aes_xts.dkey); + if (ret) { + GF_FREE(data_key); + return ret; + } + /* + * set up key used to encrypt tweaks + */ + ret = AES_set_encrypt_key(data_key + subkey_size, + object->o_dkey_size / 2, + &object->u.aes_xts.tkey); + if (ret < 0) + gf_log("crypt", GF_LOG_ERROR, "Set tweak key failed"); + + GF_FREE(data_key); + return ret; +} + +static int32_t aes_xts_init(void) +{ + cassert(AES_BLOCK_SIZE == (1 << AES_BLOCK_BITS)); + return 0; +} + +static int32_t check_key_aes_xts(uint32_t keysize) +{ + switch(keysize) { + case 256: + case 512: + return 0; + default: + break; + } + return -1; +} + +static int32_t encrypt_aes_xts(const unsigned char *from, + unsigned char *to, size_t length, + off_t offset, const int enc, + struct object_cipher_info *object) +{ + XTS128_CONTEXT ctx; + if (enc) { + ctx.key1 = &object->u.aes_xts.dkey[AES_ENCRYPT]; + ctx.block1 = (block128_f)AES_encrypt; + } + else { + ctx.key1 = &object->u.aes_xts.dkey[AES_DECRYPT]; + ctx.block1 = (block128_f)AES_decrypt; + } + ctx.key2 = &object->u.aes_xts.tkey; + ctx.block2 = (block128_f)AES_encrypt; + + return CRYPTO_xts128_encrypt(&ctx, + object->u.aes_xts.ivec, + from, + to, + length, enc); +} + +/* + * Cipher input chunk @from of length @len; + * @to: result of cipher transform; + * @off: offset in a file (must be cblock-aligned); + */ +static void cipher_data(struct object_cipher_info *object, + char *from, + char *to, + off_t off, + size_t len, + const int enc) +{ + crypt_check_input_len(len, object); + +#if TRIVIAL_TFM && DEBUG_CRYPT + return; +#endif + data_cipher_algs[object->o_alg][object->o_mode].set_iv(off, object); + data_cipher_algs[object->o_alg][object->o_mode].encrypt + ((const unsigned char *)from, + (unsigned char *)to, + len, + off, + enc, + object); +} + +#define MAX_CIPHER_CHUNK (1 << 30) + +/* + * Do cipher (encryption/decryption) transform of a + * continuous region of memory. + * + * @len: a number of bytes to transform; + * @buf: data to transform; + * @off: offset in a file, should be block-aligned + * for atomic cipher modes and ksize-aligned + * for other modes). + * @dir: direction of transform (encrypt/decrypt). + */ +static void cipher_region(struct object_cipher_info *object, + char *from, + char *to, + off_t off, + size_t len, + int dir) +{ + while (len > 0) { + size_t to_cipher; + + to_cipher = len; + if (to_cipher > MAX_CIPHER_CHUNK) + to_cipher = MAX_CIPHER_CHUNK; + + /* this will reset IV */ + cipher_data(object, + from, + to, + off, + to_cipher, + dir); + from += to_cipher; + to += to_cipher; + off += to_cipher; + len -= to_cipher; + } +} + +/* + * Do cipher transform (encryption/decryption) of + * plaintext/ciphertext represented by @vec. + * + * Pre-conditions: @vec represents a continuous piece + * of data in a file at offset @off to be ciphered + * (encrypted/decrypted). + * @count is the number of vec's components. All the + * components must be block-aligned, the caller is + * responsible for this. @dir is "direction" of + * transform (encrypt/decrypt). + */ +static void cipher_aligned_iov(struct object_cipher_info *object, + struct iovec *vec, + int count, + off_t off, + int32_t dir) +{ + int i; + int len = 0; + + for (i = 0; i < count; i++) { + cipher_region(object, + vec[i].iov_base, + vec[i].iov_base, + off + len, + vec[i].iov_len, + dir); + len += vec[i].iov_len; + } +} + +void encrypt_aligned_iov(struct object_cipher_info *object, + struct iovec *vec, + int count, + off_t off) +{ + cipher_aligned_iov(object, vec, count, off, 1); +} + +void decrypt_aligned_iov(struct object_cipher_info *object, + struct iovec *vec, + int count, + off_t off) +{ + cipher_aligned_iov(object, vec, count, off, 0); +} + +#if DEBUG_CRYPT +static void compound_stream(struct iovec *vec, int count, char *buf, off_t skip) +{ + int i; + int off = 0; + for (i = 0; i < count; i++) { + memcpy(buf + off, + vec[i].iov_base + skip, + vec[i].iov_len - skip); + + off += (vec[i].iov_len - skip); + skip = 0; + } +} + +static void check_iovecs(struct iovec *vec, int cnt, + struct iovec *avec, int acnt, uint32_t off_in_head) +{ + char *s1, *s2; + uint32_t size, asize; + + size = iovec_get_size(vec, cnt); + asize = iovec_get_size(avec, acnt) - off_in_head; + if (size != asize) { + gf_log("crypt", GF_LOG_DEBUG, "size %d is not eq asize %d", + size, asize); + return; + } + s1 = GF_CALLOC(1, size, gf_crypt_mt_data); + if (!s1) { + gf_log("crypt", GF_LOG_DEBUG, "Can not allocate stream "); + return; + } + s2 = GF_CALLOC(1, asize, gf_crypt_mt_data); + if (!s2) { + GF_FREE(s1); + gf_log("crypt", GF_LOG_DEBUG, "Can not allocate stream "); + return; + } + compound_stream(vec, cnt, s1, 0); + compound_stream(avec, acnt, s2, off_in_head); + if (memcmp(s1, s2, size)) + gf_log("crypt", GF_LOG_DEBUG, "chunks of different data"); + GF_FREE(s1); + GF_FREE(s2); +} + +#else +#define check_iovecs(vec, count, avec, avecn, off) noop +#endif /* DEBUG_CRYPT */ + +static char *data_alloc_block(xlator_t *this, crypt_local_t *local, + int32_t block_size) +{ + struct iobuf *iobuf = NULL; + + iobuf = iobuf_get2(this->ctx->iobuf_pool, block_size); + if (!iobuf) { + gf_log("crypt", GF_LOG_ERROR, + "Failed to get iobuf"); + return NULL; + } + if (!local->iobref_data) { + local->iobref_data = iobref_new(); + if (!local->iobref_data) { + gf_log("crypt", GF_LOG_ERROR, + "Failed to get iobref"); + iobuf_unref(iobuf); + return NULL; + } + } + iobref_add(local->iobref_data, iobuf); + return iobuf->ptr; +} + +/* + * Compound @avec, which represent the same data + * chunk as @vec, but has aligned components of + * specified block size. Alloc blocks, if needed. + * In particular, incomplete head and tail blocks + * must be allocated. + * Put number of allocated blocks to @num_blocks. + * + * Example: + * + * input: data chunk represented by 4 components + * [AB],[BC],[CD],[DE]; + * output: 5 logical blocks (0, 1, 2, 3, 4). + * + * A B C D E + * *-----*+------*-+---*----+--------+-* + * | || | | | | | | + * *-+-----+*------+-*---+----*--------*-+------* + * 0 1 2 3 4 + * + * 0 - incomplete compound (head); + * 1, 2 - full compound; + * 3 - full non-compound (the case of reuse); + * 4 - incomplete non-compound (tail). + */ +int32_t align_iov_by_atoms(xlator_t *this, + crypt_local_t *local, + struct object_cipher_info *object, + struct iovec *vec /* input vector */, + int32_t count /* number of vec components */, + struct iovec *avec /* aligned vector */, + char **blocks /* pool of blocks */, + uint32_t *blocks_allocated, + struct avec_config *conf) +{ + int vecn = 0; /* number of the current component in vec */ + int avecn = 0; /* number of the current component in avec */ + off_t vec_off = 0; /* offset in the current vec component, + * i.e. the number of bytes have already + * been copied */ + int32_t block_size = get_atom_size(object); + size_t to_process; /* number of vec's bytes to copy and(or) re-use */ + int32_t off_in_head = conf->off_in_head; + + to_process = iovec_get_size(vec, count); + + while (to_process > 0) { + if (off_in_head || + vec[vecn].iov_len - vec_off < block_size) { + /* + * less than block_size: + * the case of incomplete (head or tail), + * or compound block + */ + size_t copied = 0; + /* + * populate the pool with a new block + */ + blocks[*blocks_allocated] = data_alloc_block(this, + local, + block_size); + if (!blocks[*blocks_allocated]) + return -ENOMEM; + memset(blocks[*blocks_allocated], 0, off_in_head); + /* + * fill the block with vec components + */ + do { + size_t to_copy; + + to_copy = vec[vecn].iov_len - vec_off; + if (to_copy > block_size - off_in_head) + to_copy = block_size - off_in_head; + + memcpy(blocks[*blocks_allocated] + off_in_head + copied, + vec[vecn].iov_base + vec_off, + to_copy); + + copied += to_copy; + to_process -= to_copy; + + vec_off += to_copy; + if (vec_off == vec[vecn].iov_len) { + /* finished with this vecn */ + vec_off = 0; + vecn++; + } + } while (copied < (block_size - off_in_head) && to_process > 0); + /* + * update avec + */ + avec[avecn].iov_len = off_in_head + copied; + avec[avecn].iov_base = blocks[*blocks_allocated]; + + (*blocks_allocated)++; + off_in_head = 0; + } else { + /* + * the rest of the current vec component + * is not less than block_size, so reuse + * the memory buffer of the component. + */ + size_t to_reuse; + to_reuse = (to_process > block_size ? + block_size : + to_process); + avec[avecn].iov_len = to_reuse; + avec[avecn].iov_base = vec[vecn].iov_base + vec_off; + + vec_off += to_reuse; + if (vec_off == vec[vecn].iov_len) { + /* finished with this vecn */ + vec_off = 0; + vecn++; + } + to_process -= to_reuse; + } + avecn++; + } + check_iovecs(vec, count, avec, avecn, conf->off_in_head); + return 0; +} + +/* + * allocate and setup aligned vector for data submission + * Pre-condition: @conf is set. + */ +int32_t set_config_avec_data(xlator_t *this, + crypt_local_t *local, + struct avec_config *conf, + struct object_cipher_info *object, + struct iovec *vec, + int32_t vec_count) +{ + int32_t ret = ENOMEM; + struct iovec *avec; + char **pool; + uint32_t blocks_in_pool = 0; + + conf->type = DATA_ATOM; + + avec = GF_CALLOC(conf->acount, sizeof(*avec), gf_crypt_mt_iovec); + if (!avec) + return ret; + pool = GF_CALLOC(conf->acount, sizeof(pool), gf_crypt_mt_char); + if (!pool) { + GF_FREE(avec); + return ret; + } + if (!vec) { + /* + * degenerated case: no data + */ + pool[0] = data_alloc_block(this, local, get_atom_size(object)); + if (!pool[0]) + goto free; + blocks_in_pool = 1; + avec->iov_base = pool[0]; + avec->iov_len = conf->off_in_tail; + } + else { + ret = align_iov_by_atoms(this, local, object, vec, vec_count, + avec, pool, &blocks_in_pool, conf); + if (ret) + goto free; + } + conf->avec = avec; + conf->pool = pool; + conf->blocks_in_pool = blocks_in_pool; + return 0; + free: + GF_FREE(avec); + GF_FREE(pool); + return ret; +} + +/* + * allocate and setup aligned vector for hole submission + */ +int32_t set_config_avec_hole(xlator_t *this, + crypt_local_t *local, + struct avec_config *conf, + struct object_cipher_info *object, + glusterfs_fop_t fop) +{ + uint32_t i, idx; + struct iovec *avec; + char **pool; + uint32_t num_blocks; + uint32_t blocks_in_pool = 0; + + conf->type = HOLE_ATOM; + + num_blocks = conf->acount - + (conf->nr_full_blocks ? conf->nr_full_blocks - 1 : 0); + + switch (fop) { + case GF_FOP_WRITE: + /* + * hole goes before data + */ + if (num_blocks == 1 && conf->off_in_tail != 0) + /* + * we won't submit a hole which fits into + * a data atom: this part of hole will be + * submitted with data write + */ + return 0; + break; + case GF_FOP_FTRUNCATE: + /* + * expanding truncate, hole goes after data, + * and will be submited in any case. + */ + break; + default: + gf_log("crypt", GF_LOG_WARNING, + "bad file operation %d", fop); + return 0; + } + avec = GF_CALLOC(num_blocks, sizeof(*avec), gf_crypt_mt_iovec); + if (!avec) + return ENOMEM; + pool = GF_CALLOC(num_blocks, sizeof(pool), gf_crypt_mt_char); + if (!pool) { + GF_FREE(avec); + return ENOMEM; + } + for (i = 0; i < num_blocks; i++) { + pool[i] = data_alloc_block(this, local, get_atom_size(object)); + if (pool[i] == NULL) + goto free; + blocks_in_pool++; + } + if (has_head_block(conf)) { + /* set head block */ + idx = 0; + avec[idx].iov_base = pool[idx]; + avec[idx].iov_len = get_atom_size(object); + memset(avec[idx].iov_base + conf->off_in_head, + 0, + get_atom_size(object) - conf->off_in_head); + } + if (has_tail_block(conf)) { + /* set tail block */ + idx = num_blocks - 1; + avec[idx].iov_base = pool[idx]; + avec[idx].iov_len = get_atom_size(object); + memset(avec[idx].iov_base, 0, conf->off_in_tail); + } + if (has_full_blocks(conf)) { + /* set full block */ + idx = conf->off_in_head ? 1 : 0; + avec[idx].iov_base = pool[idx]; + avec[idx].iov_len = get_atom_size(object); + /* + * since we re-use the buffer, + * zeroes will be set every time + * before encryption, see submit_full() + */ + } + conf->avec = avec; + conf->pool = pool; + conf->blocks_in_pool = blocks_in_pool; + return 0; + free: + GF_FREE(avec); + GF_FREE(pool); + return ENOMEM; +} + +/* A helper for setting up config of partial atoms (which + * participate in read-modify-write sequence). + * + * Calculate and setup precise amount of "extra-bytes" + * that should be uptodated at the end of partial (not + * necessarily tail!) block. + * + * Pre-condition: local->old_file_size is valid! + * @conf contains setup, which is enough for correct calculation + * of has_tail_block(), ->get_offset(). + */ +void set_gap_at_end(call_frame_t *frame, struct object_cipher_info *object, + struct avec_config *conf, atom_data_type dtype) +{ + uint32_t to_block; + crypt_local_t *local = frame->local; + uint64_t old_file_size = local->old_file_size; + struct rmw_atom *partial = atom_by_types(dtype, + has_tail_block(conf) ? + TAIL_ATOM : HEAD_ATOM); + + if (old_file_size <= partial->offset_at(frame, object)) + to_block = 0; + else { + to_block = old_file_size - partial->offset_at(frame, object); + if (to_block > get_atom_size(object)) + to_block = get_atom_size(object); + } + if (to_block > conf->off_in_tail) + conf->gap_in_tail = to_block - conf->off_in_tail; + else + /* + * nothing to uptodate + */ + conf->gap_in_tail = 0; +} + +/* + * fill struct avec_config with offsets layouts + */ +void set_config_offsets(call_frame_t *frame, + xlator_t *this, + uint64_t offset, + uint64_t count, + atom_data_type dtype, + int32_t set_gap) +{ + crypt_local_t *local; + struct object_cipher_info *object; + struct avec_config *conf; + uint32_t resid; + + uint32_t atom_size; + uint32_t atom_bits; + + size_t orig_size; + off_t orig_offset; + size_t expanded_size; + off_t aligned_offset; + + uint32_t off_in_head = 0; + uint32_t off_in_tail = 0; + uint32_t nr_full_blocks; + int32_t size_full_blocks; + + uint32_t acount; /* number of alifned components to write. + * The same as number of occupied logical + * blocks (atoms) + */ + local = frame->local; + object = &local->info->cinfo; + conf = (dtype == DATA_ATOM ? + get_data_conf(frame) : get_hole_conf(frame)); + + orig_offset = offset; + orig_size = count; + + atom_size = get_atom_size(object); + atom_bits = get_atom_bits(object); + + /* + * Round-down the start, + * round-up the end. + */ + resid = offset & (uint64_t)(atom_size - 1); + + if (resid) + off_in_head = resid; + aligned_offset = offset - off_in_head; + expanded_size = orig_size + off_in_head; + + /* calculate tail, + expand size forward */ + resid = (offset + orig_size) & (uint64_t)(atom_size - 1); + + if (resid) { + off_in_tail = resid; + expanded_size += (atom_size - off_in_tail); + } + /* + * calculate number of occupied blocks + */ + acount = expanded_size >> atom_bits; + /* + * calculate number of full blocks + */ + size_full_blocks = expanded_size; + if (off_in_head) + size_full_blocks -= atom_size; + if (off_in_tail && size_full_blocks > 0) + size_full_blocks -= atom_size; + nr_full_blocks = size_full_blocks >> atom_bits; + + conf->atom_size = atom_size; + conf->orig_size = orig_size; + conf->orig_offset = orig_offset; + conf->expanded_size = expanded_size; + conf->aligned_offset = aligned_offset; + + conf->off_in_head = off_in_head; + conf->off_in_tail = off_in_tail; + conf->nr_full_blocks = nr_full_blocks; + conf->acount = acount; + /* + * Finally, calculate precise amount of + * "extra-bytes" that should be uptodated + * at the end. + * Only if RMW is expected. + */ + if (off_in_tail && set_gap) + set_gap_at_end(frame, object, conf, dtype); +} + +struct data_cipher_alg data_cipher_algs[LAST_CIPHER_ALG][LAST_CIPHER_MODE] = { + [AES_CIPHER_ALG][XTS_CIPHER_MODE] = + { .atomic = _gf_true, + .should_pad = _gf_true, + .blkbits = AES_BLOCK_BITS, + .init = aes_xts_init, + .set_private = set_private_aes_xts, + .check_key = check_key_aes_xts, + .set_iv = set_iv_aes_xts, + .encrypt = encrypt_aes_xts + } +}; + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 80 + scroll-step: 1 + End: +*/ diff --git a/xlators/encryption/crypt/src/keys.c b/xlators/encryption/crypt/src/keys.c new file mode 100644 index 000000000..4a1d3bb5a --- /dev/null +++ b/xlators/encryption/crypt/src/keys.c @@ -0,0 +1,302 @@ +/* + Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "defaults.h" +#include "crypt-common.h" +#include "crypt.h" + +/* Key hierarchy + + +----------------+ + | MASTER_VOL_KEY | + +-------+--------+ + | + | + +----------------+----------------+ + | | | + | | | + +-------+------+ +-------+-------+ +------+--------+ + | NMTD_VOL_KEY | | EMTD_FILE_KEY | | DATA_FILE_KEY | + +-------+------+ +---------------+ +---------------+ + | + | + +-------+-------+ + | NMTD_LINK_KEY | + +---------------+ + + */ + +#if DEBUG_CRYPT +static void check_prf_iters(uint32_t num_iters) +{ + if (num_iters == 0) + gf_log ("crypt", GF_LOG_DEBUG, + "bad number of prf iterations : %d", num_iters); +} +#else +#define check_prf_iters(num_iters) noop +#endif /* DEBUG_CRYPT */ + +unsigned char crypt_fake_oid[16] = + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + +/* + * derive key in the counter mode using + * sha256-based HMAC as PRF, see + * NIST Special Publication 800-108, 5.1) + */ + +#define PRF_OUTPUT_SIZE SHA256_DIGEST_LENGTH + +static int32_t kderive_init(struct kderive_context *ctx, + const unsigned char *pkey, /* parent key */ + uint32_t pkey_size, /* parent key size */ + const unsigned char *idctx, /* id-context */ + uint32_t idctx_size, + crypt_key_type type /* type of child key */) +{ + unsigned char *pos; + uint32_t llen = strlen(crypt_keys[type].label); + /* + * Compoud the fixed input data for KDF: + * [i]_2 || Label || 0x00 || Id-Context || [L]_2), + * NIST SP 800-108, 5.1 + */ + ctx->fid_len = + sizeof(uint32_t) + + llen + + 1 + + idctx_size + + sizeof(uint32_t); + + ctx->fid = GF_CALLOC(ctx->fid_len, 1, gf_crypt_mt_key); + if (!ctx->fid) + return ENOMEM; + ctx->out_len = round_up(crypt_keys[type].len >> 3, + PRF_OUTPUT_SIZE); + ctx->out = GF_CALLOC(ctx->out_len, 1, gf_crypt_mt_key); + if (!ctx->out) { + GF_FREE(ctx->fid); + return ENOMEM; + } + ctx->pkey = pkey; + ctx->pkey_len = pkey_size; + ctx->ckey_len = crypt_keys[type].len; + + pos = ctx->fid; + + /* counter will be set up in kderive_rfn() */ + pos += sizeof(uint32_t); + + memcpy(pos, crypt_keys[type].label, llen); + pos += llen; + + /* set up zero octet */ + *pos = 0; + pos += 1; + + memcpy(pos, idctx, idctx_size); + pos += idctx_size; + + *((uint32_t *)pos) = htobe32(ctx->ckey_len); + + return 0; +} + +static void kderive_update(struct kderive_context *ctx) +{ + uint32_t i; + HMAC_CTX hctx; + unsigned char *pos = ctx->out; + uint32_t *p_iter = (uint32_t *)ctx->fid; + uint32_t num_iters = ctx->out_len / PRF_OUTPUT_SIZE; + + check_prf_iters(num_iters); + + HMAC_CTX_init(&hctx); + for (i = 0; i < num_iters; i++) { + /* + * update the iteration number in the fid + */ + *p_iter = htobe32(i); + HMAC_Init_ex(&hctx, + ctx->pkey, ctx->pkey_len >> 3, + EVP_sha256(), + NULL); + HMAC_Update(&hctx, ctx->fid, ctx->fid_len); + HMAC_Final(&hctx, pos, NULL); + + pos += PRF_OUTPUT_SIZE; + } + HMAC_CTX_cleanup(&hctx); +} + +static void kderive_final(struct kderive_context *ctx, unsigned char *child) +{ + memcpy(child, ctx->out, ctx->ckey_len >> 3); + GF_FREE(ctx->fid); + GF_FREE(ctx->out); + memset(ctx, 0, sizeof(*ctx)); +} + +/* + * derive per-volume key for object ids aithentication + */ +int32_t get_nmtd_vol_key(struct master_cipher_info *master) +{ + int32_t ret; + struct kderive_context ctx; + + ret = kderive_init(&ctx, + master->m_key, + master_key_size(), + crypt_fake_oid, sizeof(uuid_t), NMTD_VOL_KEY); + if (ret) + return ret; + kderive_update(&ctx); + kderive_final(&ctx, master->m_nmtd_key); + return 0; +} + +/* + * derive per-link key for aithentication of non-encrypted + * meta-data (nmtd) + */ +int32_t get_nmtd_link_key(loc_t *loc, + struct master_cipher_info *master, + unsigned char *result) +{ + int32_t ret; + struct kderive_context ctx; + + ret = kderive_init(&ctx, + master->m_nmtd_key, + nmtd_vol_key_size(), + (const unsigned char *)loc->path, + strlen(loc->path), NMTD_LINK_KEY); + if (ret) + return ret; + kderive_update(&ctx); + kderive_final(&ctx, result); + return 0; +} + +/* + * derive per-file key for encryption and authentication + * of encrypted part of metadata (emtd) + */ +int32_t get_emtd_file_key(struct crypt_inode_info *info, + struct master_cipher_info *master, + unsigned char *result) +{ + int32_t ret; + struct kderive_context ctx; + + ret = kderive_init(&ctx, + master->m_key, + master_key_size(), + info->oid, sizeof(uuid_t), EMTD_FILE_KEY); + if (ret) + return ret; + kderive_update(&ctx); + kderive_final(&ctx, result); + return 0; +} + +static int32_t data_key_type_by_size(uint32_t keysize, crypt_key_type *type) +{ + int32_t ret = 0; + switch (keysize) { + case 256: + *type = DATA_FILE_KEY_256; + break; + case 512: + *type = DATA_FILE_KEY_512; + break; + default: + gf_log("crypt", GF_LOG_ERROR, "Unsupported data key size %d", + keysize); + ret = ENOTSUP; + break; + } + return ret; +} + +/* + * derive per-file key for data encryption + */ +int32_t get_data_file_key(struct crypt_inode_info *info, + struct master_cipher_info *master, + uint32_t keysize, + unsigned char *key) +{ + int32_t ret; + struct kderive_context ctx; + crypt_key_type type; + + ret = data_key_type_by_size(keysize, &type); + if (ret) + return ret; + ret = kderive_init(&ctx, + master->m_key, + master_key_size(), + info->oid, sizeof(uuid_t), type); + if (ret) + return ret; + kderive_update(&ctx); + kderive_final(&ctx, key); + return 0; +} + +/* + * NOTE: Don't change existing keys: it will break compatibility; + */ +struct crypt_key crypt_keys[LAST_KEY_TYPE] = { + [MASTER_VOL_KEY] = + { .len = MASTER_VOL_KEY_SIZE << 3, + .label = "volume-master", + }, + [NMTD_VOL_KEY] = + { .len = NMTD_VOL_KEY_SIZE << 3, + .label = "volume-nmtd-key-generation" + }, + [NMTD_LINK_KEY] = + { .len = 128, + .label = "link-nmtd-authentication" + }, + [EMTD_FILE_KEY] = + { .len = 128, + .label = "file-emtd-encryption-and-auth" + }, + [DATA_FILE_KEY_256] = + { .len = 256, + .label = "file-data-encryption-256" + }, + [DATA_FILE_KEY_512] = + { .len = 512, + .label = "file-data-encryption-512" + } +}; + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 80 + scroll-step: 1 + End: +*/ diff --git a/xlators/encryption/crypt/src/metadata.c b/xlators/encryption/crypt/src/metadata.c new file mode 100644 index 000000000..36b14c055 --- /dev/null +++ b/xlators/encryption/crypt/src/metadata.c @@ -0,0 +1,605 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "defaults.h" +#include "crypt-common.h" +#include "crypt.h" +#include "metadata.h" + +int32_t alloc_format(crypt_local_t *local, size_t size) +{ + if (size > 0) { + local->format = GF_CALLOC(1, size, gf_crypt_mt_mtd); + if (!local->format) + return ENOMEM; + } + local->format_size = size; + return 0; +} + +int32_t alloc_format_create(crypt_local_t *local) +{ + return alloc_format(local, new_format_size()); +} + +void free_format(crypt_local_t *local) +{ + GF_FREE(local->format); +} + +/* + * Check compatibility with extracted metadata + */ +static int32_t check_file_metadata(struct crypt_inode_info *info) +{ + struct object_cipher_info *object = &info->cinfo; + + if (info->nr_minor != CRYPT_XLATOR_ID) { + gf_log("crypt", GF_LOG_WARNING, + "unsupported minor subversion %d", info->nr_minor); + return EINVAL; + } + if (object->o_alg > LAST_CIPHER_ALG) { + gf_log("crypt", GF_LOG_WARNING, + "unsupported cipher algorithm %d", + object->o_alg); + return EINVAL; + } + if (object->o_mode > LAST_CIPHER_MODE) { + gf_log("crypt", GF_LOG_WARNING, + "unsupported cipher mode %d", + object->o_mode); + return EINVAL; + } + if (object->o_block_bits < CRYPT_MIN_BLOCK_BITS || + object->o_block_bits > CRYPT_MAX_BLOCK_BITS) { + gf_log("crypt", GF_LOG_WARNING, "unsupported block bits %d", + object->o_block_bits); + return EINVAL; + } + /* TBD: check data key size */ + return 0; +} + +static size_t format_size_v1(mtd_op_t op, size_t old_size) +{ + + switch (op) { + case MTD_CREATE: + return sizeof(struct mtd_format_v1); + case MTD_OVERWRITE: + return old_size; + case MTD_APPEND: + return old_size + NMTD_8_MAC_SIZE; + case MTD_CUT: + if (old_size > sizeof(struct mtd_format_v1)) + return old_size - NMTD_8_MAC_SIZE; + else + return 0; + default: + gf_log("crypt", GF_LOG_WARNING, "Bad mtd operation"); + return 0; + } +} + +/* + * Calculate size of the updated format string. + * Returned zero means that we don't need to update the format string. + */ +size_t format_size(mtd_op_t op, size_t old_size) +{ + size_t versioned; + + versioned = mtd_loaders[current_mtd_loader()].format_size(op, + old_size - sizeof(struct crypt_format)); + if (versioned != 0) + return versioned + sizeof(struct crypt_format); + return 0; +} + +/* + * size of the format string of newly created file (nr_links = 1) + */ +size_t new_format_size(void) +{ + return format_size(MTD_CREATE, 0); +} + +/* + * Calculate per-link MAC by pathname + */ +static int32_t calc_link_mac_v1(struct mtd_format_v1 *fmt, + loc_t *loc, + unsigned char *result, + struct crypt_inode_info *info, + struct master_cipher_info *master) +{ + int32_t ret; + unsigned char nmtd_link_key[16]; + CMAC_CTX *cctx; + size_t len; + + ret = get_nmtd_link_key(loc, master, nmtd_link_key); + if (ret) { + gf_log("crypt", GF_LOG_ERROR, "Can not get nmtd link key"); + return -1; + } + cctx = CMAC_CTX_new(); + if (!cctx) { + gf_log("crypt", GF_LOG_ERROR, "CMAC_CTX_new failed"); + return -1; + } + ret = CMAC_Init(cctx, nmtd_link_key, sizeof(nmtd_link_key), + EVP_aes_128_cbc(), 0); + if (!ret) { + gf_log("crypt", GF_LOG_ERROR, "CMAC_Init failed"); + CMAC_CTX_free(cctx); + return -1; + } + ret = CMAC_Update(cctx, get_NMTD_V1(info), SIZE_OF_NMTD_V1); + if (!ret) { + gf_log("crypt", GF_LOG_ERROR, "CMAC_Update failed"); + CMAC_CTX_free(cctx); + return -1; + } + ret = CMAC_Final(cctx, result, &len); + CMAC_CTX_free(cctx); + if (!ret) { + gf_log("crypt", GF_LOG_ERROR, "CMAC_Final failed"); + return -1; + } + return 0; +} + +/* + * Create per-link MAC of index @idx by pathname + */ +static int32_t create_link_mac_v1(struct mtd_format_v1 *fmt, + uint32_t idx, + loc_t *loc, + struct crypt_inode_info *info, + struct master_cipher_info *master) +{ + int32_t ret; + unsigned char *mac; + unsigned char cmac[16]; + + mac = get_NMTD_V1_MAC(fmt) + idx * SIZE_OF_NMTD_V1_MAC; + + ret = calc_link_mac_v1(fmt, loc, cmac, info, master); + if (ret) + return -1; + memcpy(mac, cmac, SIZE_OF_NMTD_V1_MAC); + return 0; +} + +static int32_t create_format_v1(unsigned char *wire, + loc_t *loc, + struct crypt_inode_info *info, + struct master_cipher_info *master) +{ + int32_t ret; + struct mtd_format_v1 *fmt; + unsigned char mtd_key[16]; + AES_KEY EMTD_KEY; + unsigned char nmtd_link_key[16]; + uint32_t ad; + GCM128_CONTEXT *gctx; + + fmt = (struct mtd_format_v1 *)wire; + + fmt->minor_id = info->nr_minor; + fmt->alg_id = AES_CIPHER_ALG; + fmt->dkey_factor = master->m_dkey_size >> KEY_FACTOR_BITS; + fmt->block_bits = master->m_block_bits; + fmt->mode_id = master->m_mode; + /* + * retrieve keys for the parts of metadata + */ + ret = get_emtd_file_key(info, master, mtd_key); + if (ret) + return ret; + ret = get_nmtd_link_key(loc, master, nmtd_link_key); + if (ret) + return ret; + + AES_set_encrypt_key(mtd_key, sizeof(mtd_key)*8, &EMTD_KEY); + + gctx = CRYPTO_gcm128_new(&EMTD_KEY, (block128_f)AES_encrypt); + + /* TBD: Check return values */ + + CRYPTO_gcm128_setiv(gctx, info->oid, sizeof(uuid_t)); + + ad = htole32(MTD_LOADER_V1); + ret = CRYPTO_gcm128_aad(gctx, (const unsigned char *)&ad, sizeof(ad)); + if (ret) { + gf_log("crypt", GF_LOG_ERROR, " CRYPTO_gcm128_aad failed"); + CRYPTO_gcm128_release(gctx); + return ret; + } + ret = CRYPTO_gcm128_encrypt(gctx, + get_EMTD_V1(fmt), + get_EMTD_V1(fmt), + SIZE_OF_EMTD_V1); + if (ret) { + gf_log("crypt", GF_LOG_ERROR, " CRYPTO_gcm128_encrypt failed"); + CRYPTO_gcm128_release(gctx); + return ret; + } + /* + * set MAC of encrypted part of metadata + */ + CRYPTO_gcm128_tag(gctx, get_EMTD_V1_MAC(fmt), SIZE_OF_EMTD_V1_MAC); + CRYPTO_gcm128_release(gctx); + /* + * set the first MAC of non-encrypted part of metadata + */ + return create_link_mac_v1(fmt, 0, loc, info, master); +} + +/* + * Called by fops: + * ->create(); + * ->link(); + * + * Pack common and version-specific parts of file's metadata + * Pre-conditions: @info contains valid object-id. + */ +int32_t create_format(unsigned char *wire, + loc_t *loc, + struct crypt_inode_info *info, + struct master_cipher_info *master) +{ + struct crypt_format *fmt = (struct crypt_format *)wire; + + fmt->loader_id = current_mtd_loader(); + + wire += sizeof(struct crypt_format); + return mtd_loaders[current_mtd_loader()].create_format(wire, loc, + info, master); +} + +/* + * Append or overwrite per-link mac of @mac_idx index + * in accordance with the new pathname + */ +int32_t appov_link_mac_v1(unsigned char *new, + unsigned char *old, + uint32_t old_size, + int32_t mac_idx, + loc_t *loc, + struct crypt_inode_info *info, + struct master_cipher_info *master, + crypt_local_t *local) +{ + memcpy(new, old, old_size); + return create_link_mac_v1((struct mtd_format_v1 *)new, mac_idx, + loc, info, master); +} + +/* + * Cut per-link mac of @mac_idx index + */ +static int32_t cut_link_mac_v1(unsigned char *new, + unsigned char *old, + uint32_t old_size, + int32_t mac_idx, + loc_t *loc, + struct crypt_inode_info *info, + struct master_cipher_info *master, + crypt_local_t *local) +{ + memcpy(new, + old, + sizeof(struct mtd_format_v1) + NMTD_8_MAC_SIZE * (mac_idx - 1)); + + memcpy(new + sizeof(struct mtd_format_v1) + NMTD_8_MAC_SIZE * (mac_idx - 1), + old + sizeof(struct mtd_format_v1) + NMTD_8_MAC_SIZE * mac_idx, + old_size - (sizeof(struct mtd_format_v1) + NMTD_8_MAC_SIZE * mac_idx)); + return 0; +} + +int32_t update_format_v1(unsigned char *new, + unsigned char *old, + size_t old_len, + int32_t mac_idx, /* of old name */ + mtd_op_t op, + loc_t *loc, + struct crypt_inode_info *info, + struct master_cipher_info *master, + crypt_local_t *local) +{ + switch (op) { + case MTD_APPEND: + mac_idx = 1 + (old_len - sizeof(struct mtd_format_v1))/8; + case MTD_OVERWRITE: + return appov_link_mac_v1(new, old, old_len, mac_idx, + loc, info, master, local); + case MTD_CUT: + return cut_link_mac_v1(new, old, old_len, mac_idx, + loc, info, master, local); + default: + gf_log("crypt", GF_LOG_ERROR, "Bad mtd operation %d", op); + return -1; + } +} + +/* + * Called by fops: + * + * ->link() + * ->unlink() + * ->rename() + * + */ +int32_t update_format(unsigned char *new, + unsigned char *old, + size_t old_len, + int32_t mac_idx, + mtd_op_t op, + loc_t *loc, + struct crypt_inode_info *info, + struct master_cipher_info *master, + crypt_local_t *local) +{ + if (!new) + return 0; + memcpy(new, old, sizeof(struct crypt_format)); + + old += sizeof(struct crypt_format); + new += sizeof(struct crypt_format); + old_len -= sizeof(struct crypt_format); + + return mtd_loaders[current_mtd_loader()].update_format(new, old, + old_len, + mac_idx, op, + loc, info, + master, local); +} + +/* + * Perform preliminary checks of found metadata + * Return < 0 on errors; + * Return number of object-id MACs (>= 1) on success + */ +int32_t check_format_v1(uint32_t len, unsigned char *wire) +{ + uint32_t nr_links; + + if (len < sizeof(struct mtd_format_v1)) { + gf_log("crypt", GF_LOG_ERROR, + "v1-loader: bad metadata size %d", len); + goto error; + } + len -= sizeof(struct mtd_format_v1); + if (len % sizeof(nmtd_8_mac_t)) { + gf_log("crypt", GF_LOG_ERROR, + "v1-loader: bad metadata format"); + goto error; + } + nr_links = 1 + len / sizeof(nmtd_8_mac_t); + if (nr_links > _POSIX_LINK_MAX) + goto error; + return nr_links; + error: + return EIO; +} + +/* + * Verify per-link MAC specified by index @idx + * + * return: + * -1 on errors; + * 0 on failed verification; + * 1 on sucessful verification + */ +static int32_t verify_link_mac_v1(struct mtd_format_v1 *fmt, + uint32_t idx /* index of the mac to verify */, + loc_t *loc, + struct crypt_inode_info *info, + struct master_cipher_info *master) +{ + int32_t ret; + unsigned char *mac; + unsigned char cmac[16]; + + mac = get_NMTD_V1_MAC(fmt) + idx * SIZE_OF_NMTD_V1_MAC; + + ret = calc_link_mac_v1(fmt, loc, cmac, info, master); + if (ret) + return -1; + if (memcmp(cmac, mac, SIZE_OF_NMTD_V1_MAC)) + return 0; + return 1; +} + +/* + * Lookup per-link MAC by pathname. + * + * return index of the MAC, if it was found; + * return < 0 on errors, or if the MAC wasn't found + */ +static int32_t lookup_link_mac_v1(struct mtd_format_v1 *fmt, + uint32_t nr_macs, + loc_t *loc, + struct crypt_inode_info *info, + struct master_cipher_info *master) +{ + int32_t ret; + uint32_t idx; + + for (idx = 0; idx < nr_macs; idx++) { + ret = verify_link_mac_v1(fmt, idx, loc, info, master); + if (ret < 0) + return ret; + if (ret > 0) + return idx; + } + return -ENOENT; +} + +/* + * Extract version-specific part of metadata + */ +static int32_t open_format_v1(unsigned char *wire, + int32_t len, + loc_t *loc, + struct crypt_inode_info *info, + struct master_cipher_info *master, + crypt_local_t *local, + gf_boolean_t load_info) +{ + int32_t ret; + int32_t num_nmtd_macs; + struct mtd_format_v1 *fmt; + unsigned char mtd_key[16]; + AES_KEY EMTD_KEY; + GCM128_CONTEXT *gctx; + uint32_t ad; + emtd_8_mac_t gmac; + struct object_cipher_info *object; + + num_nmtd_macs = check_format_v1(len, wire); + if (num_nmtd_macs <= 0) + return EIO; + fmt = (struct mtd_format_v1 *)wire; + + ret = lookup_link_mac_v1(fmt, num_nmtd_macs, loc, info, master); + if (ret < 0) { + gf_log("crypt", GF_LOG_ERROR, "NMTD verification failed"); + return EINVAL; + } + local->mac_idx = ret; + if (load_info == _gf_false) + /* the case of partial open */ + return 0; + + object = &info->cinfo; + + ret = get_emtd_file_key(info, master, mtd_key); + if (ret) { + gf_log("crypt", GF_LOG_ERROR, "Can not retrieve metadata key"); + return ret; + } + /* + * decrypt encrypted meta-data + */ + ret = AES_set_encrypt_key(mtd_key, sizeof(mtd_key)*8, &EMTD_KEY); + if (ret < 0) { + gf_log("crypt", GF_LOG_ERROR, "Can not set encrypt key"); + return ret; + } + gctx = CRYPTO_gcm128_new(&EMTD_KEY, (block128_f)AES_encrypt); + if (!gctx) { + gf_log("crypt", GF_LOG_ERROR, "Can not alloc gcm context"); + return ENOMEM; + } + CRYPTO_gcm128_setiv(gctx, info->oid, sizeof(uuid_t)); + + ad = htole32(MTD_LOADER_V1); + ret = CRYPTO_gcm128_aad(gctx, (const unsigned char *)&ad, sizeof(ad)); + if (ret) { + gf_log("crypt", GF_LOG_ERROR, " CRYPTO_gcm128_aad failed"); + CRYPTO_gcm128_release(gctx); + return ret; + } + ret = CRYPTO_gcm128_decrypt(gctx, + get_EMTD_V1(fmt), + get_EMTD_V1(fmt), + SIZE_OF_EMTD_V1); + if (ret) { + gf_log("crypt", GF_LOG_ERROR, " CRYPTO_gcm128_decrypt failed"); + CRYPTO_gcm128_release(gctx); + return ret; + } + /* + * verify metadata + */ + CRYPTO_gcm128_tag(gctx, gmac, sizeof(gmac)); + CRYPTO_gcm128_release(gctx); + if (memcmp(gmac, get_EMTD_V1_MAC(fmt), SIZE_OF_EMTD_V1_MAC)) { + gf_log("crypt", GF_LOG_ERROR, "EMTD verification failed"); + return EINVAL; + } + /* + * load verified metadata to the private part of inode + */ + info->nr_minor = fmt->minor_id; + + object->o_alg = fmt->alg_id; + object->o_dkey_size = fmt->dkey_factor << KEY_FACTOR_BITS; + object->o_block_bits = fmt->block_bits; + object->o_mode = fmt->mode_id; + + return check_file_metadata(info); +} + +/* + * perform metadata authentication against @loc->path; + * extract crypt-specific attribtes and populate @info + * with them (optional) + */ +int32_t open_format(unsigned char *str, + int32_t len, + loc_t *loc, + struct crypt_inode_info *info, + struct master_cipher_info *master, + crypt_local_t *local, + gf_boolean_t load_info) +{ + struct crypt_format *fmt; + if (len < sizeof(*fmt)) { + gf_log("crypt", GF_LOG_ERROR, "Bad core format"); + return EIO; + } + fmt = (struct crypt_format *)str; + + if (fmt->loader_id >= LAST_MTD_LOADER) { + gf_log("crypt", GF_LOG_ERROR, + "Unsupported loader id %d", fmt->loader_id); + return EINVAL; + } + str += sizeof(*fmt); + len -= sizeof(*fmt); + + return mtd_loaders[fmt->loader_id].open_format(str, + len, + loc, + info, + master, + local, + load_info); +} + +struct crypt_mtd_loader mtd_loaders [LAST_MTD_LOADER] = { + [MTD_LOADER_V1] = + {.format_size = format_size_v1, + .create_format = create_format_v1, + .open_format = open_format_v1, + .update_format = update_format_v1 + } +}; + +/* + Local variables: + c-indentation-style: "K&R" + mode-name: "LC" + c-basic-offset: 8 + tab-width: 8 + fill-column: 80 + scroll-step: 1 + End: +*/ diff --git a/xlators/encryption/crypt/src/metadata.h b/xlators/encryption/crypt/src/metadata.h new file mode 100644 index 000000000..a92f149ef --- /dev/null +++ b/xlators/encryption/crypt/src/metadata.h @@ -0,0 +1,74 @@ +/* + Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __METADATA_H__ +#define __METADATA_H__ + +#define NMTD_8_MAC_SIZE (8) +#define EMTD_8_MAC_SIZE (8) + +typedef uint8_t nmtd_8_mac_t[NMTD_8_MAC_SIZE]; +typedef uint8_t emtd_8_mac_t[EMTD_8_MAC_SIZE] ; + +/* + * Version "v1" of file's metadata. + * Metadata of this version has 4 components: + * + * 1) EMTD (Encrypted part of MeTaData); + * 2) NMTD (Non-encrypted part of MeTaData); + * 3) EMTD_MAC; (EMTD Message Authentication Code); + * 4) Array of per-link NMTD MACs (for every (hard)link it includes + * exactly one MAC) + */ +struct mtd_format_v1 { + /* EMTD, encrypted part of meta-data */ + uint8_t alg_id; /* cipher algorithm id (only AES for now) */ + uint8_t mode_id; /* cipher mode id; (only XTS for now) */ + uint8_t block_bits; /* encoded block size */ + uint8_t minor_id; /* client translator id */ + uint8_t dkey_factor; /* encoded size of the data key */ + /* MACs */ + emtd_8_mac_t gmac; /* MAC of the encrypted meta-data, 8 bytes */ + nmtd_8_mac_t omac; /* per-link MACs of the non-encrypted + * meta-data: at least one such MAC is always + * present */ +} __attribute__((packed)); + +/* + * NMTD, the non-encrypted part of metadata of version "v1" + * is file's gfid, which is generated on trusted machines. + */ +#define SIZE_OF_NMTD_V1 (sizeof(uuid_t)) +#define SIZE_OF_EMTD_V1 (offsetof(struct mtd_format_v1, gmac) - \ + offsetof(struct mtd_format_v1, alg_id)) +#define SIZE_OF_NMTD_V1_MAC (NMTD_8_MAC_SIZE) +#define SIZE_OF_EMTD_V1_MAC (EMTD_8_MAC_SIZE) + +static inline unsigned char *get_EMTD_V1(struct mtd_format_v1 *format) +{ + return &format->alg_id; +} + +static inline unsigned char *get_NMTD_V1(struct crypt_inode_info *info) +{ + return info->oid; +} + +static inline unsigned char *get_EMTD_V1_MAC(struct mtd_format_v1 *format) +{ + return format->gmac; +} + +static inline unsigned char *get_NMTD_V1_MAC(struct mtd_format_v1 *format) +{ + return format->omac; +} + +#endif /* __METADATA_H__ */ diff --git a/xlators/features/Makefile.am b/xlators/features/Makefile.am index 6a73301d7..d2f5ef192 100644 --- a/xlators/features/Makefile.am +++ b/xlators/features/Makefile.am @@ -1,4 +1,4 @@ SUBDIRS = locks quota read-only mac-compat quiesce marker index \ - protect $(GLUPY_SUBDIR) # trash path-converter # filter + protect compress changelog gfid-access $(GLUPY_SUBDIR) qemu-block # trash path-converter # filter CLEANFILES = diff --git a/xlators/features/changelog/Makefile.am b/xlators/features/changelog/Makefile.am new file mode 100644 index 000000000..153bb6850 --- /dev/null +++ b/xlators/features/changelog/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src lib + +CLEANFILES = diff --git a/xlators/storage/bd_map/Makefile.am b/xlators/features/changelog/lib/Makefile.am index a985f42a8..a985f42a8 100644 --- a/xlators/storage/bd_map/Makefile.am +++ b/xlators/features/changelog/lib/Makefile.am diff --git a/xlators/features/changelog/lib/examples/c/get-changes.c b/xlators/features/changelog/lib/examples/c/get-changes.c new file mode 100644 index 000000000..14562585a --- /dev/null +++ b/xlators/features/changelog/lib/examples/c/get-changes.c @@ -0,0 +1,87 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +/** + * get set of new changes every 10 seconds (just print the file names) + * + * Compile it using: + * gcc -o getchanges `pkg-config --cflags libgfchangelog` get-changes.c \ + * `pkg-config --libs libgfchangelog` + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/un.h> +#include <limits.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <errno.h> + +#include "changelog.h" + +#define handle_error(fn) \ + printf ("%s (reason: %s)\n", fn, strerror (errno)) + +int +main (int argc, char ** argv) +{ + int i = 0; + int ret = 0; + ssize_t nr_changes = 0; + ssize_t changes = 0; + char fbuf[PATH_MAX] = {0,}; + + /* get changes for brick "/home/vshankar/export/yow/yow-1" */ + ret = gf_changelog_register ("/home/vshankar/export/yow/yow-1", + "/tmp/scratch", "/tmp/change.log", 9, 5); + if (ret) { + handle_error ("register failed"); + goto out; + } + + while (1) { + i = 0; + nr_changes = gf_changelog_scan (); + if (nr_changes < 0) { + handle_error ("scan(): "); + break; + } + + if (nr_changes == 0) + goto next; + + printf ("Got %ld changelog files\n", nr_changes); + + while ( (changes = + gf_changelog_next_change (fbuf, PATH_MAX)) > 0) { + printf ("changelog file [%d]: %s\n", ++i, fbuf); + + /* process changelog */ + /* ... */ + /* ... */ + /* ... */ + /* done processing */ + + ret = gf_changelog_done (fbuf); + if (ret) + handle_error ("gf_changelog_done"); + } + + if (changes == -1) + handle_error ("gf_changelog_next_change"); + + next: + sleep (10); + } + + out: + return ret; +} diff --git a/xlators/features/changelog/lib/examples/python/changes.py b/xlators/features/changelog/lib/examples/python/changes.py new file mode 100644 index 000000000..d21db8eab --- /dev/null +++ b/xlators/features/changelog/lib/examples/python/changes.py @@ -0,0 +1,32 @@ +#!/usr/bin/python + +import os +import sys +import time +import libgfchangelog + +cl = libgfchangelog.Changes() + +def get_changes(brick, scratch_dir, log_file, log_level, interval): + change_list = [] + try: + cl.cl_register(brick, scratch_dir, log_file, log_level) + while True: + cl.cl_scan() + change_list = cl.cl_getchanges() + if change_list: + print change_list + for change in change_list: + print('done with %s' % (change)) + cl.cl_done(change) + time.sleep(interval) + except OSError: + ex = sys.exc_info()[1] + print ex + +if __name__ == '__main__': + if len(sys.argv) != 5: + print("usage: %s <brick> <scratch-dir> <log-file> <fetch-interval>" + % (sys.argv[0])) + sys.exit(1) + get_changes(sys.argv[1], sys.argv[2], sys.argv[3], 9, int(sys.argv[4])) diff --git a/xlators/features/changelog/lib/examples/python/libgfchangelog.py b/xlators/features/changelog/lib/examples/python/libgfchangelog.py new file mode 100644 index 000000000..68ec3baf1 --- /dev/null +++ b/xlators/features/changelog/lib/examples/python/libgfchangelog.py @@ -0,0 +1,64 @@ +import os +from ctypes import * +from ctypes.util import find_library + +class Changes(object): + libgfc = CDLL(find_library("gfchangelog"), use_errno=True) + + @classmethod + def geterrno(cls): + return get_errno() + + @classmethod + def raise_oserr(cls): + errn = cls.geterrno() + raise OSError(errn, os.strerror(errn)) + + @classmethod + def _get_api(cls, call): + return getattr(cls.libgfc, call) + + @classmethod + def cl_register(cls, brick, path, log_file, log_level, retries = 0): + ret = cls._get_api('gf_changelog_register')(brick, path, + log_file, log_level, retries) + if ret == -1: + cls.raise_oserr() + + @classmethod + def cl_scan(cls): + ret = cls._get_api('gf_changelog_scan')() + if ret == -1: + cls.raise_oserr() + + @classmethod + def cl_startfresh(cls): + ret = cls._get_api('gf_changelog_start_fresh')() + if ret == -1: + cls.raise_oserr() + + @classmethod + def cl_getchanges(cls): + """ remove hardcoding for path name length """ + def clsort(f): + return f.split('.')[-1] + changes = [] + buf = create_string_buffer('\0', 4096) + call = cls._get_api('gf_changelog_next_change') + + while True: + ret = call(buf, 4096) + if ret in (0, -1): + break; + changes.append(buf.raw[:ret-1]) + if ret == -1: + cls.raise_oserr() + # cleanup tracker + cls.cl_startfresh() + return sorted(changes, key=clsort) + + @classmethod + def cl_done(cls, clfile): + ret = cls._get_api('gf_changelog_done')(clfile) + if ret == -1: + cls.raise_oserr() diff --git a/xlators/features/changelog/lib/src/Makefile.am b/xlators/features/changelog/lib/src/Makefile.am new file mode 100644 index 000000000..fbaaea628 --- /dev/null +++ b/xlators/features/changelog/lib/src/Makefile.am @@ -0,0 +1,37 @@ +libgfchangelog_la_CFLAGS = -Wall $(GF_CFLAGS) $(GF_DARWIN_LIBGLUSTERFS_CFLAGS) \ + -DDATADIR=\"$(localstatedir)\" + +libgfchangelog_la_CPPFLAGS = $(GF_CPPFLAGS) -D__USE_FILE_OFFSET64 -fpic \ + -I../../../src/ -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/xlators/features/changelog/src \ + -DDATADIR=\"$(localstatedir)\" + +libgfchangelog_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(GF_GLUSTERFS_LIBS) + +libgfchangelog_la_LDFLAGS = $(GF_LDFLAGS) + +libgfchangelogdir = $(includedir)/glusterfs/gfchangelog +lib_LTLIBRARIES = libgfchangelog.la + +CONTRIB_BUILDDIR = $(top_builddir)/contrib + +libgfchangelog_la_SOURCES = gf-changelog.c gf-changelog-process.c \ + gf-changelog-helpers.c $(CONTRIBDIR)/uuid/clear.c \ + $(CONTRIBDIR)/uuid/copy.c $(CONTRIBDIR)/uuid/gen_uuid.c \ + $(CONTRIBDIR)/uuid/pack.c $(CONTRIBDIR)/uuid/parse.c \ + $(CONTRIBDIR)/uuid/unparse.c $(CONTRIBDIR)/uuid/uuid_time.c \ + $(CONTRIBDIR)/uuid/compare.c $(CONTRIBDIR)/uuid/isnull.c \ + $(CONTRIBDIR)/uuid/unpack.c + +noinst_HEADERS = gf-changelog-helpers.h $(CONTRIBDIR)/uuid/uuidd.h \ + $(CONTRIBDIR)/uuid/uuid.h $(CONTRIBDIR)/uuid/uuidP.h \ + $(CONTRIB_BUILDDIR)/uuid/uuid_types.h + +libgfchangelog_HEADERS = changelog.h + +CLEANFILES = +CONFIG_CLEAN_FILES = $(CONTRIB_BUILDDIR)/uuid/uuid_types.h + +$(top_builddir)/libglusterfs/src/libglusterfs.la: + $(MAKE) -C $(top_builddir)/libglusterfs/src/ all diff --git a/xlators/features/changelog/lib/src/changelog.h b/xlators/features/changelog/lib/src/changelog.h new file mode 100644 index 000000000..5cddfb583 --- /dev/null +++ b/xlators/features/changelog/lib/src/changelog.h @@ -0,0 +1,31 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _GF_CHANGELOG_H +#define _GF_CHANGELOG_H + +/* API set */ + +int +gf_changelog_register (char *brick_path, char *scratch_dir, + char *log_file, int log_levl, int max_reconnects); +ssize_t +gf_changelog_scan (); + +int +gf_changelog_start_fresh (); + +ssize_t +gf_changelog_next_change (char *bufptr, size_t maxlen); + +int +gf_changelog_done (char *file); + +#endif diff --git a/xlators/features/changelog/lib/src/gf-changelog-helpers.c b/xlators/features/changelog/lib/src/gf-changelog-helpers.c new file mode 100644 index 000000000..1eef8bf04 --- /dev/null +++ b/xlators/features/changelog/lib/src/gf-changelog-helpers.c @@ -0,0 +1,180 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "changelog-mem-types.h" +#include "gf-changelog-helpers.h" + +ssize_t gf_changelog_read_path (int fd, char *buffer, size_t bufsize) +{ + return read (fd, buffer, bufsize); +} + +size_t +gf_changelog_write (int fd, char *buffer, size_t len) +{ + ssize_t size = 0; + size_t writen = 0; + + while (writen < len) { + size = write (fd, + buffer + writen, len - writen); + if (size <= 0) + break; + + writen += size; + } + + return writen; +} + +void +gf_rfc3986_encode (unsigned char *s, char *enc, char *estr) +{ + for (; *s; s++) { + if (estr[*s]) + sprintf(enc, "%c", estr[*s]); + else + sprintf(enc, "%%%02X", *s); + while (*++enc); + } +} + +/** + * thread safe version of readline with buffering + * (taken from Unix Network Programming Volume I, W.R. Stevens) + * + * This is favoured over fgets() as we'd need to ftruncate() + * (see gf_changelog_scan() API) to record new changelog files. + * stream open functions does have a truncate like api (although + * that can be done via @fflush(fp), @ftruncate(fd) and @fseek(fp), + * but this involves mixing POSIX file descriptors and stream FILE *). + * + * NOTE: This implmentation still does work with more than one fd's + * used to perform gf_readline(). For this very reason it's not + * made a part of libglusterfs. + */ + +static pthread_key_t rl_key; +static pthread_once_t rl_once = PTHREAD_ONCE_INIT; + +static void +readline_destructor (void *ptr) +{ + GF_FREE (ptr); +} + +static void +readline_once (void) +{ + pthread_key_create (&rl_key, readline_destructor); +} + +static ssize_t +my_read (read_line_t *tsd, int fd, char *ptr) +{ + if (tsd->rl_cnt <= 0) { + if ( (tsd->rl_cnt = read (fd, tsd->rl_buf, MAXLINE)) < 0 ) + return -1; + else if (tsd->rl_cnt == 0) + return 0; + tsd->rl_bufptr = tsd->rl_buf; + } + + tsd->rl_cnt--; + *ptr = *tsd->rl_bufptr++; + return 1; +} + +static int +gf_readline_init_once (read_line_t **tsd) +{ + if (pthread_once (&rl_once, readline_once) != 0) + return -1; + + *tsd = pthread_getspecific (rl_key); + if (*tsd) + goto out; + + *tsd = GF_CALLOC (1, sizeof (**tsd), + gf_changelog_mt_libgfchangelog_rl_t); + if (!*tsd) + return -1; + + if (pthread_setspecific (rl_key, *tsd) != 0) + return -1; + + out: + return 0; +} + +ssize_t +gf_readline (int fd, void *vptr, size_t maxlen) +{ + size_t n = 0; + size_t rc = 0; + char c = ' '; + char *ptr = NULL; + read_line_t *tsd = NULL; + + if (gf_readline_init_once (&tsd)) + return -1; + + ptr = vptr; + for (n = 1; n < maxlen; n++) { + if ( (rc = my_read (tsd, fd, &c)) == 1 ) { + *ptr++ = c; + if (c == '\n') + break; + } else if (rc == 0) { + *ptr = '\0'; + return (n - 1); + } else + return -1; + } + + *ptr = '\0'; + return n; + +} + +off_t +gf_lseek (int fd, off_t offset, int whence) +{ + off_t off = 0; + read_line_t *tsd = NULL; + + if (gf_readline_init_once (&tsd)) + return -1; + + if ( (off = lseek (fd, offset, whence)) == -1) + return -1; + + tsd->rl_cnt = 0; + tsd->rl_bufptr = tsd->rl_buf; + + return off; +} + +int +gf_ftruncate (int fd, off_t length) +{ + read_line_t *tsd = NULL; + + if (gf_readline_init_once (&tsd)) + return -1; + + if (ftruncate (fd, 0)) + return -1; + + tsd->rl_cnt = 0; + tsd->rl_bufptr = tsd->rl_buf; + + return 0; +} diff --git a/xlators/features/changelog/lib/src/gf-changelog-helpers.h b/xlators/features/changelog/lib/src/gf-changelog-helpers.h new file mode 100644 index 000000000..3aa6ed7b8 --- /dev/null +++ b/xlators/features/changelog/lib/src/gf-changelog-helpers.h @@ -0,0 +1,97 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _GF_CHANGELOG_HELPERS_H +#define _GF_CHANGELOG_HELPERS_H + +#include <unistd.h> +#include <dirent.h> +#include <limits.h> +#include <pthread.h> + +#include <xlator.h> + +#define GF_CHANGELOG_TRACKER "tracker" + +#define GF_CHANGELOG_CURRENT_DIR ".current" +#define GF_CHANGELOG_PROCESSED_DIR ".processed" +#define GF_CHANGELOG_PROCESSING_DIR ".processing" + +#ifndef MAXLINE +#define MAXLINE 4096 +#endif + +#define GF_CHANGELOG_FILL_BUFFER(ptr, ascii, off, len) do { \ + memcpy (ascii + off, ptr, len); \ + off += len; \ + } while (0) + +typedef struct read_line { + int rl_cnt; + char *rl_bufptr; + char rl_buf[MAXLINE]; +} read_line_t; + +typedef struct gf_changelog { + xlator_t *this; + + /* 'processing' directory stream */ + DIR *gfc_dir; + + /* fd to the tracker file */ + int gfc_fd; + + /* connection retries */ + int gfc_connretries; + + char gfc_sockpath[PATH_MAX]; + + char gfc_brickpath[PATH_MAX]; + + /* socket for recieving notifications */ + int gfc_sockfd; + + char *gfc_working_dir; + + /* RFC 3986 string encoding */ + char rfc3986[256]; + + char gfc_current_dir[PATH_MAX]; + char gfc_processed_dir[PATH_MAX]; + char gfc_processing_dir[PATH_MAX]; + + pthread_t gfc_changelog_processor; +} gf_changelog_t; + +int +gf_changelog_notification_init (xlator_t *this, gf_changelog_t *gfc); + +void * +gf_changelog_process (void *data); + +ssize_t +gf_changelog_read_path (int fd, char *buffer, size_t bufsize); + +void +gf_rfc3986_encode (unsigned char *s, char *enc, char *estr); + +size_t +gf_changelog_write (int fd, char *buffer, size_t len); + +ssize_t +gf_readline (int fd, void *vptr, size_t maxlen); + +int +gf_ftruncate (int fd, off_t length); + +off_t +gf_lseek (int fd, off_t offset, int whence); + +#endif diff --git a/xlators/features/changelog/lib/src/gf-changelog-process.c b/xlators/features/changelog/lib/src/gf-changelog-process.c new file mode 100644 index 000000000..df7204931 --- /dev/null +++ b/xlators/features/changelog/lib/src/gf-changelog-process.c @@ -0,0 +1,571 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <unistd.h> +#include <pthread.h> + +#include "uuid.h" +#include "globals.h" +#include "glusterfs.h" + +#include "gf-changelog-helpers.h" + +/* from the changelog translator */ +#include "changelog-misc.h" + +extern int byebye; + +/** + * number of gfid records after fop number + */ +int nr_gfids[] = { + [GF_FOP_MKNOD] = 1, + [GF_FOP_MKDIR] = 1, + [GF_FOP_UNLINK] = 1, + [GF_FOP_RMDIR] = 1, + [GF_FOP_SYMLINK] = 1, + [GF_FOP_RENAME] = 2, + [GF_FOP_LINK] = 1, + [GF_FOP_CREATE] = 1, +}; + +static char * +binary_to_ascii (uuid_t uuid) +{ + return uuid_utoa (uuid); +} + +static char * +conv_noop (char *ptr) { return ptr; } + +#define VERIFY_SEPARATOR(ptr, plen, perr) \ + { \ + if (*(ptr + plen) != '\0') { \ + perr = 1; \ + break; \ + } \ + } + +#define MOVER_MOVE(mover, nleft, bytes) \ + { \ + mover += bytes; \ + nleft -= bytes; \ + } \ + +#define PARSE_GFID(mov, ptr, le, fn, perr) \ + { \ + VERIFY_SEPARATOR (mov, le, perr); \ + ptr = fn (mov); \ + if (!ptr) { \ + perr = 1; \ + break; \ + } \ + } + +#define FILL_AND_MOVE(pt, buf, of, mo, nl, le) \ + { \ + GF_CHANGELOG_FILL_BUFFER (pt, buf, of, strlen (pt)); \ + MOVER_MOVE (mo, nl, le); \ + } + + +#define PARSE_GFID_MOVE(ptr, uuid, mover, nleft, perr) \ + { \ + memcpy (uuid, mover, sizeof (uuid_t)); \ + ptr = binary_to_ascii (uuid); \ + if (!ptr) { \ + perr = 1; \ + break; \ + } \ + MOVER_MOVE (mover, nleft, sizeof (uuid_t)); \ + } \ + +#define LINE_BUFSIZE 3*PATH_MAX /* enough buffer for extra chars too */ + +/** + * using mmap() makes parsing easy. fgets() cannot be used here as + * the binary gfid could contain a line-feed (0x0A), in that case fgets() + * would read an incomplete line and parsing would fail. using POSIX fds + * would result is additional code to maintain state in case of partial + * reads of data (where multiple entries do not fit extirely in the buffer). + * + * mmap() gives the flexibility of pointing to an offset in the file + * without us worrying about reading it in memory (VM does that for us for + * free). + */ + +static int +gf_changelog_parse_binary (xlator_t *this, + gf_changelog_t *gfc, int from_fd, int to_fd, + size_t start_offset, struct stat *stbuf) + +{ + int ret = -1; + off_t off = 0; + off_t nleft = 0; + uuid_t uuid = {0,}; + char *ptr = NULL; + char *bname_start = NULL; + char *bname_end = NULL; + char *mover = NULL; + char *start = NULL; + char current_mover = ' '; + size_t blen = 0; + int parse_err = 0; + char ascii[LINE_BUFSIZE] = {0,}; + + nleft = stbuf->st_size; + + start = (char *) mmap (NULL, nleft, + PROT_READ, MAP_PRIVATE, from_fd, 0); + if (!start) { + gf_log (this->name, GF_LOG_ERROR, + "mmap() error (reason: %s)", strerror (errno)); + goto out; + } + + mover = start; + + MOVER_MOVE (mover, nleft, start_offset); + + while (nleft > 0) { + + off = blen = 0; + ptr = bname_start = bname_end = NULL; + + current_mover = *mover; + + switch (current_mover) { + case 'D': + case 'M': + MOVER_MOVE (mover, nleft, 1); + PARSE_GFID_MOVE (ptr, uuid, mover, nleft, parse_err); + + break; + + case 'E': + MOVER_MOVE (mover, nleft, 1); + PARSE_GFID_MOVE (ptr, uuid, mover, nleft, parse_err); + + bname_start = mover; + if ( (bname_end = strchr (mover, '\n')) == NULL ) { + parse_err = 1; + break; + } + + blen = bname_end - bname_start; + MOVER_MOVE (mover, nleft, blen); + + break; + + default: + parse_err = 1; + } + + if (parse_err) + break; + + GF_CHANGELOG_FILL_BUFFER (¤t_mover, ascii, off, 1); + GF_CHANGELOG_FILL_BUFFER (" ", ascii, off, 1); + GF_CHANGELOG_FILL_BUFFER (ptr, ascii, off, strlen (ptr)); + if (blen) + GF_CHANGELOG_FILL_BUFFER (bname_start, + ascii, off, blen); + GF_CHANGELOG_FILL_BUFFER ("\n", ascii, off, 1); + + if (gf_changelog_write (to_fd, ascii, off) != off) { + gf_log (this->name, GF_LOG_ERROR, + "processing binary changelog failed due to " + " error in writing ascii change (reason: %s)", + strerror (errno)); + break; + } + + MOVER_MOVE (mover, nleft, 1); + } + + if ( (nleft == 0) && (!parse_err)) + ret = 0; + + if (munmap (start, stbuf->st_size)) + gf_log (this->name, GF_LOG_ERROR, + "munmap() error (reason: %s)", strerror (errno)); + out: + return ret; +} + +/** + * ascii decoder: + * - separate out one entry from another + * - use fop name rather than fop number + */ +static int +gf_changelog_parse_ascii (xlator_t *this, + gf_changelog_t *gfc, int from_fd, int to_fd, + size_t start_offset, struct stat *stbuf) +{ + int ng = 0; + int ret = -1; + int fop = 0; + int len = 0; + off_t off = 0; + off_t nleft = 0; + char *ptr = NULL; + char *eptr = NULL; + char *start = NULL; + char *mover = NULL; + int parse_err = 0; + char current_mover = ' '; + char ascii[LINE_BUFSIZE] = {0,}; + const char *fopname = NULL; + + nleft = stbuf->st_size; + + start = (char *) mmap (NULL, nleft, + PROT_READ, MAP_PRIVATE, from_fd, 0); + if (!start) { + gf_log (this->name, GF_LOG_ERROR, + "mmap() error (reason: %s)", strerror (errno)); + goto out; + } + + mover = start; + + MOVER_MOVE (mover, nleft, start_offset); + + while (nleft > 0) { + off = 0; + current_mover = *mover; + + GF_CHANGELOG_FILL_BUFFER (¤t_mover, ascii, off, 1); + GF_CHANGELOG_FILL_BUFFER (" ", ascii, off, 1); + + switch (current_mover) { + case 'D': + case 'M': + MOVER_MOVE (mover, nleft, 1); + + /* target gfid */ + PARSE_GFID (mover, ptr, UUID_CANONICAL_FORM_LEN, + conv_noop, parse_err); + FILL_AND_MOVE(ptr, ascii, off, + mover, nleft, UUID_CANONICAL_FORM_LEN); + break; + + case 'E': + MOVER_MOVE (mover, nleft, 1); + + /* target gfid */ + PARSE_GFID (mover, ptr, UUID_CANONICAL_FORM_LEN, + conv_noop, parse_err); + FILL_AND_MOVE (ptr, ascii, off, + mover, nleft, UUID_CANONICAL_FORM_LEN); + FILL_AND_MOVE (" ", ascii, off, + mover, nleft, 1); + + /* fop */ + len = strlen (mover); + VERIFY_SEPARATOR (mover, len, parse_err); + + fop = atoi (mover); + if ( (fopname = gf_fop_list[fop]) == NULL) { + parse_err = 1; + break; + } + + MOVER_MOVE (mover, nleft, len); + + len = strlen (fopname); + GF_CHANGELOG_FILL_BUFFER (fopname, ascii, off, len); + + /* pargfid + bname */ + ng = nr_gfids[fop]; + while (ng-- > 0) { + MOVER_MOVE (mover, nleft, 1); + len = strlen (mover); + GF_CHANGELOG_FILL_BUFFER (" ", ascii, off, 1); + + PARSE_GFID (mover, ptr, len, + conv_noop, parse_err); + eptr = calloc (3, strlen (ptr)); + if (!eptr) { + parse_err = 1; + break; + } + + gf_rfc3986_encode ((unsigned char *) ptr, + eptr, gfc->rfc3986); + FILL_AND_MOVE (eptr, ascii, off, + mover, nleft, len); + free (eptr); + } + + break; + default: + parse_err = 1; + } + + if (parse_err) + break; + + GF_CHANGELOG_FILL_BUFFER ("\n", ascii, off, 1); + + if (gf_changelog_write (to_fd, ascii, off) != off) { + gf_log (this->name, GF_LOG_ERROR, + "processing ascii changelog failed due to " + " wrror in writing change (reason: %s)", + strerror (errno)); + break; + } + + MOVER_MOVE (mover, nleft, 1); + + } + + if ( (nleft == 0) && (!parse_err)) + ret = 0; + + if (munmap (start, stbuf->st_size)) + gf_log (this->name, GF_LOG_ERROR, + "munmap() error (reason: %s)", strerror (errno)); + + out: + return ret; +} + +#define COPY_BUFSIZE 8192 +static int +gf_changelog_copy (xlator_t *this, int from_fd, int to_fd) +{ + ssize_t size = 0; + char buffer[COPY_BUFSIZE+1] = {0,}; + + while (1) { + size = read (from_fd, buffer, COPY_BUFSIZE); + if (size <= 0) + break; + + if (gf_changelog_write (to_fd, + buffer, size) != size) { + gf_log (this->name, GF_LOG_ERROR, + "error processing ascii changlog"); + size = -1; + break; + } + } + + return (size < 0 ? -1 : 0); +} + +static int +gf_changelog_decode (xlator_t *this, gf_changelog_t *gfc, int from_fd, + int to_fd, struct stat *stbuf, int *zerob) +{ + int ret = -1; + int encoding = -1; + size_t elen = 0; + char buffer[1024] = {0,}; + + CHANGELOG_GET_ENCODING (from_fd, buffer, 1024, encoding, elen); + if (encoding == -1) /* unknown encoding */ + goto out; + + if (!CHANGELOG_VALID_ENCODING (encoding)) + goto out; + + if (elen == stbuf->st_size) { + *zerob = 1; + goto out; + } + + /** + * start processing after the header + */ + lseek (from_fd, elen, SEEK_SET); + + switch (encoding) { + case CHANGELOG_ENCODE_BINARY: + /** + * this ideally should have been a part of changelog-encoders.c + * (ie. part of the changelog translator). + */ + ret = gf_changelog_parse_binary (this, gfc, from_fd, + to_fd, elen, stbuf); + break; + + case CHANGELOG_ENCODE_ASCII: + ret = gf_changelog_parse_ascii (this, gfc, from_fd, + to_fd, elen, stbuf); + break; + default: + ret = gf_changelog_copy (this, from_fd, to_fd); + } + + out: + return ret; +} + +static int +gf_changelog_consume (xlator_t *this, gf_changelog_t *gfc, char *from_path) +{ + int ret = -1; + int fd1 = 0; + int fd2 = 0; + int zerob = 0; + struct stat stbuf = {0,}; + char dest[PATH_MAX] = {0,}; + char to_path[PATH_MAX] = {0,}; + + ret = stat (from_path, &stbuf); + if (ret || !S_ISREG(stbuf.st_mode)) { + gf_log (this->name, GF_LOG_ERROR, + "stat failed on changelog file: %s", from_path); + goto out; + } + + fd1 = open (from_path, O_RDONLY); + if (fd1 < 0) { + gf_log (this->name, GF_LOG_ERROR, + "cannot open changelog file: %s (reason: %s)", + from_path, strerror (errno)); + goto out; + } + + (void) snprintf (to_path, PATH_MAX, "%s%s", + gfc->gfc_current_dir, basename (from_path)); + (void) snprintf (dest, PATH_MAX, "%s%s", + gfc->gfc_processing_dir, basename (from_path)); + + fd2 = open (to_path, O_CREAT | O_TRUNC | O_RDWR, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (fd2 < 0) { + gf_log (this->name, GF_LOG_ERROR, + "cannot create ascii changelog file %s (reason %s)", + to_path, strerror (errno)); + goto close_fd; + } else { + ret = gf_changelog_decode (this, gfc, fd1, + fd2, &stbuf, &zerob); + + close (fd2); + + if (!ret) { + /* move it to processing on a successfull + decode */ + ret = rename (to_path, dest); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "error moving %s to processing dir" + " (reason: %s)", to_path, + strerror (errno)); + } + + /* remove it from .current if it's an empty file */ + if (zerob) { + ret = unlink (to_path); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "could not unlink %s (reason: %s", + to_path, strerror (errno)); + } + } + + close_fd: + close (fd1); + + out: + return ret; +} + +static char * +gf_changelog_ext_change (xlator_t *this, + gf_changelog_t *gfc, char *path, size_t readlen) +{ + int alo = 0; + int ret = 0; + size_t len = 0; + char *buf = NULL; + + buf = path; + while (len < readlen) { + if (*buf == '\0') { + alo = 1; + gf_log (this->name, GF_LOG_DEBUG, + "processing changelog: %s", path); + ret = gf_changelog_consume (this, gfc, path); + } + + if (ret) + break; + + len++; buf++; + if (alo) { + alo = 0; + path = buf; + } + } + + return (ret) ? NULL : path; +} + +void * +gf_changelog_process (void *data) +{ + ssize_t len = 0; + ssize_t offlen = 0; + xlator_t *this = NULL; + char *sbuf = NULL; + gf_changelog_t *gfc = NULL; + char from_path[PATH_MAX] = {0,}; + + gfc = (gf_changelog_t *) data; + this = gfc->this; + + pthread_detach (pthread_self()); + + for (;;) { + len = gf_changelog_read_path (gfc->gfc_sockfd, + from_path + offlen, + PATH_MAX - offlen); + if (len < 0) + continue; /* ignore it for now */ + + if (len == 0) { /* close() from the changelog translator */ + gf_log (this->name, GF_LOG_INFO, "close from changelog" + " notification translator."); + + if (gfc->gfc_connretries != 1) { + if (!gf_changelog_notification_init(this, gfc)) + continue; + } + + byebye = 1; + break; + } + + len += offlen; + sbuf = gf_changelog_ext_change (this, gfc, from_path, len); + if (!sbuf) { + gf_log (this->name, GF_LOG_ERROR, + "could not extract changelog filename"); + continue; + } + + offlen = 0; + if (sbuf != (from_path + len)) { + offlen = from_path + len - sbuf; + memmove (from_path, sbuf, offlen); + } + } + + gf_log (this->name, GF_LOG_DEBUG, + "byebye (%d) from processing thread...", byebye); + return NULL; +} diff --git a/xlators/features/changelog/lib/src/gf-changelog.c b/xlators/features/changelog/lib/src/gf-changelog.c new file mode 100644 index 000000000..ca8e373e7 --- /dev/null +++ b/xlators/features/changelog/lib/src/gf-changelog.c @@ -0,0 +1,515 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <errno.h> +#include <dirent.h> +#include <stddef.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include <string.h> + +#include "globals.h" +#include "glusterfs.h" +#include "logging.h" + +#include "gf-changelog-helpers.h" + +/* from the changelog translator */ +#include "changelog-misc.h" +#include "changelog-mem-types.h" + +int byebye = 0; + +static void +gf_changelog_cleanup (gf_changelog_t *gfc) +{ + /* socket */ + if (gfc->gfc_sockfd != -1) + close (gfc->gfc_sockfd); + /* tracker fd */ + if (gfc->gfc_fd != -1) + close (gfc->gfc_fd); + /* processing dir */ + if (gfc->gfc_dir) + closedir (gfc->gfc_dir); + + if (gfc->gfc_working_dir) + free (gfc->gfc_working_dir); /* allocated by realpath */ +} + +void +__attribute__ ((constructor)) gf_changelog_ctor (void) +{ + glusterfs_ctx_t *ctx = NULL; + + ctx = glusterfs_ctx_new (); + if (!ctx) + return; + + if (glusterfs_globals_init (ctx)) { + free (ctx); + ctx = NULL; + return; + } + + THIS->ctx = ctx; +} + +void +__attribute__ ((destructor)) gf_changelog_dtor (void) +{ + xlator_t *this = NULL; + glusterfs_ctx_t *ctx = NULL; + gf_changelog_t *gfc = NULL; + + this = THIS; + if (!this) + return; + + ctx = this->ctx; + gfc = this->private; + + if (gfc) { + gf_changelog_cleanup (gfc); + GF_FREE (gfc); + } + + if (ctx) { + pthread_mutex_destroy (&ctx->lock); + free (ctx); + ctx = NULL; + } +} + + +static int +gf_changelog_open_dirs (gf_changelog_t *gfc) +{ + int ret = -1; + DIR *dir = NULL; + int tracker_fd = 0; + char tracker_path[PATH_MAX] = {0,}; + + (void) snprintf (gfc->gfc_current_dir, PATH_MAX, + "%s/"GF_CHANGELOG_CURRENT_DIR"/", + gfc->gfc_working_dir); + ret = mkdir_p (gfc->gfc_current_dir, 0600, _gf_false); + if (ret) + goto out; + + (void) snprintf (gfc->gfc_processed_dir, PATH_MAX, + "%s/"GF_CHANGELOG_PROCESSED_DIR"/", + gfc->gfc_working_dir); + ret = mkdir_p (gfc->gfc_processed_dir, 0600, _gf_false); + if (ret) + goto out; + + (void) snprintf (gfc->gfc_processing_dir, PATH_MAX, + "%s/"GF_CHANGELOG_PROCESSING_DIR"/", + gfc->gfc_working_dir); + ret = mkdir_p (gfc->gfc_processing_dir, 0600, _gf_false); + if (ret) + goto out; + + dir = opendir (gfc->gfc_processing_dir); + if (!dir) { + gf_log ("", GF_LOG_ERROR, + "opendir() error [reason: %s]", strerror (errno)); + goto out; + } + + gfc->gfc_dir = dir; + + (void) snprintf (tracker_path, PATH_MAX, + "%s/"GF_CHANGELOG_TRACKER, gfc->gfc_working_dir); + + tracker_fd = open (tracker_path, O_CREAT | O_APPEND | O_RDWR, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (tracker_fd < 0) { + closedir (gfc->gfc_dir); + ret = -1; + goto out; + } + + gfc->gfc_fd = tracker_fd; + ret = 0; + out: + return ret; +} + +int +gf_changelog_notification_init (xlator_t *this, gf_changelog_t *gfc) +{ + int ret = 0; + int len = 0; + int tries = 0; + int sockfd = 0; + struct sockaddr_un remote; + + this = gfc->this; + + if (gfc->gfc_sockfd != -1) { + gf_log (this->name, GF_LOG_INFO, + "Reconnecting..."); + close (gfc->gfc_sockfd); + } + + sockfd = socket (AF_UNIX, SOCK_STREAM, 0); + if (sockfd < 0) { + ret = -1; + goto out; + } + + CHANGELOG_MAKE_SOCKET_PATH (gfc->gfc_brickpath, + gfc->gfc_sockpath, PATH_MAX); + gf_log (this->name, GF_LOG_INFO, + "connecting to changelog socket: %s (brick: %s)", + gfc->gfc_sockpath, gfc->gfc_brickpath); + + remote.sun_family = AF_UNIX; + strcpy (remote.sun_path, gfc->gfc_sockpath); + + len = strlen (remote.sun_path) + sizeof (remote.sun_family); + + while (tries < gfc->gfc_connretries) { + gf_log (this->name, GF_LOG_WARNING, + "connection attempt %d/%d...", + tries + 1, gfc->gfc_connretries); + + /* initiate a connect */ + if (connect (sockfd, (struct sockaddr *) &remote, len) == 0) { + gfc->gfc_sockfd = sockfd; + break; + } + + tries++; + sleep (2); + } + + if (tries == gfc->gfc_connretries) { + gf_log (this->name, GF_LOG_ERROR, + "could not connect to changelog socket!" + " bailing out..."); + ret = -1; + } else + gf_log (this->name, GF_LOG_INFO, + "connection successful"); + + out: + return ret; +} + +int +gf_changelog_done (char *file) +{ + int ret = -1; + char *buffer = NULL; + xlator_t *this = NULL; + gf_changelog_t *gfc = NULL; + char to_path[PATH_MAX] = {0,}; + + errno = EINVAL; + + this = THIS; + if (!this) + goto out; + + gfc = (gf_changelog_t *) this->private; + if (!gfc) + goto out; + + if (!file || !strlen (file)) + goto out; + + /* make sure 'file' is inside ->gfc_working_dir */ + buffer = realpath (file, NULL); + if (!buffer) + goto out; + + if (strncmp (gfc->gfc_working_dir, + buffer, strlen (gfc->gfc_working_dir))) + goto out; + + (void) snprintf (to_path, PATH_MAX, "%s%s", + gfc->gfc_processed_dir, basename (buffer)); + gf_log (this->name, GF_LOG_DEBUG, + "moving %s to processed directory", file); + ret = rename (buffer, to_path); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "cannot move %s to %s (reason: %s)", + file, to_path, strerror (errno)); + goto out; + } + + ret = 0; + + out: + if (buffer) + free (buffer); /* allocated by realpath() */ + return ret; +} + +/** + * @API + * for a set of changelogs, start from the begining + */ +int +gf_changelog_start_fresh () +{ + xlator_t *this = NULL; + gf_changelog_t *gfc = NULL; + + this = THIS; + if (!this) + goto out; + + errno = EINVAL; + + gfc = (gf_changelog_t *) this->private; + if (!gfc) + goto out; + + if (gf_ftruncate (gfc->gfc_fd, 0)) + goto out; + + return 0; + + out: + return -1; +} + +/** + * @API + * return the next changelog file entry. zero means all chanelogs + * consumed. + */ +ssize_t +gf_changelog_next_change (char *bufptr, size_t maxlen) +{ + ssize_t size = 0; + int tracker_fd = 0; + xlator_t *this = NULL; + gf_changelog_t *gfc = NULL; + char buffer[PATH_MAX] = {0,}; + + errno = EINVAL; + + this = THIS; + if (!this) + goto out; + + gfc = (gf_changelog_t *) this->private; + if (!gfc) + goto out; + + tracker_fd = gfc->gfc_fd; + + size = gf_readline (tracker_fd, buffer, maxlen); + if (size < 0) + goto out; + if (size == 0) + return 0; + + memcpy (bufptr, buffer, size - 1); + *(buffer + size) = '\0'; + + return size; + + out: + return -1; +} + +/** + * @API + * gf_changelog_scan() - scan and generate a list of change entries + * + * calling this api multiple times (without calling gf_changlog_done()) + * would result new changelogs(s) being refreshed in the tracker file. + * This call also acts as a cancellation point for the consumer. + */ +ssize_t +gf_changelog_scan () +{ + int ret = 0; + int tracker_fd = 0; + size_t len = 0; + size_t off = 0; + xlator_t *this = NULL; + size_t nr_entries = 0; + gf_changelog_t *gfc = NULL; + struct dirent *entryp = NULL; + struct dirent *result = NULL; + char buffer[PATH_MAX] = {0,}; + + this = THIS; + if (!this) + goto out; + + gfc = (gf_changelog_t *) this->private; + if (!gfc) + goto out; + + /** + * do we need to protect 'byebye' with locks? worst, the + * consumer would get notified during next scan(). + */ + if (byebye) { + errno = ECONNREFUSED; + goto out; + } + + errno = EINVAL; + + tracker_fd = gfc->gfc_fd; + + if (gf_ftruncate (tracker_fd, 0)) + goto out; + + len = offsetof(struct dirent, d_name) + + pathconf(gfc->gfc_processing_dir, _PC_NAME_MAX) + 1; + entryp = GF_CALLOC (1, len, + gf_changelog_mt_libgfchangelog_dirent_t); + if (!entryp) + goto out; + + rewinddir (gfc->gfc_dir); + while (1) { + ret = readdir_r (gfc->gfc_dir, entryp, &result); + if (ret || !result) + break; + + if ( !strcmp (basename (entryp->d_name), ".") + || !strcmp (basename (entryp->d_name), "..") ) + continue; + + nr_entries++; + + GF_CHANGELOG_FILL_BUFFER (gfc->gfc_processing_dir, + buffer, off, + strlen (gfc->gfc_processing_dir)); + GF_CHANGELOG_FILL_BUFFER (entryp->d_name, buffer, + off, strlen (entryp->d_name)); + GF_CHANGELOG_FILL_BUFFER ("\n", buffer, off, 1); + + if (gf_changelog_write (tracker_fd, buffer, off) != off) { + gf_log (this->name, GF_LOG_ERROR, + "error writing changelog filename" + " to tracker file"); + break; + } + off = 0; + } + + GF_FREE (entryp); + + if (!result) { + if (gf_lseek (tracker_fd, 0, SEEK_SET) != -1) + return nr_entries; + } + out: + return -1; +} + +/** + * @API + * gf_changelog_register() - register a client for updates. + */ +int +gf_changelog_register (char *brick_path, char *scratch_dir, + char *log_file, int log_level, int max_reconnects) +{ + int i = 0; + int ret = -1; + int errn = 0; + xlator_t *this = NULL; + gf_changelog_t *gfc = NULL; + + this = THIS; + if (!this->ctx) + goto out; + + errno = ENOMEM; + + gfc = GF_CALLOC (1, sizeof (*gfc), + gf_changelog_mt_libgfchangelog_t); + if (!gfc) + goto out; + + gfc->this = this; + + gfc->gfc_dir = NULL; + gfc->gfc_fd = gfc->gfc_sockfd = -1; + + gfc->gfc_working_dir = realpath (scratch_dir, NULL); + if (!gfc->gfc_working_dir) { + errn = errno; + goto cleanup; + } + + ret = gf_changelog_open_dirs (gfc); + if (ret) { + errn = errno; + gf_log (this->name, GF_LOG_ERROR, + "could not create entries in scratch dir"); + goto cleanup; + } + + /* passing ident as NULL means to use default ident for syslog */ + if (gf_log_init (this->ctx, log_file, NULL)) + goto cleanup; + + gf_log_set_loglevel ((log_level == -1) ? GF_LOG_INFO : + log_level); + + gfc->gfc_connretries = (max_reconnects <= 0) ? 1 : max_reconnects; + (void) strncpy (gfc->gfc_brickpath, brick_path, PATH_MAX); + + ret = gf_changelog_notification_init (this, gfc); + if (ret) { + errn = errno; + goto cleanup; + } + + ret = gf_thread_create (&gfc->gfc_changelog_processor, + NULL, gf_changelog_process, gfc); + if (ret) { + errn = errno; + gf_log (this->name, GF_LOG_ERROR, + "error creating changelog processor thread" + " new changes won't be recorded!!!"); + goto cleanup; + } + + for (; i < 256; i++) { + gfc->rfc3986[i] = + (isalnum(i) || i == '~' || + i == '-' || i == '.' || i == '_') ? i : 0; + } + + ret = 0; + this->private = gfc; + + goto out; + + cleanup: + gf_changelog_cleanup (gfc); + GF_FREE (gfc); + this->private = NULL; + errno = errn; + + out: + return ret; +} diff --git a/xlators/features/changelog/src/Makefile.am b/xlators/features/changelog/src/Makefile.am new file mode 100644 index 000000000..e85031ad4 --- /dev/null +++ b/xlators/features/changelog/src/Makefile.am @@ -0,0 +1,19 @@ +xlator_LTLIBRARIES = changelog.la + +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +noinst_HEADERS = changelog-helpers.h changelog-mem-types.h changelog-rt.h \ + changelog-misc.h changelog-encoders.h changelog-notifier.h + +changelog_la_LDFLAGS = -module -avoidversion + +changelog_la_SOURCES = changelog.c changelog-rt.c changelog-helpers.c \ + changelog-encoders.c changelog-notifier.c +changelog_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -fPIC -D_FILE_OFFSET_BITS=64 \ + -D_GNU_SOURCE -D$(GF_HOST_OS) -shared -nostartfiles -DDATADIR=\"$(localstatedir)\" + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/changelog/src/changelog-encoders.c b/xlators/features/changelog/src/changelog-encoders.c new file mode 100644 index 000000000..553eec85c --- /dev/null +++ b/xlators/features/changelog/src/changelog-encoders.c @@ -0,0 +1,176 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "changelog-encoders.h" + +size_t +entry_fn (void *data, char *buffer, gf_boolean_t encode) +{ + char *tmpbuf = NULL; + size_t bufsz = 0; + struct changelog_entry_fields *ce = NULL; + + ce = (struct changelog_entry_fields *) data; + + if (encode) { + tmpbuf = uuid_utoa (ce->cef_uuid); + CHANGELOG_FILL_BUFFER (buffer, bufsz, tmpbuf, strlen (tmpbuf)); + } else { + CHANGELOG_FILL_BUFFER (buffer, bufsz, + ce->cef_uuid, sizeof (uuid_t)); + } + + CHANGELOG_FILL_BUFFER (buffer, bufsz, "/", 1); + CHANGELOG_FILL_BUFFER (buffer, bufsz, + ce->cef_bname, strlen (ce->cef_bname)); + return bufsz; +} + +size_t +fop_fn (void *data, char *buffer, gf_boolean_t encode) +{ + char buf[10] = {0,}; + size_t bufsz = 0; + glusterfs_fop_t fop = 0; + + fop = *(glusterfs_fop_t *) data; + + if (encode) { + (void) snprintf (buf, sizeof (buf), "%d", fop); + CHANGELOG_FILL_BUFFER (buffer, bufsz, buf, strlen (buf)); + } else + CHANGELOG_FILL_BUFFER (buffer, bufsz, &fop, sizeof (fop)); + + return bufsz; +} + +void +entry_free_fn (void *data) +{ + changelog_opt_t *co = data; + + if (!co) + return; + + GF_FREE (co->co_entry.cef_bname); +} + +/** + * try to write all data in one shot + */ + +static inline void +changelog_encode_write_xtra (changelog_log_data_t *cld, + char *buffer, size_t *off, gf_boolean_t encode) +{ + int i = 0; + size_t offset = 0; + void *data = NULL; + changelog_opt_t *co = NULL; + + offset = *off; + + co = (changelog_opt_t *) cld->cld_ptr; + + for (; i < cld->cld_xtra_records; i++, co++) { + CHANGELOG_FILL_BUFFER (buffer, offset, "\0", 1); + + switch (co->co_type) { + case CHANGELOG_OPT_REC_FOP: + data = &co->co_fop; + break; + case CHANGELOG_OPT_REC_ENTRY: + data = &co->co_entry; + break; + } + + if (co->co_convert) + offset += co->co_convert (data, + buffer + offset, encode); + else /* no coversion: write it out as it is */ + CHANGELOG_FILL_BUFFER (buffer, offset, + data, co->co_len); + } + + *off = offset; +} + +int +changelog_encode_ascii (xlator_t *this, changelog_log_data_t *cld) +{ + size_t off = 0; + size_t gfid_len = 0; + char *gfid_str = NULL; + char *buffer = NULL; + changelog_priv_t *priv = NULL; + + priv = this->private; + + gfid_str = uuid_utoa (cld->cld_gfid); + gfid_len = strlen (gfid_str); + + /* extra bytes for decorations */ + buffer = alloca (gfid_len + cld->cld_ptr_len + 10); + CHANGELOG_STORE_ASCII (priv, buffer, + off, gfid_str, gfid_len, cld); + + if (cld->cld_xtra_records) + changelog_encode_write_xtra (cld, buffer, &off, _gf_true); + + CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1); + + return changelog_write_change (priv, buffer, off); +} + +int +changelog_encode_binary (xlator_t *this, changelog_log_data_t *cld) +{ + size_t off = 0; + char *buffer = NULL; + changelog_priv_t *priv = NULL; + + priv = this->private; + + /* extra bytes for decorations */ + buffer = alloca (sizeof (uuid_t) + cld->cld_ptr_len + 10); + CHANGELOG_STORE_BINARY (priv, buffer, off, cld->cld_gfid, cld); + + if (cld->cld_xtra_records) + changelog_encode_write_xtra (cld, buffer, &off, _gf_false); + + CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1); + + return changelog_write_change (priv, buffer, off); +} + +static struct changelog_encoder +cb_encoder[] = { + [CHANGELOG_ENCODE_BINARY] = + { + .encoder = CHANGELOG_ENCODE_BINARY, + .encode = changelog_encode_binary, + }, + [CHANGELOG_ENCODE_ASCII] = + { + .encoder = CHANGELOG_ENCODE_ASCII, + .encode = changelog_encode_ascii, + }, +}; + +void +changelog_encode_change( changelog_priv_t * priv) +{ + priv->ce = &cb_encoder[priv->encode_mode]; +} diff --git a/xlators/features/changelog/src/changelog-encoders.h b/xlators/features/changelog/src/changelog-encoders.h new file mode 100644 index 000000000..a3efbee05 --- /dev/null +++ b/xlators/features/changelog/src/changelog-encoders.h @@ -0,0 +1,46 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CHANGELOG_ENCODERS_H +#define _CHANGELOG_ENCODERS_H + +#include "xlator.h" +#include "defaults.h" + +#include "changelog-helpers.h" + +#define CHANGELOG_STORE_ASCII(priv, buf, off, gfid, gfid_len, cld) do { \ + CHANGELOG_FILL_BUFFER (buffer, off, \ + priv->maps[cld->cld_type], 1); \ + CHANGELOG_FILL_BUFFER (buffer, \ + off, gfid, gfid_len); \ + } while (0) + +#define CHANGELOG_STORE_BINARY(priv, buf, off, gfid, cld) do { \ + CHANGELOG_FILL_BUFFER (buffer, off, \ + priv->maps[cld->cld_type], 1); \ + CHANGELOG_FILL_BUFFER (buffer, \ + off, gfid, sizeof (uuid_t)); \ + } while (0) + +size_t +entry_fn (void *data, char *buffer, gf_boolean_t encode); +size_t +fop_fn (void *data, char *buffer, gf_boolean_t encode); +void +entry_free_fn (void *data); +int +changelog_encode_binary (xlator_t *, changelog_log_data_t *); +int +changelog_encode_ascii (xlator_t *, changelog_log_data_t *); +void +changelog_encode_change(changelog_priv_t *); + +#endif /* _CHANGELOG_ENCODERS_H */ diff --git a/xlators/features/changelog/src/changelog-helpers.c b/xlators/features/changelog/src/changelog-helpers.c new file mode 100644 index 000000000..7ab0091b5 --- /dev/null +++ b/xlators/features/changelog/src/changelog-helpers.c @@ -0,0 +1,693 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "defaults.h" +#include "logging.h" +#include "iobuf.h" + +#include "changelog-helpers.h" +#include "changelog-mem-types.h" + +#include "changelog-encoders.h" +#include <pthread.h> + +void +changelog_thread_cleanup (xlator_t *this, pthread_t thr_id) +{ + int ret = 0; + void *retval = NULL; + + /* send a cancel request to the thread */ + ret = pthread_cancel (thr_id); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "could not cancel thread (reason: %s)", + strerror (errno)); + goto out; + } + + ret = pthread_join (thr_id, &retval); + if (ret || (retval != PTHREAD_CANCELED)) { + gf_log (this->name, GF_LOG_ERROR, + "cancel request not adhered as expected" + " (reason: %s)", strerror (errno)); + } + + out: + return; +} + +inline void * +changelog_get_usable_buffer (changelog_local_t *local) +{ + changelog_log_data_t *cld = NULL; + + cld = &local->cld; + if (!cld->cld_iobuf) + return NULL; + + return cld->cld_iobuf->ptr; +} + +inline void +changelog_set_usable_record_and_length (changelog_local_t *local, + size_t len, int xr) +{ + changelog_log_data_t *cld = NULL; + + cld = &local->cld; + + cld->cld_ptr_len = len; + cld->cld_xtra_records = xr; +} + +void +changelog_local_cleanup (xlator_t *xl, changelog_local_t *local) +{ + int i = 0; + changelog_opt_t *co = NULL; + changelog_log_data_t *cld = NULL; + + if (!local) + return; + + cld = &local->cld; + + /* cleanup dynamic allocation for extra records */ + if (cld->cld_xtra_records) { + co = (changelog_opt_t *) cld->cld_ptr; + for (; i < cld->cld_xtra_records; i++, co++) + if (co->co_free) + co->co_free (co); + } + + CHANGELOG_IOBUF_UNREF (cld->cld_iobuf); + + if (local->inode) + inode_unref (local->inode); + + mem_put (local); +} + +inline int +changelog_write (int fd, char *buffer, size_t len) +{ + ssize_t size = 0; + size_t writen = 0; + + while (writen < len) { + size = write (fd, + buffer + writen, len - writen); + if (size <= 0) + break; + + writen += size; + } + + return (writen != len); +} + +static int +changelog_rollover_changelog (xlator_t *this, + changelog_priv_t *priv, unsigned long ts) +{ + int ret = -1; + int notify = 0; + char *bname = NULL; + char ofile[PATH_MAX] = {0,}; + char nfile[PATH_MAX] = {0,}; + + if (priv->changelog_fd != -1) { + close (priv->changelog_fd); + priv->changelog_fd = -1; + } + + (void) snprintf (ofile, PATH_MAX, + "%s/"CHANGELOG_FILE_NAME, priv->changelog_dir); + (void) snprintf (nfile, PATH_MAX, + "%s/"CHANGELOG_FILE_NAME".%lu", + priv->changelog_dir, ts); + + ret = rename (ofile, nfile); + if (!ret) + notify = 1; + + if (ret && (errno == ENOENT)) { + ret = 0; + } + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "error renaming %s -> %s (reason %s)", + ofile, nfile, strerror (errno)); + } + + if (notify) { + bname = basename (nfile); + gf_log (this->name, GF_LOG_DEBUG, "notifying: %s", bname); + ret = changelog_write (priv->wfd, bname, strlen (bname) + 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to send file name to notify thread" + " (reason: %s)", strerror (errno)); + } + } + + return ret; +} + +int +changelog_open (xlator_t *this, + changelog_priv_t *priv) +{ + int fd = 0; + int ret = -1; + int flags = 0; + char buffer[1024] = {0,}; + char changelog_path[PATH_MAX] = {0,}; + + (void) snprintf (changelog_path, PATH_MAX, + "%s/"CHANGELOG_FILE_NAME, + priv->changelog_dir); + + flags |= (O_CREAT | O_RDWR); + if (priv->fsync_interval == 0) + flags |= O_SYNC; + + fd = open (changelog_path, flags, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (fd < 0) { + gf_log (this->name, GF_LOG_ERROR, + "unable to open/create changelog file %s" + " (reason: %s). change-logging will be" + " inactive", changelog_path, strerror (errno)); + goto out; + } + + priv->changelog_fd = fd; + + (void) snprintf (buffer, 1024, CHANGELOG_HEADER, + CHANGELOG_VERSION_MAJOR, + CHANGELOG_VERSION_MINOR, + priv->ce->encoder); + ret = changelog_write_change (priv, buffer, strlen (buffer)); + if (ret) { + close (priv->changelog_fd); + priv->changelog_fd = -1; + goto out; + } + + ret = 0; + + out: + return ret; +} + +int +changelog_start_next_change (xlator_t *this, + changelog_priv_t *priv, + unsigned long ts, gf_boolean_t finale) +{ + int ret = -1; + + ret = changelog_rollover_changelog (this, priv, ts); + + if (!ret && !finale) + ret = changelog_open (this, priv); + + return ret; +} + +/** + * return the length of entry + */ +inline size_t +changelog_entry_length () +{ + return sizeof (changelog_log_data_t); +} + +int +changelog_fill_rollover_data (changelog_log_data_t *cld, gf_boolean_t is_last) +{ + struct timeval tv = {0,}; + + cld->cld_type = CHANGELOG_TYPE_ROLLOVER; + + if (gettimeofday (&tv, NULL)) + return -1; + + cld->cld_roll_time = (unsigned long) tv.tv_sec; + cld->cld_finale = is_last; + return 0; +} + +int +changelog_write_change (changelog_priv_t *priv, char *buffer, size_t len) +{ + return changelog_write (priv->changelog_fd, buffer, len); +} + +inline int +changelog_handle_change (xlator_t *this, + changelog_priv_t *priv, changelog_log_data_t *cld) +{ + int ret = 0; + + if (CHANGELOG_TYPE_IS_ROLLOVER (cld->cld_type)) { + changelog_encode_change(priv); + ret = changelog_start_next_change (this, priv, + cld->cld_roll_time, + cld->cld_finale); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Problem rolling over changelog(s)"); + goto out; + } + + /** + * case when there is reconfigure done (disabling changelog) and there + * are still fops that have updates in prgress. + */ + if (priv->changelog_fd == -1) + return 0; + + if (CHANGELOG_TYPE_IS_FSYNC (cld->cld_type)) { + ret = fsync (priv->changelog_fd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "fsync failed (reason: %s)", + strerror (errno)); + } + goto out; + } + + ret = priv->ce->encode (this, cld); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "error writing changelog to disk"); + } + + out: + return ret; +} + +changelog_local_t * +changelog_local_init (xlator_t *this, inode_t *inode, + uuid_t gfid, int xtra_records, + gf_boolean_t update_flag) +{ + changelog_local_t *local = NULL; + struct iobuf *iobuf = NULL; + + /** + * We relax the presence of inode if @update_flag is true. + * The caller (implmentation of the fop) needs to be careful to + * not blindly use local->inode. + */ + if (!update_flag && !inode) { + gf_log_callingfn (this->name, GF_LOG_WARNING, + "inode needed for version checking !!!"); + goto out; + } + + if (xtra_records) { + iobuf = iobuf_get2 (this->ctx->iobuf_pool, + xtra_records * CHANGELOG_OPT_RECORD_LEN); + if (!iobuf) + goto out; + } + + local = mem_get0 (this->local_pool); + if (!local) { + CHANGELOG_IOBUF_UNREF (iobuf); + goto out; + } + + local->update_no_check = update_flag; + + uuid_copy (local->cld.cld_gfid, gfid); + + local->cld.cld_iobuf = iobuf; + local->cld.cld_xtra_records = 0; /* set by the caller */ + + if (inode) + local->inode = inode_ref (inode); + + out: + return local; +} + +int +changelog_forget (xlator_t *this, inode_t *inode) +{ + uint64_t ctx_addr = 0; + changelog_inode_ctx_t *ctx = NULL; + + inode_ctx_del (inode, this, &ctx_addr); + if (!ctx_addr) + return 0; + + ctx = (changelog_inode_ctx_t *) (long) ctx_addr; + GF_FREE (ctx); + + return 0; +} + +int +changelog_inject_single_event (xlator_t *this, + changelog_priv_t *priv, + changelog_log_data_t *cld) +{ + return priv->cd.dispatchfn (this, priv, priv->cd.cd_data, cld, NULL); +} + +/** + * TODO: these threads have many thing in common (wake up after + * a certain time etc..). move them into separate routine. + */ +void * +changelog_rollover (void *data) +{ + int ret = 0; + xlator_t *this = NULL; + struct timeval tv = {0,}; + changelog_log_data_t cld = {0,}; + changelog_time_slice_t *slice = NULL; + changelog_priv_t *priv = data; + + this = priv->cr.this; + slice = &priv->slice; + + while (1) { + tv.tv_sec = priv->rollover_time; + tv.tv_usec = 0; + + ret = select (0, NULL, NULL, NULL, &tv); + if (ret) + continue; + + ret = changelog_fill_rollover_data (&cld, _gf_false); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to fill rollover data"); + continue; + } + + LOCK (&priv->lock); + { + ret = changelog_inject_single_event (this, priv, &cld); + if (!ret) + SLICE_VERSION_UPDATE (slice); + } + UNLOCK (&priv->lock); + } + + return NULL; +} + +void * +changelog_fsync_thread (void *data) +{ + int ret = 0; + xlator_t *this = NULL; + struct timeval tv = {0,}; + changelog_log_data_t cld = {0,}; + changelog_priv_t *priv = data; + + this = priv->cf.this; + cld.cld_type = CHANGELOG_TYPE_FSYNC; + + while (1) { + tv.tv_sec = priv->fsync_interval; + tv.tv_usec = 0; + + ret = select (0, NULL, NULL, NULL, &tv); + if (ret) + continue; + + ret = changelog_inject_single_event (this, priv, &cld); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "failed to inject fsync event"); + } + + return NULL; +} + +/* macros for inode/changelog version checks */ + +#define INODE_VERSION_UPDATE(priv, inode, iver, slice, type) do { \ + LOCK (&inode->lock); \ + { \ + LOCK (&priv->lock); \ + { \ + *iver = slice->changelog_version[type]; \ + } \ + UNLOCK (&priv->lock); \ + } \ + UNLOCK (&inode->lock); \ + } while (0) + +#define INODE_VERSION_EQUALS_SLICE(priv, ver, slice, type, upd) do { \ + LOCK (&priv->lock); \ + { \ + upd = (ver == slice->changelog_version[type]) \ + ? _gf_false : _gf_true; \ + } \ + UNLOCK (&priv->lock); \ + } while (0) + +static int +__changelog_inode_ctx_set (xlator_t *this, + inode_t *inode, changelog_inode_ctx_t *ctx) +{ + uint64_t ctx_addr = (uint64_t) ctx; + return __inode_ctx_set (inode, this, &ctx_addr); +} + +/** + * one shot routine to get the address and the value of a inode version + * for a particular type. + */ +static changelog_inode_ctx_t * +__changelog_inode_ctx_get (xlator_t *this, + inode_t *inode, unsigned long **iver, + unsigned long *version, changelog_log_type type) +{ + int ret = 0; + uint64_t ctx_addr = 0; + changelog_inode_ctx_t *ctx = NULL; + + ret = __inode_ctx_get (inode, this, &ctx_addr); + if (ret < 0) + ctx_addr = 0; + if (ctx_addr != 0) { + ctx = (changelog_inode_ctx_t *) (long)ctx_addr; + goto out; + } + + ctx = GF_CALLOC (1, sizeof (*ctx), gf_changelog_mt_inode_ctx_t); + if (!ctx) + goto out; + + ret = __changelog_inode_ctx_set (this, inode, ctx); + if (ret) { + GF_FREE (ctx); + ctx = NULL; + } + + out: + if (ctx && iver && version) { + *iver = CHANGELOG_INODE_VERSION_TYPE (ctx, type); + *version = **iver; + } + + return ctx; +} + +static changelog_inode_ctx_t * +changelog_inode_ctx_get (xlator_t *this, + inode_t *inode, unsigned long **iver, + unsigned long *version, changelog_log_type type) +{ + changelog_inode_ctx_t *ctx = NULL; + + LOCK (&inode->lock); + { + ctx = __changelog_inode_ctx_get (this, + inode, iver, version, type); + } + UNLOCK (&inode->lock); + + return ctx; +} + +/** + * This is the main update routine. Locking has been made granular so as to + * maximize parallelism of fops - I'll try to explain it below using execution + * timelines. + * + * Basically, the contention is between multiple execution threads of this + * routine and the roll-over thread. So, instead of having a big lock, we hold + * granular locks: inode->lock and priv->lock. Now I'll explain what happens + * when there is an update and a roll-over at just about the same time. + * NOTE: + * - the dispatcher itself synchronizes updates via it's own lock + * - the slice version in incremented by the roll-over thread + * + * Case 1: When the rollover thread wins before the inode version can be + * compared with the slice version. + * + * [updater] | [rollover] + * | + * | <SLICE: 1, 1, 1> + * <changelog_update> | + * <changelog_inode_ctx_get> | + * <CTX: 1, 1, 1> | + * | <dispatch-rollover-event> + * | LOCK (&priv->lock) + * | <SLICE_VERSION_UPDATE> + * | <SLICE: 2, 2, 2> + * | UNLOCK (&priv->lock) + * | + * LOCK (&priv->lock) | + * <INODE_VERSION_EQUALS_SLICE> | + * I: 1 <-> S: 2 | + * update: true | + * UNLOCK (&priv->lock) | + * | + * <if update == true> | + * <dispath-update-event> | + * <INODE_VERSION_UPDATE> | + * LOCK (&inode->lock) | + * LOCK (&priv->lock) | + * <CTX: 2, 1, 1> | + * UNLOCK (&priv->lock) | + * UNLOCK (&inode->lock) | + * + * Therefore, the change gets recorded in the next change (no lost change). If + * the slice version was ahead of the inode version (say I:1, S: 2), then + * anyway the comparison would result in a update (I: 1, S: 3). + * + * If the rollover time is too less, then there is another contention when the + * updater tries to bring up inode version to the slice version (this is also + * the case when the roll-over thread wakes up during INODE_VERSION_UPDATE. + * + * <CTX: 1, 1, 1> | <SLICE: 2, 2, 2> + * | + * | + * <dispath-update-event> | + * <INODE_VERSION_UPDATE> | + * LOCK (&inode->lock) | + * LOCK (&priv->lock) | + * <CTX: 2, 1, 1> | + * UNLOCK (&priv->lock) | + * UNLOCK (&inode->lock) | + * | <dispatch-rollover-event> + * | LOCK (&priv->lock) + * | <SLICE_VERSION_UPDATE> + * | <SLICE: 3, 3, 3> + * | UNLOCK (&priv->lock) + * + * + * Case 2: When the fop thread wins + * + * [updater] | [rollover] + * | + * | <SLICE: 1, 1, 1> + * <changelog_update> | + * <changelog_inode_ctx_get> | + * <CTX: 0, 0, 0> | + * | + * LOCK (&priv->lock) | + * <INODE_VERSION_EQUALS_SLICE> | + * I: 0 <-> S: 1 | + * update: true | + * UNLOCK (&priv->lock) | + * | <dispatch-rollover-event> + * | LOCK (&priv->lock) + * | <SLICE_VERSION_UPDATE> + * | <SLICE: 2, 2, 2> + * | UNLOCK (&priv->lock) + * <if update == true> | + * <dispath-update-event> | + * <INODE_VERSION_UPDATE> | + * LOCK (&inode->lock) | + * LOCK (&priv->lock) | + * <CTX: 2, 0, 0> | + * UNLOCK (&priv->lock) | + * UNLOCK (&inode->lock) | + * + * Here again, if the inode version was equal to the slice version (I: 1, S: 1) + * then there is no need to record an update (as the equality of the two version + * signifies an update was recorded in the current time slice). + */ +inline void +changelog_update (xlator_t *this, changelog_priv_t *priv, + changelog_local_t *local, changelog_log_type type) +{ + int ret = 0; + unsigned long *iver = NULL; + unsigned long version = 0; + inode_t *inode = NULL; + changelog_time_slice_t *slice = NULL; + changelog_inode_ctx_t *ctx = NULL; + changelog_log_data_t *cld_0 = NULL; + changelog_log_data_t *cld_1 = NULL; + changelog_local_t *next_local = NULL; + gf_boolean_t need_upd = _gf_true; + + slice = &priv->slice; + + /** + * for fops that do not require inode version checking + */ + if (local->update_no_check) + goto update; + + inode = local->inode; + + ctx = changelog_inode_ctx_get (this, + inode, &iver, &version, type); + if (!ctx) + goto update; + + INODE_VERSION_EQUALS_SLICE (priv, version, slice, type, need_upd); + + update: + if (need_upd) { + cld_0 = &local->cld; + cld_0->cld_type = type; + + if ( (next_local = local->prev_entry) != NULL ) { + cld_1 = &next_local->cld; + cld_1->cld_type = type; + } + + ret = priv->cd.dispatchfn (this, priv, + priv->cd.cd_data, cld_0, cld_1); + + /** + * update after the dispatcher has successfully done + * it's job. + */ + if (!local->update_no_check && iver && !ret) + INODE_VERSION_UPDATE (priv, inode, iver, slice, type); + } + + return; +} diff --git a/xlators/features/changelog/src/changelog-helpers.h b/xlators/features/changelog/src/changelog-helpers.h new file mode 100644 index 000000000..ad79636b0 --- /dev/null +++ b/xlators/features/changelog/src/changelog-helpers.h @@ -0,0 +1,395 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CHANGELOG_HELPERS_H +#define _CHANGELOG_HELPERS_H + +#include "locking.h" +#include "timer.h" +#include "pthread.h" +#include "iobuf.h" + +#include "changelog-misc.h" + +/** + * the changelog entry + */ +typedef struct changelog_log_data { + /* rollover related */ + unsigned long cld_roll_time; + + /* reopen changelog? */ + gf_boolean_t cld_finale; + + changelog_log_type cld_type; + + /** + * sincd gfid is _always_ a necessity, it's not a part + * of the iobuf. by doing this we do not add any overhead + * for data and metadata related fops. + */ + uuid_t cld_gfid; + + /** + * iobufs are used for optionals records: pargfid, path, + * write offsets etc.. It's the fop implementers job + * to allocate (iobuf_get() in the fop) and get unref'ed + * in the callback (CHANGELOG_STACK_UNWIND). + */ + struct iobuf *cld_iobuf; + +#define cld_ptr cld_iobuf->ptr + + /** + * after allocation you can point this to the length of + * usable data, but make sure it does not exceed the + * the size of the requested iobuf. + */ + size_t cld_iobuf_len; + +#define cld_ptr_len cld_iobuf_len + + /** + * number of optional records + */ + int cld_xtra_records; +} changelog_log_data_t; + +/** + * holder for dispatch function and private data + */ + +typedef struct changelog_priv changelog_priv_t; + +typedef struct changelog_dispatcher { + void *cd_data; + int (*dispatchfn) (xlator_t *, changelog_priv_t *, void *, + changelog_log_data_t *, changelog_log_data_t *); +} changelog_dispatcher_t; + +struct changelog_bootstrap { + changelog_mode_t mode; + int (*ctor) (xlator_t *, changelog_dispatcher_t *); + int (*dtor) (xlator_t *, changelog_dispatcher_t *); +}; + +struct changelog_encoder { + changelog_encoder_t encoder; + int (*encode) (xlator_t *, changelog_log_data_t *); +}; + + +/* xlator private */ + +typedef struct changelog_time_slice { + /** + * just in case we need nanosecond granularity some day. + * field is unused as of now (maybe we'd need it later). + */ + struct timeval tv_start; + + /** + * version of changelog file, incremented each time changes + * rollover. + */ + unsigned long changelog_version[CHANGELOG_MAX_TYPE]; +} changelog_time_slice_t; + +typedef struct changelog_rollover { + /* rollover thread */ + pthread_t rollover_th; + + xlator_t *this; +} changelog_rollover_t; + +typedef struct changelog_fsync { + /* fsync() thread */ + pthread_t fsync_th; + + xlator_t *this; +} changelog_fsync_t; + +# define CHANGELOG_MAX_CLIENTS 5 +typedef struct changelog_notify { + /* reader end of the pipe */ + int rfd; + + /* notifier thread */ + pthread_t notify_th; + + /* unique socket path */ + char sockpath[PATH_MAX]; + + int socket_fd; + + /** + * simple array of accept()'ed fds. Not scalable at all + * for large number of clients, but it's okay as we have + * a ahrd limit in this version (@CHANGELOG_MAX_CLIENTS). + */ + int client_fd[CHANGELOG_MAX_CLIENTS]; + + xlator_t *this; +} changelog_notify_t; + +struct changelog_priv { + gf_boolean_t active; + + /* to generate unique socket file per brick */ + char *changelog_brick; + + /* logging directory */ + char *changelog_dir; + + /* one file for all changelog types */ + int changelog_fd; + + gf_lock_t lock; + + /* writen end of the pipe */ + int wfd; + + /* rollover time */ + int32_t rollover_time; + + /* fsync() interval */ + int32_t fsync_interval; + + /* changelog type maps */ + const char *maps[CHANGELOG_MAX_TYPE]; + + /* time slicer */ + changelog_time_slice_t slice; + + /* context of the updater */ + changelog_dispatcher_t cd; + + /* context of the rollover thread */ + changelog_rollover_t cr; + + /* context of fsync thread */ + changelog_fsync_t cf; + + /* context of the notifier thread */ + changelog_notify_t cn; + + /* operation mode */ + changelog_mode_t op_mode; + + /* bootstrap routine for 'current' logger */ + struct changelog_bootstrap *cb; + + /* encoder mode */ + changelog_encoder_t encode_mode; + + /* encoder */ + struct changelog_encoder *ce; +}; + +struct changelog_local { + inode_t *inode; + gf_boolean_t update_no_check; + + changelog_log_data_t cld; + + /** + * ->prev_entry is used in cases when there needs to be + * additional changelog entry for the parent (eg. rename) + * It's analogous to ->next in single linked list world, + * but we call it as ->prev_entry... ha ha ha + */ + struct changelog_local *prev_entry; +}; + +typedef struct changelog_local changelog_local_t; + +/* inode version is stored in inode ctx */ +typedef struct changelog_inode_ctx { + unsigned long iversion[CHANGELOG_MAX_TYPE]; +} changelog_inode_ctx_t; + +#define CHANGELOG_INODE_VERSION_TYPE(ctx, type) &(ctx->iversion[type]) + +/** + * Optional Records: + * fops that need to save additional information request a array of + * @changelog_opt_t struct. The array is allocated via @iobufs. + */ +typedef enum { + CHANGELOG_OPT_REC_FOP, + CHANGELOG_OPT_REC_ENTRY, +} changelog_optional_rec_type_t; + +struct changelog_entry_fields { + uuid_t cef_uuid; + char *cef_bname; +}; + +typedef struct { + /** + * @co_covert can be used to do post-processing of the record before + * it's persisted to the CHANGELOG. If this is NULL, then the record + * is persisted as per it's in memory format. + */ + size_t (*co_convert) (void *data, char *buffer, gf_boolean_t encode); + + /* release routines */ + void (*co_free) (void *data); + + /* type of the field */ + changelog_optional_rec_type_t co_type; + + /** + * sizeof of the 'valid' field in the union. This field is not used if + * @co_convert is specified. + */ + size_t co_len; + + union { + glusterfs_fop_t co_fop; + struct changelog_entry_fields co_entry; + }; +} changelog_opt_t; + +#define CHANGELOG_OPT_RECORD_LEN sizeof (changelog_opt_t) + +/** + * helpers routines + */ + +void +changelog_thread_cleanup (xlator_t *this, pthread_t thr_id); +inline void * +changelog_get_usable_buffer (changelog_local_t *local); +inline void +changelog_set_usable_record_and_length (changelog_local_t *local, + size_t len, int xr); +void +changelog_local_cleanup (xlator_t *xl, changelog_local_t *local); +changelog_local_t * +changelog_local_init (xlator_t *this, inode_t *inode, uuid_t gfid, + int xtra_records, gf_boolean_t update_flag); +int +changelog_start_next_change (xlator_t *this, + changelog_priv_t *priv, + unsigned long ts, gf_boolean_t finale); +int +changelog_open (xlator_t *this, changelog_priv_t *priv); +int +changelog_fill_rollover_data (changelog_log_data_t *cld, gf_boolean_t is_last); +int +changelog_inject_single_event (xlator_t *this, + changelog_priv_t *priv, + changelog_log_data_t *cld); +inline size_t +changelog_entry_length (); +inline int +changelog_write (int fd, char *buffer, size_t len); +int +changelog_write_change (changelog_priv_t *priv, char *buffer, size_t len); +inline int +changelog_handle_change (xlator_t *this, + changelog_priv_t *priv, changelog_log_data_t *cld); +inline void +changelog_update (xlator_t *this, changelog_priv_t *priv, + changelog_local_t *local, changelog_log_type type); +void * +changelog_rollover (void *data); +void * +changelog_fsync_thread (void *data); +int +changelog_forget (xlator_t *this, inode_t *inode); + +/* macros */ + +#define CHANGELOG_STACK_UNWIND(fop, frame, params ...) do { \ + changelog_local_t *__local = NULL; \ + xlator_t *__xl = NULL; \ + if (frame) { \ + __local = frame->local; \ + __xl = frame->this; \ + frame->local = NULL; \ + } \ + STACK_UNWIND_STRICT (fop, frame, params); \ + changelog_local_cleanup (__xl, __local); \ + if (__local && __local->prev_entry) \ + changelog_local_cleanup (__xl, \ + __local->prev_entry); \ + } while (0) + +#define CHANGELOG_IOBUF_REF(iobuf) do { \ + if (iobuf) \ + iobuf_ref (iobuf); \ + } while (0) + +#define CHANGELOG_IOBUF_UNREF(iobuf) do { \ + if (iobuf) \ + iobuf_unref (iobuf); \ + } while (0) + +#define CHANGELOG_FILL_BUFFER(buffer, off, val, len) do { \ + memcpy (buffer + off, val, len); \ + off += len; \ + } while (0) + +#define SLICE_VERSION_UPDATE(slice) do { \ + int i = 0; \ + for (; i < CHANGELOG_MAX_TYPE; i++) { \ + slice->changelog_version[i]++; \ + } \ + } while (0) + +#define CHANGLOG_FILL_FOP_NUMBER(co, fop, converter, xlen) do { \ + co->co_convert = converter; \ + co->co_free = NULL; \ + co->co_type = CHANGELOG_OPT_REC_FOP; \ + co->co_fop = fop; \ + xlen += sizeof (fop); \ + } while (0) + +#define CHANGELOG_FILL_ENTRY(co, pargfid, bname, \ + converter, freefn, xlen, label) \ + do { \ + co->co_convert = converter; \ + co->co_free = freefn; \ + co->co_type = CHANGELOG_OPT_REC_ENTRY; \ + uuid_copy (co->co_entry.cef_uuid, pargfid); \ + co->co_entry.cef_bname = gf_strdup(bname); \ + if (!co->co_entry.cef_bname) \ + goto label; \ + xlen += (UUID_CANONICAL_FORM_LEN + strlen (bname)); \ + } while (0) + +#define CHANGELOG_INIT(this, local, inode, gfid, xrec) \ + local = changelog_local_init (this, inode, gfid, xrec, _gf_false) + +#define CHANGELOG_INIT_NOCHECK(this, local, inode, gfid, xrec) \ + local = changelog_local_init (this, inode, gfid, xrec, _gf_true) + +#define CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, label) do { \ + if (!priv->active) \ + goto label; \ + /* ignore rebalance process's activity. */ \ + if (frame->root->pid == GF_CLIENT_PID_DEFRAG) \ + goto label; \ + } while (0) + +/* ignore internal fops */ +#define CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO(dict, label) do { \ + if (dict && dict_get (dict, GLUSTERFS_INTERNAL_FOP_KEY)) \ + goto label; \ + } while (0) + +#define CHANGELOG_COND_GOTO(priv, cond, label) do { \ + if (!priv->active || cond) \ + goto label; \ + } while (0) + +#endif /* _CHANGELOG_HELPERS_H */ diff --git a/xlators/features/changelog/src/changelog-mem-types.h b/xlators/features/changelog/src/changelog-mem-types.h new file mode 100644 index 000000000..d72464eab --- /dev/null +++ b/xlators/features/changelog/src/changelog-mem-types.h @@ -0,0 +1,29 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CHANGELOG_MEM_TYPES_H +#define _CHANGELOG_MEM_TYPES_H + +#include "mem-types.h" + +enum gf_changelog_mem_types { + gf_changelog_mt_priv_t = gf_common_mt_end + 1, + gf_changelog_mt_str_t = gf_common_mt_end + 2, + gf_changelog_mt_batch_t = gf_common_mt_end + 3, + gf_changelog_mt_rt_t = gf_common_mt_end + 4, + gf_changelog_mt_inode_ctx_t = gf_common_mt_end + 5, + gf_changelog_mt_libgfchangelog_t = gf_common_mt_end + 6, + gf_changelog_mt_libgfchangelog_rl_t = gf_common_mt_end + 7, + gf_changelog_mt_libgfchangelog_dirent_t = gf_common_mt_end + 8, + gf_changelog_mt_changelog_buffer_t = gf_common_mt_end + 9, + gf_changelog_mt_end +}; + +#endif diff --git a/xlators/features/changelog/src/changelog-misc.h b/xlators/features/changelog/src/changelog-misc.h new file mode 100644 index 000000000..0712a3771 --- /dev/null +++ b/xlators/features/changelog/src/changelog-misc.h @@ -0,0 +1,101 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CHANGELOG_MISC_H +#define _CHANGELOG_MISC_H + +#include "glusterfs.h" +#include "common-utils.h" + +#define CHANGELOG_MAX_TYPE 3 +#define CHANGELOG_FILE_NAME "CHANGELOG" + +#define CHANGELOG_VERSION_MAJOR 1 +#define CHANGELOG_VERSION_MINOR 0 + +#define CHANGELOG_UNIX_SOCK DEFAULT_VAR_RUN_DIRECTORY"/changelog-%s.sock" + +/** + * header starts with the version and the format of the changelog. + * 'version' not much of a use now. + */ +#define CHANGELOG_HEADER \ + "GlusterFS Changelog | version: v%d.%d | encoding : %d\n" + +#define CHANGELOG_MAKE_SOCKET_PATH(brick_path, sockpath, len) do { \ + char md5_sum[MD5_DIGEST_LENGTH*2+1] = {0,}; \ + md5_wrapper((unsigned char *) brick_path, \ + strlen(brick_path), \ + md5_sum); \ + (void) snprintf (sockpath, len, \ + CHANGELOG_UNIX_SOCK, md5_sum); \ + } while (0) + +/** + * ... used by libgfchangelog. + */ +#define CHANGELOG_GET_ENCODING(fd, buffer, len, enc, enc_len) do { \ + FILE *fp; \ + int fd_dup, maj, min; \ + \ + enc = -1; \ + fd_dup = dup (fd); \ + \ + if (fd_dup != -1) { \ + fp = fdopen (fd_dup, "r"); \ + if (fp) { \ + if (fgets (buffer, len, fp)) { \ + elen = strlen (buffer); \ + sscanf (buffer, \ + CHANGELOG_HEADER, \ + &maj, &min, &enc); \ + } \ + fclose (fp); \ + } else { \ + close (fd_dup); \ + } \ + } \ + } while (0) + +/** + * everything after 'CHANGELOG_TYPE_ENTRY' are internal types + * (ie. none of the fops trigger this type of event), hence + * CHANGELOG_MAX_TYPE = 3 + */ +typedef enum { + CHANGELOG_TYPE_DATA = 0, + CHANGELOG_TYPE_METADATA, + CHANGELOG_TYPE_ENTRY, + CHANGELOG_TYPE_ROLLOVER, + CHANGELOG_TYPE_FSYNC, +} changelog_log_type; + +/* operation modes - RT for now */ +typedef enum { + CHANGELOG_MODE_RT = 0, +} changelog_mode_t; + +/* encoder types */ + +typedef enum { + CHANGELOG_ENCODE_MIN = 0, + CHANGELOG_ENCODE_BINARY, + CHANGELOG_ENCODE_ASCII, + CHANGELOG_ENCODE_MAX, +} changelog_encoder_t; + +#define CHANGELOG_VALID_ENCODING(enc) \ + (enc > CHANGELOG_ENCODE_MIN && enc < CHANGELOG_ENCODE_MAX) + +#define CHANGELOG_TYPE_IS_ENTRY(type) (type == CHANGELOG_TYPE_ENTRY) +#define CHANGELOG_TYPE_IS_ROLLOVER(type) (type == CHANGELOG_TYPE_ROLLOVER) +#define CHANGELOG_TYPE_IS_FSYNC(type) (type == CHANGELOG_TYPE_FSYNC) + +#endif /* _CHANGELOG_MISC_H */ diff --git a/xlators/features/changelog/src/changelog-notifier.c b/xlators/features/changelog/src/changelog-notifier.c new file mode 100644 index 000000000..1f8b31253 --- /dev/null +++ b/xlators/features/changelog/src/changelog-notifier.c @@ -0,0 +1,314 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "changelog-notifier.h" + +#include <pthread.h> + +inline static void +changelog_notify_clear_fd (changelog_notify_t *cn, int i) +{ + cn->client_fd[i] = -1; +} + +inline static void +changelog_notify_save_fd (changelog_notify_t *cn, int i, int fd) +{ + cn->client_fd[i] = fd; +} + +static int +changelog_notify_insert_fd (xlator_t *this, changelog_notify_t *cn, int fd) +{ + int i = 0; + int ret = 0; + + for (; i < CHANGELOG_MAX_CLIENTS; i++) { + if (cn->client_fd[i] == -1) + break; + } + + if (i == CHANGELOG_MAX_CLIENTS) { + /** + * this case should not be hit as listen() would limit + * the number of completely established connections. + */ + gf_log (this->name, GF_LOG_WARNING, + "hit max client limit (%d)", CHANGELOG_MAX_CLIENTS); + ret = -1; + } + else + changelog_notify_save_fd (cn, i, fd); + + return ret; +} + +static void +changelog_notify_fill_rset (changelog_notify_t *cn, fd_set *rset, int *maxfd) +{ + int i = 0; + + FD_ZERO (rset); + + FD_SET (cn->socket_fd, rset); + *maxfd = cn->socket_fd; + + FD_SET (cn->rfd, rset); + *maxfd = max (*maxfd, cn->rfd); + + for (; i < CHANGELOG_MAX_CLIENTS; i++) { + if (cn->client_fd[i] != -1) { + FD_SET (cn->client_fd[i], rset); + *maxfd = max (*maxfd, cn->client_fd[i]); + } + } + + *maxfd = *maxfd + 1; +} + +static int +changelog_notify_client (changelog_notify_t *cn, char *path, ssize_t len) +{ + int i = 0; + int ret = 0; + + for (; i < CHANGELOG_MAX_CLIENTS; i++) { + if (cn->client_fd[i] == -1) + continue; + + if (changelog_write (cn->client_fd[i], + path, len)) { + ret = -1; + + close (cn->client_fd[i]); + changelog_notify_clear_fd (cn, i); + } + } + + return ret; +} + +static void +changelog_notifier_init (changelog_notify_t *cn) +{ + int i = 0; + + cn->socket_fd = -1; + + for (; i < CHANGELOG_MAX_CLIENTS; i++) { + changelog_notify_clear_fd (cn, i); + } +} + +static void +changelog_close_client_conn (changelog_notify_t *cn) +{ + int i = 0; + + for (; i < CHANGELOG_MAX_CLIENTS; i++) { + if (cn->client_fd[i] == -1) + continue; + + close (cn->client_fd[i]); + changelog_notify_clear_fd (cn, i); + } +} + +static void +changelog_notifier_cleanup (void *arg) +{ + changelog_notify_t *cn = NULL; + + cn = (changelog_notify_t *) arg; + + changelog_close_client_conn (cn); + + if (cn->socket_fd != -1) + close (cn->socket_fd); + + if (cn->rfd) + close (cn->rfd); + + if (unlink (cn->sockpath)) + gf_log ("", GF_LOG_WARNING, + "could not unlink changelog socket file" + " %s (reason: %s", cn->sockpath, strerror (errno)); +} + +void * +changelog_notifier (void *data) +{ + int i = 0; + int fd = 0; + int max_fd = 0; + int len = 0; + ssize_t readlen = 0; + xlator_t *this = NULL; + changelog_priv_t *priv = NULL; + changelog_notify_t *cn = NULL; + struct sockaddr_un local = {0,}; + char path[PATH_MAX] = {0,}; + char abspath[PATH_MAX] = {0,}; + + char buffer; + fd_set rset; + + priv = (changelog_priv_t *) data; + + cn = &priv->cn; + this = cn->this; + + pthread_cleanup_push (changelog_notifier_cleanup, cn); + + changelog_notifier_init (cn); + + cn->socket_fd = socket (AF_UNIX, SOCK_STREAM, 0); + if (cn->socket_fd < 0) { + gf_log (this->name, GF_LOG_ERROR, + "changelog socket error (reason: %s)", + strerror (errno)); + goto out; + } + + CHANGELOG_MAKE_SOCKET_PATH (priv->changelog_brick, + cn->sockpath, PATH_MAX); + if (unlink (cn->sockpath) < 0) { + if (errno != ENOENT) { + gf_log (this->name, GF_LOG_ERROR, + "Could not unlink changelog socket file (%s)" + " (reason: %s)", + CHANGELOG_UNIX_SOCK, strerror (errno)); + goto cleanup; + } + } + + local.sun_family = AF_UNIX; + strcpy (local.sun_path, cn->sockpath); + + len = strlen (local.sun_path) + sizeof (local.sun_family); + + /* bind to the unix domain socket */ + if (bind (cn->socket_fd, (struct sockaddr *) &local, len) < 0) { + gf_log (this->name, GF_LOG_ERROR, + "Could not bind to changelog socket (reason: %s)", + strerror (errno)); + goto cleanup; + } + + /* listen for incoming connections */ + if (listen (cn->socket_fd, CHANGELOG_MAX_CLIENTS) < 0) { + gf_log (this->name, GF_LOG_ERROR, + "listen() error on changelog socket (reason: %s)", + strerror (errno)); + goto cleanup; + } + + /** + * simple select() on all to-be-read file descriptors. This method + * though old school works pretty well when you have a handfull of + * fd's to be watched (clients). + * + * Future TODO: move this to epoll based notification facility if + * number of clients increase. + */ + for (;;) { + changelog_notify_fill_rset (cn, &rset, &max_fd); + + if (select (max_fd, &rset, NULL, NULL, NULL) < 0) { + gf_log (this->name, GF_LOG_ERROR, + "select() returned -1 (reason: %s)", + strerror (errno)); + sleep (2); + continue; + } + + if (FD_ISSET (cn->socket_fd, &rset)) { + fd = accept (cn->socket_fd, NULL, NULL); + if (fd < 0) { + gf_log (this->name, GF_LOG_ERROR, + "accept error on changelog socket" + " (reason: %s)", strerror (errno)); + } else if (changelog_notify_insert_fd (this, cn, fd)) { + gf_log (this->name, GF_LOG_ERROR, + "hit max client limit"); + } + } + + if (FD_ISSET (cn->rfd, &rset)) { + /** + * read changelog filename and notify all connected + * clients. + */ + readlen = 0; + while (readlen < PATH_MAX) { + len = read (cn->rfd, &path[readlen++], 1); + if (len == -1) { + break; + } + + if (len == 0) { + gf_log (this->name, GF_LOG_ERROR, + "rollover thread sent EOF" + " on pipe - possibly a crash."); + /* be blunt and close all connections */ + pthread_exit(NULL); + } + + if (path[readlen - 1] == '\0') + break; + } + + /* should we close all client connections here too? */ + if (len < 0 || readlen == PATH_MAX) { + gf_log (this->name, GF_LOG_ERROR, + "Could not get pathname from rollover" + " thread or pathname too long"); + goto process_rest; + } + + (void) snprintf (abspath, PATH_MAX, + "%s/%s", priv->changelog_dir, path); + if (changelog_notify_client (cn, abspath, + strlen (abspath) + 1)) + gf_log (this->name, GF_LOG_ERROR, + "could not notify some clients with new" + " changelogs"); + } + + process_rest: + for (i = 0; i < CHANGELOG_MAX_CLIENTS; i++) { + if ( (fd = cn->client_fd[i]) == -1 ) + continue; + + if (FD_ISSET (fd, &rset)) { + /** + * the only data we accept from the client is a + * disconnect. Anything else is treated as bogus + * and is silently discarded (also warned!!!). + */ + if ( (readlen = read (fd, &buffer, 1)) <= 0 ) { + close (fd); + changelog_notify_clear_fd (cn, i); + } else { + /* silently discard data and log */ + gf_log (this->name, GF_LOG_WARNING, + "misbehaving changelog client"); + } + } + } + + } + + cleanup:; + pthread_cleanup_pop (1); + + out: + return NULL; +} diff --git a/xlators/features/marker/utils/src/procdiggy.h b/xlators/features/changelog/src/changelog-notifier.h index 56dfc4eb2..55e728356 100644 --- a/xlators/features/marker/utils/src/procdiggy.h +++ b/xlators/features/changelog/src/changelog-notifier.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -7,14 +7,13 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ -#ifdef __NetBSD__ -#include <sys/syslimits.h> -#endif /* __NetBSD__ */ -#define PROC "/proc" +#ifndef _CHANGELOG_NOTIFIER_H +#define _CHANGELOG_NOTIFIER_H -pid_t pidinfo (pid_t pid, char **name); +#include "changelog-helpers.h" -int prociter (int (*proch) (pid_t pid, pid_t ppid, char *name, void *data), - void *data); +void * +changelog_notifier (void *data); +#endif diff --git a/xlators/features/changelog/src/changelog-rt.c b/xlators/features/changelog/src/changelog-rt.c new file mode 100644 index 000000000..c147f68ca --- /dev/null +++ b/xlators/features/changelog/src/changelog-rt.c @@ -0,0 +1,72 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "defaults.h" +#include "logging.h" + +#include "changelog-rt.h" +#include "changelog-mem-types.h" + +int +changelog_rt_init (xlator_t *this, changelog_dispatcher_t *cd) +{ + changelog_rt_t *crt = NULL; + + crt = GF_CALLOC (1, sizeof (*crt), + gf_changelog_mt_rt_t); + if (!crt) + return -1; + + LOCK_INIT (&crt->lock); + + cd->cd_data = crt; + cd->dispatchfn = &changelog_rt_enqueue; + + return 0; +} + +int +changelog_rt_fini (xlator_t *this, changelog_dispatcher_t *cd) +{ + changelog_rt_t *crt = NULL; + + crt = cd->cd_data; + + LOCK_DESTROY (&crt->lock); + GF_FREE (crt); + + return 0; +} + +int +changelog_rt_enqueue (xlator_t *this, changelog_priv_t *priv, void *cbatch, + changelog_log_data_t *cld_0, changelog_log_data_t *cld_1) +{ + int ret = 0; + changelog_rt_t *crt = NULL; + + crt = (changelog_rt_t *) cbatch; + + LOCK (&crt->lock); + { + ret = changelog_handle_change (this, priv, cld_0); + if (!ret && cld_1) + ret = changelog_handle_change (this, priv, cld_1); + } + UNLOCK (&crt->lock); + + return ret; +} diff --git a/xlators/features/changelog/src/changelog-rt.h b/xlators/features/changelog/src/changelog-rt.h new file mode 100644 index 000000000..1fc2bbc5b --- /dev/null +++ b/xlators/features/changelog/src/changelog-rt.h @@ -0,0 +1,33 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CHANGELOG_RT_H +#define _CHANGELOG_RT_H + +#include "locking.h" +#include "timer.h" +#include "pthread.h" + +#include "changelog-helpers.h" + +/* unused as of now - may be you would need it later */ +typedef struct changelog_rt { + gf_lock_t lock; +} changelog_rt_t; + +int +changelog_rt_init (xlator_t *this, changelog_dispatcher_t *cd); +int +changelog_rt_fini (xlator_t *this, changelog_dispatcher_t *cd); +int +changelog_rt_enqueue (xlator_t *this, changelog_priv_t *priv, void *cbatch, + changelog_log_data_t *cld_0, changelog_log_data_t *cld_1); + +#endif /* _CHANGELOG_RT_H */ diff --git a/xlators/features/changelog/src/changelog.c b/xlators/features/changelog/src/changelog.c new file mode 100644 index 000000000..cea0e8c70 --- /dev/null +++ b/xlators/features/changelog/src/changelog.c @@ -0,0 +1,1477 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "defaults.h" +#include "logging.h" +#include "iobuf.h" + +#include "changelog-rt.h" + +#include "changelog-encoders.h" +#include "changelog-mem-types.h" + +#include <pthread.h> + +#include "changelog-notifier.h" + +static struct changelog_bootstrap +cb_bootstrap[] = { + { + .mode = CHANGELOG_MODE_RT, + .ctor = changelog_rt_init, + .dtor = changelog_rt_fini, + }, +}; + +/* Entry operations - TYPE III */ + +/** + * entry operations do not undergo inode version checking. + */ + +/* {{{ */ + +/* rmdir */ + +int32_t +changelog_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + + unwind: + CHANGELOG_STACK_UNWIND (rmdir, frame, op_ret, op_errno, + preparent, postparent, xdata); + return 0; +} + +int32_t +changelog_rmdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, int xflags, dict_t *xdata) +{ + size_t xtra_len = 0; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT_NOCHECK (this, frame->local, + NULL, loc->inode->gfid, 2); + + co = changelog_get_usable_buffer (frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, + entry_fn, entry_free_fn, xtra_len, wind); + + changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + + wind: + STACK_WIND (frame, changelog_rmdir_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->rmdir, + loc, xflags, xdata); + return 0; +} + +/* unlink */ + +int32_t +changelog_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + + unwind: + CHANGELOG_STACK_UNWIND (unlink, frame, op_ret, op_errno, + preparent, postparent, xdata); + return 0; +} + +int32_t +changelog_unlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, int xflags, dict_t *xdata) +{ + size_t xtra_len = 0; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO (xdata, wind); + + CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, loc->inode->gfid, 2); + + co = changelog_get_usable_buffer (frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, + entry_fn, entry_free_fn, xtra_len, wind); + + changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + + wind: + STACK_WIND (frame, changelog_unlink_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->unlink, + loc, xflags, xdata); + return 0; +} + +/* rename */ + +int32_t +changelog_rename_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *buf, struct iatt *preoldparent, + struct iatt *postoldparent, struct iatt *prenewparent, + struct iatt *postnewparent, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + + unwind: + CHANGELOG_STACK_UNWIND (rename, frame, op_ret, op_errno, + buf, preoldparent, postoldparent, + prenewparent, postnewparent, xdata); + return 0; +} + + +int32_t +changelog_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + size_t xtra_len = 0; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + /* 3 == fop + oldloc + newloc */ + CHANGELOG_INIT_NOCHECK (this, frame->local, + NULL, oldloc->inode->gfid, 3); + + co = changelog_get_usable_buffer (frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, oldloc->pargfid, oldloc->name, + entry_fn, entry_free_fn, xtra_len, wind); + + co++; + CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name, + entry_fn, entry_free_fn, xtra_len, wind); + + changelog_set_usable_record_and_length (frame->local, xtra_len, 3); + + wind: + STACK_WIND (frame, changelog_rename_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->rename, + oldloc, newloc, xdata); + return 0; +} + +/* link */ + +int32_t +changelog_link_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + + unwind: + CHANGELOG_STACK_UNWIND (link, frame, op_ret, op_errno, + inode, buf, preparent, postparent, xdata); + return 0; +} + +int32_t +changelog_link (call_frame_t *frame, + xlator_t *this, loc_t *oldloc, + loc_t *newloc, dict_t *xdata) +{ + size_t xtra_len = 0; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + + priv = this->private; + + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO (xdata, wind); + + CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, oldloc->gfid, 2); + + co = changelog_get_usable_buffer (frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name, + entry_fn, entry_free_fn, xtra_len, wind); + + changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + + wind: + STACK_WIND (frame, changelog_link_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->link, + oldloc, newloc, xdata); + return 0; +} + +/* mkdir */ + +int32_t +changelog_mkdir_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + + unwind: + CHANGELOG_STACK_UNWIND (mkdir, frame, op_ret, op_errno, + inode, buf, preparent, postparent, xdata); + return 0; +} + +int32_t +changelog_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata) +{ + int ret = -1; + uuid_t gfid = {0,}; + void *uuid_req = NULL; + size_t xtra_len = 0; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "failed to get gfid from dict"); + goto wind; + } + uuid_copy (gfid, uuid_req); + + CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, gfid, 2); + + co = changelog_get_usable_buffer (frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, + entry_fn, entry_free_fn, xtra_len, wind); + + changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + + wind: + STACK_WIND (frame, changelog_mkdir_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->mkdir, + loc, mode, umask, xdata); + return 0; +} + +/* symlink */ + +int32_t +changelog_symlink_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + + unwind: + CHANGELOG_STACK_UNWIND (symlink, frame, op_ret, op_errno, + inode, buf, preparent, postparent, xdata); + return 0; +} + +int32_t +changelog_symlink (call_frame_t *frame, xlator_t *this, + const char *linkname, loc_t *loc, + mode_t umask, dict_t *xdata) +{ + int ret = -1; + size_t xtra_len = 0; + uuid_t gfid = {0,}; + void *uuid_req = NULL; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "failed to get gfid from dict"); + goto wind; + } + uuid_copy (gfid, uuid_req); + + CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, gfid, 2); + + co = changelog_get_usable_buffer (frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, + entry_fn, entry_free_fn, xtra_len, wind); + + changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + + wind: + STACK_WIND (frame, changelog_symlink_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->symlink, + linkname, loc, umask, xdata); + return 0; +} + +/* mknod */ + +int32_t +changelog_mknod_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + + unwind: + CHANGELOG_STACK_UNWIND (mknod, frame, op_ret, op_errno, + inode, buf, preparent, postparent, xdata); + return 0; +} + +int32_t +changelog_mknod (call_frame_t *frame, + xlator_t *this, loc_t *loc, + mode_t mode, dev_t dev, mode_t umask, dict_t *xdata) +{ + int ret = -1; + uuid_t gfid = {0,}; + void *uuid_req = NULL; + size_t xtra_len = 0; + changelog_priv_t *priv = NULL; + changelog_opt_t *co = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "failed to get gfid from dict"); + goto wind; + } + uuid_copy (gfid, uuid_req); + + CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, gfid, 2); + + co = changelog_get_usable_buffer (frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, + entry_fn, entry_free_fn, xtra_len, wind); + + changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + + wind: + STACK_WIND (frame, changelog_mknod_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->mknod, + loc, mode, dev, umask, xdata); + return 0; +} + +/* creat */ + +int32_t +changelog_create_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t *fd, inode_t *inode, struct iatt *buf, + struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY); + + unwind: + CHANGELOG_STACK_UNWIND (create, frame, + op_ret, op_errno, fd, inode, + buf, preparent, postparent, xdata); + return 0; +} + +int32_t +changelog_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, + mode_t umask, fd_t *fd, dict_t *xdata) +{ + int ret = -1; + uuid_t gfid = {0,}; + void *uuid_req = NULL; + changelog_opt_t *co = NULL; + changelog_priv_t *priv = NULL; + size_t xtra_len = 0; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + ret = dict_get_ptr (xdata, "gfid-req", &uuid_req); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "failed to get gfid from dict"); + goto wind; + } + uuid_copy (gfid, uuid_req); + + /* init with two extra records */ + CHANGELOG_INIT_NOCHECK (this, frame->local, NULL, gfid, 2); + if (!frame->local) + goto wind; + + co = changelog_get_usable_buffer (frame->local); + if (!co) + goto wind; + + CHANGLOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len); + + co++; + CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name, + entry_fn, entry_free_fn, xtra_len, wind); + + changelog_set_usable_record_and_length (frame->local, xtra_len, 2); + + wind: + STACK_WIND (frame, changelog_create_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->create, + loc, flags, mode, umask, fd, xdata); + return 0; +} + +/* }}} */ + + +/* Metadata modification fops - TYPE II */ + +/* {{{ */ + +/* {f}setattr */ + +int32_t +changelog_fsetattr_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *preop_stbuf, + struct iatt *postop_stbuf, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + + unwind: + CHANGELOG_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, + preop_stbuf, postop_stbuf, xdata); + + return 0; + + +} + +int32_t +changelog_fsetattr (call_frame_t *frame, + xlator_t *this, fd_t *fd, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT (this, frame->local, + fd->inode, fd->inode->gfid, 0); + + wind: + STACK_WIND (frame, changelog_fsetattr_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetattr, + fd, stbuf, valid, xdata); + return 0; + + +} + +int32_t +changelog_setattr_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *preop_stbuf, + struct iatt *postop_stbuf, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + + unwind: + CHANGELOG_STACK_UNWIND (setattr, frame, op_ret, op_errno, + preop_stbuf, postop_stbuf, xdata); + + return 0; +} + +int32_t +changelog_setattr (call_frame_t *frame, + xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT (this, frame->local, + loc->inode, loc->inode->gfid, 0); + + wind: + STACK_WIND (frame, changelog_setattr_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->setattr, + loc, stbuf, valid, xdata); + return 0; +} + +/* {f}removexattr */ + +int32_t +changelog_fremovexattr_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + + unwind: + CHANGELOG_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, xdata); + + return 0; +} + +int32_t +changelog_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT (this, frame->local, + fd->inode, fd->inode->gfid, 0); + + wind: + STACK_WIND (frame, changelog_fremovexattr_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->fremovexattr, + fd, name, xdata); + return 0; +} + +int32_t +changelog_removexattr_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + + unwind: + CHANGELOG_STACK_UNWIND (removexattr, frame, op_ret, op_errno, xdata); + + return 0; +} + +int32_t +changelog_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT (this, frame->local, + loc->inode, loc->inode->gfid, 0); + + wind: + STACK_WIND (frame, changelog_removexattr_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->removexattr, + loc, name, xdata); + return 0; +} + +/* {f}setxattr */ + +int32_t +changelog_setxattr_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + + unwind: + CHANGELOG_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); + + return 0; +} + +int32_t +changelog_setxattr (call_frame_t *frame, + xlator_t *this, loc_t *loc, + dict_t *dict, int32_t flags, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT (this, frame->local, + loc->inode, loc->inode->gfid, 0); + + wind: + STACK_WIND (frame, changelog_setxattr_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->setxattr, + loc, dict, flags, xdata); + return 0; +} + +int32_t +changelog_fsetxattr_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA); + + unwind: + CHANGELOG_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata); + + return 0; +} + +int32_t +changelog_fsetxattr (call_frame_t *frame, + xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT (this, frame->local, + fd->inode, fd->inode->gfid, 0); + + wind: + STACK_WIND (frame, changelog_fsetxattr_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr, + fd, dict, flags, xdata); + return 0; +} + +/* }}} */ + + +/* Data modification fops - TYPE I */ + +/* {{{ */ + +/* {f}truncate() */ + +int32_t +changelog_truncate_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_DATA); + + unwind: + CHANGELOG_STACK_UNWIND (truncate, frame, + op_ret, op_errno, prebuf, postbuf, xdata); + return 0; +} + +int32_t +changelog_truncate (call_frame_t *frame, + xlator_t *this, loc_t *loc, off_t offset, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT (this, frame->local, + loc->inode, loc->inode->gfid, 0); + + wind: + STACK_WIND (frame, changelog_truncate_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->truncate, + loc, offset, xdata); + return 0; +} + +int32_t +changelog_ftruncate_cbk (call_frame_t *frame, + void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_DATA); + + unwind: + CHANGELOG_STACK_UNWIND (ftruncate, frame, + op_ret, op_errno, prebuf, postbuf, xdata); + return 0; +} + +int32_t +changelog_ftruncate (call_frame_t *frame, + xlator_t *this, fd_t *fd, off_t offset, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT (this, frame->local, + fd->inode, fd->inode->gfid, 0); + + wind: + STACK_WIND (frame, changelog_ftruncate_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->ftruncate, + fd, offset, xdata); + return 0; +} + +/* writev() */ + +int32_t +changelog_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, + dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + changelog_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + CHANGELOG_COND_GOTO (priv, ((op_ret <= 0) || !local), unwind); + + changelog_update (this, priv, local, CHANGELOG_TYPE_DATA); + + unwind: + CHANGELOG_STACK_UNWIND (writev, frame, + op_ret, op_errno, prebuf, postbuf, xdata); + return 0; +} + +int32_t +changelog_writev (call_frame_t *frame, + xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + changelog_priv_t *priv = NULL; + + priv = this->private; + CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind); + + CHANGELOG_INIT (this, frame->local, + fd->inode, fd->inode->gfid, 0); + + wind: + STACK_WIND (frame, changelog_writev_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->writev, fd, vector, + count, offset, flags, iobref, xdata); + return 0; +} + +/* }}} */ + +/** + * The + * - @init () + * - @fini () + * - @reconfigure () + * ... and helper routines + */ + +/** + * needed if there are more operation modes in the future. + */ +static void +changelog_assign_opmode (changelog_priv_t *priv, char *mode) +{ + if ( strncmp (mode, "realtime", 8) == 0 ) { + priv->op_mode = CHANGELOG_MODE_RT; + } +} + +static void +changelog_assign_encoding (changelog_priv_t *priv, char *enc) +{ + if ( strncmp (enc, "binary", 6) == 0 ) { + priv->encode_mode = CHANGELOG_ENCODE_BINARY; + } else if ( strncmp (enc, "ascii", 5) == 0 ) { + priv->encode_mode = CHANGELOG_ENCODE_ASCII; + } +} + +/* cleanup any helper threads that are running */ +static void +changelog_cleanup_helper_threads (xlator_t *this, changelog_priv_t *priv) +{ + if (priv->cr.rollover_th) { + changelog_thread_cleanup (this, priv->cr.rollover_th); + priv->cr.rollover_th = 0; + } + + if (priv->cf.fsync_th) { + changelog_thread_cleanup (this, priv->cf.fsync_th); + priv->cf.fsync_th = 0; + } +} + +/* spawn helper thread; cleaning up in case of errors */ +static int +changelog_spawn_helper_threads (xlator_t *this, changelog_priv_t *priv) +{ + int ret = 0; + + priv->cr.this = this; + ret = gf_thread_create (&priv->cr.rollover_th, + NULL, changelog_rollover, priv); + if (ret) + goto out; + + if (priv->fsync_interval) { + priv->cf.this = this; + ret = gf_thread_create (&priv->cf.fsync_th, + NULL, changelog_fsync_thread, priv); + } + + if (ret) + changelog_cleanup_helper_threads (this, priv); + + out: + return ret; +} + +/* cleanup the notifier thread */ +static int +changelog_cleanup_notifier (xlator_t *this, changelog_priv_t *priv) +{ + int ret = 0; + + if (priv->cn.notify_th) { + changelog_thread_cleanup (this, priv->cn.notify_th); + priv->cn.notify_th = 0; + + ret = close (priv->wfd); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "error closing writer end of notifier pipe" + " (reason: %s)", strerror (errno)); + } + + return ret; +} + +/* spawn the notifier thread - nop if already running */ +static int +changelog_spawn_notifier (xlator_t *this, changelog_priv_t *priv) +{ + int ret = 0; + int flags = 0; + int pipe_fd[2] = {0, 0}; + + if (priv->cn.notify_th) + goto out; /* notifier thread already running */ + + ret = pipe (pipe_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "Cannot create pipe (reason: %s)", strerror (errno)); + goto out; + } + + /* writer is non-blocking */ + flags = fcntl (pipe_fd[1], F_GETFL); + flags |= O_NONBLOCK; + + ret = fcntl (pipe_fd[1], F_SETFL, flags); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set O_NONBLOCK flag"); + goto out; + } + + priv->wfd = pipe_fd[1]; + + priv->cn.this = this; + priv->cn.rfd = pipe_fd[0]; + + ret = gf_thread_create (&priv->cn.notify_th, + NULL, changelog_notifier, priv); + + out: + return ret; +} + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_changelog_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_WARNING, "Memory accounting" + " init failed"); + return ret; + } + + return ret; +} + +static int +changelog_init (xlator_t *this, changelog_priv_t *priv) +{ + int i = 0; + int ret = -1; + struct timeval tv = {0,}; + changelog_log_data_t cld = {0,}; + + ret = gettimeofday (&tv, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "gettimeofday() failure"); + goto out; + } + + priv->slice.tv_start = tv; + + priv->maps[CHANGELOG_TYPE_DATA] = "D "; + priv->maps[CHANGELOG_TYPE_METADATA] = "M "; + priv->maps[CHANGELOG_TYPE_ENTRY] = "E "; + + for (; i < CHANGELOG_MAX_TYPE; i++) { + /* start with version 1 */ + priv->slice.changelog_version[i] = 1; + } + + if (!priv->active) + return ret; + + /* spawn the notifier thread */ + ret = changelog_spawn_notifier (this, priv); + if (ret) + goto out; + + /** + * start with a fresh changelog file every time. this is done + * in case there was an encoding change. so... things are kept + * simple here. + */ + ret = changelog_fill_rollover_data (&cld, _gf_false); + if (ret) + goto out; + + LOCK (&priv->lock); + { + ret = changelog_inject_single_event (this, priv, &cld); + } + UNLOCK (&priv->lock); + + /* ... and finally spawn the helpers threads */ + ret = changelog_spawn_helper_threads (this, priv); + + out: + return ret; +} + +int +reconfigure (xlator_t *this, dict_t *options) +{ + int ret = 0; + char *tmp = NULL; + changelog_priv_t *priv = NULL; + gf_boolean_t active_earlier = _gf_true; + gf_boolean_t active_now = _gf_true; + changelog_time_slice_t *slice = NULL; + changelog_log_data_t cld = {0,}; + + priv = this->private; + if (!priv) + goto out; + + ret = -1; + active_earlier = priv->active; + + /* first stop the rollover and the fsync thread */ + changelog_cleanup_helper_threads (this, priv); + + GF_OPTION_RECONF ("changelog-dir", tmp, options, str, out); + if (!tmp) { + gf_log (this->name, GF_LOG_ERROR, + "\"changelog-dir\" option is not set"); + goto out; + } + + GF_FREE (priv->changelog_dir); + priv->changelog_dir = gf_strdup (tmp); + if (!priv->changelog_dir) + goto out; + + ret = mkdir_p (priv->changelog_dir, 0600, _gf_true); + if (ret) + goto out; + + GF_OPTION_RECONF ("changelog", active_now, options, bool, out); + + /** + * changelog_handle_change() handles changes that could possibly + * have been submit changes before changelog deactivation. + */ + if (!active_now) + priv->active = _gf_false; + + GF_OPTION_RECONF ("op-mode", tmp, options, str, out); + changelog_assign_opmode (priv, tmp); + + tmp = NULL; + + GF_OPTION_RECONF ("encoding", tmp, options, str, out); + changelog_assign_encoding (priv, tmp); + + GF_OPTION_RECONF ("rollover-time", + priv->rollover_time, options, int32, out); + GF_OPTION_RECONF ("fsync-interval", + priv->fsync_interval, options, int32, out); + + if (active_now || active_earlier) { + ret = changelog_fill_rollover_data (&cld, !active_now); + if (ret) + goto out; + + slice = &priv->slice; + + LOCK (&priv->lock); + { + ret = changelog_inject_single_event (this, priv, &cld); + if (!ret && active_now) + SLICE_VERSION_UPDATE (slice); + } + UNLOCK (&priv->lock); + + if (ret) + goto out; + + if (active_now) { + ret = changelog_spawn_notifier (this, priv); + if (!ret) + ret = changelog_spawn_helper_threads (this, + priv); + } else + ret = changelog_cleanup_notifier (this, priv); + } + + out: + if (ret) { + ret = changelog_cleanup_notifier (this, priv); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "changelog reconfigured"); + if (active_now) + priv->active = _gf_true; + } + + return ret; +} + +int32_t +init (xlator_t *this) +{ + int ret = -1; + char *tmp = NULL; + changelog_priv_t *priv = NULL; + + GF_VALIDATE_OR_GOTO ("changelog", this, out); + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "translator needs a single subvolume"); + goto out; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_ERROR, + "dangling volume. please check volfile"); + goto out; + } + + priv = GF_CALLOC (1, sizeof (*priv), gf_changelog_mt_priv_t); + if (!priv) + goto out; + + this->local_pool = mem_pool_new (changelog_local_t, 64); + if (!this->local_pool) { + gf_log (this->name, GF_LOG_ERROR, + "failed to create local memory pool"); + goto out; + } + + LOCK_INIT (&priv->lock); + + GF_OPTION_INIT ("changelog-brick", tmp, str, out); + if (!tmp) { + gf_log (this->name, GF_LOG_ERROR, + "\"changelog-brick\" option is not set"); + goto out; + } + + priv->changelog_brick = gf_strdup (tmp); + if (!priv->changelog_brick) + goto out; + tmp = NULL; + + GF_OPTION_INIT ("changelog-dir", tmp, str, out); + if (!tmp) { + gf_log (this->name, GF_LOG_ERROR, + "\"changelog-dir\" option is not set"); + goto out; + } + + priv->changelog_dir = gf_strdup (tmp); + if (!priv->changelog_dir) + goto out; + tmp = NULL; + + /** + * create the directory even if change-logging would be inactive + * so that consumers can _look_ into it (finding nothing...) + */ + ret = mkdir_p (priv->changelog_dir, 0600, _gf_true); + if (ret) + goto out; + + GF_OPTION_INIT ("changelog", priv->active, bool, out); + + GF_OPTION_INIT ("op-mode", tmp, str, out); + changelog_assign_opmode (priv, tmp); + + tmp = NULL; + + GF_OPTION_INIT ("encoding", tmp, str, out); + changelog_assign_encoding (priv, tmp); + + GF_OPTION_INIT ("rollover-time", priv->rollover_time, int32, out); + + GF_OPTION_INIT ("fsync-interval", priv->fsync_interval, int32, out); + + changelog_encode_change(priv); + + GF_ASSERT (cb_bootstrap[priv->op_mode].mode == priv->op_mode); + priv->cb = &cb_bootstrap[priv->op_mode]; + + /* ... now bootstrap the logger */ + ret = priv->cb->ctor (this, &priv->cd); + if (ret) + goto out; + + priv->changelog_fd = -1; + ret = changelog_init (this, priv); + if (ret) + goto out; + + gf_log (this->name, GF_LOG_DEBUG, "changelog translator loaded"); + + out: + if (ret) { + if (this->local_pool) + mem_pool_destroy (this->local_pool); + if (priv->cb) { + ret = priv->cb->dtor (this, &priv->cd); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "error in cleanup during init()"); + } + GF_FREE (priv->changelog_brick); + GF_FREE (priv->changelog_dir); + GF_FREE (priv); + this->private = NULL; + } else + this->private = priv; + + return ret; +} + +void +fini (xlator_t *this) +{ + int ret = -1; + changelog_priv_t *priv = NULL; + + priv = this->private; + + if (priv) { + ret = priv->cb->dtor (this, &priv->cd); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "error in fini"); + mem_pool_destroy (this->local_pool); + GF_FREE (priv->changelog_brick); + GF_FREE (priv->changelog_dir); + GF_FREE (priv); + } + + this->private = NULL; + + return; +} + +struct xlator_fops fops = { + .mknod = changelog_mknod, + .mkdir = changelog_mkdir, + .create = changelog_create, + .symlink = changelog_symlink, + .writev = changelog_writev, + .truncate = changelog_truncate, + .ftruncate = changelog_ftruncate, + .link = changelog_link, + .rename = changelog_rename, + .unlink = changelog_unlink, + .rmdir = changelog_rmdir, + .setattr = changelog_setattr, + .fsetattr = changelog_fsetattr, + .setxattr = changelog_setxattr, + .fsetxattr = changelog_fsetxattr, + .removexattr = changelog_removexattr, + .fremovexattr = changelog_fremovexattr, +}; + +struct xlator_cbks cbks = { + .forget = changelog_forget, +}; + +struct volume_options options[] = { + {.key = {"changelog"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "enable/disable change-logging" + }, + {.key = {"changelog-brick"}, + .type = GF_OPTION_TYPE_PATH, + .description = "brick path to generate unique socket file name." + " should be the export directory of the volume strictly." + }, + {.key = {"changelog-dir"}, + .type = GF_OPTION_TYPE_PATH, + .description = "directory for the changelog files" + }, + {.key = {"op-mode"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "realtime", + .value = {"realtime"}, + .description = "operation mode - futuristic operation modes" + }, + {.key = {"encoding"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "ascii", + .value = {"binary", "ascii"}, + .description = "encoding type for changelogs" + }, + {.key = {"rollover-time"}, + .default_value = "60", + .type = GF_OPTION_TYPE_TIME, + .description = "time to switch to a new changelog file (in seconds)" + }, + {.key = {"fsync-interval"}, + .type = GF_OPTION_TYPE_TIME, + .default_value = "0", + .description = "do not open CHANGELOG file with O_SYNC mode." + " instead perform fsync() at specified intervals" + }, + {.key = {NULL} + }, +}; diff --git a/xlators/features/compress/Makefile.am b/xlators/features/compress/Makefile.am new file mode 100644 index 000000000..a985f42a8 --- /dev/null +++ b/xlators/features/compress/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/compress/src/Makefile.am b/xlators/features/compress/src/Makefile.am new file mode 100644 index 000000000..4a64b52a9 --- /dev/null +++ b/xlators/features/compress/src/Makefile.am @@ -0,0 +1,17 @@ +xlator_LTLIBRARIES = cdc.la + +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +noinst_HEADERS = cdc.h cdc-mem-types.h + +cdc_la_LDFLAGS = -module -avoidversion $(LIBZ_LIBS) + +cdc_la_SOURCES = cdc.c cdc-helper.c +cdc_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) \ +-shared -nostartfiles $(LIBZ_CFLAGS) + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/compress/src/cdc-helper.c b/xlators/features/compress/src/cdc-helper.c new file mode 100644 index 000000000..54432ff45 --- /dev/null +++ b/xlators/features/compress/src/cdc-helper.c @@ -0,0 +1,547 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" + +#include "cdc.h" +#include "cdc-mem-types.h" + +#ifdef HAVE_LIB_Z +#include "zlib.h" +#endif + +#ifdef HAVE_LIB_Z +/* gzip header looks something like this + * (RFC 1950) + * + * +---+---+---+---+---+---+---+---+---+---+ + * |ID1|ID2|CM |FLG| MTIME |XFL|OS | + * +---+---+---+---+---+---+---+---+---+---+ + * + * Data is usually sent without this header i.e + * Data sent = <compressed-data> + trailer(8) + * The trailer contains the checksum. + * + * gzip_header is added only during debugging. + * Refer to the function cdc_dump_iovec_to_disk + */ +static const char gzip_header[10] = + { + '\037', '\213', Z_DEFLATED, 0, + 0, 0, 0, 0, + 0, GF_CDC_OS_ID + }; + +static int32_t +cdc_next_iovec (xlator_t *this, cdc_info_t *ci) +{ + int ret = -1; + + ci->ncount++; + /* check for iovec overflow -- should not happen */ + if (ci->ncount == MAX_IOVEC) { + gf_log (this->name, GF_LOG_ERROR, + "Zlib output buffer overflow" + " ->ncount (%d) | ->MAX_IOVEC (%d)", + ci->ncount, MAX_IOVEC); + goto out; + } + + ret = 0; + + out: + return ret; +} + +static void +cdc_put_long (unsigned char *string, unsigned long x) +{ + string[0] = (unsigned char) (x & 0xff); + string[1] = (unsigned char) ((x & 0xff00) >> 8); + string[2] = (unsigned char) ((x & 0xff0000) >> 16); + string[3] = (unsigned char) ((x & 0xff000000) >> 24); +} + +static unsigned long +cdc_get_long (unsigned char *buf) +{ + return ((unsigned long) buf[0]) + | (((unsigned long) buf[1]) << 8) + | (((unsigned long) buf[2]) << 16) + | (((unsigned long) buf[3]) << 24); +} + +static int32_t +cdc_init_gzip_trailer (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci) +{ + int ret = -1; + char *buf = NULL; + + ret = cdc_next_iovec (this, ci); + if (ret) + goto out; + + buf = CURR_VEC(ci).iov_base = + (char *) GF_CALLOC (1, GF_CDC_VALIDATION_SIZE, + gf_cdc_mt_gzip_trailer_t); + + if (!CURR_VEC(ci).iov_base) + goto out; + + CURR_VEC(ci).iov_len = GF_CDC_VALIDATION_SIZE; + + cdc_put_long ((unsigned char *)&buf[0], ci->crc); + cdc_put_long ((unsigned char *)&buf[4], ci->stream.total_in); + + ret = 0; + + out: + return ret; +} + +static int32_t +cdc_alloc_iobuf_and_init_vec (xlator_t *this, + cdc_priv_t *priv, cdc_info_t *ci, + int size) +{ + int ret = -1; + int alloc_len = 0; + struct iobuf *iobuf = NULL; + + ret = cdc_next_iovec (this, ci); + if (ret) + goto out; + + alloc_len = size ? size : ci->buffer_size; + + iobuf = iobuf_get2 (this->ctx->iobuf_pool, alloc_len); + if (!iobuf) + goto out; + + ret = iobref_add (ci->iobref, iobuf); + if (ret) + goto out; + + /* Initialize this iovec */ + CURR_VEC(ci).iov_base = iobuf->ptr; + CURR_VEC(ci).iov_len = alloc_len; + + ret = 0; + + out: + return ret; +} + +static void +cdc_init_zlib_output_stream (cdc_priv_t *priv, cdc_info_t *ci, int size) +{ + ci->stream.next_out = (unsigned char *) CURR_VEC(ci).iov_base; + ci->stream.avail_out = size ? size : ci->buffer_size; +} + +/* This routine is for testing and debugging only. + * Data written = header(10) + <compressed-data> + trailer(8) + * So each gzip dump file is at least 18 bytes in size. + */ +void +cdc_dump_iovec_to_disk (xlator_t *this, cdc_info_t *ci, const char *file) +{ + int i = 0; + int fd = 0; + size_t writen = 0; + size_t total_writen = 0; + + fd = open (file, O_WRONLY|O_CREAT|O_TRUNC, 0777 ); + if (fd < 0) { + gf_log (this->name, GF_LOG_ERROR, + "Cannot open file: %s", file); + return; + } + + writen = write (fd, (char *) gzip_header, 10); + total_writen += writen; + for (i = 0; i < ci->ncount; i++) { + writen = write (fd, (char *) ci->vec[i].iov_base, ci->vec[i].iov_len); + total_writen += writen; + } + + gf_log (this->name, GF_LOG_DEBUG, + "dump'd %zu bytes to %s", total_writen, GF_CDC_DEBUG_DUMP_FILE ); + + close (fd); +} + +static int32_t +cdc_flush_libz_buffer (cdc_priv_t *priv, xlator_t *this, cdc_info_t *ci, + int (*libz_func)(z_streamp, int), + int flush) +{ + int32_t ret = Z_OK; + int done = 0; + unsigned int deflate_len = 0; + + for (;;) { + deflate_len = ci->buffer_size - ci->stream.avail_out; + + if (deflate_len != 0) { + CURR_VEC(ci).iov_len = deflate_len; + + ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0); + if (ret) { + ret = Z_MEM_ERROR; + break; + } + + /* Re-position Zlib output buffer */ + cdc_init_zlib_output_stream (priv, ci, 0); + } + + if (done) { + ci->ncount--; + break; + } + + ret = libz_func (&ci->stream, flush); + + if (ret == Z_BUF_ERROR) { + ret = Z_OK; + ci->ncount--; + break; + } + + done = (ci->stream.avail_out != 0 || ret == Z_STREAM_END); + + if (ret != Z_OK && ret != Z_STREAM_END) + break; + } + + return ret; +} + +static int32_t +do_cdc_compress (struct iovec *vec, xlator_t *this, cdc_priv_t *priv, + cdc_info_t *ci) +{ + int ret = -1; + + /* Initialize defalte */ + ret = deflateInit2 (&ci->stream, priv->cdc_level, Z_DEFLATED, + priv->window_size, priv->mem_level, + Z_DEFAULT_STRATEGY); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "unable to init Zlib (retval: %d)", ret); + goto out; + } + + ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0); + if (ret) + goto out; + + /* setup output buffer */ + cdc_init_zlib_output_stream (priv, ci, 0); + + /* setup input buffer */ + ci->stream.next_in = (unsigned char *) vec->iov_base; + ci->stream.avail_in = vec->iov_len; + + ci->crc = crc32 (ci->crc, (const Bytef *) vec->iov_base, vec->iov_len); + + gf_log (this->name, GF_LOG_DEBUG, "crc=%lu len=%d buffer_size=%d", + ci->crc, ci->stream.avail_in, ci->buffer_size); + + /* compress !! */ + while (ci->stream.avail_in != 0) { + if (ci->stream.avail_out == 0) { + + CURR_VEC(ci).iov_len = ci->buffer_size; + + ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0); + if (ret) + break; + + /* Re-position Zlib output buffer */ + cdc_init_zlib_output_stream (priv, ci, 0); + } + + ret = deflate (&ci->stream, Z_NO_FLUSH); + if (ret != Z_OK) + break; + } + + out: + return ret; +} + +int32_t +cdc_compress (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci, + dict_t **xdata) +{ + int ret = -1; + int i = 0; + + ci->iobref = iobref_new (); + if (!ci->iobref) + goto out; + + if (!*xdata) { + *xdata = dict_new (); + if (!*xdata) { + gf_log (this->name, GF_LOG_ERROR, "Cannot allocate xdata" + " dict"); + goto out; + } + } + + /* data */ + for (i = 0; i < ci->count; i++) { + ret = do_cdc_compress (&ci->vector[i], this, priv, ci); + if (ret != Z_OK) + goto deflate_cleanup_out; + } + + /* flush zlib buffer */ + ret = cdc_flush_libz_buffer (priv, this, ci, deflate, Z_FINISH); + if (!(ret == Z_OK || ret == Z_STREAM_END)) { + gf_log (this->name, GF_LOG_ERROR, + "Compression Error: ret (%d)", ret); + ret = -1; + goto deflate_cleanup_out; + } + + /* trailer */ + ret = cdc_init_gzip_trailer (this, priv, ci); + if (ret) + goto deflate_cleanup_out; + + gf_log (this->name, GF_LOG_DEBUG, + "Compressed %ld to %ld bytes", + ci->stream.total_in, ci->stream.total_out); + + ci->nbytes = ci->stream.total_out + GF_CDC_VALIDATION_SIZE; + + /* set deflated canary value for identification */ + ret = dict_set_int32 (*xdata, GF_CDC_DEFLATE_CANARY_VAL, 1); + if (ret) { + /* Send uncompressed data if we can't _tell_ the client + * that deflated data is on it's way. So, we just log + * the faliure and continue as usual. + */ + gf_log (this->name, GF_LOG_ERROR, + "Data deflated, but could not set canary" + " value in dict for identification"); + } + + /* This is to be used in testing */ + if ( priv->debug ) { + cdc_dump_iovec_to_disk (this, ci, GF_CDC_DEBUG_DUMP_FILE ); + } + + deflate_cleanup_out: + (void) deflateEnd(&ci->stream); + + out: + return ret; +} + + +/* deflate content is checked by the presence of a canary + * value in the dict as the key + */ +static int32_t +cdc_check_content_for_deflate (dict_t *xdata) +{ + return dict_get (xdata, GF_CDC_DEFLATE_CANARY_VAL) ? -1 : 0; +} + +static unsigned long +cdc_extract_crc (char *trailer) +{ + return cdc_get_long ((unsigned char *) &trailer[0]); +} + +static unsigned long +cdc_extract_size (char *trailer) +{ + return cdc_get_long ((unsigned char *) &trailer[4]); +} + +static int32_t +cdc_validate_inflate (cdc_info_t *ci, unsigned long crc, + unsigned long len) +{ + return !((crc == ci->crc) + /* inflated length is hidden inside + * Zlib stream struct */ + && (len == ci->stream.total_out)); +} + +static int32_t +do_cdc_decompress (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci) +{ + int ret = -1; + int i = 0; + int len = 0; + char *inflte = NULL; + char *trailer = NULL; + struct iovec vec = {0,}; + unsigned long computed_crc = 0; + unsigned long computed_len = 0; + + ret = inflateInit2 (&ci->stream, priv->window_size); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Zlib: Unable to initialize inflate"); + goto out; + } + + vec = THIS_VEC(ci, 0); + + trailer = (char *) (((char *) vec.iov_base) + vec.iov_len + - GF_CDC_VALIDATION_SIZE); + + /* CRC of uncompressed data */ + computed_crc = cdc_extract_crc (trailer); + + /* size of uncomrpessed data */ + computed_len = cdc_extract_size (trailer); + + gf_log (this->name, GF_LOG_DEBUG, "crc=%lu len=%lu buffer_size=%d", + computed_crc, computed_len, ci->buffer_size); + + inflte = vec.iov_base ; + len = vec.iov_len - GF_CDC_VALIDATION_SIZE; + + /* allocate buffer of the original length of the data */ + ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0); + if (ret) + goto out; + + /* setup output buffer */ + cdc_init_zlib_output_stream (priv, ci, 0); + + /* setup input buffer */ + ci->stream.next_in = (unsigned char *) inflte; + ci->stream.avail_in = len; + + while (ci->stream.avail_in != 0) { + if (ci->stream.avail_out == 0) { + CURR_VEC(ci).iov_len = ci->buffer_size; + + ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0); + if (ret) + break; + + /* Re-position Zlib output buffer */ + cdc_init_zlib_output_stream (priv, ci, 0); + } + + ret = inflate (&ci->stream, Z_NO_FLUSH); + if (ret == Z_STREAM_ERROR) + break; + } + + /* flush zlib buffer */ + ret = cdc_flush_libz_buffer (priv, this, ci, inflate, Z_SYNC_FLUSH); + if (!(ret == Z_OK || ret == Z_STREAM_END)) { + gf_log (this->name, GF_LOG_ERROR, + "Decompression Error: ret (%d)", ret); + ret = -1; + goto out; + } + + /* compute CRC of the uncompresses data to check for + * correctness */ + + for (i = 0; i < ci->ncount; i++) { + ci->crc = crc32 (ci->crc, + (const Bytef *) ci->vec[i].iov_base, + ci->vec[i].iov_len); + } + + /* validate inflated data */ + ret = cdc_validate_inflate (ci, computed_crc, computed_len); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Checksum or length mismatched in inflated data"); + } + + out: + return ret; +} + +int32_t +cdc_decompress (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci, + dict_t *xdata) +{ + int32_t ret = -1; + + /* check for deflate content */ + if (!cdc_check_content_for_deflate (xdata)) { + gf_log (this->name, GF_LOG_DEBUG, + "Content not deflated, passing through ..."); + goto passthrough_out; + } + + ci->iobref = iobref_new (); + if (!ci->iobref) + goto passthrough_out; + + /* do we need to do this? can we assume that one iovec + * will hold per request data everytime? + * + * server/client protocol seems to deal with a single + * iovec even if op_ret > 1M. So, it looks ok to + * assume that a single iovec will contain all the + * data (This saves us a lot from finding the trailer + * and the data since it could have been split-up onto + * two adjacent iovec's. + * + * But, in case this translator is loaded above quick-read + * for some reason, then it's entirely possible that we get + * multiple iovec's... + * + * This case (handled below) is not tested. (by loading the + * xlator below quick-read) + */ + + /* @@ I_HOPE_THIS_IS_NEVER_HIT */ + if (ci->count > 1) { + gf_log (this->name, GF_LOG_WARNING, "unable to handle" + " multiple iovecs (%d in number)", ci->count); + goto inflate_cleanup_out; + /* TODO: coallate all iovecs in one */ + } + + ret = do_cdc_decompress (this, priv, ci); + if (ret) + goto inflate_cleanup_out; + + ci->nbytes = ci->stream.total_out; + + gf_log (this->name, GF_LOG_DEBUG, + "Inflated %ld to %ld bytes", + ci->stream.total_in, ci->stream.total_out); + + inflate_cleanup_out: + (void) inflateEnd (&ci->stream); + + passthrough_out: + return ret; +} + +#endif diff --git a/xlators/features/compress/src/cdc-mem-types.h b/xlators/features/compress/src/cdc-mem-types.h new file mode 100644 index 000000000..efa008059 --- /dev/null +++ b/xlators/features/compress/src/cdc-mem-types.h @@ -0,0 +1,22 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __CDC_MEM_TYPES_H +#define __CDC_MEM_TYPES_H + +#include "mem-types.h" + +enum gf_cdc_mem_types { + gf_cdc_mt_priv_t = gf_common_mt_end + 1, + gf_cdc_mt_vec_t = gf_common_mt_end + 2, + gf_cdc_mt_gzip_trailer_t = gf_common_mt_end + 3, +}; + +#endif diff --git a/xlators/features/compress/src/cdc.c b/xlators/features/compress/src/cdc.c new file mode 100644 index 000000000..eb7d87c56 --- /dev/null +++ b/xlators/features/compress/src/cdc.c @@ -0,0 +1,342 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include <sys/uio.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "defaults.h" +#include "logging.h" + +#include "cdc.h" +#include "cdc-mem-types.h" + +static void +cdc_cleanup_iobref (cdc_info_t *ci) +{ + assert(ci->iobref != NULL); + iobref_clear (ci->iobref); +} + +int32_t +cdc_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, + struct iatt *stbuf, struct iobref *iobref, + dict_t *xdata) +{ + int ret = -1; + cdc_priv_t *priv = NULL; + cdc_info_t ci = {0,}; + + GF_VALIDATE_OR_GOTO ("cdc", this, default_out); + GF_VALIDATE_OR_GOTO (this->name, frame, default_out); + + priv = this->private; + + if (op_ret <= 0) + goto default_out; + + if ( (priv->min_file_size != 0) + && (op_ret < priv->min_file_size) ) + goto default_out; + + ci.count = count; + ci.ibytes = op_ret; + ci.vector = vector; + ci.buf = NULL; + ci.iobref = NULL; + ci.ncount = 0; + ci.crc = 0; + ci.buffer_size = GF_CDC_DEF_BUFFERSIZE; + +/* A readv compresses on the server side and decompresses on the client side + */ + if (priv->op_mode == GF_CDC_MODE_SERVER) { + ret = cdc_compress (this, priv, &ci, &xdata); + } else if (priv->op_mode == GF_CDC_MODE_CLIENT) { + ret = cdc_decompress (this, priv, &ci, xdata); + } else { + gf_log (this->name, GF_LOG_ERROR, + "Invalid operation mode (%d)", priv->op_mode); + } + + if (ret) + goto default_out; + + STACK_UNWIND_STRICT (readv, frame, ci.nbytes, op_errno, + ci.vec, ci.ncount, stbuf, iobref, + xdata); + cdc_cleanup_iobref (&ci); + return 0; + + default_out: + STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, + vector, count, stbuf, iobref, xdata); + return 0; +} + +int32_t +cdc_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, uint32_t flags, + dict_t *xdata) +{ + fop_readv_cbk_t cbk = NULL; + +#ifdef HAVE_LIB_Z + cbk = cdc_readv_cbk; +#else + cbk = default_readv_cbk; +#endif + STACK_WIND (frame, cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, + fd, size, offset, flags, xdata); + return 0; +} + +int32_t +cdc_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, xdata); + return 0; +} + +int32_t +cdc_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t offset, + uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + int ret = -1; + cdc_priv_t *priv = NULL; + cdc_info_t ci = {0,}; + size_t isize = 0; + + GF_VALIDATE_OR_GOTO ("cdc", this, default_out); + GF_VALIDATE_OR_GOTO (this->name, frame, default_out); + + priv = this->private; + + isize = iov_length(vector, count); + + if (isize <= 0) + goto default_out; + + if ( (priv->min_file_size != 0) + && (isize < priv->min_file_size) ) + goto default_out; + + ci.count = count; + ci.ibytes = isize; + ci.vector = vector; + ci.buf = NULL; + ci.iobref = NULL; + ci.ncount = 0; + ci.crc = 0; + ci.buffer_size = GF_CDC_DEF_BUFFERSIZE; + +/* A writev compresses on the client side and decompresses on the server side + */ + if (priv->op_mode == GF_CDC_MODE_CLIENT) { + ret = cdc_compress (this, priv, &ci, &xdata); + } else if (priv->op_mode == GF_CDC_MODE_SERVER) { + ret = cdc_decompress (this, priv, &ci, xdata); + } else { + gf_log (this->name, GF_LOG_ERROR, "Invalid operation mode (%d) ", priv->op_mode); + } + + if (ret) + goto default_out; + + STACK_WIND (frame, + cdc_writev_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->writev, + fd, ci.vec, ci.ncount, offset, flags, + iobref, xdata); + + cdc_cleanup_iobref (&ci); + return 0; + + default_out: + STACK_WIND (frame, + cdc_writev_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->writev, + fd, vector, count, offset, flags, + iobref, xdata); + return 0; +} + +int32_t +init (xlator_t *this) +{ + int ret = -1; + char *temp_str = NULL; + cdc_priv_t *priv = NULL; + + GF_VALIDATE_OR_GOTO ("cdc", this, err); + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "Need subvolume == 1"); + goto err; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "Dangling volume. Check volfile"); + } + + priv = GF_CALLOC (1, sizeof (*priv), gf_cdc_mt_priv_t); + if (!priv) { + goto err; + } + + /* Check if debug mode is turned on */ + GF_OPTION_INIT ("debug", priv->debug, bool, err); + if( priv->debug ) { + gf_log (this->name, GF_LOG_DEBUG, "CDC debug option turned on"); + } + + /* Set Gzip Window Size */ + GF_OPTION_INIT ("window-size", priv->window_size, int32, err); + if ( (priv->window_size > GF_CDC_MAX_WINDOWSIZE) + || (priv->window_size < GF_CDC_DEF_WINDOWSIZE) ) { + gf_log (this->name, GF_LOG_WARNING, + "Invalid gzip window size (%d), using default", + priv->window_size); + priv->window_size = GF_CDC_DEF_WINDOWSIZE; + } + + /* Set Gzip (De)Compression Level */ + GF_OPTION_INIT ("compression-level", priv->cdc_level, int32, err); + if ( ((priv->cdc_level < 1) || (priv->cdc_level > 9)) + && (priv->cdc_level != GF_CDC_DEF_COMPRESSION) ) { + gf_log (this->name, GF_LOG_WARNING, + "Invalid gzip (de)compression level (%d)," + " using default", priv->cdc_level); + priv->cdc_level = GF_CDC_DEF_COMPRESSION; + } + + /* Set Gzip Memory Level */ + GF_OPTION_INIT ("mem-level", priv->mem_level, int32, err); + if ( (priv->mem_level < 1) || (priv->mem_level > 9) ) { + gf_log (this->name, GF_LOG_WARNING, + "Invalid gzip memory level, using the default"); + priv->mem_level = GF_CDC_DEF_MEMLEVEL; + } + + /* Set min file size to enable compression */ + GF_OPTION_INIT ("min-size", priv->min_file_size, int32, err); + + /* Mode of operation - Server/Client */ + ret = dict_get_str (this->options, "mode", &temp_str); + if (ret) { + gf_log (this->name, GF_LOG_CRITICAL, + "Operation mode not specified !!"); + goto err; + } + + if (GF_CDC_MODE_IS_CLIENT (temp_str)) { + priv->op_mode = GF_CDC_MODE_CLIENT; + } else if (GF_CDC_MODE_IS_SERVER (temp_str)) { + priv->op_mode = GF_CDC_MODE_SERVER; + } else { + gf_log (this->name, GF_LOG_CRITICAL, + "Bogus operation mode (%s) specified", temp_str); + goto err; + } + + this->private = priv; + gf_log (this->name, GF_LOG_DEBUG, "CDC xlator loaded in (%s) mode",temp_str); + return 0; + + err: + if (priv) + GF_FREE (priv); + + return -1; +} + +void +fini (xlator_t *this) +{ + cdc_priv_t *priv = this->private; + + if (priv) + GF_FREE (priv); + this->private = NULL; + return; +} + +struct xlator_fops fops = { + .readv = cdc_readv, + .writev = cdc_writev, +}; + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { + { .key = {"window-size"}, + .default_value = "-15", + .type = GF_OPTION_TYPE_INT, + .description = "Size of the zlib history buffer." + }, + { .key = {"mem-level"}, + .default_value = "8", + .type = GF_OPTION_TYPE_INT, + .description = "Memory allocated for internal compression state.\ + 1 uses minimum memory but is slow and reduces \ + compression ratio; memLevel=9 uses maximum memory \ + for optimal speed. The default value is 8." + }, + { .key = {"compression-level"}, + .default_value = "-1", + .type = GF_OPTION_TYPE_INT, + .description = "Compression levels \ + 0 : no compression, 1 : best speed, \ + 9 : best compression, -1 : default compression " + }, + { .key = {"min-size"}, + .default_value = "0", + .type = GF_OPTION_TYPE_INT, + .description = "Data is compressed only when its size exceeds this." + }, + { .key = {"mode"}, + .value = {"server", "client"}, + .type = GF_OPTION_TYPE_STR, + .description = "Set on the basis of where the xlator is loaded." + }, + { .key = {"debug"}, + .default_value = "false", + .type = GF_OPTION_TYPE_BOOL, + .description = "This is used in testing. Will dump compressed data \ + to disk as a gzip file." + }, + { .key = {NULL} + }, +}; diff --git a/xlators/features/compress/src/cdc.h b/xlators/features/compress/src/cdc.h new file mode 100644 index 000000000..71f4d2317 --- /dev/null +++ b/xlators/features/compress/src/cdc.h @@ -0,0 +1,107 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __CDC_H +#define __CDC_H + +#ifdef HAVE_LIB_Z +#include "zlib.h" +#endif + +#include "xlator.h" + +#ifndef MAX_IOVEC +#define MAX_IOVEC 16 +#endif + +typedef struct cdc_priv { + int window_size; + int mem_level; + int cdc_level; + int min_file_size; + int op_mode; + gf_boolean_t debug; + gf_lock_t lock; +} cdc_priv_t; + +typedef struct cdc_info { + /* input bits */ + int count; + int32_t ibytes; + struct iovec *vector; + struct iatt *buf; + + /* output bits */ + int ncount; + int nbytes; + int buffer_size; + struct iovec vec[MAX_IOVEC]; + struct iobref *iobref; + + /* zlib bits */ +#ifdef HAVE_LIB_Z + z_stream stream; +#endif + unsigned long crc; +} cdc_info_t; + +#define NVEC(ci) (ci->ncount - 1) +#define CURR_VEC(ci) ci->vec[ci->ncount - 1] +#define THIS_VEC(ci, i) ci->vector[i] + +/* Gzip defaults */ +#define GF_CDC_DEF_WINDOWSIZE -15 /* default value */ +#define GF_CDC_MAX_WINDOWSIZE -8 /* max value */ + +#ifdef HAVE_LIB_Z +#define GF_CDC_DEF_COMPRESSION Z_DEFAULT_COMPRESSION +#else +#define GF_CDC_DEF_COMPRESSION -1 +#endif + +#define GF_CDC_DEF_MEMLEVEL 8 +#define GF_CDC_DEF_BUFFERSIZE 262144 // 256K - default compression buffer size + +/* Operation mode + * If xlator is loaded on client, readv decompresses and writev compresses + * If xlator is loaded on server, readv compresses and writev decompresses + */ +#define GF_CDC_MODE_CLIENT 0 +#define GF_CDC_MODE_SERVER 1 + +/* min size of data to do cmpression + * 0 == compress even 1byte + */ +#define GF_CDC_MIN_CHUNK_SIZE 0 + +#define GF_CDC_VALIDATION_SIZE 8 + +#define GF_CDC_OS_ID 0xFF +#define GF_CDC_DEFLATE_CANARY_VAL "deflate" +#define GF_CDC_DEBUG_DUMP_FILE "/tmp/cdcdump.gz" + +#define GF_CDC_MODE_IS_CLIENT(m) \ + (strcmp (m, "client") == 0) + +#define GF_CDC_MODE_IS_SERVER(m) \ + (strcmp (m, "server") == 0) + +int32_t +cdc_compress (xlator_t *this, + cdc_priv_t *priv, + cdc_info_t *ci, + dict_t **xdata); +int32_t +cdc_decompress (xlator_t *this, + cdc_priv_t *priv, + cdc_info_t *ci, + dict_t *xdata); + +#endif diff --git a/xlators/features/gfid-access/Makefile.am b/xlators/features/gfid-access/Makefile.am new file mode 100644 index 000000000..af437a64d --- /dev/null +++ b/xlators/features/gfid-access/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src diff --git a/xlators/features/gfid-access/src/Makefile.am b/xlators/features/gfid-access/src/Makefile.am new file mode 100644 index 000000000..db53affaa --- /dev/null +++ b/xlators/features/gfid-access/src/Makefile.am @@ -0,0 +1,15 @@ +xlator_LTLIBRARIES = gfid-access.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +gfid_access_la_LDFLAGS = -module -avoid-version + +gfid_access_la_SOURCES = gfid-access.c +gfid_access_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = gfid-access.h gfid-access-mem-types.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/features/gfid-access/src/gfid-access-mem-types.h b/xlators/features/gfid-access/src/gfid-access-mem-types.h new file mode 100644 index 000000000..168d67b43 --- /dev/null +++ b/xlators/features/gfid-access/src/gfid-access-mem-types.h @@ -0,0 +1,23 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _GFID_ACCESS_MEM_TYPES_H +#define _GFID_ACCESS_MEM_TYPES_H + +#include "mem-types.h" + +enum gf_changelog_mem_types { + gf_gfid_access_mt_priv_t = gf_common_mt_end + 1, + gf_gfid_access_mt_gfid_t, + gf_gfid_access_mt_end +}; + +#endif + diff --git a/xlators/features/gfid-access/src/gfid-access.c b/xlators/features/gfid-access/src/gfid-access.c new file mode 100644 index 000000000..da0ba7e50 --- /dev/null +++ b/xlators/features/gfid-access/src/gfid-access.c @@ -0,0 +1,1172 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "gfid-access.h" +#include "inode.h" +#include "byte-order.h" + + + +void +ga_newfile_args_free (ga_newfile_args_t *args) +{ + if (!args) + goto out; + + GF_FREE (args->bname); + + if (S_ISLNK (args->st_mode) && args->args.symlink.linkpath) { + GF_FREE (args->args.symlink.linkpath); + args->args.symlink.linkpath = NULL; + } + + mem_put (args); +out: + return; +} + + +void +ga_heal_args_free (ga_heal_args_t *args) +{ + if (!args) + goto out; + + GF_FREE (args->bname); + + mem_put (args); +out: + return; +} + + +ga_newfile_args_t * +ga_newfile_parse_args (xlator_t *this, data_t *data) +{ + ga_newfile_args_t *args = NULL; + ga_private_t *priv = NULL; + int len = 0; + int blob_len = 0; + int min_len = 0; + void *blob = NULL; + + priv = this->private; + + blob = data->data; + blob_len = data->len; + + min_len = sizeof (args->uid) + sizeof (args->gid) + sizeof (args->gfid) + + sizeof (args->st_mode) + 2 + 2; + if (blob_len < min_len) { + gf_log (this->name, GF_LOG_ERROR, + "Invalid length: Total length is less " + "than minimum length."); + goto err; + } + + args = mem_get0 (priv->newfile_args_pool); + if (args == NULL) + goto err; + + args->uid = ntoh32 (*(uint32_t *)blob); + blob += sizeof (uint32_t); + blob_len -= sizeof (uint32_t); + + args->gid = ntoh32 (*(uint32_t *)blob); + blob += sizeof (uint32_t); + blob_len -= sizeof (uint32_t); + + memcpy (args->gfid, blob, sizeof (args->gfid)); + blob += sizeof (args->gfid); + blob_len -= sizeof (args->gfid); + + args->st_mode = ntoh32 (*(uint32_t *)blob); + blob += sizeof (uint32_t); + blob_len -= sizeof (uint32_t); + + len = strnlen (blob, blob_len); + if (len == blob_len) + if (len == blob_len) { + gf_log (this->name, GF_LOG_ERROR, + "gfid: %s. No null byte present.", + args->gfid); + goto err; + } + + args->bname = GF_CALLOC (1, (len + 1), gf_common_mt_char); + if (args->bname == NULL) + goto err; + + memcpy (args->bname, blob, (len + 1)); + blob += (len + 1); + blob_len -= (len + 1); + + if (S_ISDIR (args->st_mode)) { + if (blob_len < sizeof (uint32_t)) { + gf_log (this->name, GF_LOG_ERROR, + "gfid: %s. Invalid length", + args->gfid); + goto err; + } + args->args.mkdir.mode = ntoh32 (*(uint32_t *)blob); + blob += sizeof (uint32_t); + blob_len -= sizeof (uint32_t); + + if (blob_len < sizeof (uint32_t)) { + gf_log (this->name, GF_LOG_ERROR, + "gfid: %s. Invalid length", + args->gfid); + goto err; + } + args->args.mkdir.umask = ntoh32 (*(uint32_t *)blob); + blob += sizeof (uint32_t); + blob_len -= sizeof (uint32_t); + if (blob_len < 0) { + gf_log (this->name, GF_LOG_ERROR, + "gfid: %s. Invalid length", + args->gfid); + goto err; + } + } else if (S_ISLNK (args->st_mode)) { + len = strnlen (blob, blob_len); + if (len == blob_len) { + gf_log (this->name, GF_LOG_ERROR, + "gfid: %s. Invalid length", + args->gfid); + goto err; + } + args->args.symlink.linkpath = GF_CALLOC (1, len + 1, + gf_common_mt_char); + if (args->args.symlink.linkpath == NULL) + goto err; + + memcpy (args->args.symlink.linkpath, blob, (len + 1)); + blob += (len + 1); + blob_len -= (len + 1); + } else { + if (blob_len < sizeof (uint32_t)) { + gf_log (this->name, GF_LOG_ERROR, + "gfid: %s. Invalid length", + args->gfid); + goto err; + } + args->args.mknod.mode = ntoh32 (*(uint32_t *)blob); + blob += sizeof (uint32_t); + blob_len -= sizeof (uint32_t); + + if (blob_len < sizeof (uint32_t)) { + gf_log (this->name, GF_LOG_ERROR, + "gfid: %s. Invalid length", + args->gfid); + goto err; + } + args->args.mknod.rdev = ntoh32 (*(uint32_t *)blob); + blob += sizeof (uint32_t); + blob_len -= sizeof (uint32_t); + + if (blob_len < sizeof (uint32_t)) { + gf_log (this->name, GF_LOG_ERROR, + "gfid: %s. Invalid length", + args->gfid); + goto err; + } + args->args.mknod.umask = ntoh32 (*(uint32_t *)blob); + blob += sizeof (uint32_t); + blob_len -= sizeof (uint32_t); + } + + if (blob_len) { + gf_log (this->name, GF_LOG_ERROR, + "gfid: %s. Invalid length", + args->gfid); + goto err; + } + + return args; + +err: + if (args) + ga_newfile_args_free (args); + + return NULL; +} + +ga_heal_args_t * +ga_heal_parse_args (xlator_t *this, data_t *data) +{ + ga_heal_args_t *args = NULL; + ga_private_t *priv = NULL; + void *blob = NULL; + int len = 0; + int blob_len = 0; + + blob = data->data; + blob_len = data->len; + + priv = this->private; + + /* bname should at least contain a character */ + if (blob_len < (sizeof (args->gfid) + 2)) + goto err; + + args = mem_get0 (priv->heal_args_pool); + if (!args) + goto err; + + memcpy (args->gfid, blob, sizeof (args->gfid)); + blob += sizeof (args->gfid); + blob_len -= sizeof (args->gfid); + + len = strnlen (blob, blob_len); + if (len == blob_len) + goto err; + + args->bname = GF_CALLOC (1, len + 1, gf_common_mt_char); + if (!args->bname) + goto err; + + memcpy (args->bname, blob, len); + blob_len -= (len + 1); + + if (blob_len) + goto err; + + return args; + +err: + if (args) + ga_heal_args_free (args); + + return NULL; +} + +static int32_t +ga_fill_tmp_loc (loc_t *loc, xlator_t *this, char *gfid, + char *bname, dict_t *xdata, loc_t *new_loc) +{ + int ret = -1; + uint64_t value = 0; + inode_t *parent = NULL; + + parent = loc->inode; + ret = inode_ctx_get (loc->inode, this, &value); + if (!ret) { + parent = (void *)value; + } + + /* parent itself should be looked up */ + uuid_copy (new_loc->pargfid, parent->gfid); + new_loc->parent = inode_ref (parent); + + new_loc->inode = inode_grep (parent->table, parent, bname); + if (!new_loc->inode) + new_loc->inode = inode_new (parent->table); + + loc_path (new_loc, bname); + new_loc->name = basename (new_loc->path); + + /* As GFID would not be set on the entry yet, lets not send entry + gfid in the request */ + /*uuid_copy (new_loc->gfid, (const unsigned char *)gfid); */ + + ret = dict_set_static_bin (xdata, "gfid-req", gfid, 16); + if (ret < 0) + goto out; + + ret = 0; + +out: + return ret; +} + + + +static gf_boolean_t +__is_gfid_access_dir (uuid_t gfid) +{ + uuid_t aux_gfid; + + memset (aux_gfid, 0, 16); + aux_gfid[15] = GF_AUX_GFID; + + if (uuid_compare (gfid, aux_gfid) == 0) + return _gf_true; + + return _gf_false; +} + +int32_t +ga_forget (xlator_t *this, inode_t *inode) +{ + int ret = -1; + uint64_t value = 0; + inode_t *tmp_inode = NULL; + + ret = inode_ctx_del (inode, this, &value); + if (ret) + goto out; + + tmp_inode = (void *)value; + inode_unref (tmp_inode); + +out: + return 0; +} + + +static int +ga_heal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *stat, dict_t *dict, + struct iatt *postparent) +{ + call_frame_t *orig_frame = NULL; + + orig_frame = frame->local; + frame->local = NULL; + + /* don't worry about inode linking and other stuff. They'll happen on + * the next lookup. + */ + STACK_DESTROY (frame->root); + + STACK_UNWIND_STRICT (setxattr, orig_frame, op_ret, op_errno, dict); + + return 0; +} + +static int +ga_newentry_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) +{ + call_frame_t *orig_frame = NULL; + + orig_frame = frame->local; + frame->local = NULL; + + /* don't worry about inode linking and other stuff. They'll happen on + * the next lookup. + */ + STACK_DESTROY (frame->root); + + STACK_UNWIND_STRICT (setxattr, orig_frame, op_ret, op_errno, xdata); + + return 0; +} + +int32_t +ga_new_entry (call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *data, + dict_t *xdata) +{ + int ret = -1; + ga_newfile_args_t *args = NULL; + loc_t tmp_loc = {0,}; + call_frame_t *new_frame = NULL; + mode_t mode = 0; + + args = ga_newfile_parse_args (this, data); + if (!args) + goto out; + + if (!xdata) + xdata = dict_new (); + + ret = ga_fill_tmp_loc (loc, this, args->gfid, + args->bname, xdata, &tmp_loc); + if (ret) + goto out; + + new_frame = copy_frame (frame); + if (!new_frame) + goto out; + new_frame->local = (void *)frame; + + new_frame->root->uid = args->uid; + new_frame->root->gid = args->gid; + + if (S_ISDIR (args->st_mode)) { + STACK_WIND (new_frame, ga_newentry_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, + &tmp_loc, args->args.mkdir.mode, + args->args.mkdir.umask, xdata); + } else if (S_ISLNK (args->st_mode)) { + STACK_WIND (new_frame, ga_newentry_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink, + args->args.symlink.linkpath, + &tmp_loc, 0, xdata); + } else { + /* use 07777 (4 7s) for considering the Sticky bits etc) */ + mode = (S_IFMT & args->st_mode) | + (07777 | args->args.mknod.mode);; + + STACK_WIND (new_frame, ga_newentry_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, + &tmp_loc, mode, + args->args.mknod.rdev, args->args.mknod.umask, + xdata); + } + + ret = 0; +out: + ga_newfile_args_free (args); + + return ret; +} + +int32_t +ga_heal_entry (call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *data, + dict_t *xdata) +{ + int ret = -1; + ga_heal_args_t *args = NULL; + loc_t tmp_loc = {0,}; + call_frame_t *new_frame = NULL; + + args = ga_heal_parse_args (this, data); + if (!args) + goto out; + + if (!xdata) + xdata = dict_new (); + + ret = ga_fill_tmp_loc (loc, this, args->gfid, args->bname, + xdata, &tmp_loc); + if (ret) + goto out; + + new_frame = copy_frame (frame); + if (!new_frame) + goto out; + new_frame->local = (void *)frame; + + STACK_WIND (new_frame, ga_heal_cbk, FIRST_CHILD (this), + FIRST_CHILD(this)->fops->lookup, + &tmp_loc, xdata); + + ret = 0; +out: + if (args) + ga_heal_args_free (args); + + return ret; +} + +int32_t +ga_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t *xdata) +{ + STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata); + return 0; +} + +int32_t +ga_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) +{ + + data_t *data = NULL; + int op_errno = ENOMEM; + int ret = 0; + inode_t *unref = NULL; + + if ((loc->name && !strcmp (GF_GFID_DIR, loc->name)) && + ((loc->parent && + __is_root_gfid (loc->parent->gfid)) || + __is_root_gfid (loc->pargfid))) { + op_errno = EPERM; + goto err; + } + + data = dict_get (dict, GF_FUSE_AUX_GFID_NEWFILE); + if (data) { + ret = ga_new_entry (frame, this, loc, data, xdata); + if (ret) + goto err; + return 0; + } + + data = dict_get (dict, GF_FUSE_AUX_GFID_HEAL); + if (data) { + ret = ga_heal_entry (frame, this, loc, data, xdata); + if (ret) + goto err; + return 0; + } + + //If the inode is a virtual inode change the inode otherwise perform + //the operation on same inode + GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); + +wind: + STACK_WIND (frame, ga_setxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, + xdata); + if (unref) + inode_unref (unref); + + return 0; +err: + STACK_UNWIND_STRICT (setxattr, frame, -1, op_errno, xdata); + return 0; +} + + +int32_t +ga_virtual_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + int j = 0; + int i = 0; + int ret = 0; + uint64_t temp_ino = 0; + inode_t *cbk_inode = NULL; + inode_t *true_inode = NULL; + uuid_t random_gfid = {0,}; + + if (frame->local) + cbk_inode = frame->local; + else + cbk_inode = inode; + + frame->local = NULL; + if (op_ret) + goto unwind; + + if (!IA_ISDIR (buf->ia_type)) + goto unwind; + + /* need to send back a different inode for linking in itable */ + if (cbk_inode == inode) { + /* check if the inode is in the 'itable' or + if its just previously discover()'d inode */ + true_inode = inode_find (inode->table, buf->ia_gfid); + if (!true_inode) { + cbk_inode = inode_new (inode->table); + + if (!cbk_inode) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + /* the inode is not present in itable, ie, the actual + path is not yet looked up. Use the current inode + itself for now */ + inode_ref (inode); + } else { + /* 'inode_ref()' has been done in inode_find() */ + inode = true_inode; + } + + ret = inode_ctx_put (cbk_inode, this, (uint64_t)inode); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set the inode ctx with" + "the actual inode"); + if (inode) + inode_unref (inode); + } + inode = NULL; + } + + if (!uuid_is_null (cbk_inode->gfid)) { + /* if the previous linked inode is used, use the + same gfid */ + uuid_copy (random_gfid, cbk_inode->gfid); + } else { + /* replace the buf->ia_gfid to a random gfid + for directory, for files, what we received is fine */ + uuid_generate (random_gfid); + } + + uuid_copy (buf->ia_gfid, random_gfid); + + for (i = 15; i > (15 - 8); i--) { + temp_ino += (uint64_t)(buf->ia_gfid[i]) << j; + j += 8; + } + buf->ia_ino = temp_ino; + +unwind: + STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, cbk_inode, buf, + xdata, postparent); + + return 0; +} + +int32_t +ga_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + ga_private_t *priv = NULL; + + /* if the entry in question is not 'root', + then follow the normal path */ + if (op_ret || !__is_root_gfid(buf->ia_gfid)) + goto unwind; + + priv = this->private; + + /* do we need to copy root stbuf everytime? */ + /* mostly yes, as we want to have the 'stat' info show latest + in every _cbk() */ + + /* keep the reference for root stat buf */ + priv->root_stbuf = *buf; + priv->gfiddir_stbuf = priv->root_stbuf; + priv->gfiddir_stbuf.ia_gfid[15] = GF_AUX_GFID; + priv->gfiddir_stbuf.ia_ino = GF_AUX_GFID; + +unwind: + STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, + xdata, postparent); + return 0; +} + +int32_t +ga_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + ga_private_t *priv = NULL; + int ret = -1; + uuid_t tmp_gfid = {0,}; + loc_t tmp_loc = {0,}; + uint64_t value = 0; + inode_t *inode = NULL; + inode_t *true_inode = NULL; + int32_t op_errno = ENOENT; + + /* if its discover(), no need for any action here */ + if (!loc->name) + goto wind; + + /* if its revalidate, and inode is not of type directory, + proceed with 'wind' */ + if (loc->inode && loc->inode->ia_type && + !IA_ISDIR (loc->inode->ia_type)) + goto wind; + + priv = this->private; + + /* need to check if the lookup is on virtual dir */ + if ((loc->name && !strcmp (GF_GFID_DIR, loc->name)) && + ((loc->parent && __is_root_gfid (loc->parent->gfid)) || + __is_root_gfid (loc->pargfid))) { + /* this means, the query is on '/.gfid', return the fake stat, + and say success */ + + STACK_UNWIND_STRICT (lookup, frame, 0, 0, loc->inode, + &priv->gfiddir_stbuf, xdata, + &priv->root_stbuf); + return 0; + } + + /* now, check if the lookup() is on an existing entry, + but on gfid-path */ + if (!((loc->parent && __is_gfid_access_dir (loc->parent->gfid)) || + __is_gfid_access_dir (loc->pargfid))) + goto wind; + + /* make sure the 'basename' is actually a 'canonical-gfid', + otherwise, return error */ + ret = uuid_parse (loc->name, tmp_gfid); + if (ret) + goto err; + + /* if its fresh lookup, go ahead and send it down, if not, + for directory, we need indirection to actual dir inode */ + if (!(loc->inode && loc->inode->ia_type)) + goto discover; + + /* revalidate on directory */ + ret = inode_ctx_get (loc->inode, this, &value); + if (ret) + goto err; + + inode = (void *)value; + + /* valid inode, already looked up, work on that */ + if (inode->ia_type) + goto discover; + + /* check if the inode is in the 'itable' or + if its just previously discover()'d inode */ + true_inode = inode_find (loc->inode->table, tmp_gfid); + if (true_inode) { + /* time do another lookup and update the context + with proper inode */ + op_errno = ESTALE; + goto err; + } + +discover: + /* for the virtual entries, we don't need to send 'gfid-req' key, as + for these entries, we don't want to 'set' a new gfid */ + if (xdata) + dict_del (xdata, "gfid-req"); + + uuid_copy (tmp_loc.gfid, tmp_gfid); + + /* if revalidate, then we need to have the proper reference */ + if (inode) { + tmp_loc.inode = inode_ref (inode); + frame->local = loc->inode; + } else { + tmp_loc.inode = inode_ref (loc->inode); + } + + STACK_WIND (frame, ga_virtual_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, &tmp_loc, xdata); + + inode_unref (tmp_loc.inode); + + return 0; + +wind: + /* used for all the normal lookup path */ + STACK_WIND (frame, ga_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + + return 0; + +err: + STACK_UNWIND_STRICT (lookup, frame, -1, op_errno, loc->inode, + &priv->gfiddir_stbuf, xdata, + &priv->root_stbuf); + return 0; +} + +int +ga_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) +{ + int op_errno = 0; + + GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + + STACK_WIND (frame, default_mkdir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, + xdata); + + return 0; + +err: + STACK_UNWIND_STRICT (mkdir, frame, -1, op_errno, loc->inode, + NULL, NULL, NULL, xdata); + return 0; +} + + +int +ga_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + int op_errno = 0; + + GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + + STACK_WIND (frame, default_create_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->create, + loc, flags, mode, umask, fd, xdata); + return 0; +err: + STACK_UNWIND_STRICT (create, frame, -1, op_errno, NULL, + NULL, NULL, NULL, NULL, xdata); + + return 0; + +} + +int +ga_symlink (call_frame_t *frame, xlator_t *this, const char *linkname, + loc_t *loc, mode_t umask, dict_t *xdata) +{ + int op_errno = 0; + + GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + + STACK_WIND (frame, default_symlink_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink, + linkname, loc, umask, xdata); + return 0; +err: + STACK_UNWIND_STRICT (symlink, frame, -1, op_errno, NULL, + NULL, NULL, NULL, xdata); + + return 0; +} + +int +ga_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) +{ + int op_errno = 0; + + GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + + STACK_WIND (frame, default_mknod_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, + umask, xdata); + + return 0; +err: + STACK_UNWIND_STRICT (mknod, frame, -1, op_errno, NULL, + NULL, NULL, NULL, xdata); + + return 0; +} + +int +ga_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flag, + dict_t *xdata) +{ + int op_errno = 0; + inode_t *unref = NULL; + + GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + + GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); + +wind: + STACK_WIND (frame, default_rmdir_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->rmdir, + loc, flag, xdata); + if (unref) + inode_unref (unref); + + return 0; +err: + STACK_UNWIND_STRICT (rmdir, frame, -1, op_errno, NULL, + NULL, xdata); + + return 0; +} + +int +ga_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t xflag, + dict_t *xdata) +{ + int op_errno = 0; + inode_t *unref = NULL; + + GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + + GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); + +wind: + STACK_WIND (frame, default_unlink_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, + loc, xflag, xdata); + + if (unref) + inode_unref (unref); + + return 0; +err: + STACK_UNWIND_STRICT (unlink, frame, -1, op_errno, NULL, + NULL, xdata); + + return 0; +} + +int +ga_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + int op_errno = 0; + inode_t *oldloc_unref = NULL; + inode_t *newloc_unref = NULL; + + GFID_ACCESS_ENTRY_OP_CHECK (oldloc, op_errno, err); + GFID_ACCESS_ENTRY_OP_CHECK (newloc, op_errno, err); + + GFID_ACCESS_GET_VALID_DIR_INODE (this, oldloc, oldloc_unref, + handle_newloc); + +handle_newloc: + GFID_ACCESS_GET_VALID_DIR_INODE (this, newloc, newloc_unref, wind); + +wind: + STACK_WIND (frame, default_rename_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, + oldloc, newloc, xdata); + + if (oldloc_unref) + inode_unref (oldloc_unref); + + if (newloc_unref) + inode_unref (newloc_unref); + + return 0; +err: + STACK_UNWIND_STRICT (rename, frame, -1, op_errno, NULL, + NULL, NULL, NULL, NULL, xdata); + + return 0; +} + + +int +ga_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + int op_errno = 0; + inode_t *oldloc_unref = NULL; + inode_t *newloc_unref = NULL; + + GFID_ACCESS_ENTRY_OP_CHECK (oldloc, op_errno, err); + GFID_ACCESS_ENTRY_OP_CHECK (newloc, op_errno, err); + + GFID_ACCESS_GET_VALID_DIR_INODE (this, oldloc, oldloc_unref, + handle_newloc); + +handle_newloc: + GFID_ACCESS_GET_VALID_DIR_INODE (this, newloc, newloc_unref, wind); + +wind: + STACK_WIND (frame, default_link_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, + oldloc, newloc, xdata); + + if (oldloc_unref) + inode_unref (oldloc_unref); + + if (newloc_unref) + inode_unref (newloc_unref); + + return 0; +err: + STACK_UNWIND_STRICT (link, frame, -1, op_errno, NULL, + NULL, NULL, NULL, xdata); + + return 0; +} + +int32_t +ga_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, + fd_t *fd, dict_t *xdata) +{ + int op_errno = 0; + + GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err); + + /* also check if the loc->inode itself is virtual + inode, if yes, return with failure, mainly because we + can't handle all the readdirp and other things on it. */ + if (inode_ctx_get (loc->inode, this, NULL) == 0) { + op_errno = ENOTSUP; + goto err; + } + + STACK_WIND (frame, default_opendir_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->opendir, + loc, fd, xdata); + return 0; +err: + STACK_UNWIND_STRICT (opendir, frame, -1, op_errno, NULL, xdata); + + return 0; +} + +int32_t +ga_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + inode_t *unref = NULL; + + GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); + +wind: + STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); + + if (unref) + inode_unref (unref); + + return 0; +} + +int32_t +ga_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xdata) +{ + inode_t *unref = NULL; + + GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); + +wind: + STACK_WIND (frame, default_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + if (unref) + inode_unref (unref); + + return 0; +} + +int32_t +ga_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + struct iatt *stbuf, int32_t valid, + dict_t *xdata) +{ + inode_t *unref = NULL; + + GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); + +wind: + STACK_WIND (frame, default_setattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setattr, loc, stbuf, valid, + xdata); + if (unref) + inode_unref (unref); + + return 0; +} + +int32_t +ga_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + inode_t *unref = NULL; + + GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind); + +wind: + STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, + xdata); + if (unref) + inode_unref (unref); + + return 0; +} + + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_gfid_access_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_WARNING, "Memory accounting" + " init failed"); + return ret; + } + + return ret; +} + +int32_t +init (xlator_t *this) +{ + ga_private_t *priv = NULL; + int ret = -1; + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "not configured with exactly one child. exiting"); + goto out; + } + + /* This can be the top of graph in certain cases */ + if (!this->parents) { + gf_log (this->name, GF_LOG_DEBUG, + "dangling volume. check volfile "); + } + + /* TODO: define a mem-type structure */ + priv = GF_CALLOC (1, sizeof (*priv), gf_gfid_access_mt_priv_t); + if (!priv) + goto out; + + priv->newfile_args_pool = mem_pool_new (ga_newfile_args_t, 512); + if (!priv->newfile_args_pool) + goto out; + + priv->heal_args_pool = mem_pool_new (ga_heal_args_t, 512); + if (!priv->heal_args_pool) + goto out; + + this->private = priv; + + ret = 0; +out: + if (ret && priv) { + if (priv->newfile_args_pool) + mem_pool_destroy (priv->newfile_args_pool); + GF_FREE (priv); + } + + return ret; +} + +void +fini (xlator_t *this) +{ + ga_private_t *priv = NULL; + priv = this->private; + this->private = NULL; + + if (priv) { + if (priv->newfile_args_pool) + mem_pool_destroy (priv->newfile_args_pool); + if (priv->heal_args_pool) + mem_pool_destroy (priv->heal_args_pool); + GF_FREE (priv); + } + + return; +} + + +struct xlator_fops fops = { + .lookup = ga_lookup, + + /* entry fops */ + .mkdir = ga_mkdir, + .mknod = ga_mknod, + .create = ga_create, + .symlink = ga_symlink, + .link = ga_link, + .unlink = ga_unlink, + .rmdir = ga_rmdir, + .rename = ga_rename, + + /* handle any other directory operations here */ + .opendir = ga_opendir, + .stat = ga_stat, + .setattr = ga_setattr, + .getxattr = ga_getxattr, + .removexattr = ga_removexattr, + + /* special fop to handle more entry creations */ + .setxattr = ga_setxattr, +}; + +struct xlator_cbks cbks = { + .forget = ga_forget, +}; + +struct volume_options options[] = { + /* This translator doesn't take any options, or provide any options */ + { .key = {NULL} }, +}; diff --git a/xlators/features/gfid-access/src/gfid-access.h b/xlators/features/gfid-access/src/gfid-access.h new file mode 100644 index 000000000..e13c9b724 --- /dev/null +++ b/xlators/features/gfid-access/src/gfid-access.h @@ -0,0 +1,128 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef __GFID_ACCESS_H__ +#define __GFID_ACCESS_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "defaults.h" +#include "gfid-access-mem-types.h" + +#define UUID_CANONICAL_FORM_LEN 36 + +#define GF_FUSE_AUX_GFID_NEWFILE "glusterfs.gfid.newfile" +#define GF_FUSE_AUX_GFID_HEAL "glusterfs.gfid.heal" + +#define GF_GFID_KEY "GLUSTERFS_GFID" +#define GF_GFID_DIR ".gfid" +#define GF_AUX_GFID 0xd + +#define GFID_ACCESS_GET_VALID_DIR_INODE(x,l,unref,lbl) do { \ + int ret = 0; \ + uint64_t value = 0; \ + inode_t *tmp_inode = NULL; \ + \ + /* if its an entry operation, on the virtual */ \ + /* directory inode as parent, we need to handle */ \ + /* it properly */ \ + if (l->parent) { \ + ret = inode_ctx_get (l->parent, x, &value); \ + if (ret) \ + goto lbl; \ + tmp_inode = (inode_t *)value; \ + unref = inode_ref (tmp_inode); \ + l->parent = tmp_inode; \ + /* if parent is virtual, no need to handle */ \ + /* loc->inode */ \ + break; \ + } \ + \ + /* if its an inode operation, on the virtual */ \ + /* directory inode itself, we need to handle */ \ + /* it properly */ \ + if (l->inode) { \ + ret = inode_ctx_get (l->inode, x, &value); \ + if (ret) \ + goto lbl; \ + tmp_inode = (inode_t *)value; \ + unref = inode_ref (tmp_inode); \ + l->inode = tmp_inode; \ + } \ + \ + } while (0) + +#define GFID_ACCESS_ENTRY_OP_CHECK(loc,err,lbl) do { \ + /* need to check if the lookup is on virtual dir */ \ + if ((loc->name && !strcmp (GF_GFID_DIR, loc->name)) && \ + ((loc->parent && \ + __is_root_gfid (loc->parent->gfid)) || \ + __is_root_gfid (loc->pargfid))) { \ + err = EEXIST; \ + goto lbl; \ + } \ + \ + /* now, check if the lookup() is on an existing */ \ + /* entry, but on gfid-path */ \ + if ((loc->parent && \ + __is_gfid_access_dir (loc->parent->gfid)) || \ + __is_gfid_access_dir (loc->pargfid)) { \ + err = EPERM; \ + goto lbl; \ + } \ + } while (0) + + +typedef struct { + unsigned int uid; + unsigned int gid; + char gfid[UUID_CANONICAL_FORM_LEN + 1]; + unsigned int st_mode; + char *bname; + + union { + struct _symlink_in { + char *linkpath; + } __attribute__ ((__packed__)) symlink; + + struct _mknod_in { + unsigned int mode; + unsigned int rdev; + unsigned int umask; + } __attribute__ ((__packed__)) mknod; + + struct _mkdir_in { + unsigned int mode; + unsigned int umask; + } __attribute__ ((__packed__)) mkdir; + } __attribute__ ((__packed__)) args; +} __attribute__((__packed__)) ga_newfile_args_t; + +typedef struct { + char gfid[UUID_CANONICAL_FORM_LEN + 1]; + char *bname; /* a null terminated basename */ +} __attribute__((__packed__)) ga_heal_args_t; + +struct ga_private { + /* root inode's stbuf */ + struct iatt root_stbuf; + struct iatt gfiddir_stbuf; + struct mem_pool *newfile_args_pool; + struct mem_pool *heal_args_pool; +}; +typedef struct ga_private ga_private_t; + +#endif /* __GFID_ACCESS_H__ */ diff --git a/xlators/features/index/src/index.c b/xlators/features/index/src/index.c index 3245076ec..9253120f3 100644 --- a/xlators/features/index/src/index.c +++ b/xlators/features/index/src/index.c @@ -15,8 +15,10 @@ #include "index.h" #include "options.h" #include "glusterfs3-xdr.h" +#include "syncop.h" #define XATTROP_SUBDIR "xattrop" +#define BASE_INDICES_HOLDER_SUBDIR "base_indices_holder" call_stub_t * __index_dequeue (struct list_head *callstubs) @@ -242,20 +244,40 @@ check_delete_stale_index_file (xlator_t *this, char *filename) { int ret = 0; struct stat st = {0}; + struct stat base_index_st = {0}; char filepath[PATH_MAX] = {0}; + char filepath_under_base_indices_holder[PATH_MAX] = {0}; index_priv_t *priv = NULL; priv = this->private; + if (priv->to_be_healed_states != synced_state) + return; + make_file_path (priv->index_basepath, XATTROP_SUBDIR, filename, filepath, sizeof (filepath)); + + make_file_path (priv->index_basepath, BASE_INDICES_HOLDER_SUBDIR, + filename, filepath_under_base_indices_holder, + sizeof (filepath_under_base_indices_holder)); + + + ret = stat (filepath_under_base_indices_holder, &base_index_st); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "Base index is not created" + "under index/base_indices_holder"); + return; + } + ret = stat (filepath, &st); - if (!ret && st.st_nlink == 1) + if (!ret && st.st_nlink == 2) { unlink (filepath); + unlink (filepath_under_base_indices_holder); + } } static int index_fill_readdir (fd_t *fd, DIR *dir, off_t off, - size_t size, gf_dirent_t *entries) + size_t size, gf_dirent_t *entries, readdir_directory type) { off_t in_case = -1; size_t filled = 0; @@ -298,7 +320,8 @@ index_fill_readdir (fd_t *fd, DIR *dir, off_t off, } if (!strncmp (entry->d_name, XATTROP_SUBDIR"-", - strlen (XATTROP_SUBDIR"-"))) { + strlen (XATTROP_SUBDIR"-")) && + (type == INDEX_XATTROP)) { check_delete_stale_index_file (this, entry->d_name); continue; } @@ -337,16 +360,175 @@ out: } int +sync_base_indices (void *index_priv) +{ + index_priv_t *priv = NULL; + DIR *dir_base_holder = NULL; + DIR *xattrop_dir = NULL; + struct dirent *entry = NULL; + char base_indices_holder[PATH_MAX] = {0}; + char xattrop_directory[PATH_MAX] = {0}; + char base_index_path[PATH_MAX] = {0}; + char xattrop_index_path[PATH_MAX] = {0}; + int ret = 0; + + priv = index_priv; + + snprintf (base_indices_holder, PATH_MAX, "%s/%s", priv->index_basepath, + BASE_INDICES_HOLDER_SUBDIR); + snprintf (xattrop_directory, PATH_MAX, "%s/%s", priv->index_basepath, + XATTROP_SUBDIR); + + if ((dir_base_holder = opendir(base_indices_holder)) == NULL) { + ret = -1; + goto out; + } + if ((xattrop_dir = opendir (xattrop_directory)) == NULL) { + ret = -1; + goto out; + } + + priv->to_be_healed_states = sync_started; + while ((entry = readdir(xattrop_dir)) != NULL) { + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) { + continue; + } + if (strncmp (entry->d_name, XATTROP_SUBDIR"-", + strlen (XATTROP_SUBDIR"-"))) { + continue; + } + if (!strncmp (entry->d_name, XATTROP_SUBDIR"-", + strlen (XATTROP_SUBDIR"-"))) { + + snprintf (xattrop_index_path, PATH_MAX, "%s/%s", + xattrop_directory, entry->d_name); + + snprintf (base_index_path, PATH_MAX, "%s/%s", + base_indices_holder, entry->d_name); + + ret = link (xattrop_index_path, base_index_path); + if (ret && errno != EEXIST) + goto out; + + } + } + ret = closedir (xattrop_dir); + if (ret) + goto out; + ret = closedir (dir_base_holder); + if (ret) + goto out; + + ret = 0; +out: + return ret; + +} + +int +base_indices_syncing_done (int ret, call_frame_t *frame, void *data) +{ + index_priv_t *priv = NULL; + priv = data; + + if (!priv) + goto out; + + if (ret) { + priv->to_be_healed_states = sync_not_started; + } else { + priv->to_be_healed_states = synced_state; + } + + STACK_DESTROY (frame->root); + +out: + return 0; +} + +int +sync_base_indices_from_xattrop (xlator_t *this) +{ + + index_priv_t *priv = NULL; + char base_indices_holder[PATH_MAX] = {0}; + int ret = 0; + struct stat st = {0}; + DIR *dir = NULL; + struct dirent *entry = NULL; + call_frame_t *frame = NULL; + + priv = this->private; + + if (priv->to_be_healed_states != sync_not_started) { + ret = -1; + goto out; + } + + snprintf (base_indices_holder, PATH_MAX, "%s/%s", priv->index_basepath, + BASE_INDICES_HOLDER_SUBDIR); + + ret = stat (base_indices_holder, &st); + + if (ret && (errno != ENOENT)) { + goto out; + } else if (errno == ENOENT) { + ret = index_dir_create (this, BASE_INDICES_HOLDER_SUBDIR); + if (ret) + goto out; + } else { + if ((dir = opendir (base_indices_holder)) == NULL) { + ret = -1; + goto out; + } + while ((entry = readdir (dir)) != NULL) { + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name,"..")) { + continue; + } + ret = unlink (entry->d_name); + if (ret) + goto out; + } + closedir (dir); + } + + /*At this point of time we have index/base_indicies_holder directory + *is with no entries*/ + + frame = create_frame (this, this->ctx->pool); + if (!frame) { + ret = -1; + goto out; + } + set_lk_owner_from_ptr (&frame->root->lk_owner, frame->root); + + frame->root->pid = LOW_PRIO_PROC_PID; + + ret = synctask_new (this->ctx->env, sync_base_indices, + base_indices_syncing_done,frame, priv); + + + +out: + return ret; + +} + +int index_add (xlator_t *this, uuid_t gfid, const char *subdir) { int32_t op_errno = 0; char gfid_path[PATH_MAX] = {0}; char index_path[PATH_MAX] = {0}; + char base_path[PATH_MAX] = {0}; int ret = 0; uuid_t index = {0}; index_priv_t *priv = NULL; struct stat st = {0}; int fd = 0; + int index_created = 0; priv = this->private; GF_ASSERT_AND_GOTO_WITH_ERROR (this->name, !uuid_is_null (gfid), @@ -364,9 +546,11 @@ index_add (xlator_t *this, uuid_t gfid, const char *subdir) ret = link (index_path, gfid_path); if (!ret || (errno == EEXIST)) { ret = 0; + index_created = 1; goto out; } + op_errno = errno; if (op_errno == ENOENT) { ret = index_dir_create (this, subdir); @@ -398,10 +582,36 @@ index_add (xlator_t *this, uuid_t gfid, const char *subdir) "add to index (%s)", uuid_utoa (gfid), strerror (errno)); goto out; + } else { + index_created = 1; + } + + if (priv->to_be_healed_states != sync_not_started) { + make_index_path (priv->index_basepath, + GF_BASE_INDICES_HOLDER_GFID, + index, base_path, sizeof (base_path)); + ret = link (index_path, base_path); + if (ret) + goto out; } ret = 0; out: + /*If base_indices_holder is not created: create and sync + *If directory is present: delete contents and start syncing + *If syncing is in progress :No need to do any thing + *If syncing is done: No need to do anything*/ + if (!ret) { + switch (priv->to_be_healed_states) { + case sync_not_started: + ret = sync_base_indices_from_xattrop (this); + break; + case sync_started: + case synced_state: + /*No need to do anything*/ + break; + } + } return ret; } @@ -737,8 +947,18 @@ index_getxattr_wrapper (call_frame_t *frame, xlator_t *this, goto done; } - ret = dict_set_static_bin (xattr, (char*)name, priv->xattrop_vgfid, - sizeof (priv->xattrop_vgfid)); + if (!strcmp (name, GF_XATTROP_INDEX_GFID)) { + + ret = dict_set_static_bin (xattr, (char*)name, + priv->xattrop_vgfid, + sizeof (priv->xattrop_vgfid)); + + } else if (!strcmp (name, GF_BASE_INDICES_HOLDER_GFID)) { + + ret = dict_set_static_bin (xattr, (char*)name, + priv->base_indices_holder_vgfid, + sizeof (priv->base_indices_holder_vgfid)); + } if (ret) { ret = -ENOMEM; gf_log (THIS->name, GF_LOG_ERROR, "xattrop index " @@ -782,6 +1002,15 @@ index_lookup_wrapper (call_frame_t *frame, xlator_t *this, } else if (!uuid_compare (loc->pargfid, priv->xattrop_vgfid)) { make_file_path (priv->index_basepath, XATTROP_SUBDIR, loc->name, path, sizeof (path)); + } else if (!uuid_compare (loc->gfid,priv->base_indices_holder_vgfid)){ + make_index_dir_path (priv->index_basepath, + BASE_INDICES_HOLDER_SUBDIR, path, + sizeof (path)); + is_dir = _gf_true; + } else if (!uuid_compare (loc->pargfid, priv->base_indices_holder_vgfid)) { + make_file_path (priv->index_basepath, + BASE_INDICES_HOLDER_SUBDIR,loc->name, path, + sizeof (path)); } ret = lstat (path, &lstatbuf); @@ -803,10 +1032,14 @@ index_lookup_wrapper (call_frame_t *frame, xlator_t *this, } iatt_from_stat (&stbuf, &lstatbuf); - if (is_dir) + if (is_dir && !uuid_compare (loc->gfid, priv->xattrop_vgfid)) { uuid_copy (stbuf.ia_gfid, priv->xattrop_vgfid); - else + } else if (is_dir && + !uuid_compare (loc->gfid, priv->base_indices_holder_vgfid)) { + uuid_copy (stbuf.ia_gfid, priv->base_indices_holder_vgfid); + } else { uuid_generate (stbuf.ia_gfid); + } stbuf.ia_ino = -1; op_ret = 0; done: @@ -818,6 +1051,44 @@ done: } int32_t +base_indices_readdir_wrapper (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t off, dict_t *xdata) +{ + index_priv_t *priv = NULL; + char base_indices_holder[PATH_MAX] = {0}; + DIR *dir = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + int count = 0; + gf_dirent_t entries; + + priv = this->private; + + make_index_dir_path (priv->index_basepath, BASE_INDICES_HOLDER_SUBDIR, + base_indices_holder, sizeof (base_indices_holder)); + + dir = opendir (base_indices_holder); + if (!dir) { + op_errno = EINVAL; + goto done; + } + + + INIT_LIST_HEAD (&entries.list); + + count = index_fill_readdir (fd, dir, off, size, &entries, + BASE_INDICES_HOLDER); + /* pick ENOENT to indicate EOF */ + op_errno = errno; + op_ret = count; + closedir (dir); +done: + STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, &entries, xdata); + gf_dirent_free (&entries); + return 0; +} + +int32_t index_readdir_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t off, dict_t *xdata) { @@ -848,7 +1119,8 @@ index_readdir_wrapper (call_frame_t *frame, xlator_t *this, goto done; } - count = index_fill_readdir (fd, dir, off, size, &entries); + count = index_fill_readdir (fd, dir, off, size, &entries, + INDEX_XATTROP); /* pick ENOENT to indicate EOF */ op_errno = errno; @@ -915,7 +1187,10 @@ index_getxattr (call_frame_t *frame, xlator_t *this, { call_stub_t *stub = NULL; - if (!name || strcmp (GF_XATTROP_INDEX_GFID, name)) + if (!name) + goto out; + if (strcmp (GF_XATTROP_INDEX_GFID, name) && + strcmp (GF_BASE_INDICES_HOLDER_GFID, name)) goto out; stub = fop_getxattr_stub (frame, index_getxattr_wrapper, loc, name, @@ -942,7 +1217,9 @@ index_lookup (call_frame_t *frame, xlator_t *this, priv = this->private; if (uuid_compare (loc->gfid, priv->xattrop_vgfid) && - uuid_compare (loc->pargfid, priv->xattrop_vgfid)) + uuid_compare (loc->pargfid, priv->xattrop_vgfid) && + uuid_compare (loc->gfid, priv->base_indices_holder_vgfid) && + uuid_compare (loc->pargfid, priv->base_indices_holder_vgfid)) goto normal; stub = fop_lookup_stub (frame, index_lookup_wrapper, loc, xattr_req); @@ -968,10 +1245,19 @@ index_readdir (call_frame_t *frame, xlator_t *this, index_priv_t *priv = NULL; priv = this->private; - if (uuid_compare (fd->inode->gfid, priv->xattrop_vgfid)) + if (uuid_compare (fd->inode->gfid, priv->xattrop_vgfid) && + uuid_compare (fd->inode->gfid, priv->base_indices_holder_vgfid)) goto out; - stub = fop_readdir_stub (frame, index_readdir_wrapper, fd, size, off, - xdata); + + if (!uuid_compare (fd->inode->gfid, priv->xattrop_vgfid)) { + stub = fop_readdir_stub (frame, index_readdir_wrapper, fd, size, + off, xdata); + } else if (!uuid_compare (fd->inode->gfid, + priv->base_indices_holder_vgfid)) { + stub = fop_readdir_stub (frame, base_indices_readdir_wrapper, + fd, size, off, xdata); + } + if (!stub) { STACK_UNWIND_STRICT (readdir, frame, -1, ENOMEM, NULL, NULL); return 0; @@ -1075,10 +1361,13 @@ init (xlator_t *this) GF_OPTION_INIT ("index-base", priv->index_basepath, path, out); uuid_generate (priv->index); uuid_generate (priv->xattrop_vgfid); + /*base_indices_holder is a directory which contains hard links to + * all base indices inside indices/xattrop directory*/ + uuid_generate (priv->base_indices_holder_vgfid); INIT_LIST_HEAD (&priv->callstubs); this->private = priv; - ret = pthread_create (&thread, &w_attr, index_worker, this); + ret = gf_thread_create (&thread, &w_attr, index_worker, this); if (ret) { gf_log (this->name, GF_LOG_WARNING, "Failed to create " "worker thread, aborting"); diff --git a/xlators/features/index/src/index.h b/xlators/features/index/src/index.h index 661dcdbc4..d6dcb1c23 100644 --- a/xlators/features/index/src/index.h +++ b/xlators/features/index/src/index.h @@ -36,14 +36,28 @@ typedef struct index_fd_ctx { DIR *dir; } index_fd_ctx_t; +typedef enum { + sync_not_started, + sync_started, + synced_state, +} to_be_healed_states_t; + +typedef enum { + INDEX_XATTROP, + BASE_INDICES_HOLDER, +} readdir_directory; + typedef struct index_priv { char *index_basepath; uuid_t index; gf_lock_t lock; uuid_t xattrop_vgfid;//virtual gfid of the xattrop index dir + uuid_t base_indices_holder_vgfid; //virtual gfid of the + //to_be_healed_xattrop directory struct list_head callstubs; pthread_mutex_t mutex; pthread_cond_t cond; + to_be_healed_states_t to_be_healed_states; } index_priv_t; #define INDEX_STACK_UNWIND(fop, frame, params ...) \ diff --git a/xlators/features/locks/src/Makefile.am b/xlators/features/locks/src/Makefile.am index 8908c1f52..0f79731b4 100644 --- a/xlators/features/locks/src/Makefile.am +++ b/xlators/features/locks/src/Makefile.am @@ -11,6 +11,7 @@ noinst_HEADERS = locks.h common.h locks-mem-types.h clear.h AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS) CLEANFILES = diff --git a/xlators/features/locks/src/clear.c b/xlators/features/locks/src/clear.c index 5790a99ce..124b9ad0f 100644 --- a/xlators/features/locks/src/clear.c +++ b/xlators/features/locks/src/clear.c @@ -339,6 +339,7 @@ blkd: -1, EAGAIN); STACK_UNWIND_STRICT (entrylk, elock->frame, -1, EAGAIN, NULL); GF_FREE ((char *) elock->basename); + GF_FREE (elock->connection_id); GF_FREE (elock); } @@ -379,8 +380,8 @@ out: int clrlk_clear_lks_in_all_domains (xlator_t *this, pl_inode_t *pl_inode, - clrlk_args *args, int *blkd, int *granted, - int *op_errno) + clrlk_args *args, int *blkd, int *granted, + int *op_errno) { pl_dom_list_t *dom = NULL; int ret = -1; diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c index 9c21bddb9..b3309580d 100644 --- a/xlators/features/locks/src/common.c +++ b/xlators/features/locks/src/common.c @@ -35,6 +35,7 @@ __insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock); static int pl_send_prelock_unlock (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *old_lock); + static pl_dom_list_t * __allocate_domain (const char *volume) { @@ -75,8 +76,8 @@ get_domain (pl_inode_t *pl_inode, const char *volume) { pl_dom_list_t *dom = NULL; - GF_VALIDATE_OR_GOTO (POSIX_LOCKS, pl_inode, out); - GF_VALIDATE_OR_GOTO (POSIX_LOCKS, volume, out); + GF_VALIDATE_OR_GOTO ("posix-locks", pl_inode, out); + GF_VALIDATE_OR_GOTO ("posix-locks", volume, out); pthread_mutex_lock (&pl_inode->mutex); { @@ -92,9 +93,9 @@ get_domain (pl_inode_t *pl_inode, const char *volume) unlock: pthread_mutex_unlock (&pl_inode->mutex); if (dom) { - gf_log (POSIX_LOCKS, GF_LOG_TRACE, "Domain %s found", volume); + gf_log ("posix-locks", GF_LOG_TRACE, "Domain %s found", volume); } else { - gf_log (POSIX_LOCKS, GF_LOG_TRACE, "Domain %s not found", volume); + gf_log ("posix-locks", GF_LOG_TRACE, "Domain %s not found", volume); } out: return dom; @@ -135,10 +136,10 @@ __pl_inode_is_empty (pl_inode_t *pl_inode) void pl_print_locker (char *str, int size, xlator_t *this, call_frame_t *frame) { - snprintf (str, size, "Pid=%llu, lk-owner=%s, Transport=%p, Frame=%llu", + snprintf (str, size, "Pid=%llu, lk-owner=%s, Client=%p, Frame=%llu", (unsigned long long) frame->root->pid, lkowner_utoa (&frame->root->lk_owner), - (void *)frame->root->trans, + frame->root->client, (unsigned long long) frame->root->unique); } @@ -462,14 +463,14 @@ unlock: /* Create a new posix_lock_t */ posix_lock_t * -new_posix_lock (struct gf_flock *flock, void *transport, pid_t client_pid, +new_posix_lock (struct gf_flock *flock, client_t *client, pid_t client_pid, gf_lkowner_t *owner, fd_t *fd) { posix_lock_t *lock = NULL; - GF_VALIDATE_OR_GOTO (POSIX_LOCKS, flock, out); - GF_VALIDATE_OR_GOTO (POSIX_LOCKS, transport, out); - GF_VALIDATE_OR_GOTO (POSIX_LOCKS, fd, out); + GF_VALIDATE_OR_GOTO ("posix-locks", flock, out); + GF_VALIDATE_OR_GOTO ("posix-locks", client, out); + GF_VALIDATE_OR_GOTO ("posix-locks", fd, out); lock = GF_CALLOC (1, sizeof (posix_lock_t), gf_locks_mt_posix_lock_t); @@ -485,7 +486,7 @@ new_posix_lock (struct gf_flock *flock, void *transport, pid_t client_pid, else lock->fl_end = flock->l_start + flock->l_len - 1; - lock->transport = transport; + lock->client = client; lock->fd_num = fd_to_fdnum (fd); lock->fd = fd; lock->client_pid = client_pid; @@ -565,7 +566,7 @@ same_owner (posix_lock_t *l1, posix_lock_t *l2) { return (is_same_lkowner (&l1->owner, &l2->owner) && - (l1->transport == l2->transport)); + (l1->client == l2->client)); } @@ -694,7 +695,7 @@ subtract_locks (posix_lock_t *big, posix_lock_t *small) } GF_ASSERT (0); - gf_log (POSIX_LOCKS, GF_LOG_ERROR, "Unexpected case in subtract_locks"); + gf_log ("posix-locks", GF_LOG_ERROR, "Unexpected case in subtract_locks"); out: if (v.locks[0]) { @@ -812,7 +813,7 @@ __insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock) sum = add_locks (lock, conf); sum->fl_type = lock->fl_type; - sum->transport = lock->transport; + sum->client = lock->client; sum->fd_num = lock->fd_num; sum->client_pid = lock->client_pid; sum->owner = lock->owner; @@ -830,7 +831,7 @@ __insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock) sum = add_locks (lock, conf); sum->fl_type = conf->fl_type; - sum->transport = conf->transport; + sum->client = conf->client; sum->fd_num = conf->fd_num; sum->client_pid = conf->client_pid; sum->owner = conf->owner; @@ -988,7 +989,7 @@ pl_send_prelock_unlock (xlator_t *this, pl_inode_t *pl_inode, flock.l_len = old_lock->user_flock.l_len; - unlock_lock = new_posix_lock (&flock, old_lock->transport, + unlock_lock = new_posix_lock (&flock, old_lock->client, old_lock->client_pid, &old_lock->owner, old_lock->fd); GF_VALIDATE_OR_GOTO (this->name, unlock_lock, out); @@ -1097,3 +1098,124 @@ pl_getlk (pl_inode_t *pl_inode, posix_lock_t *lock) return conf; } + + +struct _lock_table * +pl_lock_table_new (void) +{ + struct _lock_table *new = NULL; + + new = GF_CALLOC (1, sizeof (struct _lock_table), gf_common_mt_lock_table); + if (new == NULL) { + goto out; + } + INIT_LIST_HEAD (&new->entrylk_lockers); + INIT_LIST_HEAD (&new->inodelk_lockers); + LOCK_INIT (&new->lock); +out: + return new; +} + + +int +pl_add_locker (struct _lock_table *table, const char *volume, + loc_t *loc, fd_t *fd, pid_t pid, gf_lkowner_t *owner, + glusterfs_fop_t type) +{ + int32_t ret = -1; + struct _locker *new = NULL; + + GF_VALIDATE_OR_GOTO ("lock-table", table, out); + GF_VALIDATE_OR_GOTO ("lock-table", volume, out); + + new = GF_CALLOC (1, sizeof (struct _locker), gf_common_mt_locker); + if (new == NULL) { + goto out; + } + INIT_LIST_HEAD (&new->lockers); + + new->volume = gf_strdup (volume); + + if (fd == NULL) { + loc_copy (&new->loc, loc); + } else { + new->fd = fd_ref (fd); + } + + new->pid = pid; + new->owner = *owner; + + LOCK (&table->lock); + { + if (type == GF_FOP_ENTRYLK) + list_add_tail (&new->lockers, &table->entrylk_lockers); + else + list_add_tail (&new->lockers, &table->inodelk_lockers); + } + UNLOCK (&table->lock); +out: + return ret; +} + +int +pl_del_locker (struct _lock_table *table, const char *volume, + loc_t *loc, fd_t *fd, gf_lkowner_t *owner, glusterfs_fop_t type) +{ + struct _locker *locker = NULL; + struct _locker *tmp = NULL; + int32_t ret = -1; + struct list_head *head = NULL; + struct list_head del; + + GF_VALIDATE_OR_GOTO ("lock-table", table, out); + GF_VALIDATE_OR_GOTO ("lock-table", volume, out); + + INIT_LIST_HEAD (&del); + + LOCK (&table->lock); + { + if (type == GF_FOP_ENTRYLK) { + head = &table->entrylk_lockers; + } else { + head = &table->inodelk_lockers; + } + + list_for_each_entry_safe (locker, tmp, head, lockers) { + if (!is_same_lkowner (&locker->owner, owner) || + strcmp (locker->volume, volume)) + continue; + + /* + * It is possible for inodelk lock to come on anon-fd + * and inodelk unlock to come on normal fd in case of + * client re-opens. So don't check for fds to be equal. + */ + if (locker->fd && fd) + list_move_tail (&locker->lockers, &del); + else if (locker->loc.inode && loc && + (locker->loc.inode == loc->inode)) + list_move_tail (&locker->lockers, &del); + } + } + UNLOCK (&table->lock); + + tmp = NULL; + locker = NULL; + + list_for_each_entry_safe (locker, tmp, &del, lockers) { + list_del_init (&locker->lockers); + if (locker->fd) + fd_unref (locker->fd); + else + loc_wipe (&locker->loc); + + GF_FREE (locker->volume); + GF_FREE (locker); + } + + ret = 0; +out: + return ret; + +} + diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h index 94ef4b2b4..db19ec978 100644 --- a/xlators/features/locks/src/common.h +++ b/xlators/features/locks/src/common.h @@ -14,12 +14,14 @@ /*dump locks format strings */ #define RANGE_FMT "type=%s, whence=%hd, start=%llu, len=%llu" #define ENTRY_FMT "type=%s on basename=%s" -#define DUMP_GEN_FMT "pid = %llu, owner=%s, transport=%p, " +#define DUMP_GEN_FMT "pid = %llu, owner=%s, client=%p" #define GRNTD_AT "granted at %s" #define BLKD_AT "blocked at %s" -#define DUMP_BLKD_FMT DUMP_GEN_FMT", "BLKD_AT -#define DUMP_GRNTD_FMT DUMP_GEN_FMT", "GRNTD_AT -#define DUMP_BLKD_GRNTD_FMT DUMP_GEN_FMT", "BLKD_AT", "GRNTD_AT +#define CONN_ID "connection-id=%s" +#define DUMP_BLKD_FMT DUMP_GEN_FMT", "CONN_ID", "BLKD_AT +#define DUMP_GRNTD_FMT DUMP_GEN_FMT", "CONN_ID", "GRNTD_AT +#define DUMP_BLKD_GRNTD_FMT DUMP_GEN_FMT", "CONN_ID", "BLKD_AT", "GRNTD_AT + #define ENTRY_BLKD_FMT ENTRY_FMT", "DUMP_BLKD_FMT #define ENTRY_GRNTD_FMT ENTRY_FMT", "DUMP_GRNTD_FMT #define ENTRY_BLKD_GRNTD_FMT ENTRY_FMT", "DUMP_BLKD_GRNTD_FMT @@ -29,8 +31,24 @@ #define RANGE_BLKD_GRNTD_FMT RANGE_FMT", "DUMP_BLKD_GRNTD_FMT #define SET_FLOCK_PID(flock, lock) ((flock)->l_pid = lock->client_pid) + +struct _locker { + struct list_head lockers; + char *volume; + loc_t loc; + fd_t *fd; + gf_lkowner_t owner; + pid_t pid; +}; + +struct _lock_table { + struct list_head inodelk_lockers; + struct list_head entrylk_lockers; + gf_lock_t lock; +}; + posix_lock_t * -new_posix_lock (struct gf_flock *flock, void *transport, pid_t client_pid, +new_posix_lock (struct gf_flock *flock, client_t *client, pid_t client_pid, gf_lkowner_t *owner, fd_t *fd); pl_inode_t * @@ -63,7 +81,8 @@ pl_dom_list_t * get_domain (pl_inode_t *pl_inode, const char *volume); void -grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom); +grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, + pl_dom_list_t *dom); void __delete_inode_lock (pl_inode_lock_t *lock); @@ -78,9 +97,9 @@ grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, void pl_update_refkeeper (xlator_t *this, inode_t *inode); int32_t -__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode); +__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode, char *domname); int32_t -get_inodelk_count (xlator_t *this, inode_t *inode); +get_inodelk_count (xlator_t *this, inode_t *inode, char *domname); int32_t __get_entrylk_count (xlator_t *this, pl_inode_t *pl_inode); @@ -143,6 +162,26 @@ pl_verify_reservelk (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, int can_block); int pl_reserve_unlock (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *reqlock); + uint32_t check_entrylk_on_basename (xlator_t *this, inode_t *parent, char *basename); + +int32_t +pl_add_locker (struct _lock_table *table, const char *volume, + loc_t *loc, + fd_t *fd, + pid_t pid, + gf_lkowner_t *owner, + glusterfs_fop_t type); + +int32_t +pl_del_locker (struct _lock_table *table, const char *volume, + loc_t *loc, + fd_t *fd, + gf_lkowner_t *owner, + glusterfs_fop_t type); + +struct _lock_table * +pl_lock_table_new (void); + #endif /* __COMMON_H__ */ diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c index d934a8b94..0785dc547 100644 --- a/xlators/features/locks/src/entrylk.c +++ b/xlators/features/locks/src/entrylk.c @@ -25,7 +25,7 @@ static pl_entry_lock_t * new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type, - void *trans, pid_t client_pid, gf_lkowner_t *owner, + client_t *client, pid_t client_pid, gf_lkowner_t *owner, const char *volume) { @@ -39,7 +39,7 @@ new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type, newlock->basename = basename ? gf_strdup (basename) : NULL; newlock->type = type; - newlock->trans = trans; + newlock->trans = client; newlock->volume = volume; newlock->client_pid = client_pid; newlock->owner = *owner; @@ -305,18 +305,15 @@ __find_most_matching_lock (pl_dom_list_t *dom, const char *basename) int __lock_name (pl_inode_t *pinode, const char *basename, entrylk_type type, - call_frame_t *frame, pl_dom_list_t *dom, xlator_t *this, int nonblock) + call_frame_t *frame, pl_dom_list_t *dom, xlator_t *this, + int nonblock, char *conn_id) { pl_entry_lock_t *lock = NULL; pl_entry_lock_t *conf = NULL; - void *trans = NULL; - pid_t client_pid = 0; int ret = -EINVAL; - trans = frame->root->trans; - client_pid = frame->root->pid; - - lock = new_entrylk_lock (pinode, basename, type, trans, client_pid, + lock = new_entrylk_lock (pinode, basename, type, + frame->root->client, frame->root->pid, &frame->root->lk_owner, dom->domain); if (!lock) { ret = -ENOMEM; @@ -325,12 +322,17 @@ __lock_name (pl_inode_t *pinode, const char *basename, entrylk_type type, lock->frame = frame; lock->this = this; - lock->trans = trans; + lock->trans = frame->root->client; + + if (conn_id) { + lock->connection_id = gf_strdup (conn_id); + } conf = __lock_grantable (dom, basename, type); if (conf) { ret = -EAGAIN; if (nonblock){ + GF_FREE (lock->connection_id); GF_FREE ((char *)lock->basename); GF_FREE (lock); goto out; @@ -350,6 +352,7 @@ __lock_name (pl_inode_t *pinode, const char *basename, entrylk_type type, if ( __blocked_lock_conflict (dom, basename, type) && !(__owner_has_lock (dom, lock))) { ret = -EAGAIN; if (nonblock) { + GF_FREE (lock->connection_id); GF_FREE ((char *) lock->basename); GF_FREE (lock); goto out; @@ -480,13 +483,15 @@ __grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, pl_inode, bl->basename); bl_ret = __lock_name (pl_inode, bl->basename, bl->type, - bl->frame, dom, bl->this, 0); + bl->frame, dom, bl->this, 0, + bl->connection_id); if (bl_ret == 0) { list_add (&bl->blocked_locks, granted); } else { gf_log (this->name, GF_LOG_DEBUG, "should never happen"); + GF_FREE (bl->connection_id); GF_FREE ((char *)bl->basename); GF_FREE (bl); } @@ -507,7 +512,8 @@ grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, pthread_mutex_lock (&pl_inode->mutex); { - __grant_blocked_entry_locks (this, pl_inode, dom, &granted_list); + __grant_blocked_entry_locks (this, pl_inode, dom, + &granted_list); } pthread_mutex_unlock (&pl_inode->mutex); @@ -520,24 +526,26 @@ grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, STACK_UNWIND_STRICT (entrylk, lock->frame, 0, 0, NULL); - GF_FREE ((char *)lock->basename); - GF_FREE (lock); + GF_FREE (lock->connection_id); + GF_FREE ((char *)lock->basename); + GF_FREE (lock); } GF_FREE ((char *)unlocked->basename); + GF_FREE (unlocked->connection_id); GF_FREE (unlocked); return; } /** - * release_entry_locks_for_transport: release all entry locks from this - * transport for this loc_t + * release_entry_locks_for_client: release all entry locks from this + * client for this loc_t */ static int -release_entry_locks_for_transport (xlator_t *this, pl_inode_t *pinode, - pl_dom_list_t *dom, void *trans) +release_entry_locks_for_client (xlator_t *this, pl_inode_t *pinode, + pl_dom_list_t *dom, client_t *client) { pl_entry_lock_t *lock = NULL; pl_entry_lock_t *tmp = NULL; @@ -551,14 +559,14 @@ release_entry_locks_for_transport (xlator_t *this, pl_inode_t *pinode, { list_for_each_entry_safe (lock, tmp, &dom->blocked_entrylks, blocked_locks) { - if (lock->trans != trans) + if (lock->trans != client) continue; list_del_init (&lock->blocked_locks); gf_log (this->name, GF_LOG_TRACE, "releasing lock on held by " - "{transport=%p}",trans); + "{client=%p}", client); list_add (&lock->blocked_locks, &released); @@ -566,16 +574,17 @@ release_entry_locks_for_transport (xlator_t *this, pl_inode_t *pinode, list_for_each_entry_safe (lock, tmp, &dom->entrylk_list, domain_list) { - if (lock->trans != trans) + if (lock->trans != client) continue; list_del_init (&lock->domain_list); gf_log (this->name, GF_LOG_TRACE, "releasing lock on held by " - "{transport=%p}",trans); + "{client=%p}", client); GF_FREE ((char *)lock->basename); + GF_FREE (lock->connection_id); GF_FREE (lock); } @@ -591,6 +600,7 @@ release_entry_locks_for_transport (xlator_t *this, pl_inode_t *pinode, STACK_UNWIND_STRICT (entrylk, lock->frame, -1, EAGAIN, NULL); GF_FREE ((char *)lock->basename); + GF_FREE (lock->connection_id); GF_FREE (lock); } @@ -601,6 +611,7 @@ release_entry_locks_for_transport (xlator_t *this, pl_inode_t *pinode, STACK_UNWIND_STRICT (entrylk, lock->frame, 0, 0, NULL); GF_FREE ((char *)lock->basename); + GF_FREE (lock->connection_id); GF_FREE (lock); } @@ -611,19 +622,23 @@ release_entry_locks_for_transport (xlator_t *this, pl_inode_t *pinode, int pl_common_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, inode_t *inode, const char *basename, - entrylk_cmd cmd, entrylk_type type, loc_t *loc, fd_t *fd) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; + entrylk_cmd cmd, entrylk_type type, loc_t *loc, fd_t *fd, + dict_t *xdata) - void * transport = NULL; - - pl_inode_t * pinode = NULL; +{ + int32_t op_ret = -1; + int32_t op_errno = 0; int ret = -1; - pl_entry_lock_t *unlocked = NULL; char unwind = 1; + GF_UNUSED int dict_ret = -1; + pl_inode_t *pinode = NULL; + pl_entry_lock_t *unlocked = NULL; + pl_dom_list_t *dom = NULL; + char *conn_id = NULL; + pl_ctx_t *ctx = NULL; - pl_dom_list_t *dom = NULL; + if (xdata) + dict_ret = dict_get_str (xdata, "connection-id", &conn_id); pinode = pl_inode_get (this, inode); if (!pinode) { @@ -639,18 +654,17 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, entrylk_trace_in (this, frame, volume, fd, loc, basename, cmd, type); - transport = frame->root->trans; - if (frame->root->lk_owner.len == 0) { /* this is a special case that means release - all locks from this transport + all locks from this client */ gf_log (this->name, GF_LOG_TRACE, - "Releasing locks for transport %p", transport); + "Releasing locks for client %p", frame->root->client); - release_entry_locks_for_transport (this, pinode, dom, transport); + release_entry_locks_for_client (this, pinode, dom, + frame->root->client); op_ret = 0; goto out; @@ -661,7 +675,7 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, pthread_mutex_lock (&pinode->mutex); { ret = __lock_name (pinode, basename, type, - frame, dom, this, 0); + frame, dom, this, 0, conn_id); } pthread_mutex_unlock (&pinode->mutex); @@ -686,7 +700,7 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, pthread_mutex_lock (&pinode->mutex); { ret = __lock_name (pinode, basename, type, - frame, dom, this, 1); + frame, dom, this, 1, conn_id); } pthread_mutex_unlock (&pinode->mutex); @@ -723,6 +737,24 @@ out: entrylk_trace_out (this, frame, volume, fd, loc, basename, cmd, type, op_ret, op_errno); + ctx = pl_ctx_get (frame->root->client, this); + + if (ctx == NULL) { + gf_log (this->name, GF_LOG_INFO, "pl_ctx_get() failed"); + goto unwind; + } + + if (cmd == ENTRYLK_UNLOCK) + pl_del_locker (ctx->ltable, volume, loc, fd, + &frame->root->lk_owner, + GF_FOP_ENTRYLK); + else + pl_add_locker (ctx->ltable, volume, loc, fd, + frame->root->pid, + &frame->root->lk_owner, + GF_FOP_ENTRYLK); + +unwind: STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno, NULL); } else { entrylk_trace_block (this, frame, volume, fd, loc, basename, @@ -742,10 +774,10 @@ out: int pl_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type) + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { - - pl_common_entrylk (frame, this, volume, loc->inode, basename, cmd, type, loc, NULL); + pl_common_entrylk (frame, this, volume, loc->inode, basename, cmd, + type, loc, NULL, xdata); return 0; } @@ -760,10 +792,10 @@ pl_entrylk (call_frame_t *frame, xlator_t *this, int pl_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type) + entrylk_cmd cmd, entrylk_type type, dict_t *xdata) { - - pl_common_entrylk (frame, this, volume, fd->inode, basename, cmd, type, NULL, fd); + pl_common_entrylk (frame, this, volume, fd->inode, basename, cmd, + type, NULL, fd, xdata); return 0; } diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c index 42350e59a..508523e11 100644 --- a/xlators/features/locks/src/inodelk.c +++ b/xlators/features/locks/src/inodelk.c @@ -39,8 +39,10 @@ inline void __pl_inodelk_unref (pl_inode_lock_t *lock) { lock->ref--; - if (!lock->ref) + if (!lock->ref) { + GF_FREE (lock->connection_id); GF_FREE (lock); + } } /* Check if 2 inodelks are conflicting on type. Only 2 shared locks don't conflict */ @@ -122,7 +124,7 @@ static inline int same_inodelk_owner (pl_inode_lock_t *l1, pl_inode_lock_t *l2) { return (is_same_lkowner (&l1->owner, &l2->owner) && - (l1->transport == l2->transport)); + (l1->client == l2->client)); } /* Returns true if the 2 inodelks conflict with each other */ @@ -292,7 +294,7 @@ __inode_unlock_lock (xlator_t *this, pl_inode_lock_t *lock, pl_dom_list_t *dom) " Matching lock not found for unlock %llu-%llu, by %s " "on %p", (unsigned long long)lock->fl_start, (unsigned long long)lock->fl_end, - lkowner_utoa (&lock->owner), lock->transport); + lkowner_utoa (&lock->owner), lock->client); goto out; } __delete_inode_lock (conf); @@ -300,7 +302,7 @@ __inode_unlock_lock (xlator_t *this, pl_inode_lock_t *lock, pl_dom_list_t *dom) " Matching lock found for unlock %llu-%llu, by %s on %p", (unsigned long long)lock->fl_start, (unsigned long long)lock->fl_end, lkowner_utoa (&lock->owner), - lock->transport); + lock->client); out: return conf; @@ -333,7 +335,8 @@ __grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, /* Grant all inodelks blocked on a lock */ void -grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom) +grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, + pl_dom_list_t *dom) { struct list_head granted; pl_inode_lock_t *lock; @@ -372,10 +375,10 @@ grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t * pthread_mutex_unlock (&pl_inode->mutex); } -/* Release all inodelks from this transport */ +/* Release all inodelks from this client */ static int -release_inode_locks_of_transport (xlator_t *this, pl_dom_list_t *dom, - inode_t *inode, void *trans) +release_inode_locks_of_client (xlator_t *this, pl_dom_list_t *dom, + inode_t *inode, client_t *client) { pl_inode_lock_t *tmp = NULL; pl_inode_lock_t *l = NULL; @@ -395,7 +398,7 @@ release_inode_locks_of_transport (xlator_t *this, pl_dom_list_t *dom, { list_for_each_entry_safe (l, tmp, &dom->blocked_inodelks, blocked_locks) { - if (l->transport != trans) + if (l->client != client) continue; list_del_init (&l->blocked_locks); @@ -408,8 +411,8 @@ release_inode_locks_of_transport (xlator_t *this, pl_dom_list_t *dom, gf_log (this->name, GF_LOG_DEBUG, "releasing blocking lock on %s held by " - "{transport=%p, pid=%"PRId64" lk-owner=%s}", - file, trans, (uint64_t) l->client_pid, + "{client=%p, pid=%"PRId64" lk-owner=%s}", + file, client, (uint64_t) l->client_pid, lkowner_utoa (&l->owner)); list_add (&l->blocked_locks, &released); @@ -420,7 +423,7 @@ release_inode_locks_of_transport (xlator_t *this, pl_dom_list_t *dom, } list_for_each_entry_safe (l, tmp, &dom->inodelk_list, list) { - if (l->transport != trans) + if (l->client != client) continue; inode_path (inode, NULL, &path); @@ -431,8 +434,8 @@ release_inode_locks_of_transport (xlator_t *this, pl_dom_list_t *dom, gf_log (this->name, GF_LOG_DEBUG, "releasing granted lock on %s held by " - "{transport=%p, pid=%"PRId64" lk-owner=%s}", - file, trans, (uint64_t) l->client_pid, + "{client=%p, pid=%"PRId64" lk-owner=%s}", + file, client, (uint64_t) l->client_pid, lkowner_utoa (&l->owner)); if (path) { @@ -515,8 +518,9 @@ out: /* Create a new inode_lock_t */ pl_inode_lock_t * -new_inode_lock (struct gf_flock *flock, void *transport, pid_t client_pid, - gf_lkowner_t *owner, const char *volume) +new_inode_lock (struct gf_flock *flock, client_t *client, pid_t client_pid, + call_frame_t *frame, xlator_t *this, const char *volume, + char *conn_id) { pl_inode_lock_t *lock = NULL; @@ -535,10 +539,16 @@ new_inode_lock (struct gf_flock *flock, void *transport, pid_t client_pid, else lock->fl_end = flock->l_start + flock->l_len - 1; - lock->transport = transport; + lock->client = client; lock->client_pid = client_pid; lock->volume = volume; - lock->owner = *owner; + lock->owner = frame->root->lk_owner; + lock->frame = frame; + lock->this = this; + + if (conn_id) { + lock->connection_id = gf_strdup (conn_id); + } INIT_LIST_HEAD (&lock->list); INIT_LIST_HEAD (&lock->blocked_locks); @@ -547,21 +557,58 @@ new_inode_lock (struct gf_flock *flock, void *transport, pid_t client_pid, return lock; } +int32_t +_pl_convert_volume (const char *volume, char **res) +{ + char *mdata_vol = NULL; + int ret = 0; + + mdata_vol = strrchr (volume, ':'); + //if the volume already ends with :metadata don't bother + if (mdata_vol && (strcmp (mdata_vol, ":metadata") == 0)) + return 0; + + ret = gf_asprintf (res, "%s:metadata", volume); + if (ret <= 0) + return ENOMEM; + return 0; +} + +int32_t +_pl_convert_volume_for_special_range (struct gf_flock *flock, + const char *volume, char **res) +{ + int32_t ret = 0; + + if ((flock->l_start == LLONG_MAX -1) && + (flock->l_len == 0)) { + ret = _pl_convert_volume (volume, res); + } + + return ret; +} + /* Common inodelk code called from pl_inodelk and pl_finodelk */ int pl_common_inodelk (call_frame_t *frame, xlator_t *this, const char *volume, inode_t *inode, int32_t cmd, - struct gf_flock *flock, loc_t *loc, fd_t *fd) + struct gf_flock *flock, loc_t *loc, fd_t *fd, dict_t *xdata) { int32_t op_ret = -1; int32_t op_errno = 0; int ret = -1; + GF_UNUSED int dict_ret = -1; int can_block = 0; - pid_t client_pid = -1; - void * transport = NULL; pl_inode_t * pinode = NULL; pl_inode_lock_t * reqlock = NULL; pl_dom_list_t * dom = NULL; + char *res = NULL; + char *res1 = NULL; + char *conn_id = NULL; + pl_ctx_t *ctx = NULL; + + if (xdata) + dict_ret = dict_get_str (xdata, "connection-id", &conn_id); VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (inode, unwind); @@ -572,10 +619,13 @@ pl_common_inodelk (call_frame_t *frame, xlator_t *this, goto unwind; } - pl_trace_in (this, frame, fd, loc, cmd, flock, volume); + op_errno = _pl_convert_volume_for_special_range (flock, volume, &res); + if (op_errno) + goto unwind; + if (res) + volume = res; - transport = frame->root->trans; - client_pid = frame->root->pid; + pl_trace_in (this, frame, fd, loc, cmd, flock, volume); pinode = pl_inode_get (this, inode); if (!pinode) { @@ -592,19 +642,26 @@ pl_common_inodelk (call_frame_t *frame, xlator_t *this, if (frame->root->lk_owner.len == 0) { /* special case: this means release all locks - from this transport + from this client */ gf_log (this->name, GF_LOG_TRACE, - "Releasing all locks from transport %p", transport); - - release_inode_locks_of_transport (this, dom, inode, transport); + "Releasing all locks from client %p", frame->root->client); + + release_inode_locks_of_client (this, dom, inode, frame->root->client); + _pl_convert_volume (volume, &res1); + if (res1) { + dom = get_domain (pinode, res1); + if (dom) + release_inode_locks_of_client (this, dom, + inode, frame->root->client); + } op_ret = 0; goto unwind; } - reqlock = new_inode_lock (flock, transport, client_pid, - &frame->root->lk_owner, volume); + reqlock = new_inode_lock (flock, frame->root->client, frame->root->pid, + frame, this, volume, conn_id); if (!reqlock) { op_ret = -1; @@ -612,14 +669,10 @@ pl_common_inodelk (call_frame_t *frame, xlator_t *this, goto unwind; } - reqlock->frame = frame; - reqlock->this = this; switch (cmd) { case F_SETLKW: can_block = 1; - reqlock->frame = frame; - reqlock->this = this; /* fall through */ @@ -651,6 +704,23 @@ pl_common_inodelk (call_frame_t *frame, xlator_t *this, op_ret = 0; + ctx = pl_ctx_get (frame->root->client, this); + + if (ctx == NULL) { + gf_log (this->name, GF_LOG_INFO, "pl_ctx_get() failed"); + goto unwind; + } + + if (flock->l_type == F_UNLCK) + pl_del_locker (ctx->ltable, volume, loc, fd, + &frame->root->lk_owner, + GF_FOP_INODELK); + else + pl_add_locker (ctx->ltable, volume, loc, fd, + frame->root->pid, + &frame->root->lk_owner, + GF_FOP_INODELK); + unwind: if ((inode != NULL) && (flock !=NULL)) { pl_update_refkeeper (this, inode); @@ -659,53 +729,78 @@ unwind: STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno, NULL); out: + GF_FREE (res); + GF_FREE (res1); return 0; } int pl_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock) + const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock, + dict_t *xdata) { - - pl_common_inodelk (frame, this, volume, loc->inode, cmd, flock, loc, NULL); + pl_common_inodelk (frame, this, volume, loc->inode, cmd, flock, + loc, NULL, xdata); return 0; } int pl_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock) + const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, + dict_t *xdata) { - - pl_common_inodelk (frame, this, volume, fd->inode, cmd, flock, NULL, fd); + pl_common_inodelk (frame, this, volume, fd->inode, cmd, flock, + NULL, fd, xdata); return 0; } +static inline int32_t +__get_inodelk_dom_count (pl_dom_list_t *dom) +{ + pl_inode_lock_t *lock = NULL; + int32_t count = 0; + + list_for_each_entry (lock, &dom->inodelk_list, list) { + count++; + } + list_for_each_entry (lock, &dom->blocked_inodelks, blocked_locks) { + count++; + } + return count; +} +/* Returns the no. of locks (blocked/granted) held on a given domain name + * If @domname is NULL, returns the no. of locks in all the domains present. + * If @domname is non-NULL and non-existent, returns 0 */ int32_t -__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode) +__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode, char *domname) { int32_t count = 0; - pl_inode_lock_t *lock = NULL; pl_dom_list_t *dom = NULL; list_for_each_entry (dom, &pl_inode->dom_list, inode_list) { - list_for_each_entry (lock, &dom->inodelk_list, list) { - count++; - } - list_for_each_entry (lock, &dom->blocked_inodelks, blocked_locks) { - count++; - } + if (domname) { + if (strcmp (domname, dom->domain) == 0) { + count = __get_inodelk_dom_count (dom); + goto out; + } + + } else { + /* Counting locks from all domains */ + count += __get_inodelk_dom_count (dom); + } } +out: return count; } int32_t -get_inodelk_count (xlator_t *this, inode_t *inode) +get_inodelk_count (xlator_t *this, inode_t *inode, char *domname) { pl_inode_t *pl_inode = NULL; uint64_t tmp_pl_inode = 0; @@ -721,7 +816,7 @@ get_inodelk_count (xlator_t *this, inode_t *inode) pthread_mutex_lock (&pl_inode->mutex); { - count = __get_inodelk_count (this, pl_inode); + count = __get_inodelk_count (this, pl_inode, domname); } pthread_mutex_unlock (&pl_inode->mutex); diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h index 7ffc67e1b..76fc941d7 100644 --- a/xlators/features/locks/src/locks.h +++ b/xlators/features/locks/src/locks.h @@ -19,10 +19,10 @@ #include "stack.h" #include "call-stub.h" #include "locks-mem-types.h" +#include "client_t.h" #include "lkowner.h" -#define POSIX_LOCKS "posix-locks" struct __pl_fd; struct __posix_lock { @@ -33,7 +33,7 @@ struct __posix_lock { off_t fl_end; short blocked; /* waiting to acquire */ - struct gf_flock user_flock; /* the flock supplied by the user */ + struct gf_flock user_flock; /* the flock supplied by the user */ xlator_t *this; /* required for blocked locks */ unsigned long fd_num; @@ -46,7 +46,7 @@ struct __posix_lock { /* These two together serve to uniquely identify each process across nodes */ - void *transport; /* to identify client node */ + void *client; /* to identify client node */ gf_lkowner_t owner; pid_t client_pid; /* pid of client process */ }; @@ -63,7 +63,7 @@ struct __pl_inode_lock { const char *volume; - struct gf_flock user_flock; /* the flock supplied by the user */ + struct gf_flock user_flock; /* the flock supplied by the user */ xlator_t *this; /* required for blocked locks */ fd_t *fd; @@ -75,9 +75,11 @@ struct __pl_inode_lock { /* These two together serve to uniquely identify each process across nodes */ - void *transport; /* to identify client node */ + void *client; /* to identify client node */ gf_lkowner_t owner; pid_t client_pid; /* pid of client process */ + + char *connection_id; /* stores the client connection id */ }; typedef struct __pl_inode_lock pl_inode_lock_t; @@ -115,7 +117,9 @@ struct __entry_lock { void *trans; gf_lkowner_t owner; - pid_t client_pid; /* pid of client process */ + pid_t client_pid; /* pid of client process */ + + char *connection_id; /* stores the client connection id */ }; typedef struct __entry_lock pl_entry_lock_t; @@ -152,9 +156,11 @@ typedef struct { char *brickname; } posix_locks_private_t; + typedef struct { gf_boolean_t entrylk_count_req; gf_boolean_t inodelk_count_req; + gf_boolean_t inodelk_dom_count_req; gf_boolean_t posixlk_count_req; gf_boolean_t parent_entrylk_req; @@ -166,8 +172,21 @@ typedef struct { enum {TRUNCATE, FTRUNCATE} op; } pl_local_t; + typedef struct { struct list_head locks_list; } pl_fdctx_t; + +typedef struct _locks_ctx { + gf_lock_t ltable_lock; /* only for replace, + ltable has its own internal + lock for operations */ + struct _lock_table *ltable; +} pl_ctx_t; + + +pl_ctx_t * +pl_ctx_get (client_t *client, xlator_t *xlator); + #endif /* __POSIX_LOCKS_H__ */ diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c index 2bc5f8581..7bfb38a51 100644 --- a/xlators/features/locks/src/posix.c +++ b/xlators/features/locks/src/posix.c @@ -51,7 +51,7 @@ pl_new_fdctx () fdctx = GF_CALLOC (1, sizeof (*fdctx), gf_locks_mt_pl_fdctx_t); - GF_VALIDATE_OR_GOTO (POSIX_LOCKS, fdctx, out); + GF_VALIDATE_OR_GOTO ("posix-locks", fdctx, out); INIT_LIST_HEAD (&fdctx->locks_list); @@ -66,7 +66,7 @@ pl_check_n_create_fdctx (xlator_t *this, fd_t *fd) uint64_t tmp = 0; pl_fdctx_t *fdctx = NULL; - GF_VALIDATE_OR_GOTO (POSIX_LOCKS, this, out); + GF_VALIDATE_OR_GOTO ("posix-locks", this, out); GF_VALIDATE_OR_GOTO (this->name, fd, out); LOCK (&fd->lock); @@ -119,7 +119,7 @@ pl_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, static int truncate_allowed (pl_inode_t *pl_inode, - void *transport, pid_t client_pid, + client_t *client, pid_t client_pid, gf_lkowner_t *owner, off_t offset) { posix_lock_t *l = NULL; @@ -128,7 +128,7 @@ truncate_allowed (pl_inode_t *pl_inode, region.fl_start = offset; region.fl_end = LLONG_MAX; - region.transport = transport; + region.client = client; region.client_pid = client_pid; region.owner = *owner; @@ -139,7 +139,7 @@ truncate_allowed (pl_inode_t *pl_inode, && locks_overlap (®ion, l) && !same_owner (®ion, l)) { ret = 0; - gf_log (POSIX_LOCKS, GF_LOG_TRACE, "Truncate " + gf_log ("posix-locks", GF_LOG_TRACE, "Truncate " "allowed"); break; } @@ -186,7 +186,7 @@ truncate_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (priv->mandatory && pl_inode->mandatory - && !truncate_allowed (pl_inode, frame->root->trans, + && !truncate_allowed (pl_inode, frame->root->client, frame->root->pid, &frame->root->lk_owner, local->offset)) { op_ret = -1; @@ -347,7 +347,7 @@ delete_locks_of_fd (xlator_t *this, pl_inode_t *pl_inode, fd_t *fd) static void __delete_locks_of_owner (pl_inode_t *pl_inode, - void *transport, gf_lkowner_t *owner) + client_t *client, gf_lkowner_t *owner) { posix_lock_t *tmp = NULL; posix_lock_t *l = NULL; @@ -357,7 +357,7 @@ __delete_locks_of_owner (pl_inode_t *pl_inode, list_for_each_entry_safe (l, tmp, &pl_inode->ext_list, list) { if (l->blocked) continue; - if ((l->transport == transport) && + if ((l->client == client) && is_same_lkowner (&l->owner, owner)) { gf_log ("posix-locks", GF_LOG_TRACE, " Flushing lock" @@ -810,7 +810,7 @@ pl_migrate_locks (call_frame_t *frame, fd_t *newfd, uint64_t oldfd_num, list_for_each_entry (l, &pl_inode->ext_list, list) { if (l->fd_num == oldfd_num) { l->fd_num = newfd_num; - l->transport = frame->root->trans; + l->client = frame->root->client; } } } @@ -983,7 +983,7 @@ pl_flush (call_frame_t *frame, xlator_t *this, } pthread_mutex_lock (&pl_inode->mutex); { - __delete_locks_of_owner (pl_inode, frame->root->trans, + __delete_locks_of_owner (pl_inode, frame->root->client, &frame->root->lk_owner); } pthread_mutex_unlock (&pl_inode->mutex); @@ -1178,7 +1178,7 @@ pl_readv (call_frame_t *frame, xlator_t *this, if (priv->mandatory && pl_inode->mandatory) { region.fl_start = offset; region.fl_end = offset + size - 1; - region.transport = frame->root->trans; + region.client = frame->root->client; region.fd_num = fd_to_fdnum(fd); region.client_pid = frame->root->pid; region.owner = frame->root->lk_owner; @@ -1272,7 +1272,7 @@ pl_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, if (priv->mandatory && pl_inode->mandatory) { region.fl_start = offset; region.fl_end = offset + iov_length (vector, count) - 1; - region.transport = frame->root->trans; + region.client = frame->root->client; region.fd_num = fd_to_fdnum(fd); region.client_pid = frame->root->pid; region.owner = frame->root->lk_owner; @@ -1353,7 +1353,7 @@ lock_dup (posix_lock_t *lock) { posix_lock_t *new_lock = NULL; - new_lock = new_posix_lock (&lock->user_flock, lock->transport, + new_lock = new_posix_lock (&lock->user_flock, lock->client, lock->client_pid, &lock->owner, (fd_t *)lock->fd_num); return new_lock; @@ -1513,8 +1513,6 @@ int pl_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) { - void *transport = NULL; - pid_t client_pid = 0; pl_inode_t *pl_inode = NULL; int op_ret = 0; int op_errno = 0; @@ -1523,9 +1521,6 @@ pl_lk (call_frame_t *frame, xlator_t *this, posix_lock_t *conf = NULL; int ret = 0; - transport = frame->root->trans; - client_pid = frame->root->pid; - if ((flock->l_start < 0) || (flock->l_len < 0)) { op_ret = -1; op_errno = EINVAL; @@ -1539,7 +1534,7 @@ pl_lk (call_frame_t *frame, xlator_t *this, goto unwind; } - reqlock = new_posix_lock (flock, transport, client_pid, + reqlock = new_posix_lock (flock, frame->root->client, frame->root->pid, &frame->root->lk_owner, fd); if (!reqlock) { @@ -1764,6 +1759,7 @@ pl_forget (xlator_t *this, list_del_init (&entry_l->domain_list); GF_FREE ((char *)entry_l->basename); + GF_FREE (entry_l->connection_id); GF_FREE (entry_l); } @@ -1797,6 +1793,7 @@ pl_forget (xlator_t *this, STACK_UNWIND_STRICT (entrylk, entry_l->frame, -1, 0, NULL); GF_FREE ((char *)entry_l->basename); + GF_FREE (entry_l->connection_id); GF_FREE (entry_l); } @@ -1946,19 +1943,34 @@ pl_entrylk_xattr_fill (xlator_t *this, inode_t *inode, } void -pl_inodelk_xattr_fill (xlator_t *this, inode_t *inode, - dict_t *dict) +pl_inodelk_xattr_fill (xlator_t *this, inode_t *inode, dict_t *dict, + gf_boolean_t per_dom) { int32_t count = 0; int ret = -1; + char *domname = NULL; + + + if (per_dom){ + ret = dict_get_str (dict, GLUSTERFS_INODELK_DOM_COUNT, + &domname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "value for key %s",GLUSTERFS_INODELK_DOM_COUNT); + goto out; + } + } + + count = get_inodelk_count (this, inode, domname); - count = get_inodelk_count (this, inode); ret = dict_set_int32 (dict, GLUSTERFS_INODELK_COUNT, count); if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - " dict_set failed on key %s", GLUSTERFS_INODELK_COUNT); + gf_log (this->name, GF_LOG_DEBUG, "Failed to set count for " + "key %s", GLUSTERFS_INODELK_COUNT); } +out: + return; } void @@ -2003,7 +2015,9 @@ pl_lookup_cbk (call_frame_t *frame, if (local->entrylk_count_req) pl_entrylk_xattr_fill (this, inode, xdata); if (local->inodelk_count_req) - pl_inodelk_xattr_fill (this, inode, xdata); + pl_inodelk_xattr_fill (this, inode, xdata, _gf_false); + if (local->inodelk_dom_count_req) + pl_inodelk_xattr_fill (this, inode, xdata, _gf_true); if (local->posixlk_count_req) pl_posixlk_xattr_fill (this, inode, xdata); @@ -2050,6 +2064,8 @@ pl_lookup (call_frame_t *frame, local->entrylk_count_req = 1; if (dict_get (xdata, GLUSTERFS_INODELK_COUNT)) local->inodelk_count_req = 1; + if (dict_get (xdata, GLUSTERFS_INODELK_DOM_COUNT)) + local->inodelk_dom_count_req = 1; if (dict_get (xdata, GLUSTERFS_POSIXLK_COUNT)) local->posixlk_count_req = 1; if (dict_get (xdata, GLUSTERFS_PARENT_ENTRYLK)) @@ -2088,7 +2104,11 @@ pl_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (local->entrylk_count_req) pl_entrylk_xattr_fill (this, entry->inode, entry->dict); if (local->inodelk_count_req) - pl_inodelk_xattr_fill (this, entry->inode, entry->dict); + pl_inodelk_xattr_fill (this, entry->inode, entry->dict, + _gf_false); + if (local->inodelk_dom_count_req) + pl_inodelk_xattr_fill (this, entry->inode, entry->dict, + _gf_true); if (local->posixlk_count_req) pl_posixlk_xattr_fill (this, entry->inode, entry->dict); } @@ -2117,6 +2137,8 @@ pl_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, local->entrylk_count_req = 1; if (dict_get (dict, GLUSTERFS_INODELK_COUNT)) local->inodelk_count_req = 1; + if (dict_get (dict, GLUSTERFS_INODELK_DOM_COUNT)) + local->inodelk_dom_count_req = 1; if (dict_get (dict, GLUSTERFS_POSIXLK_COUNT)) local->posixlk_count_req = 1; } @@ -2136,8 +2158,8 @@ out: void pl_dump_lock (char *str, int size, struct gf_flock *flock, - gf_lkowner_t *owner, void *trans, time_t *granted_time, - time_t *blkd_time, gf_boolean_t active) + gf_lkowner_t *owner, void *trans, char *conn_id, + time_t *granted_time, time_t *blkd_time, gf_boolean_t active) { char *type_str = NULL; char granted[32] = {0,}; @@ -2165,7 +2187,7 @@ pl_dump_lock (char *str, int size, struct gf_flock *flock, (unsigned long long) flock->l_start, (unsigned long long) flock->l_len, (unsigned long long) flock->l_pid, - lkowner_utoa (owner), trans, + lkowner_utoa (owner), trans, conn_id, ctime_r (granted_time, granted)); } else { snprintf (str, size, RANGE_BLKD_GRNTD_FMT, @@ -2173,7 +2195,7 @@ pl_dump_lock (char *str, int size, struct gf_flock *flock, (unsigned long long) flock->l_start, (unsigned long long) flock->l_len, (unsigned long long) flock->l_pid, - lkowner_utoa (owner), trans, + lkowner_utoa (owner), trans, conn_id, ctime_r (blkd_time, blocked), ctime_r (granted_time, granted)); } @@ -2184,7 +2206,7 @@ pl_dump_lock (char *str, int size, struct gf_flock *flock, (unsigned long long) flock->l_start, (unsigned long long) flock->l_len, (unsigned long long) flock->l_pid, - lkowner_utoa (owner), trans, + lkowner_utoa (owner), trans, conn_id, ctime_r (blkd_time, blocked)); } @@ -2222,6 +2244,7 @@ __dump_entrylks (pl_inode_t *pl_inode) "ENTRYLK_WRLCK", lock->basename, (unsigned long long) lock->client_pid, lkowner_utoa (&lock->owner), lock->trans, + lock->connection_id, ctime_r (&lock->granted_time.tv_sec, granted)); } else { snprintf (tmp, 256, ENTRY_BLKD_GRNTD_FMT, @@ -2229,6 +2252,7 @@ __dump_entrylks (pl_inode_t *pl_inode) "ENTRYLK_WRLCK", lock->basename, (unsigned long long) lock->client_pid, lkowner_utoa (&lock->owner), lock->trans, + lock->connection_id, ctime_r (&lock->blkd_time.tv_sec, blocked), ctime_r (&lock->granted_time.tv_sec, granted)); } @@ -2248,6 +2272,7 @@ __dump_entrylks (pl_inode_t *pl_inode) "ENTRYLK_WRLCK", lock->basename, (unsigned long long) lock->client_pid, lkowner_utoa (&lock->owner), lock->trans, + lock->connection_id, ctime_r (&lock->blkd_time.tv_sec, blocked)); gf_proc_dump_write(key, tmp); @@ -2298,7 +2323,7 @@ __dump_inodelks (pl_inode_t *pl_inode) SET_FLOCK_PID (&lock->user_flock, lock); pl_dump_lock (tmp, 256, &lock->user_flock, &lock->owner, - lock->transport, + lock->client, lock->connection_id, &lock->granted_time.tv_sec, &lock->blkd_time.tv_sec, _gf_true); @@ -2315,7 +2340,7 @@ __dump_inodelks (pl_inode_t *pl_inode) SET_FLOCK_PID (&lock->user_flock, lock); pl_dump_lock (tmp, 256, &lock->user_flock, &lock->owner, - lock->transport, + lock->client, lock->connection_id, 0, &lock->blkd_time.tv_sec, _gf_false); gf_proc_dump_write(key, tmp); @@ -2356,7 +2381,7 @@ __dump_posixlks (pl_inode_t *pl_inode) count, lock->blocked ? "BLOCKED" : "ACTIVE"); pl_dump_lock (tmp, 256, &lock->user_flock, - &lock->owner, lock->transport, + &lock->owner, lock->client, NULL, &lock->granted_time.tv_sec, &lock->blkd_time.tv_sec, (lock->blocked)? _gf_false: _gf_true); gf_proc_dump_write(key, tmp); @@ -2433,7 +2458,7 @@ unlock: __dump_entrylks (pl_inode); } - count = __get_inodelk_count (this, pl_inode); + count = __get_inodelk_count (this, pl_inode, NULL); if (count) { gf_proc_dump_write("inodelk-count", "%d", count); __dump_inodelks (pl_inode); @@ -2480,6 +2505,124 @@ mem_acct_init (xlator_t *this) return ret; } + +pl_ctx_t* +pl_ctx_get (client_t *client, xlator_t *xlator) +{ + void *tmp = NULL; + pl_ctx_t *ctx = NULL; + + client_ctx_get (client, xlator, &tmp); + + ctx = tmp; + + if (ctx != NULL) + goto out; + + ctx = GF_CALLOC (1, sizeof (pl_ctx_t), gf_locks_mt_posix_lock_t); + + if (ctx == NULL) + goto out; + + ctx->ltable = pl_lock_table_new(); + + if (ctx->ltable == NULL) { + GF_FREE (ctx); + ctx = NULL; + goto out; + } + + LOCK_INIT (&ctx->ltable_lock); + + if (client_ctx_set (client, xlator, ctx) != 0) { + LOCK_DESTROY (&ctx->ltable_lock); + GF_FREE (ctx->ltable); + GF_FREE (ctx); + ctx = NULL; + } +out: + return ctx; +} + +static void +ltable_delete_locks (struct _lock_table *ltable) +{ + struct _locker *locker = NULL; + struct _locker *tmp = NULL; + + list_for_each_entry_safe (locker, tmp, <able->inodelk_lockers, lockers) { + if (locker->fd) + pl_del_locker (ltable, locker->volume, &locker->loc, + locker->fd, &locker->owner, + GF_FOP_INODELK); + GF_FREE (locker->volume); + GF_FREE (locker); + } + + list_for_each_entry_safe (locker, tmp, <able->entrylk_lockers, lockers) { + if (locker->fd) + pl_del_locker (ltable, locker->volume, &locker->loc, + locker->fd, &locker->owner, + GF_FOP_ENTRYLK); + GF_FREE (locker->volume); + GF_FREE (locker); + } + GF_FREE (ltable); +} + + +static int32_t +destroy_cbk (xlator_t *this, client_t *client) +{ + void *tmp = NULL; + pl_ctx_t *locks_ctx = NULL; + + client_ctx_del (client, this, &tmp); + + if (tmp == NULL) + return 0 +; + locks_ctx = tmp; + if (locks_ctx->ltable) + ltable_delete_locks (locks_ctx->ltable); + + LOCK_DESTROY (&locks_ctx->ltable_lock); + GF_FREE (locks_ctx); + + return 0; +} + + +static int32_t +disconnect_cbk (xlator_t *this, client_t *client) +{ + int32_t ret = 0; + pl_ctx_t *locks_ctx = NULL; + struct _lock_table *ltable = NULL; + + locks_ctx = pl_ctx_get (client, this); + if (locks_ctx == NULL) { + gf_log (this->name, GF_LOG_INFO, "pl_ctx_get() failed"); + goto out; + } + + LOCK (&locks_ctx->ltable_lock); + { + if (locks_ctx->ltable) { + ltable = locks_ctx->ltable; + locks_ctx->ltable = pl_lock_table_new (); + } + } + UNLOCK (&locks_ctx->ltable_lock); + + if (ltable) + ltable_delete_locks (ltable); + +out: + return ret; +} + + int init (xlator_t *this) { @@ -2508,7 +2651,7 @@ init (xlator_t *this) gf_log (this->name, GF_LOG_CRITICAL, "'locks' translator is not loaded over a storage " "translator"); - goto out;; + goto out; } priv = GF_CALLOC (1, sizeof (*priv), @@ -2610,9 +2753,11 @@ struct xlator_dumpops dumpops = { }; struct xlator_cbks cbks = { - .forget = pl_forget, - .release = pl_release, - .releasedir = pl_releasedir, + .forget = pl_forget, + .release = pl_release, + .releasedir = pl_releasedir, + .client_destroy = destroy_cbk, + .client_disconnect = disconnect_cbk, }; diff --git a/xlators/features/marker/Makefile.am b/xlators/features/marker/Makefile.am index a6ba2de16..a985f42a8 100644 --- a/xlators/features/marker/Makefile.am +++ b/xlators/features/marker/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = src @SYNCDAEMON_SUBDIR@ +SUBDIRS = src CLEANFILES = diff --git a/xlators/features/marker/src/marker.c b/xlators/features/marker/src/marker.c index 82d9066d5..6a2c85691 100644 --- a/xlators/features/marker/src/marker.c +++ b/xlators/features/marker/src/marker.c @@ -185,6 +185,8 @@ marker_local_unref (marker_local_t *local) loc_wipe (&local->loc); loc_wipe (&local->parent_loc); + if (local->xdata) + dict_unref (local->xdata); if (local->oplocal) { marker_local_unref (local->oplocal); @@ -466,11 +468,16 @@ marker_create_frame (xlator_t *this, marker_local_t *local) int32_t marker_xtime_update_marks (xlator_t *this, marker_local_t *local) { + marker_conf_t *priv = NULL; + GF_VALIDATE_OR_GOTO ("marker", this, out); GF_VALIDATE_OR_GOTO (this->name, local, out); - if ((local->pid == GF_CLIENT_PID_GSYNCD) || - (local->pid == GF_CLIENT_PID_DEFRAG)) + priv = this->private; + + if ((local->pid == GF_CLIENT_PID_GSYNCD + && !(priv->feature_enabled & GF_XTIME_GSYNC_FORCE)) + || (local->pid == GF_CLIENT_PID_DEFRAG)) goto out; marker_gettimeofday (local); @@ -833,7 +840,7 @@ marker_unlink_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, STACK_WIND (frame, marker_unlink_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink, &local->loc, local->xflag, - NULL); + local->xdata); return 0; err: frame->local = NULL; @@ -858,6 +865,8 @@ marker_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, local = mem_get0 (this->local_pool); local->xflag = xflag; + if (xdata) + local->xdata = dict_ref (xdata); MARKER_INIT_LOCAL (frame, local); ret = loc_copy (&local->loc, loc); @@ -1808,6 +1817,210 @@ err: } +int32_t +marker_fallocate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_TRACE, "%s occurred while " + "fallocating a file ", strerror (op_errno)); + } + + local = (marker_local_t *) frame->local; + + frame->local = NULL; + + STACK_UNWIND_STRICT (fallocate, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + + if (op_ret == -1 || local == NULL) + goto out; + + priv = this->private; + + if (priv->feature_enabled & GF_QUOTA) + mq_initiate_quota_txn (this, &local->loc); + + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks (this, local); +out: + marker_local_unref (local); + + return 0; +} + +int32_t +marker_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + + priv = this->private; + + if (priv->feature_enabled == 0) + goto wind; + + local = mem_get0 (this->local_pool); + + MARKER_INIT_LOCAL (frame, local); + + ret = marker_inode_loc_fill (fd->inode, &local->loc); + + if (ret == -1) + goto err; +wind: + STACK_WIND (frame, marker_fallocate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len, + xdata); + return 0; +err: + STACK_UNWIND_STRICT (fallocate, frame, -1, ENOMEM, NULL, NULL, NULL); + + return 0; +} + + +int32_t +marker_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_TRACE, "%s occurred during discard", + strerror (op_errno)); + } + + local = (marker_local_t *) frame->local; + + frame->local = NULL; + + STACK_UNWIND_STRICT (discard, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + + if (op_ret == -1 || local == NULL) + goto out; + + priv = this->private; + + if (priv->feature_enabled & GF_QUOTA) + mq_initiate_quota_txn (this, &local->loc); + + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks (this, local); +out: + marker_local_unref (local); + + return 0; +} + +int32_t +marker_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + + priv = this->private; + + if (priv->feature_enabled == 0) + goto wind; + + local = mem_get0 (this->local_pool); + + MARKER_INIT_LOCAL (frame, local); + + ret = marker_inode_loc_fill (fd->inode, &local->loc); + + if (ret == -1) + goto err; +wind: + STACK_WIND (frame, marker_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); + return 0; +err: + STACK_UNWIND_STRICT (discard, frame, -1, ENOMEM, NULL, NULL, NULL); + + return 0; +} + +int32_t +marker_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_TRACE, "%s occurred during zerofill", + strerror (op_errno)); + } + + local = (marker_local_t *) frame->local; + + frame->local = NULL; + + STACK_UNWIND_STRICT (zerofill, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + + if (op_ret == -1 || local == NULL) + goto out; + + priv = this->private; + + if (priv->feature_enabled & GF_QUOTA) + mq_initiate_quota_txn (this, &local->loc); + + if (priv->feature_enabled & GF_XTIME) + marker_xtime_update_marks (this, local); +out: + marker_local_unref (local); + + return 0; +} + +int32_t +marker_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int32_t ret = 0; + marker_local_t *local = NULL; + marker_conf_t *priv = NULL; + + priv = this->private; + + if (priv->feature_enabled == 0) + goto wind; + + local = mem_get0 (this->local_pool); + + MARKER_INIT_LOCAL (frame, local); + + ret = marker_inode_loc_fill (fd->inode, &local->loc); + + if (ret == -1) + goto err; +wind: + STACK_WIND (frame, marker_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + return 0; +err: + STACK_UNWIND_STRICT (zerofill, frame, -1, ENOMEM, NULL, NULL, NULL); + + return 0; +} + + /* when a call from the special client is received on * key trusted.glusterfs.volume-mark with value "RESET" * or if the value is 0length, update the change the @@ -2452,7 +2665,7 @@ out: int32_t reconfigure (xlator_t *this, dict_t *options) { - int32_t ret = -1; + int32_t ret = 0; data_t *data = NULL; gf_boolean_t flag = _gf_false; marker_conf_t *priv = NULL; @@ -2493,11 +2706,17 @@ reconfigure (xlator_t *this, dict_t *options) "xtime updation will fail"); } else { priv->feature_enabled |= GF_XTIME; + data = dict_get (options, "gsync-force-xtime"); + if (!data) + goto out; + ret = gf_string2boolean (data->data, &flag); + if (ret == 0 && flag) + priv->feature_enabled |= GF_XTIME_GSYNC_FORCE; } } } out: - return 0; + return ret; } @@ -2553,9 +2772,16 @@ init (xlator_t *this) goto err; priv->feature_enabled |= GF_XTIME; + data = dict_get (options, "gsync-force-xtime"); + if (!data) + goto cont; + ret = gf_string2boolean (data->data, &flag); + if (ret == 0 && flag) + priv->feature_enabled |= GF_XTIME_GSYNC_FORCE; } } + cont: this->local_pool = mem_pool_new (marker_local_t, 128); if (!this->local_pool) { gf_log (this->name, GF_LOG_ERROR, @@ -2617,6 +2843,9 @@ struct xlator_fops fops = { .removexattr = marker_removexattr, .getxattr = marker_getxattr, .readdirp = marker_readdirp, + .fallocate = marker_fallocate, + .discard = marker_discard, + .zerofill = marker_zerofill, }; struct xlator_cbks cbks = { @@ -2628,5 +2857,6 @@ struct volume_options options[] = { {.key = {"timestamp-file"}}, {.key = {"quota"}}, {.key = {"xtime"}}, + {.key = {"gsync-force-xtime"}}, {.key = {NULL}} }; diff --git a/xlators/features/marker/src/marker.h b/xlators/features/marker/src/marker.h index 63491ab37..1a58f8cfc 100644 --- a/xlators/features/marker/src/marker.h +++ b/xlators/features/marker/src/marker.h @@ -28,8 +28,9 @@ #define TIMESTAMP_FILE "timestamp-file" enum { - GF_QUOTA=1, - GF_XTIME=2 + GF_QUOTA = 1, + GF_XTIME = 2, + GF_XTIME_GSYNC_FORCE = 4, }; /*initialize the local variable*/ @@ -110,6 +111,7 @@ struct marker_local{ inode_contribution_t *contri; int xflag; + dict_t *xdata; }; typedef struct marker_local marker_local_t; diff --git a/xlators/features/marker/utils/Makefile.am b/xlators/features/marker/utils/Makefile.am deleted file mode 100644 index 556951d9f..000000000 --- a/xlators/features/marker/utils/Makefile.am +++ /dev/null @@ -1,3 +0,0 @@ -SUBDIRS = syncdaemon src - -CLEANFILES = diff --git a/xlators/features/marker/utils/src/Makefile.am b/xlators/features/marker/utils/src/Makefile.am deleted file mode 100644 index 9e410cda6..000000000 --- a/xlators/features/marker/utils/src/Makefile.am +++ /dev/null @@ -1,26 +0,0 @@ -gsyncddir = $(libexecdir)/glusterfs - -gsyncd_PROGRAMS = gsyncd - -gsyncd_SOURCES = gsyncd.c procdiggy.c - -gsyncd_LDADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ - $(GF_GLUSTERFS_LIBS) - -gsyncd_LDFLAGS = $(GF_LDFLAGS) - -noinst_HEADERS = procdiggy.h - -AM_CPPFLAGS = $(GF_CPPFLAGS) \ - -I$(top_srcdir)/libglusterfs/src\ - -DGSYNCD_PREFIX=\"$(libexecdir)/glusterfs\"\ - -DUSE_LIBGLUSTERFS\ - -DSBIN_DIR=\"$(sbindir)\" -DPYTHON=\"$(PYTHON)\" - -AM_CFLAGS = -Wall $(GF_CFLAGS) - - -CLEANFILES = - -$(top_builddir)/libglusterfs/src/libglusterfs.la: - $(MAKE) -C $(top_builddir)/libglusterfs/src/ all diff --git a/xlators/features/marker/utils/src/gsyncd.c b/xlators/features/marker/utils/src/gsyncd.c deleted file mode 100644 index 9c4a5bdff..000000000 --- a/xlators/features/marker/utils/src/gsyncd.c +++ /dev/null @@ -1,367 +0,0 @@ -/* - Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include <stdlib.h> -#include <stdio.h> -#include <unistd.h> -#include <string.h> -#include <sys/param.h> /* for PATH_MAX */ - -/* NOTE (USE_LIBGLUSTERFS): - * ------------------------ - * When USE_LIBGLUSTERFS debugging sumbol is passed; perform - * glusterfs translator like initialization so that glusterfs - * globals, contexts are valid when glustefs api's are invoked. - * We unconditionally pass then while building gsyncd binary. - */ -#ifdef USE_LIBGLUSTERFS -#include "glusterfs.h" -#include "globals.h" -#endif - -#include "common-utils.h" -#include "run.h" -#include "procdiggy.h" - -#define _GLUSTERD_CALLED_ "_GLUSTERD_CALLED_" -#define _GSYNCD_DISPATCHED_ "_GSYNCD_DISPATCHED_" -#define GSYNCD_CONF "geo-replication/gsyncd.conf" -#define GSYNCD_PY "gsyncd.py" -#define RSYNC "rsync" - -int restricted = 0; - -static int -duplexpand (void **buf, size_t tsiz, size_t *len) -{ - size_t osiz = tsiz * *len; - char *p = realloc (*buf, osiz << 1); - if (!p) { - free(*buf); - return -1; - } - - memset (p + osiz, 0, osiz); - *buf = p; - *len <<= 1; - - return 0; -} - -static int -str2argv (char *str, char ***argv) -{ - char *p = NULL; - char *savetok = NULL; - int argc = 0; - size_t argv_len = 32; - int ret = 0; - - assert (str); - str = strdup (str); - if (!str) - return -1; - - *argv = calloc (argv_len, sizeof (**argv)); - if (!*argv) - goto error; - - while ((p = strtok_r (str, " ", &savetok))) { - str = NULL; - - argc++; - if (argc == argv_len) { - ret = duplexpand ((void *)argv, - sizeof (**argv), - &argv_len); - if (ret == -1) - goto error; - } - (*argv)[argc - 1] = p; - } - - return argc; - - error: - fprintf (stderr, "out of memory\n"); - return -1; -} - -static int -invoke_gsyncd (int argc, char **argv) -{ - char config_file[PATH_MAX] = {0,}; - size_t gluster_workdir_len = 0; - runner_t runner = {0,}; - int i = 0; - int j = 0; - char *nargv[argc + 4]; - char *python = NULL; - - if (restricted) { - size_t len; - /* in restricted mode we forcibly use the system-wide config */ - runinit (&runner); - runner_add_args (&runner, SBIN_DIR"/gluster", - "--log-file=-", "system::", "getwd", - NULL); - runner_redir (&runner, STDOUT_FILENO, RUN_PIPE); - if (runner_start (&runner) == 0 && - fgets (config_file, PATH_MAX, - runner_chio (&runner, STDOUT_FILENO)) != NULL && - (len = strlen (config_file)) && - config_file[len - 1] == '\n' && - runner_end (&runner) == 0) - gluster_workdir_len = len - 1; - - if (gluster_workdir_len) { - if (gluster_workdir_len + 1 + strlen (GSYNCD_CONF) + 1 > - PATH_MAX) - goto error; - config_file[gluster_workdir_len] = '/'; - strcat (config_file, GSYNCD_CONF); - } else - goto error; - - if (setenv ("_GSYNCD_RESTRICTED_", "1", 1) == -1) - goto error; - } - - if (chdir ("/") == -1) - goto error; - - j = 0; - python = getenv("PYTHON"); - if(!python) - python = PYTHON; - nargv[j++] = python; - nargv[j++] = GSYNCD_PREFIX"/python/syncdaemon/"GSYNCD_PY; - for (i = 1; i < argc; i++) - nargv[j++] = argv[i]; - if (config_file[0]) { - nargv[j++] = "-c"; - nargv[j++] = config_file; - } - nargv[j++] = NULL; - - execvp (python, nargv); - - fprintf (stderr, "exec of '%s' failed\n", python); - return 127; - - error: - fprintf (stderr, "gsyncd initializaion failed\n"); - return 1; -} - - -static int -find_gsyncd (pid_t pid, pid_t ppid, char *name, void *data) -{ - char buf[NAME_MAX * 2] = {0,}; - char path[PATH_MAX] = {0,}; - char *p = NULL; - int zeros = 0; - int ret = 0; - int fd = -1; - pid_t *pida = (pid_t *)data; - - if (ppid != pida[0]) - return 0; - - sprintf (path, PROC"/%d/cmdline", pid); - fd = open (path, O_RDONLY); - if (fd == -1) - return 0; - ret = read (fd, buf, sizeof (buf)); - close (fd); - if (ret == -1) - return 0; - for (zeros = 0, p = buf; zeros < 2 && p < buf + ret; p++) - zeros += !*p; - - ret = 0; - switch (zeros) { - case 2: - if ((strcmp (basename (buf), basename (PYTHON)) || - strcmp (basename (buf + strlen (buf) + 1), GSYNCD_PY)) == 0) { - ret = 1; - break; - } - /* fallthrough */ - case 1: - if (strcmp (basename (buf), GSYNCD_PY) == 0) - ret = 1; - } - - if (ret == 1) { - if (pida[1] != -1) { - fprintf (stderr, GSYNCD_PY" sibling is not unique"); - return -1; - } - pida[1] = pid; - } - - return 0; -} - -static int -invoke_rsync (int argc, char **argv) -{ - int i = 0; - char path[PATH_MAX] = {0,}; - pid_t pid = -1; - pid_t ppid = -1; - pid_t pida[] = {-1, -1}; - char *name = NULL; - char buf[PATH_MAX + 1] = {0,}; - int ret = 0; - - assert (argv[argc] == NULL); - - if (argc < 2 || strcmp (argv[1], "--server") != 0) - goto error; - - for (i = 2; i < argc && argv[i][0] == '-'; i++); - - if (!(i == argc - 2 && strcmp (argv[i], ".") == 0 && argv[i + 1][0] == '/')) { - fprintf (stderr, "need an rsync invocation without protected args\n"); - goto error; - } - - /* look up sshd we are spawned from */ - for (pid = getpid () ;; pid = ppid) { - ppid = pidinfo (pid, &name); - if (ppid < 0) { - fprintf (stderr, "sshd ancestor not found\n"); - goto error; - } - if (strcmp (name, "sshd") == 0) { - GF_FREE (name); - break; - } - GF_FREE (name); - } - /* look up "ssh-sibling" gsyncd */ - pida[0] = pid; - ret = prociter (find_gsyncd, pida); - if (ret == -1 || pida[1] == -1) { - fprintf (stderr, "gsyncd sibling not found\n"); - goto error; - } - /* check if rsync target matches gsyncd target */ - sprintf (path, PROC"/%d/cwd", pida[1]); - ret = readlink (path, buf, sizeof (buf)); - if (ret == -1 || ret == sizeof (buf)) - goto error; - if (strcmp (argv[argc - 1], "/") == 0 /* root dir cannot be a target */ || - (strcmp (argv[argc - 1], path) /* match against gluster target */ && - strcmp (argv[argc - 1], buf) /* match against file target */) != 0) { - fprintf (stderr, "rsync target does not match "GEOREP" session\n"); - goto error; - } - - argv[0] = RSYNC; - - execvp (RSYNC, argv); - - fprintf (stderr, "exec of "RSYNC" failed\n"); - return 127; - - error: - fprintf (stderr, "disallowed "RSYNC" invocation\n"); - return 1; -} - - -struct invocable { - char *name; - int (*invoker) (int argc, char **argv); -}; - -struct invocable invocables[] = { - { "rsync", invoke_rsync }, - { "gsyncd", invoke_gsyncd }, - { NULL, NULL} -}; - -int -main (int argc, char **argv) -{ - char *evas = NULL; - struct invocable *i = NULL; - char *b = NULL; - char *sargv = NULL; - -#ifdef USE_LIBGLUSTERFS - glusterfs_ctx_t *ctx = NULL; - - ctx = glusterfs_ctx_new (); - if (!ctx) - return ENOMEM; - - if (glusterfs_globals_init (ctx)) - return 1; - - THIS->ctx = ctx; -#endif - - evas = getenv (_GLUSTERD_CALLED_); - if (evas && strcmp (evas, "1") == 0) - /* OK, we know glusterd called us, no need to look for further config - * ... altough this conclusion should not inherit to our children - */ - unsetenv (_GLUSTERD_CALLED_); - else { - /* we regard all gsyncd invocations unsafe - * that do not come from glusterd and - * therefore restrict it - */ - restricted = 1; - - if (!getenv (_GSYNCD_DISPATCHED_)) { - evas = getenv ("SSH_ORIGINAL_COMMAND"); - if (evas) - sargv = evas; - else { - evas = getenv ("SHELL"); - if (evas && strcmp (basename (evas), "gsyncd") == 0 && - argc == 3 && strcmp (argv[1], "-c") == 0) - sargv = argv[2]; - } - } - - } - - if (!(sargv && restricted)) - return invoke_gsyncd (argc, argv); - - argc = str2argv (sargv, &argv); - if (argc == -1 || setenv (_GSYNCD_DISPATCHED_, "1", 1) == -1) { - fprintf (stderr, "internal error\n"); - return 1; - } - - b = basename (argv[0]); - for (i = invocables; i->name; i++) { - if (strcmp (b, i->name) == 0) - return i->invoker (argc, argv); - } - - fprintf (stderr, "invoking %s in restricted SSH session is not allowed\n", - b); - - return 1; -} diff --git a/xlators/features/marker/utils/src/procdiggy.c b/xlators/features/marker/utils/src/procdiggy.c deleted file mode 100644 index 1eba414c1..000000000 --- a/xlators/features/marker/utils/src/procdiggy.c +++ /dev/null @@ -1,121 +0,0 @@ -/* - Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include <stdlib.h> -#include <stdio.h> -#include <unistd.h> -#include <string.h> -#include <ctype.h> -#include <sys/param.h> /* for PATH_MAX */ - -#include "common-utils.h" -#include "procdiggy.h" - -pid_t -pidinfo (pid_t pid, char **name) -{ - char buf[NAME_MAX * 2] = {0,}; - FILE *f = NULL; - char path[PATH_MAX] = {0,}; - char *p = NULL; - int ret = 0; - - sprintf (path, PROC"/%d/status", pid); - - f = fopen (path, "r"); - if (!f) - return -1; - - if (name) - *name = NULL; - for (;;) { - size_t len; - memset (buf, 0, sizeof (buf)); - if (fgets (buf, sizeof (buf), f) == NULL || - (len = strlen (buf)) == 0 || - buf[len - 1] != '\n') { - pid = -1; - goto out; - } - buf[len - 1] = '\0'; - - if (name && !*name) { - p = strtail (buf, "Name:"); - if (p) { - while (isspace (*++p)); - *name = gf_strdup (p); - if (!*name) { - pid = -2; - goto out; - } - continue; - } - } - - p = strtail (buf, "PPid:"); - if (p) - break; - } - - while (isspace (*++p)); - ret = gf_string2int (p, &pid); - if (ret == -1) - pid = -1; - - out: - fclose (f); - if (pid == -1 && name && *name) - GF_FREE (name); - if (pid == -2) - fprintf (stderr, "out of memory\n"); - return pid; -} - -int -prociter (int (*proch) (pid_t pid, pid_t ppid, char *tmpname, void *data), - void *data) -{ - char *name = NULL; - DIR *d = NULL; - struct dirent *de = NULL; - pid_t pid = -1; - pid_t ppid = -1; - int ret = 0; - - d = opendir (PROC); - if (!d) - return -1; - while (errno = 0, de = readdir (d)) { - if (gf_string2int (de->d_name, &pid) != -1 && pid >= 0) { - ppid = pidinfo (pid, &name); - switch (ppid) { - case -1: continue; - case -2: ret = -1; break; - } - ret = proch (pid, ppid, name, data); - GF_FREE (name); - if (ret) - break; - } - } - closedir (d); - if (!de && errno) { - fprintf (stderr, "failed to traverse "PROC" (%s)\n", - strerror (errno)); - ret = -1; - } - - return ret; -} diff --git a/xlators/features/marker/utils/syncdaemon/Makefile.am b/xlators/features/marker/utils/syncdaemon/Makefile.am deleted file mode 100644 index cc7cee102..000000000 --- a/xlators/features/marker/utils/syncdaemon/Makefile.am +++ /dev/null @@ -1,6 +0,0 @@ -syncdaemondir = $(libexecdir)/glusterfs/python/syncdaemon - -syncdaemon_PYTHON = gconf.py gsyncd.py __init__.py master.py README.md repce.py resource.py configinterface.py syncdutils.py monitor.py libcxattr.py \ - $(top_builddir)/contrib/ipaddr-py/ipaddr.py - -CLEANFILES = diff --git a/xlators/features/marker/utils/syncdaemon/README.md b/xlators/features/marker/utils/syncdaemon/README.md deleted file mode 100644 index d45006932..000000000 --- a/xlators/features/marker/utils/syncdaemon/README.md +++ /dev/null @@ -1,81 +0,0 @@ -gsycnd, the Gluster Syncdaemon -============================== - -REQUIREMENTS ------------- - -_gsyncd_ is a program which can operate either in _master_ or in _slave_ mode. -Requirements are categorized according to this. - -* supported OS is GNU/Linux -* Python >= 2.5, or 2.4 with Ctypes (see below) (both) -* OpenSSH >= 4.0 (master) / SSH2 compliant sshd (eg. openssh) (slave) -* rsync (both) -* glusterfs with marker support (master); glusterfs (optional on slave) -* FUSE; for supported versions consult glusterfs - -INSTALLATION ------------- - -As of now, the supported way of operation is running from the source directory. - -If you use Python 2.4.x, you need to install the [Ctypes module](http://python.net/crew/theller/ctypes/). - -CONFIGURATION -------------- - -gsyncd tunables are a subset of the long command-line options; for listing them, -type - - gsyncd.py --help - -and see the long options up to "--config-file". (The leading double dash should be omitted; -interim underscores and dashes are interchangeable.) The set of options bear some resemblance -to those of glusterfs and rsync. - -The config file format matches the following syntax: - - <option1>: <value1> - <option2>: <value2> - # comment - -By default (unless specified by the option `-c`), gsyncd looks for config file at _conf/gsyncd.conf_ -in the source tree. - -USAGE ------ - -gsyncd is a utilitly for continous mirroring, ie. it mirrors master to slave incrementally. -Assume we have a gluster volume _pop_ at localhost. We try to set up the following mirrors -for it with gysncd: - -1. _/data/mirror_ -2. local gluster volume _yow_ -3. _/data/far_mirror_ at example.com -4. gluster volume _moz_ at example.com - -The respective gsyncd invocations are (demoing some syntax sugaring): - -1. - - gsyncd.py gluster://localhost:pop file:///data/mirror - - or short form - - gsyncd.py :pop /data/mirror - -2. `gsyncd :pop :yow` -3. - - gsyncd.py :pop ssh://example.com:/data/far_mirror - - or short form - - gsyncd.py :pop example.com:/data/far_mirror - -4. `gsyncd.py :pop example.com::moz` - -gsyncd has to be available on both sides; it's location on the remote side has to be specified -via the "--remote-gsyncd" option (or "remote-gsyncd" config file parameter). (This option can also be -used for setting options on the remote side, although the suggested mode of operation is to -set parameters like log file / pid file in the configuration file.) diff --git a/xlators/features/marker/utils/syncdaemon/__codecheck.py b/xlators/features/marker/utils/syncdaemon/__codecheck.py deleted file mode 100644 index e3386afba..000000000 --- a/xlators/features/marker/utils/syncdaemon/__codecheck.py +++ /dev/null @@ -1,46 +0,0 @@ -import os -import os.path -import sys -import tempfile -import shutil - -ipd = tempfile.mkdtemp(prefix = 'codecheck-aux') - -try: - # add a fake ipaddr module, we don't want to - # deal with the real one (just test our code) - f = open(os.path.join(ipd, 'ipaddr.py'), 'w') - f.write(""" -class IPAddress(object): - pass -class IPNetwork(list): - pass -""") - f.close() - sys.path.append(ipd) - - fl = os.listdir(os.path.dirname(sys.argv[0]) or '.') - fl.sort() - for f in fl: - if f[-3:] != '.py' or f[0] == '_': - continue - m = f[:-3] - sys.stdout.write('importing %s ...' % m) - __import__(m) - print(' OK.') - - def sys_argv_set(a): - sys.argv = sys.argv[:1] + a - - gsyncd = sys.modules['gsyncd'] - for a in [['--help'], ['--version'], ['--canonicalize-escape-url', '/foo']]: - print('>>> invoking program with args: %s' % ' '.join(a)) - pid = os.fork() - if not pid: - sys_argv_set(a) - gsyncd.main() - _, r = os.waitpid(pid, 0) - if r: - raise RuntimeError('invocation failed') -finally: - shutil.rmtree(ipd) diff --git a/xlators/features/marker/utils/syncdaemon/__init__.py b/xlators/features/marker/utils/syncdaemon/__init__.py deleted file mode 100644 index e69de29bb..000000000 --- a/xlators/features/marker/utils/syncdaemon/__init__.py +++ /dev/null diff --git a/xlators/features/marker/utils/syncdaemon/configinterface.py b/xlators/features/marker/utils/syncdaemon/configinterface.py deleted file mode 100644 index e55bec519..000000000 --- a/xlators/features/marker/utils/syncdaemon/configinterface.py +++ /dev/null @@ -1,224 +0,0 @@ -try: - import ConfigParser -except ImportError: - # py 3 - import configparser as ConfigParser -import re -from string import Template - -from syncdutils import escape, unescape, norm, update_file, GsyncdError - -SECT_ORD = '__section_order__' -SECT_META = '__meta__' -config_version = 2.0 - -re_type = type(re.compile('')) - - -class MultiDict(object): - """a virtual dict-like class which functions as the union of underlying dicts""" - - def __init__(self, *dd): - self.dicts = dd - - def __getitem__(self, key): - val = None - for d in self.dicts: - if d.get(key): - val = d[key] - if not val: - raise KeyError(key) - return val - - -class GConffile(object): - """A high-level interface to ConfigParser which flattens the two-tiered - config layout by implenting automatic section dispatch based on initial - parameters. - - Also ensure section ordering in terms of their time of addition -- a compat - hack for Python < 2.7. - """ - - def _normconfig(self): - """normalize config keys by s/-/_/g""" - for n, s in self.config._sections.items(): - if n.find('__') == 0: - continue - s2 = type(s)() - for k, v in s.items(): - if k.find('__') != 0: - k = norm(k) - s2[k] = v - self.config._sections[n] = s2 - - def __init__(self, path, peers, *dd): - """ - - .path: location of config file - - .config: underlying ConfigParser instance - - .peers: on behalf of whom we flatten .config - (master, or master-slave url pair) - - .auxdicts: template subtituents - """ - self.peers = peers - self.path = path - self.auxdicts = dd - self.config = ConfigParser.RawConfigParser() - self.config.read(path) - self._normconfig() - - def section(self, rx=False): - """get the section name of the section representing .peers in .config""" - peers = self.peers - if not peers: - peers = ['.', '.'] - rx = True - if rx: - st = 'peersrx' - else: - st = 'peers' - return ' '.join([st] + [escape(u) for u in peers]) - - @staticmethod - def parse_section(section): - """retrieve peers sequence encoded by section name - (as urls or regexen, depending on section type) - """ - sl = section.split() - st = sl.pop(0) - sl = [unescape(u) for u in sl] - if st == 'peersrx': - sl = [re.compile(u) for u in sl] - return sl - - def ord_sections(self): - """Return an ordered list of sections. - - Ordering happens based on the auxiliary - SECT_ORD section storing indices for each - section added through the config API. - - To not to go corrupt in case of manually - written config files, we take care to append - also those sections which are not registered - in SECT_ORD. - - Needed for python 2.{4,5,6} where ConfigParser - cannot yet order sections/options internally. - """ - so = {} - if self.config.has_section(SECT_ORD): - so = self.config._sections[SECT_ORD] - so2 = {} - for k, v in so.items(): - if k != '__name__': - so2[k] = int(v) - tv = 0 - if so2: - tv = max(so2.values()) + 1 - ss = [s for s in self.config.sections() if s.find('__') != 0] - for s in ss: - if s in so.keys(): - continue - so2[s] = tv - tv += 1 - def scmp(x, y): - return cmp(*(so2[s] for s in (x, y))) - ss.sort(scmp) - return ss - - def update_to(self, dct, allow_unresolved=False): - """update @dct from key/values of ours. - - key/values are collected from .config by filtering the regexp sections - according to match, and from .section. The values are treated as templates, - which are substituted from .auxdicts and (in case of regexp sections) - match groups. - """ - if not self.peers: - raise GsyncdError('no peers given, cannot select matching options') - def update_from_sect(sect, mud): - for k, v in self.config._sections[sect].items(): - if k == '__name__': - continue - if allow_unresolved: - dct[k] = Template(v).safe_substitute(mud) - else: - dct[k] = Template(v).substitute(mud) - for sect in self.ord_sections(): - sp = self.parse_section(sect) - if isinstance(sp[0], re_type) and len(sp) == len(self.peers): - match = True - mad = {} - for i in range(len(sp)): - m = sp[i].search(self.peers[i]) - if not m: - match = False - break - for j in range(len(m.groups())): - mad['match%d_%d' % (i+1, j+1)] = m.groups()[j] - if match: - update_from_sect(sect, MultiDict(dct, mad, *self.auxdicts)) - if self.config.has_section(self.section()): - update_from_sect(self.section(), MultiDict(dct, *self.auxdicts)) - - def get(self, opt=None): - """print the matching key/value pairs from .config, - or if @opt given, the value for @opt (according to the - logic described in .update_to) - """ - d = {} - self.update_to(d, allow_unresolved = True) - if opt: - opt = norm(opt) - v = d.get(opt) - if v: - print(v) - else: - for k, v in d.iteritems(): - if k == '__name__': - continue - print("%s: %s" % (k, v)) - - def write(self, trfn, opt, *a, **kw): - """update on-disk config transactionally - - @trfn is the transaction function - """ - def mergeconf(f): - self.config = ConfigParser.RawConfigParser() - self.config.readfp(f) - self._normconfig() - if not self.config.has_section(SECT_META): - self.config.add_section(SECT_META) - self.config.set(SECT_META, 'version', config_version) - return trfn(norm(opt), *a, **kw) - def updateconf(f): - self.config.write(f) - update_file(self.path, updateconf, mergeconf) - - def _set(self, opt, val, rx=False): - """set @opt to @val in .section""" - sect = self.section(rx) - if not self.config.has_section(sect): - self.config.add_section(sect) - # regarding SECT_ORD, cf. ord_sections - if not self.config.has_section(SECT_ORD): - self.config.add_section(SECT_ORD) - self.config.set(SECT_ORD, sect, len(self.config._sections[SECT_ORD])) - self.config.set(sect, opt, val) - return True - - def set(self, opt, *a, **kw): - """perform ._set transactionally""" - self.write(self._set, opt, *a, **kw) - - def _delete(self, opt, rx=False): - """delete @opt from .section""" - sect = self.section(rx) - if self.config.has_section(sect): - return self.config.remove_option(sect, opt) - - def delete(self, opt, *a, **kw): - """perform ._delete transactionally""" - self.write(self._delete, opt, *a, **kw) diff --git a/xlators/features/marker/utils/syncdaemon/gconf.py b/xlators/features/marker/utils/syncdaemon/gconf.py deleted file mode 100644 index 146c72a18..000000000 --- a/xlators/features/marker/utils/syncdaemon/gconf.py +++ /dev/null @@ -1,20 +0,0 @@ -import os - -class GConf(object): - """singleton class to store globals - shared between gsyncd modules""" - - ssh_ctl_dir = None - ssh_ctl_args = None - cpid = None - pid_file_owned = False - log_exit = False - permanent_handles = [] - log_metadata = {} - - @classmethod - def setup_ssh_ctl(cls, ctld): - cls.ssh_ctl_dir = ctld - cls.ssh_ctl_args = ["-oControlMaster=auto", "-S", os.path.join(ctld, "gsycnd-ssh-%r@%h:%p")] - -gconf = GConf() diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py deleted file mode 100644 index 387900e6c..000000000 --- a/xlators/features/marker/utils/syncdaemon/gsyncd.py +++ /dev/null @@ -1,419 +0,0 @@ -#!/usr/bin/env python - -import os -import os.path -import sys -import time -import logging -import signal -import optparse -import fcntl -import fnmatch -from optparse import OptionParser, SUPPRESS_HELP -from logging import Logger -from errno import EEXIST, ENOENT - -from ipaddr import IPAddress, IPNetwork - -from gconf import gconf -from syncdutils import FreeObject, norm, grabpidfile, finalize, log_raise_exception -from syncdutils import GsyncdError, select, set_term_handler, privileged -from configinterface import GConffile -import resource -from monitor import monitor - -class GLogger(Logger): - """Logger customizations for gsyncd. - - It implements a log format similar to that of glusterfs. - """ - - def makeRecord(self, name, level, *a): - rv = Logger.makeRecord(self, name, level, *a) - rv.nsecs = (rv.created - int(rv.created)) * 1000000 - fr = sys._getframe(4) - callee = fr.f_locals.get('self') - if callee: - ctx = str(type(callee)).split("'")[1].split('.')[-1] - else: - ctx = '<top>' - if not hasattr(rv, 'funcName'): - rv.funcName = fr.f_code.co_name - rv.lvlnam = logging.getLevelName(level)[0] - rv.ctx = ctx - return rv - - @classmethod - def setup(cls, **kw): - lbl = kw.get('label', "") - if lbl: - lbl = '(' + lbl + ')' - lprm = {'datefmt': "%Y-%m-%d %H:%M:%S", - 'format': "[%(asctime)s.%(nsecs)d] %(lvlnam)s [%(module)s" + lbl + ":%(lineno)s:%(funcName)s] %(ctx)s: %(message)s"} - lprm.update(kw) - lvl = kw.get('level', logging.INFO) - lprm['level'] = lvl - logging.root = cls("root", lvl) - logging.setLoggerClass(cls) - logging.getLogger().handlers = [] - logging.basicConfig(**lprm) - - @classmethod - def _gsyncd_loginit(cls, **kw): - lkw = {} - if gconf.log_level: - lkw['level'] = gconf.log_level - if kw.get('log_file'): - if kw['log_file'] in ('-', '/dev/stderr'): - lkw['stream'] = sys.stderr - elif kw['log_file'] == '/dev/stdout': - lkw['stream'] = sys.stdout - else: - lkw['filename'] = kw['log_file'] - - cls.setup(label=kw.get('label'), **lkw) - - lkw.update({'saved_label': kw.get('label')}) - gconf.log_metadata = lkw - gconf.log_exit = True - -def startup(**kw): - """set up logging, pidfile grabbing, daemonization""" - if getattr(gconf, 'pid_file', None) and kw.get('go_daemon') != 'postconn': - if not grabpidfile(): - sys.stderr.write("pidfile is taken, exiting.\n") - sys.exit(2) - gconf.pid_file_owned = True - - if kw.get('go_daemon') == 'should': - x, y = os.pipe() - gconf.cpid = os.fork() - if gconf.cpid: - os.close(x) - sys.exit() - os.close(y) - os.setsid() - dn = os.open(os.devnull, os.O_RDWR) - for f in (sys.stdin, sys.stdout, sys.stderr): - os.dup2(dn, f.fileno()) - if getattr(gconf, 'pid_file', None): - if not grabpidfile(gconf.pid_file + '.tmp'): - raise GsyncdError("cannot grab temporary pidfile") - os.rename(gconf.pid_file + '.tmp', gconf.pid_file) - # wait for parent to terminate - # so we can start up with - # no messing from the dirty - # ol' bustard - select((x,), (), ()) - os.close(x) - - GLogger._gsyncd_loginit(**kw) - -def main(): - """main routine, signal/exception handling boilerplates""" - gconf.starttime = time.time() - set_term_handler() - GLogger.setup() - excont = FreeObject(exval = 0) - try: - try: - main_i() - except: - log_raise_exception(excont) - finally: - finalize(exval = excont.exval) - -def main_i(): - """internal main routine - - parse command line, decide what action will be taken; - we can either: - - query/manipulate configuration - - format gsyncd urls using gsyncd's url parsing engine - - start service in following modes, in given stages: - - monitor: startup(), monitor() - - master: startup(), connect_remote(), connect(), service_loop() - - slave: startup(), connect(), service_loop() - """ - rconf = {'go_daemon': 'should'} - - def store_abs(opt, optstr, val, parser): - if val and val != '-': - val = os.path.abspath(val) - setattr(parser.values, opt.dest, val) - def store_local(opt, optstr, val, parser): - rconf[opt.dest] = val - def store_local_curry(val): - return lambda o, oo, vx, p: store_local(o, oo, val, p) - def store_local_obj(op, dmake): - return lambda o, oo, vx, p: store_local(o, oo, FreeObject(op=op, **dmake(vx)), p) - - op = OptionParser(usage="%prog [options...] <master> <slave>", version="%prog 0.0.1") - op.add_option('--gluster-command-dir', metavar='DIR', default='') - op.add_option('--gluster-log-file', metavar='LOGF', default=os.devnull, type=str, action='callback', callback=store_abs) - op.add_option('--gluster-log-level', metavar='LVL') - op.add_option('--gluster-params', metavar='PRMS', default='') - op.add_option('--gluster-cli-options', metavar='OPTS', default='--log-file=-') - op.add_option('--mountbroker', metavar='LABEL') - op.add_option('-p', '--pid-file', metavar='PIDF', type=str, action='callback', callback=store_abs) - op.add_option('-l', '--log-file', metavar='LOGF', type=str, action='callback', callback=store_abs) - op.add_option('--log-file-mbr', metavar='LOGF', type=str, action='callback', callback=store_abs) - op.add_option('--state-file', metavar='STATF', type=str, action='callback', callback=store_abs) - op.add_option('--ignore-deletes', default=False, action='store_true') - op.add_option('--use-rsync-xattrs', default=False, action='store_true') - op.add_option('-L', '--log-level', metavar='LVL') - op.add_option('-r', '--remote-gsyncd', metavar='CMD', default=os.path.abspath(sys.argv[0])) - op.add_option('--volume-id', metavar='UUID') - op.add_option('--session-owner', metavar='ID') - op.add_option('-s', '--ssh-command', metavar='CMD', default='ssh') - op.add_option('--rsync-command', metavar='CMD', default='rsync') - op.add_option('--rsync-options', metavar='OPTS', default='--sparse') - op.add_option('--rsync-ssh-options', metavar='OPTS', default='--compress') - op.add_option('--timeout', metavar='SEC', type=int, default=120) - op.add_option('--connection-timeout', metavar='SEC', type=int, default=60, help=SUPPRESS_HELP) - op.add_option('--sync-jobs', metavar='N', type=int, default=3) - op.add_option('--turns', metavar='N', type=int, default=0, help=SUPPRESS_HELP) - op.add_option('--allow-network', metavar='IPS', default='') - op.add_option('--socketdir', metavar='DIR') - op.add_option('--state-socket-unencoded', metavar='SOCKF', type=str, action='callback', callback=store_abs) - op.add_option('--checkpoint', metavar='LABEL', default='') - # tunables for failover/failback mechanism: - # None - gsyncd behaves as normal - # blind - gsyncd works with xtime pairs to identify - # candidates for synchronization - # wrapup - same as normal mode but does not assign - # xtimes to orphaned files - # see crawl() for usage of the above tunables - op.add_option('--special-sync-mode', type=str, help=SUPPRESS_HELP) - - op.add_option('-c', '--config-file', metavar='CONF', type=str, action='callback', callback=store_local) - # duh. need to specify dest or value will be mapped to None :S - op.add_option('--monitor', dest='monitor', action='callback', callback=store_local_curry(True)) - op.add_option('--feedback-fd', dest='feedback_fd', type=int, help=SUPPRESS_HELP, action='callback', callback=store_local) - op.add_option('--listen', dest='listen', help=SUPPRESS_HELP, action='callback', callback=store_local_curry(True)) - op.add_option('-N', '--no-daemon', dest="go_daemon", action='callback', callback=store_local_curry('dont')) - op.add_option('--debug', dest="go_daemon", action='callback', callback=lambda *a: (store_local_curry('dont')(*a), - setattr(a[-1].values, 'log_file', '-'), - setattr(a[-1].values, 'log_level', 'DEBUG'))), - - for a in ('check', 'get'): - op.add_option('--config-' + a, metavar='OPT', type=str, dest='config', action='callback', - callback=store_local_obj(a, lambda vx: {'opt': vx})) - op.add_option('--config-get-all', dest='config', action='callback', callback=store_local_obj('get', lambda vx: {'opt': None})) - for m in ('', '-rx', '-glob'): - # call this code 'Pythonic' eh? - # have to define a one-shot local function to be able to inject (a value depending on the) - # iteration variable into the inner lambda - def conf_mod_opt_regex_variant(rx): - op.add_option('--config-set' + m, metavar='OPT VAL', type=str, nargs=2, dest='config', action='callback', - callback=store_local_obj('set', lambda vx: {'opt': vx[0], 'val': vx[1], 'rx': rx})) - op.add_option('--config-del' + m, metavar='OPT', type=str, dest='config', action='callback', - callback=store_local_obj('del', lambda vx: {'opt': vx, 'rx': rx})) - conf_mod_opt_regex_variant(m and m[1:] or False) - - op.add_option('--normalize-url', dest='url_print', action='callback', callback=store_local_curry('normal')) - op.add_option('--canonicalize-url', dest='url_print', action='callback', callback=store_local_curry('canon')) - op.add_option('--canonicalize-escape-url', dest='url_print', action='callback', callback=store_local_curry('canon_esc')) - - tunables = [ norm(o.get_opt_string()[2:]) for o in op.option_list if o.callback in (store_abs, 'store_true', None) and o.get_opt_string() not in ('--version', '--help') ] - remote_tunables = [ 'listen', 'go_daemon', 'timeout', 'session_owner', 'config_file', 'use_rsync_xattrs' ] - rq_remote_tunables = { 'listen': True } - - # precedence for sources of values: 1) commandline, 2) cfg file, 3) defaults - # -- for this to work out we need to tell apart defaults from explicitly set - # options... so churn out the defaults here and call the parser with virgin - # values container. - defaults = op.get_default_values() - opts, args = op.parse_args(values=optparse.Values()) - confdata = rconf.get('config') - if not (len(args) == 2 or \ - (len(args) == 1 and rconf.get('listen')) or \ - (len(args) <= 2 and confdata) or \ - rconf.get('url_print')): - sys.stderr.write("error: incorrect number of arguments\n\n") - sys.stderr.write(op.get_usage() + "\n") - sys.exit(1) - - restricted = os.getenv('_GSYNCD_RESTRICTED_') - - if restricted: - allopts = {} - allopts.update(opts.__dict__) - allopts.update(rconf) - bannedtuns = set(allopts.keys()) - set(remote_tunables) - if bannedtuns: - raise GsyncdError('following tunables cannot be set with restricted SSH invocaton: ' + \ - ', '.join(bannedtuns)) - for k, v in rq_remote_tunables.items(): - if not k in allopts or allopts[k] != v: - raise GsyncdError('tunable %s is not set to value %s required for restricted SSH invocaton' % \ - (k, v)) - - confrx = getattr(confdata, 'rx', None) - if confrx: - # peers are regexen, don't try to parse them - if confrx == 'glob': - args = [ '\A' + fnmatch.translate(a) for a in args ] - canon_peers = args - namedict = {} - else: - rscs = [resource.parse_url(u) for u in args] - dc = rconf.get('url_print') - if dc: - for r in rscs: - print(r.get_url(**{'normal': {}, - 'canon': {'canonical': True}, - 'canon_esc': {'canonical': True, 'escaped': True}}[dc])) - return - local = remote = None - if rscs: - local = rscs[0] - if len(rscs) > 1: - remote = rscs[1] - if not local.can_connect_to(remote): - raise GsyncdError("%s cannot work with %s" % (local.path, remote and remote.path)) - pa = ([], [], []) - urlprms = ({}, {'canonical': True}, {'canonical': True, 'escaped': True}) - for x in rscs: - for i in range(len(pa)): - pa[i].append(x.get_url(**urlprms[i])) - peers, canon_peers, canon_esc_peers = pa - # creating the namedict, a dict representing various ways of referring to / repreenting - # peers to be fillable in config templates - mods = (lambda x: x, lambda x: x[0].upper() + x[1:], lambda x: 'e' + x[0].upper() + x[1:]) - if remote: - rmap = { local: ('local', 'master'), remote: ('remote', 'slave') } - else: - rmap = { local: ('local', 'slave') } - namedict = {} - for i in range(len(rscs)): - x = rscs[i] - for name in rmap[x]: - for j in range(3): - namedict[mods[j](name)] = pa[j][i] - if x.scheme == 'gluster': - namedict[name + 'vol'] = x.volume - if not 'config_file' in rconf: - rconf['config_file'] = os.path.join(os.path.dirname(sys.argv[0]), "conf/gsyncd.conf") - gcnf = GConffile(rconf['config_file'], canon_peers, defaults.__dict__, opts.__dict__, namedict) - - checkpoint_change = False - if confdata: - opt_ok = norm(confdata.opt) in tunables + [None] - if confdata.op == 'check': - if opt_ok: - sys.exit(0) - else: - sys.exit(1) - elif not opt_ok: - raise GsyncdError("not a valid option: " + confdata.opt) - if confdata.op == 'get': - gcnf.get(confdata.opt) - elif confdata.op == 'set': - gcnf.set(confdata.opt, confdata.val, confdata.rx) - elif confdata.op == 'del': - gcnf.delete(confdata.opt, confdata.rx) - # when modifying checkpoint, it's important to make a log - # of that, so in that case we go on to set up logging even - # if its just config invocation - if confdata.opt == 'checkpoint' and confdata.op in ('set', 'del') and \ - not confdata.rx: - checkpoint_change = True - if not checkpoint_change: - return - - gconf.__dict__.update(defaults.__dict__) - gcnf.update_to(gconf.__dict__) - gconf.__dict__.update(opts.__dict__) - gconf.configinterface = gcnf - - if restricted and gconf.allow_network: - ssh_conn = os.getenv('SSH_CONNECTION') - if not ssh_conn: - #legacy env var - ssh_conn = os.getenv('SSH_CLIENT') - if ssh_conn: - allowed_networks = [ IPNetwork(a) for a in gconf.allow_network.split(',') ] - client_ip = IPAddress(ssh_conn.split()[0]) - allowed = False - for nw in allowed_networks: - if client_ip in nw: - allowed = True - break - if not allowed: - raise GsyncdError("client IP address is not allowed") - - ffd = rconf.get('feedback_fd') - if ffd: - fcntl.fcntl(ffd, fcntl.F_SETFD, fcntl.FD_CLOEXEC) - - #normalize loglevel - lvl0 = gconf.log_level - if isinstance(lvl0, str): - lvl1 = lvl0.upper() - lvl2 = logging.getLevelName(lvl1) - # I have _never_ _ever_ seen such an utterly braindead - # error condition - if lvl2 == "Level " + lvl1: - raise GsyncdError('cannot recognize log level "%s"' % lvl0) - gconf.log_level = lvl2 - - if not privileged() and gconf.log_file_mbr: - gconf.log_file = gconf.log_file_mbr - - if checkpoint_change: - try: - GLogger._gsyncd_loginit(log_file=gconf.log_file, label='conf') - if confdata.op == 'set': - logging.info('checkpoint %s set' % confdata.val) - elif confdata.op == 'del': - logging.info('checkpoint info was reset') - except IOError: - if sys.exc_info()[1].errno == ENOENT: - # directory of log path is not present, - # which happens if we get here from - # a peer-multiplexed "config-set checkpoint" - # (as that directory is created only on the - # original node) - pass - else: - raise - return - - go_daemon = rconf['go_daemon'] - be_monitor = rconf.get('monitor') - - if not be_monitor and isinstance(remote, resource.SSH) and \ - go_daemon == 'should': - go_daemon = 'postconn' - log_file = None - else: - log_file = gconf.log_file - if be_monitor: - label = 'monitor' - elif remote: - #master - label = '' - else: - label = 'slave' - startup(go_daemon=go_daemon, log_file=log_file, label=label) - - if be_monitor: - return monitor() - - logging.info("syncing: %s" % " -> ".join(peers)) - resource.Popen.init_errhandler() - if remote: - go_daemon = remote.connect_remote(go_daemon=go_daemon) - if go_daemon: - startup(go_daemon=go_daemon, log_file=gconf.log_file) - # complete remote connection in child - remote.connect_remote(go_daemon='done') - local.connect() - if ffd: - os.close(ffd) - local.service_loop(*[r for r in [remote] if r]) - - -if __name__ == "__main__": - main() diff --git a/xlators/features/marker/utils/syncdaemon/libcxattr.py b/xlators/features/marker/utils/syncdaemon/libcxattr.py deleted file mode 100644 index f0a9d2292..000000000 --- a/xlators/features/marker/utils/syncdaemon/libcxattr.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -from ctypes import * -from ctypes.util import find_library - -class Xattr(object): - """singleton that wraps the extended attribues system - interface for python using ctypes - - Just implement it to the degree we need it, in particular - - we need just the l*xattr variants, ie. we never want symlinks to be - followed - - don't need size discovery for getxattr, as we always know the exact - sizes we expect - """ - - libc = CDLL(find_library("libc")) - - @classmethod - def geterrno(cls): - return c_int.in_dll(cls.libc, 'errno').value - - @classmethod - def raise_oserr(cls): - errn = cls.geterrno() - raise OSError(errn, os.strerror(errn)) - - @classmethod - def _query_xattr(cls, path, siz, syscall, *a): - if siz: - buf = create_string_buffer('\0' * siz) - else: - buf = None - ret = getattr(cls.libc, syscall)(*((path,) + a + (buf, siz))) - if ret == -1: - cls.raise_oserr() - if siz: - return buf.raw[:ret] - else: - return ret - - @classmethod - def lgetxattr(cls, path, attr, siz=0): - return cls._query_xattr( path, siz, 'lgetxattr', attr) - - @classmethod - def llistxattr(cls, path, siz=0): - ret = cls._query_xattr(path, siz, 'llistxattr') - if isinstance(ret, str): - ret = ret.split('\0') - return ret - - @classmethod - def lsetxattr(cls, path, attr, val): - ret = cls.libc.lsetxattr(path, attr, val, len(val), 0) - if ret == -1: - cls.raise_oserr() - - @classmethod - def lremovexattr(cls, path, attr): - ret = cls.libc.lremovexattr(path, attr) - if ret == -1: - cls.raise_oserr() - - @classmethod - def llistxattr_buf(cls, path): - """listxattr variant with size discovery""" - size = cls.llistxattr(path) - if size == -1: - cls.raise_oserr() - if size == 0: - return [] - return cls.llistxattr(path, size) diff --git a/xlators/features/marker/utils/syncdaemon/master.py b/xlators/features/marker/utils/syncdaemon/master.py deleted file mode 100644 index f903f3059..000000000 --- a/xlators/features/marker/utils/syncdaemon/master.py +++ /dev/null @@ -1,961 +0,0 @@ -import os -import sys -import time -import stat -import random -import signal -import logging -import socket -import errno -import re -from errno import ENOENT, ENODATA, EPIPE -from threading import currentThread, Condition, Lock -from datetime import datetime -try: - from hashlib import md5 as md5 -except ImportError: - # py 2.4 - from md5 import new as md5 - -from gconf import gconf -from syncdutils import FreeObject, Thread, GsyncdError, boolify, \ - escape, unescape, select - -URXTIME = (-1, 0) - -# Utility functions to help us to get to closer proximity -# of the DRY principle (no, don't look for elevated or -# perspectivistic things here) - -def _xtime_now(): - t = time.time() - sec = int(t) - nsec = int((t - sec) * 1000000) - return (sec, nsec) - -def _volinfo_hook_relax_foreign(self): - volinfo_sys = self.get_sys_volinfo() - fgn_vi = volinfo_sys[self.KFGN] - if fgn_vi: - expiry = fgn_vi['timeout'] - int(time.time()) + 1 - logging.info('foreign volume info found, waiting %d sec for expiry' % \ - expiry) - time.sleep(expiry) - volinfo_sys = self.get_sys_volinfo() - self.volinfo_state, state_change = self.volinfo_state_machine(self.volinfo_state, - volinfo_sys) - if self.inter_master: - raise GsyncdError("cannot be intermediate master in special mode") - return (volinfo_sys, state_change) - - -# The API! - -def gmaster_builder(): - """produce the GMaster class variant corresponding - to sync mode""" - this = sys.modules[__name__] - modemixin = gconf.special_sync_mode - if not modemixin: - modemixin = 'normal' - logging.info('setting up master for %s sync mode' % modemixin) - modemixin = getattr(this, modemixin.capitalize() + 'Mixin') - sendmarkmixin = boolify(gconf.use_rsync_xattrs) and SendmarkRsyncMixin or SendmarkNormalMixin - purgemixin = boolify(gconf.ignore_deletes) and PurgeNoopMixin or PurgeNormalMixin - class _GMaster(GMasterBase, modemixin, sendmarkmixin, purgemixin): - pass - return _GMaster - - -# Mixin classes that implement the data format -# and logic particularities of the certain -# sync modes - -class NormalMixin(object): - """normal geo-rep behavior""" - - minus_infinity = URXTIME - - # following staticmethods ideally would be - # methods of an xtime object (in particular, - # implementing the hooks needed for comparison - # operators), but at this point we don't yet - # have a dedicated xtime class - - @staticmethod - def serialize_xtime(xt): - return "%d.%d" % tuple(xt) - - @staticmethod - def deserialize_xtime(xt): - return tuple(int(x) for x in xt.split(".")) - - @staticmethod - def native_xtime(xt): - return xt - - @staticmethod - def xtime_geq(xt0, xt1): - return xt0 >= xt1 - - def make_xtime_opts(self, is_master, opts): - if not 'create' in opts: - opts['create'] = is_master and not self.inter_master - if not 'default_xtime' in opts: - if is_master and self.inter_master: - opts['default_xtime'] = ENODATA - else: - opts['default_xtime'] = URXTIME - - def xtime_low(self, server, path, **opts): - xt = server.xtime(path, self.uuid) - if isinstance(xt, int) and xt != ENODATA: - return xt - if xt == ENODATA or xt < self.volmark: - if opts['create']: - xt = _xtime_now() - server.set_xtime(path, self.uuid, xt) - else: - xt = opts['default_xtime'] - return xt - - def keepalive_payload_hook(self, timo, gap): - # first grab a reference as self.volinfo - # can be changed in main thread - vi = self.volinfo - if vi: - # then have a private copy which we can mod - vi = vi.copy() - vi['timeout'] = int(time.time()) + timo - else: - # send keep-alives more frequently to - # avoid a delay in announcing our volume info - # to slave if it becomes established in the - # meantime - gap = min(10, gap) - return (vi, gap) - - def volinfo_hook(self): - volinfo_sys = self.get_sys_volinfo() - self.volinfo_state, state_change = self.volinfo_state_machine(self.volinfo_state, - volinfo_sys) - return (volinfo_sys, state_change) - - def xtime_reversion_hook(self, path, xtl, xtr): - if xtr > xtl: - raise GsyncdError("timestamp corruption for " + path) - - def need_sync(self, e, xte, xtrd): - return xte > xtrd - - def set_slave_xtime(self, path, mark): - self.slave.server.set_xtime(path, self.uuid, mark) - -class WrapupMixin(NormalMixin): - """a variant that differs from normal in terms - of ignoring non-indexed files""" - - @staticmethod - def make_xtime_opts(is_master, opts): - if not 'create' in opts: - opts['create'] = False - if not 'default_xtime' in opts: - opts['default_xtime'] = URXTIME - - @staticmethod - def keepalive_payload_hook(timo, gap): - return (None, gap) - - def volinfo_hook(self): - return _volinfo_hook_relax_foreign(self) - -class BlindMixin(object): - """Geo-rep flavor using vectored xtime. - - Coordinates are the master, slave uuid pair; - in master coordinate behavior is normal, - in slave coordinate we force synchronization - on any value difference (these are in disjunctive - relation, ie. if either orders the entry to be - synced, it shall be synced. - """ - - minus_infinity = (URXTIME, None) - - @staticmethod - def serialize_xtime(xt): - a = [] - for x in xt: - if not x: - x = ('None', '') - a.extend(x) - return '.'.join(str(n) for n in a) - - @staticmethod - def deserialize_xtime(xt): - a = xt.split(".") - a = (tuple(a[0:2]), tuple(a[3:4])) - b = [] - for p in a: - if p[0] == 'None': - p = None - else: - p = tuple(int(x) for x in p) - b.append(p) - return tuple(b) - - @staticmethod - def native_xtime(xt): - return xt[0] - - @staticmethod - def xtime_geq(xt0, xt1): - return (not xt1[0] or xt0[0] >= xt1[0]) and \ - (not xt1[1] or xt0[1] >= xt1[1]) - - @property - def ruuid(self): - if self.volinfo_r: - return self.volinfo_r['uuid'] - - @staticmethod - def make_xtime_opts(is_master, opts): - if not 'create' in opts: - opts['create'] = is_master - if not 'default_xtime' in opts: - opts['default_xtime'] = URXTIME - - def xtime_low(self, server, path, **opts): - xtd = server.xtime_vec(path, self.uuid, self.ruuid) - if isinstance(xtd, int): - return xtd - xt = (xtd[self.uuid], xtd[self.ruuid]) - if not xt[1] and (not xt[0] or xt[0] < self.volmark): - if opts['create']: - # not expected, but can happen if file originates - # from interrupted gsyncd transfer - logging.warn('have to fix up missing xtime on ' + path) - xt0 = _xtime_now() - server.set_xtime(path, self.uuid, xt0) - else: - xt0 = opts['default_xtime'] - xt = (xt0, xt[1]) - return xt - - @staticmethod - def keepalive_payload_hook(timo, gap): - return (None, gap) - - def volinfo_hook(self): - res = _volinfo_hook_relax_foreign(self) - volinfo_r_new = self.slave.server.native_volume_info() - if volinfo_r_new['retval']: - raise GsyncdError("slave is corrupt") - if getattr(self, 'volinfo_r', None): - if self.volinfo_r['uuid'] != volinfo_r_new['uuid']: - raise GsyncdError("uuid mismatch on slave") - self.volinfo_r = volinfo_r_new - return res - - def xtime_reversion_hook(self, path, xtl, xtr): - if not isinstance(xtr[0], int) and \ - (isinstance(xtl[0], int) or xtr[0] > xtl[0]): - raise GsyncdError("timestamp corruption for " + path) - - def need_sync(self, e, xte, xtrd): - if xte[0]: - if not xtrd[0] or xte[0] > xtrd[0]: - # there is outstanding diff at 0th pos, - # we can short-cut to true - return True - # we arrived to this point by either of these - # two possiblilites: - # - no outstanding difference at 0th pos, - # wanna see 1st pos if he raises veto - # against "no need to sync" proposal - # - no data at 0th pos, 1st pos will have - # to decide (due to xtime assignment, - # in this case 1st pos does carry data - # -- iow, if 1st pos did not have data, - # and 0th neither, 0th would have been - # force-feeded) - if not xte[1]: - # no data, no veto - return False - # the hard work: for 1st pos, - # the conduct is fetch corresponding - # slave data and do a "blind" comparison - # (ie. do not care who is newer, we trigger - # sync on non-identical xitmes) - xtr = self.xtime(e, self.slave) - return isinstance(xtr, int) or xte[1] != xtr[1] - - def set_slave_xtime(self, path, mark): - xtd = {} - for (u, t) in zip((self.uuid, self.ruuid), mark): - if t: - xtd[u] = t - self.slave.server.set_xtime_vec(path, xtd) - - -# Further mixins for certain tunable behaviors - -class SendmarkNormalMixin(object): - - def sendmark_regular(self, *a, **kw): - return self.sendmark(*a, **kw) - -class SendmarkRsyncMixin(object): - - def sendmark_regular(self, *a, **kw): - pass - - -class PurgeNormalMixin(object): - - def purge_missing(self, path, names): - self.slave.server.purge(path, names) - -class PurgeNoopMixin(object): - - def purge_missing(self, path, names): - pass - - - -class GMasterBase(object): - """abstract class impementling master role""" - - KFGN = 0 - KNAT = 1 - - def get_sys_volinfo(self): - """query volume marks on fs root - - err out on multiple foreign masters - """ - fgn_vis, nat_vi = self.master.server.foreign_volume_infos(), \ - self.master.server.native_volume_info() - fgn_vi = None - if fgn_vis: - if len(fgn_vis) > 1: - raise GsyncdError("cannot work with multiple foreign masters") - fgn_vi = fgn_vis[0] - return fgn_vi, nat_vi - - @property - def uuid(self): - if self.volinfo: - return self.volinfo['uuid'] - - @property - def volmark(self): - if self.volinfo: - return self.volinfo['volume_mark'] - - @property - def inter_master(self): - """decide if we are an intermediate master - in a cascading setup - """ - return self.volinfo_state[self.KFGN] and True or False - - def xtime(self, path, *a, **opts): - """get amended xtime - - as of amending, we can create missing xtime, or - determine a valid value if what we get is expired - (as of the volume mark expiry); way of amendig - depends on @opts and on subject of query (master - or slave). - """ - if a: - rsc = a[0] - else: - rsc = self.master - self.make_xtime_opts(rsc == self.master, opts) - return self.xtime_low(rsc.server, path, **opts) - - def __init__(self, master, slave): - self.master = master - self.slave = slave - self.jobtab = {} - self.syncer = Syncer(slave) - # crawls vs. turns: - # - self.crawls is simply the number of crawl() invocations on root - # - one turn is a maximal consecutive sequence of crawls so that each - # crawl in it detects a change to be synced - # - self.turns is the number of turns since start - # - self.total_turns is a limit so that if self.turns reaches it, then - # we exit (for diagnostic purposes) - # so, eg., if the master fs changes unceasingly, self.turns will remain 0. - self.crawls = 0 - self.turns = 0 - self.total_turns = int(gconf.turns) - self.lastreport = {'crawls': 0, 'turns': 0} - self.start = None - self.change_seen = None - self.syncTime=0 - self.lastSyncTime=0 - self.crawlStartTime=0 - self.crawlTime=0 - self.filesSynced=0 - self.bytesSynced=0 - # the authoritative (foreign, native) volinfo pair - # which lets us deduce what to do when we refetch - # the volinfos from system - uuid_preset = getattr(gconf, 'volume_id', None) - self.volinfo_state = (uuid_preset and {'uuid': uuid_preset}, None) - # the actual volinfo we make use of - self.volinfo = None - self.terminate = False - self.checkpoint_thread = None - - @classmethod - def _checkpt_param(cls, chkpt, prm, xtimish=True): - """use config backend to lookup a parameter belonging to - checkpoint @chkpt""" - cprm = getattr(gconf, 'checkpoint_' + prm, None) - if not cprm: - return - chkpt_mapped, val = cprm.split(':', 1) - if unescape(chkpt_mapped) != chkpt: - return - if xtimish: - val = cls.deserialize_xtime(val) - return val - - @classmethod - def _set_checkpt_param(cls, chkpt, prm, val, xtimish=True): - """use config backend to store a parameter associated - with checkpoint @chkpt""" - if xtimish: - val = cls.serialize_xtime(val) - gconf.configinterface.set('checkpoint_' + prm, "%s:%s" % (escape(chkpt), val)) - - @staticmethod - def humantime(*tpair): - """format xtime-like (sec, nsec) pair to human readable format""" - ts = datetime.fromtimestamp(float('.'.join(str(n) for n in tpair))).\ - strftime("%Y-%m-%d %H:%M:%S") - if len(tpair) > 1: - ts += '.' + str(tpair[1]) - return ts - - def get_extra_info(self): - str_info="\nFile synced : %d" %(self.filesSynced) - str_info+="\nBytes Synced : %d KB" %(self.syncer.bytesSynced) - str_info+="\nSync Time : %f seconds" %(self.syncTime) - self.crawlTime=datetime.now()-self.crawlStartTime - years , days =divmod(self.crawlTime.days,365.25) - years=int(years) - days=int(days) - - date="" - m, s = divmod(self.crawlTime.seconds, 60) - h, m = divmod(m, 60) - - if years!=0 : - date+=str(years)+" year " - if days!=0 : - date+=str(days)+" day " - if h!=0 : - date+=str(h)+" H : " - if m!=0 or h!=0 : - date+=str(m)+" M : " - - date+=str(s)+" S" - self.crawlTime=date - str_info+="\nCrawl Time : %s" %(str(self.crawlTime)) - str_info+="\n\0" - return str_info - - def checkpt_service(self, chan, chkpt, tgt): - """checkpoint service loop - - monitor and verify checkpoint status for @chkpt, and listen - for incoming requests for whom we serve a pretty-formatted - status report""" - if not chkpt: - # dummy loop for the case when there is no checkpt set - while True: - select([chan], [], []) - conn, _ = chan.accept() - conn.send(self.get_extra_info()) - conn.close() - completed = self._checkpt_param(chkpt, 'completed', xtimish=False) - if completed: - completed = tuple(int(x) for x in completed.split('.')) - while True: - s,_,_ = select([chan], [], [], (not completed) and 5 or None) - # either request made and we re-check to not - # give back stale data, or we still hunting for completion - if self.native_xtime(tgt) and self.native_xtime(tgt) < self.volmark: - # indexing has been reset since setting the checkpoint - status = "is invalid" - else: - xtr = self.xtime('.', self.slave) - if isinstance(xtr, int): - raise GsyncdError("slave root directory is unaccessible (%s)", - os.strerror(xtr)) - ncompleted = self.xtime_geq(xtr, tgt) - if completed and not ncompleted: # stale data - logging.warn("completion time %s for checkpoint %s became stale" % \ - (self.humantime(*completed), chkpt)) - completed = None - gconf.confdata.delete('checkpoint-completed') - if ncompleted and not completed: # just reaching completion - completed = "%.6f" % time.time() - self._set_checkpt_param(chkpt, 'completed', completed, xtimish=False) - completed = tuple(int(x) for x in completed.split('.')) - logging.info("checkpoint %s completed" % chkpt) - status = completed and \ - "completed at " + self.humantime(completed[0]) or \ - "not reached yet" - if s: - conn = None - try: - conn, _ = chan.accept() - try: - conn.send(" | checkpoint %s %s %s" % (chkpt, status,self.get_extra_info())) - except: - exc = sys.exc_info()[1] - if (isinstance(exc, OSError) or isinstance(exc, IOError)) and \ - exc.errno == EPIPE: - logging.debug('checkpoint client disconnected') - else: - raise - finally: - if conn: - conn.close() - - def start_checkpoint_thread(self): - """prepare and start checkpoint service""" - if self.checkpoint_thread or not ( - getattr(gconf, 'state_socket_unencoded', None) and getattr(gconf, 'socketdir', None) - ): - return - chan = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - state_socket = os.path.join(gconf.socketdir, md5(gconf.state_socket_unencoded).hexdigest() + ".socket") - try: - os.unlink(state_socket) - except: - if sys.exc_info()[0] == OSError: - pass - chan.bind(state_socket) - chan.listen(1) - checkpt_tgt = None - if gconf.checkpoint: - checkpt_tgt = self._checkpt_param(gconf.checkpoint, 'target') - if not checkpt_tgt: - checkpt_tgt = self.xtime('.') - if isinstance(checkpt_tgt, int): - raise GsyncdError("master root directory is unaccessible (%s)", - os.strerror(checkpt_tgt)) - self._set_checkpt_param(gconf.checkpoint, 'target', checkpt_tgt) - logging.debug("checkpoint target %s has been determined for checkpoint %s" % \ - (repr(checkpt_tgt), gconf.checkpoint)) - t = Thread(target=self.checkpt_service, args=(chan, gconf.checkpoint, checkpt_tgt)) - t.start() - self.checkpoint_thread = t - - def crawl_loop(self): - """start the keep-alive thread and iterate .crawl""" - timo = int(gconf.timeout or 0) - if timo > 0: - def keep_alive(): - while True: - vi, gap = self.keepalive_payload_hook(timo, timo * 0.5) - self.slave.server.keep_alive(vi) - time.sleep(gap) - t = Thread(target=keep_alive) - t.start() - self.lastreport['time'] = time.time() - self.crawlStartTime=datetime.now() - while not self.terminate: - self.crawl() - - def add_job(self, path, label, job, *a, **kw): - """insert @job function to job table at @path with @label""" - if self.jobtab.get(path) == None: - self.jobtab[path] = [] - self.jobtab[path].append((label, a, lambda : job(*a, **kw))) - - def add_failjob(self, path, label): - """invoke .add_job with a job that does nothing just fails""" - logging.debug('salvaged: ' + label) - self.add_job(path, label, lambda: False) - - def wait(self, path, *args): - """perform jobs registered for @path - - Reset jobtab entry for @path, - determine success as the conjuction of - success of all the jobs. In case of - success, call .sendmark on @path - """ - jobs = self.jobtab.pop(path, []) - succeed = True - for j in jobs: - ret = j[-1]() - if not ret: - succeed = False - if succeed: - self.sendmark(path, *args) - return succeed - - def sendmark(self, path, mark, adct=None): - """update slave side xtime for @path to master side xtime - - also can send a setattr payload (see Server.setattr). - """ - if adct: - self.slave.server.setattr(path, adct) - self.set_slave_xtime(path, mark) - - @staticmethod - def volinfo_state_machine(volinfo_state, volinfo_sys): - """compute new volinfo_state from old one and incoming - as of current system state, also indicating if there was a - change regarding which volume mark is the authoritative one - - @volinfo_state, @volinfo_sys are pairs of volume mark dicts - (foreign, native). - - Note this method is marked as static, ie. the computation is - pure, without reliance on any excess implicit state. State - transitions which are deemed as ambiguous or banned will raise - an exception. - - """ - # store the value below "boxed" to emulate proper closures - # (variables of the enclosing scope are available inner functions - # provided they are no reassigned; mutation is OK). - param = FreeObject(relax_mismatch = False, state_change = None, index=-1) - def select_vi(vi0, vi): - param.index += 1 - if vi and (not vi0 or vi0['uuid'] == vi['uuid']): - if not vi0 and not param.relax_mismatch: - param.state_change = param.index - # valid new value found; for the rest, we are graceful about - # uuid mismatch - param.relax_mismatch = True - return vi - if vi0 and vi and vi0['uuid'] != vi['uuid'] and not param.relax_mismatch: - # uuid mismatch for master candidate, bail out - raise GsyncdError("aborting on uuid change from %s to %s" % \ - (vi0['uuid'], vi['uuid'])) - # fall back to old - return vi0 - newstate = tuple(select_vi(*vip) for vip in zip(volinfo_state, volinfo_sys)) - srep = lambda vi: vi and vi['uuid'][0:8] - logging.debug('(%s, %s) << (%s, %s) -> (%s, %s)' % \ - tuple(srep(vi) for vi in volinfo_state + volinfo_sys + newstate)) - return newstate, param.state_change - - def crawl(self, path='.', xtl=None): - """crawling... - - Standing around - All the right people - Crawling - Tennis on Tuesday - The ladder is long - It is your nature - You've gotta suntan - Football on Sunday - Society boy - - Recursively walk the master side tree and check if updates are - needed due to xtime differences. One invocation of crawl checks - children of @path and do a recursive enter only on - those directory children where there is an update needed. - - Way of updates depend on file type: - - for symlinks, sync them directy and synchronously - - for regular children, register jobs for @path (cf. .add_job) to start - and wait on their rsync - - for directory children, register a job for @path which waits (.wait) - on jobs for the given child - (other kind of filesystem nodes are not considered) - - Those slave side children which do not exist on master are simply - purged (see Server.purge). - - Behavior is fault tolerant, synchronization is adaptive: if some action fails, - just go on relentlessly, adding a fail job (see .add_failjob) which will prevent - the .sendmark on @path, so when the next crawl will arrive to @path it will not - see it as up-to-date and will try to sync it again. While this semantics can be - supported by funky design principles (http://c2.com/cgi/wiki?LazinessImpatienceHubris), - the ultimate reason which excludes other possibilities is simply transience: we cannot - assert that the file systems (master / slave) underneath do not change and actions - taken upon some condition will not lose their context by the time they are performed. - """ - if path == '.': - if self.start: - self.crawls += 1 - logging.debug("... crawl #%d done, took %.6f seconds" % \ - (self.crawls, time.time() - self.start)) - time.sleep(1) - self.start = time.time() - should_display_info = self.start - self.lastreport['time'] >= 60 - if should_display_info: - logging.info("completed %d crawls, %d turns", - self.crawls - self.lastreport['crawls'], - self.turns - self.lastreport['turns']) - self.lastreport.update(crawls = self.crawls, - turns = self.turns, - time = self.start) - volinfo_sys, state_change = self.volinfo_hook() - if self.inter_master: - self.volinfo = volinfo_sys[self.KFGN] - else: - self.volinfo = volinfo_sys[self.KNAT] - if state_change == self.KFGN or (state_change == self.KNAT and not self.inter_master): - logging.info('new master is %s', self.uuid) - if self.volinfo: - logging.info("%s master with volume id %s ..." % \ - (self.inter_master and "intermediate" or "primary", - self.uuid)) - if state_change == self.KFGN: - gconf.configinterface.set('volume_id', self.uuid) - if self.volinfo: - if self.volinfo['retval']: - raise GsyncdError ("master is corrupt") - self.start_checkpoint_thread() - else: - if should_display_info or self.crawls == 0: - if self.inter_master: - logging.info("waiting for being synced from %s ..." % \ - self.volinfo_state[self.KFGN]['uuid']) - else: - logging.info("waiting for volume info ...") - return - logging.debug("entering " + path) - if not xtl: - xtl = self.xtime(path) - if isinstance(xtl, int): - self.add_failjob(path, 'no-local-node') - return - xtr = self.xtime(path, self.slave) - if isinstance(xtr, int): - if xtr != ENOENT: - self.slave.server.purge(path) - try: - self.slave.server.mkdir(path) - except OSError: - self.add_failjob(path, 'no-remote-node') - return - xtr = self.minus_infinity - else: - self.xtime_reversion_hook(path, xtl, xtr) - if xtl == xtr: - if path == '.' and self.change_seen: - self.turns += 1 - self.change_seen = False - if self.total_turns: - logging.info("finished turn #%s/%s" % \ - (self.turns, self.total_turns)) - if self.turns == self.total_turns: - logging.info("reached turn limit") - self.terminate = True - return - if path == '.': - self.change_seen = True - try: - dem = self.master.server.entries(path) - except OSError: - self.add_failjob(path, 'local-entries-fail') - return - random.shuffle(dem) - try: - des = self.slave.server.entries(path) - except OSError: - self.slave.server.purge(path) - try: - self.slave.server.mkdir(path) - des = self.slave.server.entries(path) - except OSError: - self.add_failjob(path, 'remote-entries-fail') - return - dd = set(des) - set(dem) - if dd: - self.purge_missing(path, dd) - chld = [] - for e in dem: - e = os.path.join(path, e) - xte = self.xtime(e) - if isinstance(xte, int): - logging.warn("irregular xtime for %s: %s" % (e, errno.errorcode[xte])) - elif self.need_sync(e, xte, xtr): - chld.append((e, xte)) - def indulgently(e, fnc, blame=None): - if not blame: - blame = path - try: - return fnc(e) - except (IOError, OSError): - ex = sys.exc_info()[1] - if ex.errno == ENOENT: - logging.warn("salvaged ENOENT for " + e) - self.add_failjob(blame, 'by-indulgently') - return False - else: - raise - for e, xte in chld: - st = indulgently(e, lambda e: os.lstat(e)) - if st == False: - continue - mo = st.st_mode - adct = {'own': (st.st_uid, st.st_gid)} - if stat.S_ISLNK(mo): - if indulgently(e, lambda e: self.slave.server.symlink(os.readlink(e), e)) == False: - continue - self.sendmark(e, xte, adct) - elif stat.S_ISREG(mo): - logging.debug("syncing %s ..." % e) - pb = self.syncer.add(e) - timeA=datetime.now() - def regjob(e, xte, pb): - if pb.wait(): - logging.debug("synced " + e) - self.sendmark_regular(e, xte) - - timeB=datetime.now() - self.lastSyncTime=timeB-timeA - self.syncTime=(self.syncTime+self.lastSyncTime.microseconds)/(10.0**6) - self.filesSynced=self.filesSynced+1 - return True - else: - logging.warn("failed to sync " + e) - self.add_job(path, 'reg', regjob, e, xte, pb) - elif stat.S_ISDIR(mo): - adct['mode'] = mo - if indulgently(e, lambda e: (self.add_job(path, 'cwait', self.wait, e, xte, adct), - self.crawl(e, xte), - True)[-1], blame=e) == False: - continue - else: - # ignore fifos, sockets and special files - pass - if path == '.': - self.wait(path, xtl) - -class BoxClosedErr(Exception): - pass - -class PostBox(list): - """synchronized collection for storing things thought of as "requests" """ - - def __init__(self, *a): - list.__init__(self, *a) - # too bad Python stdlib does not have read/write locks... - # it would suffivce to grab the lock in .append as reader, in .close as writer - self.lever = Condition() - self.open = True - self.done = False - - def wait(self): - """wait on requests to be processed""" - self.lever.acquire() - if not self.done: - self.lever.wait() - self.lever.release() - return self.result - - def wakeup(self, data): - """wake up requestors with the result""" - self.result = data - self.lever.acquire() - self.done = True - self.lever.notifyAll() - self.lever.release() - - def append(self, e): - """post a request""" - self.lever.acquire() - if not self.open: - raise BoxClosedErr - list.append(self, e) - self.lever.release() - - def close(self): - """prohibit the posting of further requests""" - self.lever.acquire() - self.open = False - self.lever.release() - -class Syncer(object): - """a staged queue to relay rsync requests to rsync workers - - By "staged queue" its meant that when a consumer comes to the - queue, it takes _all_ entries, leaving the queue empty. - (I don't know if there is an official term for this pattern.) - - The queue uses a PostBox to accumulate incoming items. - When a consumer (rsync worker) comes, a new PostBox is - set up and the old one is passed on to the consumer. - - Instead of the simplistic scheme of having one big lock - which synchronizes both the addition of new items and - PostBox exchanges, use a separate lock to arbitrate consumers, - and rely on PostBox's synchronization mechanisms take - care about additions. - - There is a corner case racy situation, producers vs. consumers, - which is not handled by this scheme: namely, when the PostBox - exchange occurs in between being passed to the producer for posting - and the post placement. But that's what Postbox.close is for: - such a posting will find the PostBox closed, in which case - the producer can re-try posting against the actual PostBox of - the queue. - - To aid accumlation of items in the PostBoxen before grabbed - by an rsync worker, the worker goes to sleep a bit after - each completed syncjob. - """ - - def __init__(self, slave): - """spawn worker threads""" - self.slave = slave - self.lock = Lock() - self.pb = PostBox() - self.bytesSynced=0 - for i in range(int(gconf.sync_jobs)): - t = Thread(target=self.syncjob) - t.start() - - def syncjob(self): - """the life of a worker""" - while True: - pb = None - while True: - self.lock.acquire() - if self.pb: - pb, self.pb = self.pb, PostBox() - self.lock.release() - if pb: - break - time.sleep(0.5) - pb.close() - po = self.slave.rsync(pb) - if po.returncode == 0: - regEx=re.search('\ *total\ *transferred\ *file\ *size:\ *(\d+)\ *bytes\ *',po.stdout.read(),re.IGNORECASE) - if regEx: - self.bytesSynced+=(int(regEx.group(1)))/1024 - ret = True - elif po.returncode in (23, 24): - # partial transfer (cf. rsync(1)), that's normal - ret = False - else: - po.errfail() - pb.wakeup(ret) - - def add(self, e): - while True: - pb = self.pb - try: - pb.append(e) - return pb - except BoxClosedErr: - pass diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py deleted file mode 100644 index b8956dcc2..000000000 --- a/xlators/features/marker/utils/syncdaemon/monitor.py +++ /dev/null @@ -1,129 +0,0 @@ -import os -import sys -import time -import signal -import logging -from gconf import gconf -from syncdutils import update_file, select, waitpid, set_term_handler - -class Monitor(object): - """class which spawns and manages gsyncd workers""" - - def __init__(self): - self.state = None - - def set_state(self, state): - """set the state that can be used by external agents - like glusterd for status reporting""" - if state == self.state: - return - self.state = state - logging.info('new state: %s' % state) - if getattr(gconf, 'state_file', None): - update_file(gconf.state_file, lambda f: f.write(state + '\n')) - - def monitor(self): - """the monitor loop - - Basic logic is a blantantly simple blunt heuristics: - if spawned client survives 60 secs, it's considered OK. - This servers us pretty well as it's not vulneralbe to - any kind of irregular behavior of the child... - - ... well, except for one: if children is hung up on - waiting for some event, it can survive aeons, still - will be defunct. So we tweak the above logic to - expect the worker to send us a signal within 60 secs - (in the form of closing its end of a pipe). The worker - does this when it's done with the setup stage - ready to enter the service loop (note it's the setup - stage which is vulnerable to hangs -- the full - blown worker blows up on EPIPE if the net goes down, - due to the keep-alive thread) - """ - def sigcont_handler(*a): - """ - Re-init logging and send group kill signal - """ - md = gconf.log_metadata - logging.shutdown() - lcls = logging.getLoggerClass() - lcls.setup(label=md.get('saved_label'), **md) - pid = os.getpid() - os.kill(-pid, signal.SIGUSR1) - signal.signal(signal.SIGUSR1, lambda *a: ()) - signal.signal(signal.SIGCONT, sigcont_handler) - - argv = sys.argv[:] - for o in ('-N', '--no-daemon', '--monitor'): - while o in argv: - argv.remove(o) - argv.extend(('-N', '-p', '')) - argv.insert(0, os.path.basename(sys.executable)) - - self.set_state('starting...') - ret = 0 - def nwait(p, o=0): - p2, r = waitpid(p, o) - if not p2: - return - return r - def exit_signalled(s): - """ child teminated due to receipt of SIGUSR1 """ - return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1)) - def exit_status(s): - if os.WIFEXITED(s): - return os.WEXITSTATUS(s) - return 1 - conn_timeout = int(gconf.connection_timeout) - while ret in (0, 1): - logging.info('-' * conn_timeout) - logging.info('starting gsyncd worker') - pr, pw = os.pipe() - cpid = os.fork() - if cpid == 0: - os.close(pr) - os.execv(sys.executable, argv + ['--feedback-fd', str(pw)]) - os.close(pw) - t0 = time.time() - so = select((pr,), (), (), conn_timeout)[0] - os.close(pr) - if so: - ret = nwait(cpid, os.WNOHANG) - if ret != None: - logging.debug("worker died before establishing connection") - else: - logging.debug("worker seems to be connected (?? racy check)") - while time.time() < t0 + conn_timeout: - ret = nwait(cpid, os.WNOHANG) - if ret != None: - logging.debug("worker died in startup phase") - break - time.sleep(1) - else: - logging.debug("worker not confirmed in %d sec, aborting it" % \ - conn_timeout) - # relax one SIGTERM by setting a handler that sets back - # standard handler - set_term_handler(lambda *a: set_term_handler()) - # give a chance to graceful exit - os.kill(-os.getpid(), signal.SIGTERM) - time.sleep(1) - os.kill(cpid, signal.SIGKILL) - ret = nwait(cpid) - if ret == None: - self.set_state('OK') - ret = nwait(cpid) - if exit_signalled(ret): - ret = 0 - else: - ret = exit_status(ret) - if ret in (0,1): - self.set_state('faulty') - time.sleep(10) - self.set_state('inconsistent') - return ret - -def monitor(): - """oh yeah, actually Monitor is used as singleton, too""" - return Monitor().monitor() diff --git a/xlators/features/marker/utils/syncdaemon/repce.py b/xlators/features/marker/utils/syncdaemon/repce.py deleted file mode 100644 index 755fb61df..000000000 --- a/xlators/features/marker/utils/syncdaemon/repce.py +++ /dev/null @@ -1,225 +0,0 @@ -import os -import sys -import time -import logging -from threading import Condition -try: - import thread -except ImportError: - # py 3 - import _thread as thread -try: - from Queue import Queue -except ImportError: - # py 3 - from queue import Queue -try: - import cPickle as pickle -except ImportError: - # py 3 - import pickle - -from syncdutils import Thread, select - -pickle_proto = -1 -repce_version = 1.0 - -def ioparse(i, o): - if isinstance(i, int): - i = os.fdopen(i) - # rely on duck typing for recognizing - # streams as that works uniformly - # in py2 and py3 - if hasattr(o, 'fileno'): - o = o.fileno() - return (i, o) - -def send(out, *args): - """pickle args and write out wholly in one syscall - - ie. not use the ability of pickle to dump directly to - a stream, as that would potentially mess up messages - by interleaving them - """ - os.write(out, pickle.dumps(args, pickle_proto)) - -def recv(inf): - """load an object from input stream""" - return pickle.load(inf) - - -class RepceServer(object): - """RePCe is Hungarian for canola, http://hu.wikipedia.org/wiki/Repce - - ... also our homebrewed RPC backend where the transport layer is - reduced to a pair of filehandles. - - This is the server component. - """ - - def __init__(self, obj, i, o, wnum=6): - """register a backend object .obj to which incoming messages - are dispatched, also incoming/outcoming streams - """ - self.obj = obj - self.inf, self.out = ioparse(i, o) - self.wnum = wnum - self.q = Queue() - - def service_loop(self): - """fire up worker threads, get messages and dispatch among them""" - for i in range(self.wnum): - t = Thread(target=self.worker) - t.start() - try: - while True: - self.q.put(recv(self.inf)) - except EOFError: - logging.info("terminating on reaching EOF.") - - def worker(self): - """life of a worker - - Get message, extract its id, method name and arguments - (kwargs not supported), call method on .obj. - Send back message id + return value. - If method call throws an exception, rescue it, and send - back the exception as result (with flag marking it as - exception). - """ - while True: - in_data = self.q.get(True) - rid = in_data[0] - rmeth = in_data[1] - exc = False - if rmeth == '__repce_version__': - res = repce_version - else: - try: - res = getattr(self.obj, rmeth)(*in_data[2:]) - except: - res = sys.exc_info()[1] - exc = True - logging.exception("call failed: ") - send(self.out, rid, exc, res) - - -class RepceJob(object): - """class representing message status we can use - for waiting on reply""" - - def __init__(self, cbk): - """ - - .rid: (process-wise) unique id - - .cbk: what we do upon receiving reply - """ - self.rid = (os.getpid(), thread.get_ident(), time.time()) - self.cbk = cbk - self.lever = Condition() - self.done = False - - def __repr__(self): - return ':'.join([str(x) for x in self.rid]) - - def wait(self): - self.lever.acquire() - if not self.done: - self.lever.wait() - self.lever.release() - return self.result - - def wakeup(self, data): - self.result = data - self.lever.acquire() - self.done = True - self.lever.notify() - self.lever.release() - - -class RepceClient(object): - """RePCe is Hungarian for canola, http://hu.wikipedia.org/wiki/Repce - - ... also our homebrewed RPC backend where the transport layer is - reduced to a pair of filehandles. - - This is the client component. - """ - - def __init__(self, i, o): - self.inf, self.out = ioparse(i, o) - self.jtab = {} - t = Thread(target = self.listen) - t.start() - - def listen(self): - while True: - select((self.inf,), (), ()) - rid, exc, res = recv(self.inf) - rjob = self.jtab.pop(rid) - if rjob.cbk: - rjob.cbk(rjob, [exc, res]) - - def push(self, meth, *args, **kw): - """wrap arguments in a RepceJob, send them to server - and return the RepceJob - - @cbk to pass on RepceJob can be given as kwarg. - """ - cbk = kw.get('cbk') - if not cbk: - def cbk(rj, res): - if res[0]: - raise res[1] - rjob = RepceJob(cbk) - self.jtab[rjob.rid] = rjob - logging.debug("call %s %s%s ..." % (repr(rjob), meth, repr(args))) - send(self.out, rjob.rid, meth, *args) - return rjob - - def __call__(self, meth, *args): - """RePCe client is callabe, calling it implements a synchronous remote call - - We do a .push with a cbk which does a wakeup upon receiving anwser, then wait - on the RepceJob. - """ - rjob = self.push(meth, *args, **{'cbk': lambda rj, res: rj.wakeup(res)}) - exc, res = rjob.wait() - if exc: - logging.error('call %s (%s) failed on peer with %s' % (repr(rjob), meth, str(type(res).__name__))) - raise res - logging.debug("call %s %s -> %s" % (repr(rjob), meth, repr(res))) - return res - - class mprx(object): - """method proxy, standard trick to implement rubyesque method_missing - in Python - - A class is a closure factory, you know what I mean, or go read some SICP. - """ - def __init__(self, ins, meth): - self.ins = ins - self.meth = meth - - def __call__(self, *a): - return self.ins(self.meth, *a) - - def __getattr__(self, meth): - """this implements transparent method dispatch to remote object, - so that you don't need to call the RepceClient instance like - - rclient('how_old_are_you_if_born_in', 1979) - - but you can make it into an ordinary method call like - - rclient.how_old_are_you_if_born_in(1979) - """ - return self.mprx(self, meth) - - def __version__(self): - """used in handshake to verify compatibility""" - d = {'proto': self('__repce_version__')} - try: - d['object'] = self('version') - except AttributeError: - pass - return d diff --git a/xlators/features/marker/utils/syncdaemon/resource.py b/xlators/features/marker/utils/syncdaemon/resource.py deleted file mode 100644 index 73102fbcb..000000000 --- a/xlators/features/marker/utils/syncdaemon/resource.py +++ /dev/null @@ -1,972 +0,0 @@ -import re -import os -import sys -import stat -import time -import fcntl -import errno -import struct -import socket -import logging -import tempfile -import threading -import subprocess -from errno import EEXIST, ENOENT, ENODATA, ENOTDIR, ELOOP, EISDIR -from select import error as SelectError - -from gconf import gconf -import repce -from repce import RepceServer, RepceClient -from master import gmaster_builder -import syncdutils -from syncdutils import GsyncdError, select, privileged, boolify - -UrlRX = re.compile('\A(\w+)://([^ *?[]*)\Z') -HostRX = re.compile('[a-z\d](?:[a-z\d.-]*[a-z\d])?', re.I) -UserRX = re.compile("[\w!\#$%&'*+-\/=?^_`{|}~]+") - -def sup(x, *a, **kw): - """a rubyesque "super" for python ;) - - invoke caller method in parent class with given args. - """ - return getattr(super(type(x), x), sys._getframe(1).f_code.co_name)(*a, **kw) - -def desugar(ustr): - """transform sugared url strings to standard <scheme>://<urlbody> form - - parsing logic enforces the constraint that sugared forms should contatin - a ':' or a '/', which ensures that sugared urls do not conflict with - gluster volume names. - """ - m = re.match('([^:]*):(.*)', ustr) - if m: - if not m.groups()[0]: - return "gluster://localhost" + ustr - elif '@' in m.groups()[0] or re.search('[:/]', m.groups()[1]): - return "ssh://" + ustr - else: - return "gluster://" + ustr - else: - if ustr[0] != '/': - raise GsyncdError("cannot resolve sugared url '%s'" % ustr) - ap = os.path.normpath(ustr) - if ap.startswith('//'): - ap = ap[1:] - return "file://" + ap - -def gethostbyname(hnam): - """gethostbyname wrapper""" - try: - return socket.gethostbyname(hnam) - except socket.gaierror: - ex = sys.exc_info()[1] - raise GsyncdError("failed to resolve %s: %s" % \ - (hnam, ex.strerror)) - -def parse_url(ustr): - """instantiate an url object by scheme-to-class dispatch - - The url classes taken into consideration are the ones in - this module whose names are full-caps. - """ - m = UrlRX.match(ustr) - if not m: - ustr = desugar(ustr) - m = UrlRX.match(ustr) - if not m: - raise GsyncdError("malformed url") - sch, path = m.groups() - this = sys.modules[__name__] - if not hasattr(this, sch.upper()): - raise GsyncdError("unknown url scheme " + sch) - return getattr(this, sch.upper())(path) - - -class _MetaXattr(object): - """singleton class, a lazy wrapper around the - libcxattr module - - libcxattr (a heavy import due to ctypes) is - loaded only when when the single - instance is tried to be used. - - This reduces runtime for those invocations - which do not need filesystem manipulation - (eg. for config, url parsing) - """ - - def __getattr__(self, meth): - from libcxattr import Xattr as LXattr - xmeth = [ m for m in dir(LXattr) if m[0] != '_' ] - if not meth in xmeth: - return - for m in xmeth: - setattr(self, m, getattr(LXattr, m)) - return getattr(self, meth) - -Xattr = _MetaXattr() - - -class Popen(subprocess.Popen): - """customized subclass of subprocess.Popen with a ring - buffer for children error output""" - - @classmethod - def init_errhandler(cls): - """start the thread which handles children's error output""" - cls.errstore = {} - def tailer(): - while True: - errstore = cls.errstore.copy() - try: - poe, _ ,_ = select([po.stderr for po in errstore], [], [], 1) - except (ValueError, SelectError): - continue - for po in errstore: - if po.stderr not in poe: - continue - po.lock.acquire() - try: - if po.on_death_row: - continue - la = errstore[po] - try: - fd = po.stderr.fileno() - except ValueError: # file is already closed - continue - l = os.read(fd, 1024) - if not l: - continue - tots = len(l) - for lx in la: - tots += len(lx) - while tots > 1<<20 and la: - tots -= len(la.pop(0)) - la.append(l) - finally: - po.lock.release() - t = syncdutils.Thread(target = tailer) - t.start() - cls.errhandler = t - - @classmethod - def fork(cls): - """fork wrapper that restarts errhandler thread in child""" - pid = os.fork() - if not pid: - cls.init_errhandler() - return pid - - def __init__(self, args, *a, **kw): - """customizations for subprocess.Popen instantiation - - - 'close_fds' is taken to be the default - - if child's stderr is chosen to be managed, - register it with the error handler thread - """ - self.args = args - if 'close_fds' not in kw: - kw['close_fds'] = True - self.lock = threading.Lock() - self.on_death_row = False - try: - sup(self, args, *a, **kw) - except: - ex = sys.exc_info()[1] - if not isinstance(ex, OSError): - raise - raise GsyncdError("""execution of "%s" failed with %s (%s)""" % \ - (args[0], errno.errorcode[ex.errno], os.strerror(ex.errno))) - if kw.get('stderr') == subprocess.PIPE: - assert(getattr(self, 'errhandler', None)) - self.errstore[self] = [] - - def errlog(self): - """make a log about child's failure event""" - filling = "" - if self.elines: - filling = ", saying:" - logging.error("""command "%s" returned with %s%s""" % \ - (" ".join(self.args), repr(self.returncode), filling)) - lp = '' - def logerr(l): - logging.error(self.args[0] + "> " + l) - for l in self.elines: - ls = l.split('\n') - ls[0] = lp + ls[0] - lp = ls.pop() - for ll in ls: - logerr(ll) - if lp: - logerr(lp) - - def errfail(self): - """fail nicely if child did not terminate with success""" - self.errlog() - syncdutils.finalize(exval = 1) - - def terminate_geterr(self, fail_on_err = True): - """kill child, finalize stderr harvesting (unregister - from errhandler, set up .elines), fail on error if - asked for - """ - self.lock.acquire() - try: - self.on_death_row = True - finally: - self.lock.release() - elines = self.errstore.pop(self) - if self.poll() == None: - self.terminate() - if self.poll() == None: - time.sleep(0.1) - self.kill() - self.wait() - while True: - if not select([self.stderr],[],[],0.1)[0]: - break - b = os.read(self.stderr.fileno(), 1024) - if b: - elines.append(b) - else: - break - self.stderr.close() - self.elines = elines - if fail_on_err and self.returncode != 0: - self.errfail() - - -class Server(object): - """singleton implemening those filesystem access primitives - which are needed for geo-replication functionality - - (Singleton in the sense it's a class which has only static - and classmethods and is used directly, without instantiation.) - """ - - GX_NSPACE = (privileged() and "trusted" or "system") + ".glusterfs" - NTV_FMTSTR = "!" + "B"*19 + "II" - FRGN_XTRA_FMT = "I" - FRGN_FMTSTR = NTV_FMTSTR + FRGN_XTRA_FMT - - def _pathguard(f): - """decorator method that checks - the path argument of the decorated - functions to make sure it does not - point out of the managed tree - """ - - fc = getattr(f, 'func_code', None) - if not fc: - # python 3 - fc = f.__code__ - pi = list(fc.co_varnames).index('path') - def ff(*a): - path = a[pi] - ps = path.split('/') - if path[0] == '/' or '..' in ps: - raise ValueError('unsafe path') - return f(*a) - return ff - - @staticmethod - @_pathguard - def entries(path): - """directory entries in an array""" - # prevent symlinks being followed - if not stat.S_ISDIR(os.lstat(path).st_mode): - raise OSError(ENOTDIR, os.strerror(ENOTDIR)) - return os.listdir(path) - - @classmethod - @_pathguard - def purge(cls, path, entries=None): - """force-delete subtrees - - If @entries is not specified, delete - the whole subtree under @path (including - @path). - - Otherwise, @entries should be a - a sequence of children of @path, and - the effect is identical with a joint - @entries-less purge on them, ie. - - for e in entries: - cls.purge(os.path.join(path, e)) - """ - me_also = entries == None - if not entries: - try: - # if it's a symlink, prevent - # following it - try: - os.unlink(path) - return - except OSError: - ex = sys.exc_info()[1] - if ex.errno == EISDIR: - entries = os.listdir(path) - else: - raise - except OSError: - ex = sys.exc_info()[1] - if ex.errno in (ENOTDIR, ENOENT, ELOOP): - try: - os.unlink(path) - return - except OSError: - ex = sys.exc_info()[1] - if ex.errno == ENOENT: - return - raise - else: - raise - for e in entries: - cls.purge(os.path.join(path, e)) - if me_also: - os.rmdir(path) - - @classmethod - @_pathguard - def _create(cls, path, ctor): - """path creation backend routine""" - try: - ctor(path) - except OSError: - ex = sys.exc_info()[1] - if ex.errno == EEXIST: - cls.purge(path) - return ctor(path) - raise - - @classmethod - @_pathguard - def mkdir(cls, path): - cls._create(path, os.mkdir) - - @classmethod - @_pathguard - def symlink(cls, lnk, path): - cls._create(path, lambda p: os.symlink(lnk, p)) - - @classmethod - @_pathguard - def xtime(cls, path, uuid): - """query xtime extended attribute - - Return xtime of @path for @uuid as a pair of integers. - "Normal" errors due to non-existent @path or extended attribute - are tolerated and errno is returned in such a case. - """ - - try: - return struct.unpack('!II', Xattr.lgetxattr(path, '.'.join([cls.GX_NSPACE, uuid, 'xtime']), 8)) - except OSError: - ex = sys.exc_info()[1] - if ex.errno in (ENOENT, ENODATA, ENOTDIR): - return ex.errno - else: - raise - - @classmethod - def xtime_vec(cls, path, *uuids): - """vectored version of @xtime - - accepts a list of uuids and returns a dictionary - with uuid as key(s) and xtime as value(s) - """ - xt = {} - for uuid in uuids: - xtu = cls.xtime(path, uuid) - if xtu == ENODATA: - xtu = None - if isinstance(xtu, int): - return xtu - xt[uuid] = xtu - return xt - - @classmethod - @_pathguard - def set_xtime(cls, path, uuid, mark): - """set @mark as xtime for @uuid on @path""" - Xattr.lsetxattr(path, '.'.join([cls.GX_NSPACE, uuid, 'xtime']), struct.pack('!II', *mark)) - - @classmethod - def set_xtime_vec(cls, path, mark_dct): - """vectored (or dictered) version of set_xtime - - ignore values that match @ignore - """ - for u,t in mark_dct.items(): - cls.set_xtime(path, u, t) - - @staticmethod - @_pathguard - def setattr(path, adct): - """set file attributes - - @adct is a dict, where 'own', 'mode' and 'times' - keys are looked for and values used to perform - chown, chmod or utimes on @path. - """ - own = adct.get('own') - if own: - os.lchown(path, *own) - mode = adct.get('mode') - if mode: - os.chmod(path, stat.S_IMODE(mode)) - times = adct.get('times') - if times: - os.utime(path, times) - - @staticmethod - def pid(): - return os.getpid() - - last_keep_alive = 0 - @classmethod - def keep_alive(cls, dct): - """process keepalive messages. - - Return keep-alive counter (number of received keep-alive - messages). - - Now the "keep-alive" message can also have a payload which is - used to set a foreign volume-mark on the underlying file system. - """ - if dct: - key = '.'.join([cls.GX_NSPACE, 'volume-mark', dct['uuid']]) - val = struct.pack(cls.FRGN_FMTSTR, - *(dct['version'] + - tuple(int(x,16) for x in re.findall('(?:[\da-f]){2}', dct['uuid'])) + - (dct['retval'],) + dct['volume_mark'][0:2] + (dct['timeout'],))) - Xattr.lsetxattr('.', key, val) - cls.last_keep_alive += 1 - return cls.last_keep_alive - - @staticmethod - def version(): - """version used in handshake""" - return 1.0 - - -class SlaveLocal(object): - """mix-in class to implement some factes of a slave server - - ("mix-in" is sort of like "abstract class", ie. it's not - instantiated just included in the ancesty DAG. I use "mix-in" - to indicate that it's not used as an abstract base class, - rather just taken in to implement additional functionality - on the basis of the assumed availability of certain interfaces.) - """ - - def can_connect_to(self, remote): - """determine our position in the connectibility matrix""" - return not remote - - def service_loop(self): - """start a RePCe server serving self's server - - stop servicing if a timeout is configured and got no - keep-alime in that inteval - """ - - if boolify(gconf.use_rsync_xattrs) and not privileged(): - raise GsyncdError("using rsync for extended attributes is not supported") - - repce = RepceServer(self.server, sys.stdin, sys.stdout, int(gconf.sync_jobs)) - t = syncdutils.Thread(target=lambda: (repce.service_loop(), - syncdutils.finalize())) - t.start() - logging.info("slave listening") - if gconf.timeout and int(gconf.timeout) > 0: - while True: - lp = self.server.last_keep_alive - time.sleep(int(gconf.timeout)) - if lp == self.server.last_keep_alive: - logging.info("connection inactive for %d seconds, stopping" % int(gconf.timeout)) - break - else: - select((), (), ()) - -class SlaveRemote(object): - """mix-in class to implement an interface to a remote slave""" - - def connect_remote(self, rargs=[], **opts): - """connects to a remote slave - - Invoke an auxiliary utility (slave gsyncd, possibly wrapped) - which sets up the connection and set up a RePCe client to - communicate throuh its stdio. - """ - slave = opts.get('slave', self.url) - extra_opts = [] - so = getattr(gconf, 'session_owner', None) - if so: - extra_opts += ['--session-owner', so] - if boolify(gconf.use_rsync_xattrs): - extra_opts.append('--use-rsync-xattrs') - po = Popen(rargs + gconf.remote_gsyncd.split() + extra_opts + \ - ['-N', '--listen', '--timeout', str(gconf.timeout), slave], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - gconf.transport = po - return self.start_fd_client(po.stdout, po.stdin, **opts) - - def start_fd_client(self, i, o, **opts): - """set up RePCe client, handshake with server - - It's cut out as a separate method to let - subclasses hook into client startup - """ - self.server = RepceClient(i, o) - rv = self.server.__version__() - exrv = {'proto': repce.repce_version, 'object': Server.version()} - da0 = (rv, exrv) - da1 = ({}, {}) - for i in range(2): - for k, v in da0[i].iteritems(): - da1[i][k] = int(v) - if da1[0] != da1[1]: - raise GsyncdError("RePCe major version mismatch: local %s, remote %s" % (exrv, rv)) - - def rsync(self, files, *args): - """invoke rsync""" - if not files: - raise GsyncdError("no files to sync") - logging.debug("files: " + ", ".join(files)) - argv = gconf.rsync_command.split() + \ - ['-aR0', '--files-from=-', '--super','--stats', '--numeric-ids', '--no-implied-dirs'] + \ - gconf.rsync_options.split() + (boolify(gconf.use_rsync_xattrs) and ['--xattrs'] or []) + \ - ['.'] + list(args) - po = Popen(argv, stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE) - for f in files: - po.stdin.write(f) - po.stdin.write('\0') - - po.stdin.close() - po.wait() - po.terminate_geterr(fail_on_err = False) - - return po - - -class AbstractUrl(object): - """abstract base class for url scheme classes""" - - def __init__(self, path, pattern): - m = re.search(pattern, path) - if not m: - raise GsyncdError("malformed path") - self.path = path - return m.groups() - - @property - def scheme(self): - return type(self).__name__.lower() - - def canonical_path(self): - return self.path - - def get_url(self, canonical=False, escaped=False): - """format self's url in various styles""" - if canonical: - pa = self.canonical_path() - else: - pa = self.path - u = "://".join((self.scheme, pa)) - if escaped: - u = syncdutils.escape(u) - return u - - @property - def url(self): - return self.get_url() - - - ### Concrete resource classes ### - - -class FILE(AbstractUrl, SlaveLocal, SlaveRemote): - """scheme class for file:// urls - - can be used to represent a file slave server - on slave side, or interface to a remote file - file server on master side - """ - - class FILEServer(Server): - """included server flavor""" - pass - - server = FILEServer - - def __init__(self, path): - sup(self, path, '^/') - - def connect(self): - """inhibit the resource beyond""" - os.chdir(self.path) - - def rsync(self, files): - return sup(self, files, self.path) - - -class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): - """scheme class for gluster:// urls - - can be used to represent a gluster slave server - on slave side, or interface to a remote gluster - slave on master side, or to represent master - (slave-ish features come from the mixins, master - functionality is outsourced to GMaster from master) - """ - - class GLUSTERServer(Server): - "server enhancements for a glusterfs backend""" - - @classmethod - def _attr_unpack_dict(cls, xattr, extra_fields = ''): - """generic volume mark fetching/parsing backed""" - fmt_string = cls.NTV_FMTSTR + extra_fields - buf = Xattr.lgetxattr('.', xattr, struct.calcsize(fmt_string)) - vm = struct.unpack(fmt_string, buf) - m = re.match('(.{8})(.{4})(.{4})(.{4})(.{12})', "".join(['%02x' % x for x in vm[2:18]])) - uuid = '-'.join(m.groups()) - volinfo = { 'version': vm[0:2], - 'uuid' : uuid, - 'retval' : vm[18], - 'volume_mark': vm[19:21], - } - if extra_fields: - return volinfo, vm[-len(extra_fields):] - else: - return volinfo - - @classmethod - def foreign_volume_infos(cls): - """return list of valid (not expired) foreign volume marks""" - dict_list = [] - xattr_list = Xattr.llistxattr_buf('.') - for ele in xattr_list: - if ele.find('.'.join([cls.GX_NSPACE, 'volume-mark', ''])) == 0: - d, x = cls._attr_unpack_dict(ele, cls.FRGN_XTRA_FMT) - now = int(time.time()) - if x[0] > now: - logging.debug("volinfo[%s] expires: %d (%d sec later)" % \ - (d['uuid'], x[0], x[0] - now)) - d['timeout'] = x[0] - dict_list.append(d) - else: - try: - Xattr.lremovexattr('.', ele) - except OSError: - pass - return dict_list - - @classmethod - def native_volume_info(cls): - """get the native volume mark of the underlying gluster volume""" - try: - return cls._attr_unpack_dict('.'.join([cls.GX_NSPACE, 'volume-mark'])) - except OSError: - ex = sys.exc_info()[1] - if ex.errno != ENODATA: - raise - - server = GLUSTERServer - - def __init__(self, path): - self.host, self.volume = sup(self, path, '^(%s):(.+)' % HostRX.pattern) - - def canonical_path(self): - return ':'.join([gethostbyname(self.host), self.volume]) - - def can_connect_to(self, remote): - """determine our position in the connectibility matrix""" - return True - - class Mounter(object): - """Abstract base class for mounter backends""" - - def __init__(self, params): - self.params = params - self.mntpt = None - - @classmethod - def get_glusterprog(cls): - return os.path.join(gconf.gluster_command_dir, cls.glusterprog) - - def umount_l(self, d): - """perform lazy umount""" - po = Popen(self.make_umount_argv(d), stderr=subprocess.PIPE) - po.wait() - return po - - @classmethod - def make_umount_argv(cls, d): - raise NotImplementedError - - def make_mount_argv(self, *a): - raise NotImplementedError - - def cleanup_mntpt(self, *a): - pass - - def handle_mounter(self, po): - po.wait() - - def inhibit(self, *a): - """inhibit a gluster filesystem - - Mount glusterfs over a temporary mountpoint, - change into the mount, and lazy unmount the - filesystem. - """ - - mpi, mpo = os.pipe() - mh = Popen.fork() - if mh: - os.close(mpi) - fcntl.fcntl(mpo, fcntl.F_SETFD, fcntl.FD_CLOEXEC) - d = None - margv = self.make_mount_argv(*a) - if self.mntpt: - # mntpt is determined pre-mount - d = self.mntpt - os.write(mpo, d + '\0') - po = Popen(margv, **self.mountkw) - self.handle_mounter(po) - po.terminate_geterr() - logging.debug('auxiliary glusterfs mount in place') - if not d: - # mntpt is determined during mount - d = self.mntpt - os.write(mpo, d + '\0') - os.write(mpo, 'M') - t = syncdutils.Thread(target=lambda: os.chdir(d)) - t.start() - tlim = gconf.starttime + int(gconf.connection_timeout) - while True: - if not t.isAlive(): - break - if time.time() >= tlim: - syncdutils.finalize(exval = 1) - time.sleep(1) - os.close(mpo) - _, rv = syncdutils.waitpid(mh, 0) - if rv: - rv = (os.WIFEXITED(rv) and os.WEXITSTATUS(rv) or 0) - \ - (os.WIFSIGNALED(rv) and os.WTERMSIG(rv) or 0) - logging.warn('stale mount possibly left behind on ' + d) - raise GsyncdError("cleaning up temp mountpoint %s failed with status %d" % \ - (d, rv)) - else: - rv = 0 - try: - os.setsid() - os.close(mpo) - mntdata = '' - while True: - c = os.read(mpi, 1) - if not c: - break - mntdata += c - if mntdata: - mounted = False - if mntdata[-1] == 'M': - mntdata = mntdata[:-1] - assert(mntdata) - mounted = True - assert(mntdata[-1] == '\0') - mntpt = mntdata[:-1] - assert(mntpt) - if mounted: - po = self.umount_l(mntpt) - po.terminate_geterr(fail_on_err = False) - if po.returncode != 0: - po.errlog() - rv = po.returncode - self.cleanup_mntpt(mntpt) - except: - logging.exception('mount cleanup failure:') - rv = 200 - os._exit(rv) - logging.debug('auxiliary glusterfs mount prepared') - - class DirectMounter(Mounter): - """mounter backend which calls mount(8), umount(8) directly""" - - mountkw = {'stderr': subprocess.PIPE} - glusterprog = 'glusterfs' - - @staticmethod - def make_umount_argv(d): - return ['umount', '-l', d] - - def make_mount_argv(self): - self.mntpt = tempfile.mkdtemp(prefix = 'gsyncd-aux-mount-') - return [self.get_glusterprog()] + ['--' + p for p in self.params] + [self.mntpt] - - def cleanup_mntpt(self, mntpt = None): - if not mntpt: - mntpt = self.mntpt - os.rmdir(mntpt) - - class MountbrokerMounter(Mounter): - """mounter backend using the mountbroker gluster service""" - - mountkw = {'stderr': subprocess.PIPE, 'stdout': subprocess.PIPE} - glusterprog = 'gluster' - - @classmethod - def make_cli_argv(cls): - return [cls.get_glusterprog()] + gconf.gluster_cli_options.split() + ['system::'] - - @classmethod - def make_umount_argv(cls, d): - return cls.make_cli_argv() + ['umount', d, 'lazy'] - - def make_mount_argv(self, label): - return self.make_cli_argv() + \ - ['mount', label, 'user-map-root=' + syncdutils.getusername()] + self.params - - def handle_mounter(self, po): - self.mntpt = po.stdout.readline()[:-1] - po.stdout.close() - sup(self, po) - if po.returncode != 0: - # if cli terminated with error due to being - # refused by glusterd, what it put - # out on stdout is a diagnostic message - logging.error('glusterd answered: %s' % self.mntpt) - - def connect(self): - """inhibit the resource beyond - - Choose mounting backend (direct or mountbroker), - set up glusterfs parameters and perform the mount - with given backend - """ - - label = getattr(gconf, 'mountbroker', None) - if not label and not privileged(): - label = syncdutils.getusername() - mounter = label and self.MountbrokerMounter or self.DirectMounter - params = gconf.gluster_params.split() + \ - (gconf.gluster_log_level and ['log-level=' + gconf.gluster_log_level] or []) + \ - ['log-file=' + gconf.gluster_log_file, 'volfile-server=' + self.host, - 'volfile-id=' + self.volume, 'client-pid=-1'] - mounter(params).inhibit(*[l for l in [label] if l]) - - def connect_remote(self, *a, **kw): - sup(self, *a, **kw) - self.slavedir = "/proc/%d/cwd" % self.server.pid() - - def service_loop(self, *args): - """enter service loop - - - if slave given, instantiate GMaster and - pass control to that instance, which implements - master behavior - - else do that's what's inherited - """ - if args: - gmaster_builder()(self, args[0]).crawl_loop() - else: - sup(self, *args) - - def rsync(self, files): - return sup(self, files, self.slavedir) - - -class SSH(AbstractUrl, SlaveRemote): - """scheme class for ssh:// urls - - interface to remote slave on master side - implementing an ssh based proxy - """ - - def __init__(self, path): - self.remote_addr, inner_url = sup(self, path, - '^((?:%s@)?%s):(.+)' % tuple([ r.pattern for r in (UserRX, HostRX) ])) - self.inner_rsc = parse_url(inner_url) - - def canonical_path(self): - m = re.match('([^@]+)@(.+)', self.remote_addr) - if m: - u, h = m.groups() - else: - u, h = syncdutils.getusername(), self.remote_addr - remote_addr = '@'.join([u, gethostbyname(h)]) - return ':'.join([remote_addr, self.inner_rsc.get_url(canonical=True)]) - - def can_connect_to(self, remote): - """determine our position in the connectibility matrix""" - return False - - def start_fd_client(self, *a, **opts): - """customizations for client startup - - - be a no-op if we are to daemonize (client startup is deferred - to post-daemon stage) - - determine target url for rsync after consulting server - """ - if opts.get('deferred'): - return a - sup(self, *a) - ityp = type(self.inner_rsc) - if ityp == FILE: - slavepath = self.inner_rsc.path - elif ityp == GLUSTER: - slavepath = "/proc/%d/cwd" % self.server.pid() - else: - raise NotImplementedError - self.slaveurl = ':'.join([self.remote_addr, slavepath]) - - def connect_remote(self, go_daemon=None): - """connect to inner slave url through outer ssh url - - Wrap the connecting utility in ssh. - - Much care is put into daemonizing: in that case - ssh is started before daemonization, but - RePCe client is to be created after that (as ssh - interactive password auth would be defeated by - a daemonized ssh, while client should be present - only in the final process). In that case the action - is taken apart to two parts, this method is ivoked - once pre-daemon, once post-daemon. Use @go_daemon - to deiced what part to perform. - - [NB. ATM gluster product does not makes use of interactive - authentication.] - """ - if go_daemon == 'done': - return self.start_fd_client(*self.fd_pair) - gconf.setup_ssh_ctl(tempfile.mkdtemp(prefix='gsyncd-aux-ssh-')) - deferred = go_daemon == 'postconn' - ret = sup(self, gconf.ssh_command.split() + gconf.ssh_ctl_args + [self.remote_addr], slave=self.inner_rsc.url, deferred=deferred) - if deferred: - # send a message to peer so that we can wait for - # the answer from which we know connection is - # established and we can proceed with daemonization - # (doing that too early robs the ssh passwd prompt...) - # However, we'd better not start the RepceClient - # before daemonization (that's not preserved properly - # in daemon), we just do a an ad-hoc linear put/get. - i, o = ret - inf = os.fdopen(i) - repce.send(o, None, '__repce_version__') - select((inf,), (), ()) - repce.recv(inf) - # hack hack hack: store a global reference to the file - # to save it from getting GC'd which implies closing it - gconf.permanent_handles.append(inf) - self.fd_pair = (i, o) - return 'should' - - def rsync(self, files): - return sup(self, files, '-e', " ".join(gconf.ssh_command.split() + gconf.ssh_ctl_args), - *(gconf.rsync_ssh_options.split() + [self.slaveurl])) diff --git a/xlators/features/marker/utils/syncdaemon/syncdutils.py b/xlators/features/marker/utils/syncdaemon/syncdutils.py deleted file mode 100644 index 0764c0790..000000000 --- a/xlators/features/marker/utils/syncdaemon/syncdutils.py +++ /dev/null @@ -1,288 +0,0 @@ -import os -import sys -import pwd -import time -import fcntl -import shutil -import logging -from threading import Lock, Thread as baseThread -from errno import EACCES, EAGAIN, EPIPE, ENOTCONN, ECONNABORTED, EINTR, errorcode -from signal import signal, SIGTERM, SIGKILL -from time import sleep -import select as oselect -from os import waitpid as owaitpid -try: - from cPickle import PickleError -except ImportError: - # py 3 - from pickle import PickleError - -from gconf import gconf - -try: - # py 3 - from urllib import parse as urllib -except ImportError: - import urllib - -def escape(s): - """the chosen flavor of string escaping, used all over - to turn whatever data to creatable representation""" - return urllib.quote_plus(s) - -def unescape(s): - """inverse of .escape""" - return urllib.unquote_plus(s) - -def norm(s): - if s: - return s.replace('-', '_') - -def update_file(path, updater, merger = lambda f: True): - """update a file in a transaction-like manner""" - - fr = fw = None - try: - fd = os.open(path, os.O_CREAT|os.O_RDWR) - try: - fr = os.fdopen(fd, 'r+b') - except: - os.close(fd) - raise - fcntl.lockf(fr, fcntl.LOCK_EX) - if not merger(fr): - return - - tmpp = path + '.tmp.' + str(os.getpid()) - fd = os.open(tmpp, os.O_CREAT|os.O_EXCL|os.O_WRONLY) - try: - fw = os.fdopen(fd, 'wb', 0) - except: - os.close(fd) - raise - updater(fw) - os.fsync(fd) - os.rename(tmpp, path) - finally: - for fx in (fr, fw): - if fx: - fx.close() - -def grabfile(fname, content=None): - """open @fname + contest for its fcntl lock - - @content: if given, set the file content to it - """ - # damn those messy open() mode codes - fd = os.open(fname, os.O_CREAT|os.O_RDWR) - f = os.fdopen(fd, 'r+b', 0) - try: - fcntl.lockf(f, fcntl.LOCK_EX|fcntl.LOCK_NB) - except: - ex = sys.exc_info()[1] - f.close() - if isinstance(ex, IOError) and ex.errno in (EACCES, EAGAIN): - # cannot grab, it's taken - return - raise - if content: - try: - f.truncate() - f.write(content) - except: - f.close() - raise - gconf.permanent_handles.append(f) - return f - -def grabpidfile(fname=None, setpid=True): - """.grabfile customization for pid files""" - if not fname: - fname = gconf.pid_file - content = None - if setpid: - content = str(os.getpid()) + '\n' - return grabfile(fname, content=content) - -final_lock = Lock() - -def finalize(*a, **kw): - """all those messy final steps we go trough upon termination - - Do away with pidfile, ssh control dir and logging. - """ - final_lock.acquire() - if getattr(gconf, 'pid_file', None): - rm_pidf = gconf.pid_file_owned - if gconf.cpid: - # exit path from parent branch of daemonization - rm_pidf = False - while True: - f = grabpidfile(setpid=False) - if not f: - # child has already taken over pidfile - break - if os.waitpid(gconf.cpid, os.WNOHANG)[0] == gconf.cpid: - # child has terminated - rm_pidf = True - break; - time.sleep(0.1) - if rm_pidf: - try: - os.unlink(gconf.pid_file) - except: - ex = sys.exc_info()[1] - if ex.errno == ENOENT: - pass - else: - raise - if gconf.ssh_ctl_dir and not gconf.cpid: - shutil.rmtree(gconf.ssh_ctl_dir) - if getattr(gconf, 'state_socket', None): - try: - os.unlink(gconf.state_socket) - except: - if sys.exc_info()[0] == OSError: - pass - if gconf.log_exit: - logging.info("exiting.") - sys.stdout.flush() - sys.stderr.flush() - os._exit(kw.get('exval', 0)) - -def log_raise_exception(excont): - """top-level exception handler - - Try to some fancy things to cover up we face with an error. - Translate some weird sounding but well understood exceptions - into human-friendly lingo - """ - is_filelog = False - for h in logging.getLogger().handlers: - fno = getattr(getattr(h, 'stream', None), 'fileno', None) - if fno and not os.isatty(fno()): - is_filelog = True - - exc = sys.exc_info()[1] - if isinstance(exc, SystemExit): - excont.exval = exc.code or 0 - raise - else: - logtag = None - if isinstance(exc, GsyncdError): - if is_filelog: - logging.error(exc.args[0]) - sys.stderr.write('failure: ' + exc.args[0] + '\n') - elif isinstance(exc, PickleError) or isinstance(exc, EOFError) or \ - ((isinstance(exc, OSError) or isinstance(exc, IOError)) and \ - exc.errno == EPIPE): - logging.error('connection to peer is broken') - if hasattr(gconf, 'transport'): - gconf.transport.wait() - if gconf.transport.returncode == 127: - logging.warn("!!!!!!!!!!!!!") - logging.warn('!!! getting "No such file or directory" errors ' - "is most likely due to MISCONFIGURATION, please consult " - "http://access.redhat.com/knowledge/docs/en-US/Red_Hat_Storage/2.0/html/Administration_Guide/chap-User_Guide-Geo_Rep-Preparation-Settingup_Environment.html") - logging.warn("!!!!!!!!!!!!!") - gconf.transport.terminate_geterr() - elif isinstance(exc, OSError) and exc.errno in (ENOTCONN, ECONNABORTED): - logging.error('glusterfs session went down [%s]', errorcode[exc.errno]) - else: - logtag = "FAIL" - if not logtag and logging.getLogger().isEnabledFor(logging.DEBUG): - logtag = "FULL EXCEPTION TRACE" - if logtag: - logging.exception(logtag + ": ") - sys.stderr.write("failed with %s.\n" % type(exc).__name__) - excont.exval = 1 - sys.exit(excont.exval) - - -class FreeObject(object): - """wildcard class for which any attribute can be set""" - - def __init__(self, **kw): - for k,v in kw.items(): - setattr(self, k, v) - -class Thread(baseThread): - """thread class flavor for gsyncd - - - always a daemon thread - - force exit for whole program if thread - function coughs up an exception - """ - def __init__(self, *a, **kw): - tf = kw.get('target') - if tf: - def twrap(*aa): - excont = FreeObject(exval = 0) - try: - tf(*aa) - except: - try: - log_raise_exception(excont) - finally: - finalize(exval = excont.exval) - kw['target'] = twrap - baseThread.__init__(self, *a, **kw) - self.setDaemon(True) - -class GsyncdError(Exception): - pass - -def getusername(uid = None): - if uid == None: - uid = os.geteuid() - return pwd.getpwuid(uid).pw_name - -def privileged(): - return os.geteuid() == 0 - -def boolify(s): - """ - Generic string to boolean converter - - return - - Quick return if string 's' is of type bool - - True if it's in true_list - - False if it's in false_list - - Warn if it's not present in either and return False - """ - true_list = ['true', 'yes', '1', 'on'] - false_list = ['false', 'no', '0', 'off'] - - if isinstance(s, bool): - return s - - rv = False - lstr = s.lower() - if lstr in true_list: - rv = True - elif not lstr in false_list: - logging.warn("Unknown string (%s) in string to boolean conversion defaulting to False\n" % (s)) - - return rv - -def eintr_wrap(func, exc, *a): - """ - wrapper around syscalls resilient to interrupt caused - by signals - """ - while True: - try: - return func(*a) - except exc: - ex = sys.exc_info()[1] - if not ex.args[0] == EINTR: - raise - -def select(*a): - return eintr_wrap(oselect.select, oselect.error, *a) - -def waitpid (*a): - return eintr_wrap(owaitpid, OSError, *a) - -def set_term_handler(hook=lambda *a: finalize(*a, **{'exval': 1})): - signal(SIGTERM, hook) diff --git a/xlators/features/qemu-block/Makefile.am b/xlators/features/qemu-block/Makefile.am new file mode 100644 index 000000000..af437a64d --- /dev/null +++ b/xlators/features/qemu-block/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src diff --git a/xlators/features/qemu-block/src/Makefile.am b/xlators/features/qemu-block/src/Makefile.am new file mode 100644 index 000000000..08a7b62a0 --- /dev/null +++ b/xlators/features/qemu-block/src/Makefile.am @@ -0,0 +1,155 @@ +if ENABLE_QEMU_BLOCK +xlator_LTLIBRARIES = qemu-block.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +qemu_block_la_LDFLAGS = -module -avoid-version +qemu_block_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(GLIB_LIBS) -lz -lrt + +qemu_block_la_SOURCES_qemu = \ + $(CONTRIBDIR)/qemu/qemu-coroutine.c \ + $(CONTRIBDIR)/qemu/qemu-coroutine-lock.c \ + $(CONTRIBDIR)/qemu/qemu-coroutine-sleep.c \ + $(CONTRIBDIR)/qemu/coroutine-ucontext.c \ + $(CONTRIBDIR)/qemu/block.c \ + $(CONTRIBDIR)/qemu/nop-symbols.c + +qemu_block_la_SOURCES_qemu_util = \ + $(CONTRIBDIR)/qemu/util/aes.c \ + $(CONTRIBDIR)/qemu/util/bitmap.c \ + $(CONTRIBDIR)/qemu/util/bitops.c \ + $(CONTRIBDIR)/qemu/util/cutils.c \ + $(CONTRIBDIR)/qemu/util/error.c \ + $(CONTRIBDIR)/qemu/util/hbitmap.c \ + $(CONTRIBDIR)/qemu/util/iov.c \ + $(CONTRIBDIR)/qemu/util/module.c \ + $(CONTRIBDIR)/qemu/util/oslib-posix.c \ + $(CONTRIBDIR)/qemu/util/qemu-option.c \ + $(CONTRIBDIR)/qemu/util/qemu-error.c \ + $(CONTRIBDIR)/qemu/util/qemu-thread-posix.c \ + $(CONTRIBDIR)/qemu/util/unicode.c \ + $(CONTRIBDIR)/qemu/util/hexdump.c + +qemu_block_la_SOURCES_qemu_block = \ + $(CONTRIBDIR)/qemu/block/snapshot.c \ + $(CONTRIBDIR)/qemu/block/qcow2-cache.c \ + $(CONTRIBDIR)/qemu/block/qcow2-cluster.c \ + $(CONTRIBDIR)/qemu/block/qcow2-refcount.c \ + $(CONTRIBDIR)/qemu/block/qcow2-snapshot.c \ + $(CONTRIBDIR)/qemu/block/qcow2.c \ + $(CONTRIBDIR)/qemu/block/qed-check.c \ + $(CONTRIBDIR)/qemu/block/qed-cluster.c \ + $(CONTRIBDIR)/qemu/block/qed-gencb.c \ + $(CONTRIBDIR)/qemu/block/qed-l2-cache.c \ + $(CONTRIBDIR)/qemu/block/qed-table.c \ + $(CONTRIBDIR)/qemu/block/qed.c + +qemu_block_la_SOURCES_qemu_qobject = \ + $(CONTRIBDIR)/qemu/qobject/json-lexer.c \ + $(CONTRIBDIR)/qemu/qobject/json-parser.c \ + $(CONTRIBDIR)/qemu/qobject/json-streamer.c \ + $(CONTRIBDIR)/qemu/qobject/qbool.c \ + $(CONTRIBDIR)/qemu/qobject/qdict.c \ + $(CONTRIBDIR)/qemu/qobject/qerror.c \ + $(CONTRIBDIR)/qemu/qobject/qfloat.c \ + $(CONTRIBDIR)/qemu/qobject/qint.c \ + $(CONTRIBDIR)/qemu/qobject/qjson.c \ + $(CONTRIBDIR)/qemu/qobject/qlist.c \ + $(CONTRIBDIR)/qemu/qobject/qstring.c + +qemu_block_la_SOURCES = \ + $(qemu_block_la_SOURCES_qemu) \ + $(qemu_block_la_SOURCES_qemu_util) \ + $(qemu_block_la_SOURCES_qemu_block) \ + $(qemu_block_la_SOURCES_qemu_qobject) \ + bdrv-xlator.c \ + coroutine-synctask.c \ + bh-syncop.c \ + monitor-logging.c \ + clock-timer.c \ + qemu-block.c \ + qb-coroutines.c + +noinst_HEADERS_qemu = \ + $(CONTRIBDIR)/qemu/config-host.h \ + $(CONTRIBDIR)/qemu/qapi-types.h \ + $(CONTRIBDIR)/qemu/qmp-commands.h \ + $(CONTRIBDIR)/qemu/trace/generated-tracers.h \ + $(CONTRIBDIR)/qemu/include/config.h \ + $(CONTRIBDIR)/qemu/include/glib-compat.h \ + $(CONTRIBDIR)/qemu/include/qemu-common.h \ + $(CONTRIBDIR)/qemu/include/trace.h \ + $(CONTRIBDIR)/qemu/include/block/coroutine.h \ + $(CONTRIBDIR)/qemu/include/block/aio.h \ + $(CONTRIBDIR)/qemu/include/block/block.h \ + $(CONTRIBDIR)/qemu/include/block/block_int.h \ + $(CONTRIBDIR)/qemu/include/block/blockjob.h \ + $(CONTRIBDIR)/qemu/include/block/coroutine.h \ + $(CONTRIBDIR)/qemu/include/block/coroutine_int.h \ + $(CONTRIBDIR)/qemu/include/block/snapshot.h \ + $(CONTRIBDIR)/qemu/include/exec/cpu-common.h \ + $(CONTRIBDIR)/qemu/include/exec/hwaddr.h \ + $(CONTRIBDIR)/qemu/include/exec/poison.h \ + $(CONTRIBDIR)/qemu/include/fpu/softfloat.h \ + $(CONTRIBDIR)/qemu/include/migration/migration.h \ + $(CONTRIBDIR)/qemu/include/migration/qemu-file.h \ + $(CONTRIBDIR)/qemu/include/migration/vmstate.h \ + $(CONTRIBDIR)/qemu/include/monitor/monitor.h \ + $(CONTRIBDIR)/qemu/include/monitor/readline.h \ + $(CONTRIBDIR)/qemu/include/qapi/error.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/json-lexer.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/json-parser.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/json-streamer.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/qbool.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/qdict.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/qerror.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/qfloat.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/qint.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/qjson.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/qlist.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/qobject.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/qstring.h \ + $(CONTRIBDIR)/qemu/include/qapi/qmp/types.h \ + $(CONTRIBDIR)/qemu/include/qemu/aes.h \ + $(CONTRIBDIR)/qemu/include/qemu/atomic.h \ + $(CONTRIBDIR)/qemu/include/qemu/bitmap.h \ + $(CONTRIBDIR)/qemu/include/qemu/bitops.h \ + $(CONTRIBDIR)/qemu/include/qemu/bswap.h \ + $(CONTRIBDIR)/qemu/include/qemu/compiler.h \ + $(CONTRIBDIR)/qemu/include/qemu/error-report.h \ + $(CONTRIBDIR)/qemu/include/qemu/event_notifier.h \ + $(CONTRIBDIR)/qemu/include/qemu/hbitmap.h \ + $(CONTRIBDIR)/qemu/include/qemu/host-utils.h \ + $(CONTRIBDIR)/qemu/include/qemu/iov.h \ + $(CONTRIBDIR)/qemu/include/qemu/main-loop.h \ + $(CONTRIBDIR)/qemu/include/qemu/module.h \ + $(CONTRIBDIR)/qemu/include/qemu/notify.h \ + $(CONTRIBDIR)/qemu/include/qemu/option.h \ + $(CONTRIBDIR)/qemu/include/qemu/option_int.h \ + $(CONTRIBDIR)/qemu/include/qemu/osdep.h \ + $(CONTRIBDIR)/qemu/include/qemu/queue.h \ + $(CONTRIBDIR)/qemu/include/qemu/sockets.h \ + $(CONTRIBDIR)/qemu/include/qemu/thread-posix.h \ + $(CONTRIBDIR)/qemu/include/qemu/thread.h \ + $(CONTRIBDIR)/qemu/include/qemu/timer.h \ + $(CONTRIBDIR)/qemu/include/qemu/typedefs.h \ + $(CONTRIBDIR)/qemu/include/sysemu/sysemu.h \ + $(CONTRIBDIR)/qemu/include/sysemu/os-posix.h \ + $(CONTRIBDIR)/qemu/block/qcow2.h \ + $(CONTRIBDIR)/qemu/block/qed.h + +noinst_HEADERS = \ + $(noinst_HEADERS_qemu) \ + qemu-block.h \ + qemu-block-memory-types.h \ + qb-coroutines.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(CONTRIBDIR)/qemu \ + -I$(CONTRIBDIR)/qemu/include \ + -DGLUSTER_XLATOR + +AM_CFLAGS = -fno-strict-aliasing -Wall $(GF_CFLAGS) $(GLIB_CFLAGS) + +CLEANFILES = + +endif diff --git a/xlators/features/qemu-block/src/bdrv-xlator.c b/xlators/features/qemu-block/src/bdrv-xlator.c new file mode 100644 index 000000000..106c59775 --- /dev/null +++ b/xlators/features/qemu-block/src/bdrv-xlator.c @@ -0,0 +1,397 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "inode.h" +#include "syncop.h" +#include "qemu-block.h" +#include "block/block_int.h" + +typedef struct BDRVGlusterState { + inode_t *inode; +} BDRVGlusterState; + +static QemuOptsList runtime_opts = { + .name = "gluster", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "filename", + .type = QEMU_OPT_STRING, + .help = "GFID of file", + }, + { /* end of list */ } + }, +}; + +inode_t * +qb_inode_from_filename (const char *filename) +{ + const char *iptr = NULL; + inode_t *inode = NULL; + + iptr = filename + 17; + sscanf (iptr, "%p", &inode); + + return inode; +} + + +int +qb_inode_to_filename (inode_t *inode, char *filename, int size) +{ + return snprintf (filename, size, "gluster://inodep:%p", inode); +} + + +static fd_t * +fd_from_bs (BlockDriverState *bs) +{ + BDRVGlusterState *s = bs->opaque; + + return fd_anonymous (s->inode); +} + + +static int +qemu_gluster_open (BlockDriverState *bs, QDict *options, int bdrv_flags) +{ + inode_t *inode = NULL; + BDRVGlusterState *s = bs->opaque; + QemuOpts *opts = NULL; + Error *local_err = NULL; + const char *filename = NULL; + char gfid_str[128]; + int ret; + qb_conf_t *conf = THIS->private; + + opts = qemu_opts_create_nofail(&runtime_opts); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (error_is_set(&local_err)) { + qerror_report_err(local_err); + error_free(local_err); + return -EINVAL; + } + + filename = qemu_opt_get(opts, "filename"); + + /* + * gfid:<gfid> format means we're opening a backing image. + */ + ret = sscanf(filename, "gluster://gfid:%s", gfid_str); + if (ret) { + loc_t loc = {0,}; + struct iatt buf = {0,}; + uuid_t gfid; + + uuid_parse(gfid_str, gfid); + + loc.inode = inode_find(conf->root_inode->table, gfid); + if (!loc.inode) { + loc.inode = inode_new(conf->root_inode->table); + uuid_copy(loc.inode->gfid, gfid); + } + + uuid_copy(loc.gfid, loc.inode->gfid); + ret = syncop_lookup(FIRST_CHILD(THIS), &loc, NULL, &buf, NULL, + NULL); + if (ret) { + loc_wipe(&loc); + return -errno; + } + + s->inode = inode_ref(loc.inode); + loc_wipe(&loc); + } else { + inode = qb_inode_from_filename (filename); + if (!inode) + return -EINVAL; + + s->inode = inode_ref(inode); + } + + return 0; +} + + +static int +qemu_gluster_create (const char *filename, QEMUOptionParameter *options) +{ + uint64_t total_size = 0; + inode_t *inode = NULL; + fd_t *fd = NULL; + struct iatt stat = {0, }; + int ret = 0; + + inode = qb_inode_from_filename (filename); + if (!inode) + return -EINVAL; + + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + total_size = options->value.n / BDRV_SECTOR_SIZE; + } + options++; + } + + fd = fd_anonymous (inode); + if (!fd) + return -ENOMEM; + + ret = syncop_fstat (FIRST_CHILD(THIS), fd, &stat); + if (ret) { + fd_unref (fd); + return -errno; + } + + if (stat.ia_size) { + /* format ONLY if the filesize is 0 bytes */ + fd_unref (fd); + return -EFBIG; + } + + if (total_size) { + ret = syncop_ftruncate (FIRST_CHILD(THIS), fd, total_size); + if (ret) { + fd_unref (fd); + return -errno; + } + } + + fd_unref (fd); + return 0; +} + + +static int +qemu_gluster_co_readv (BlockDriverState *bs, int64_t sector_num, int nb_sectors, + QEMUIOVector *qiov) +{ + fd_t *fd = NULL; + off_t offset = 0; + size_t size = 0; + struct iovec *iov = NULL; + int count = 0; + struct iobref *iobref = NULL; + int ret = 0; + + fd = fd_from_bs (bs); + if (!fd) + return -EIO; + + offset = sector_num * BDRV_SECTOR_SIZE; + size = nb_sectors * BDRV_SECTOR_SIZE; + + ret = syncop_readv (FIRST_CHILD(THIS), fd, size, offset, 0, + &iov, &count, &iobref); + if (ret < 0) { + ret = -errno; + goto out; + } + + iov_copy (qiov->iov, qiov->niov, iov, count); /* *choke!* */ + +out: + GF_FREE (iov); + if (iobref) + iobref_unref (iobref); + fd_unref (fd); + return ret; +} + + +static int +qemu_gluster_co_writev (BlockDriverState *bs, int64_t sector_num, int nb_sectors, + QEMUIOVector *qiov) +{ + fd_t *fd = NULL; + off_t offset = 0; + size_t size = 0; + struct iobref *iobref = NULL; + struct iobuf *iobuf = NULL; + struct iovec iov = {0, }; + int ret = -ENOMEM; + + fd = fd_from_bs (bs); + if (!fd) + return -EIO; + + offset = sector_num * BDRV_SECTOR_SIZE; + size = nb_sectors * BDRV_SECTOR_SIZE; + + iobuf = iobuf_get2 (THIS->ctx->iobuf_pool, size); + if (!iobuf) + goto out; + + iobref = iobref_new (); + if (!iobref) { + iobuf_unref (iobuf); + goto out; + } + + iobref_add (iobref, iobuf); + + iov_unload (iobuf_ptr (iobuf), qiov->iov, qiov->niov); /* *choke!* */ + + iov.iov_base = iobuf_ptr (iobuf); + iov.iov_len = size; + + ret = syncop_writev (FIRST_CHILD(THIS), fd, &iov, 1, offset, iobref, 0); + if (ret < 0) + ret = -errno; + +out: + if (iobuf) + iobuf_unref (iobuf); + if (iobref) + iobref_unref (iobref); + fd_unref (fd); + return ret; +} + + +static int +qemu_gluster_co_flush (BlockDriverState *bs) +{ + fd_t *fd = NULL; + int ret = 0; + + fd = fd_from_bs (bs); + + ret = syncop_flush (FIRST_CHILD(THIS), fd); + + fd_unref (fd); + + return ret; +} + + +static int +qemu_gluster_co_fsync (BlockDriverState *bs) +{ + fd_t *fd = NULL; + int ret = 0; + + fd = fd_from_bs (bs); + + ret = syncop_fsync (FIRST_CHILD(THIS), fd, 0); + + fd_unref (fd); + + return ret; +} + + +static int +qemu_gluster_truncate (BlockDriverState *bs, int64_t offset) +{ + fd_t *fd = NULL; + int ret = 0; + + fd = fd_from_bs (bs); + + ret = syncop_ftruncate (FIRST_CHILD(THIS), fd, offset); + + fd_unref (fd); + + if (ret < 0) + return ret; + + return ret; +} + + +static int64_t +qemu_gluster_getlength (BlockDriverState *bs) +{ + fd_t *fd = NULL; + int ret = 0; + struct iatt iatt = {0, }; + + fd = fd_from_bs (bs); + + ret = syncop_fstat (FIRST_CHILD(THIS), fd, &iatt); + if (ret < 0) + return -1; + + return iatt.ia_size; +} + + +static int64_t +qemu_gluster_allocated_file_size (BlockDriverState *bs) +{ + fd_t *fd = NULL; + int ret = 0; + struct iatt iatt = {0, }; + + fd = fd_from_bs (bs); + + ret = syncop_fstat (FIRST_CHILD(THIS), fd, &iatt); + if (ret < 0) + return -1; + + return iatt.ia_blocks * 512; +} + + +static void +qemu_gluster_close (BlockDriverState *bs) +{ + BDRVGlusterState *s = NULL; + + s = bs->opaque; + + inode_unref (s->inode); + + return; +} + + +static QEMUOptionParameter qemu_gluster_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { NULL } +}; + + +static BlockDriver bdrv_gluster = { + .format_name = "gluster", + .protocol_name = "gluster", + .instance_size = sizeof(BDRVGlusterState), + .bdrv_file_open = qemu_gluster_open, + .bdrv_close = qemu_gluster_close, + .bdrv_create = qemu_gluster_create, + .bdrv_getlength = qemu_gluster_getlength, + .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, + .bdrv_co_readv = qemu_gluster_co_readv, + .bdrv_co_writev = qemu_gluster_co_writev, + .bdrv_co_flush_to_os = qemu_gluster_co_flush, + .bdrv_co_flush_to_disk = qemu_gluster_co_fsync, + .bdrv_truncate = qemu_gluster_truncate, + .create_options = qemu_gluster_create_options, +}; + + +static void bdrv_gluster_init(void) +{ + bdrv_register(&bdrv_gluster); +} + + +block_init(bdrv_gluster_init); diff --git a/xlators/features/qemu-block/src/bh-syncop.c b/xlators/features/qemu-block/src/bh-syncop.c new file mode 100644 index 000000000..e8686f6d4 --- /dev/null +++ b/xlators/features/qemu-block/src/bh-syncop.c @@ -0,0 +1,48 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "syncop.h" +#include "qemu-block-memory-types.h" + +#include "block/aio.h" + +void +qemu_bh_schedule (QEMUBH *bh) +{ + return; +} + +void +qemu_bh_cancel (QEMUBH *bh) +{ + return; +} + +void +qemu_bh_delete (QEMUBH *bh) +{ + +} + +QEMUBH * +qemu_bh_new (QEMUBHFunc *cb, void *opaque) +{ + return NULL; +} diff --git a/xlators/features/qemu-block/src/clock-timer.c b/xlators/features/qemu-block/src/clock-timer.c new file mode 100644 index 000000000..fcbec6ad1 --- /dev/null +++ b/xlators/features/qemu-block/src/clock-timer.c @@ -0,0 +1,60 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "syncop.h" +#include "qemu-block-memory-types.h" + +#include "qemu/timer.h" + +QEMUClock *vm_clock; +int use_rt_clock = 0; + +QEMUTimer *qemu_new_timer (QEMUClock *clock, int scale, + QEMUTimerCB *cb, void *opaque) +{ + return NULL; +} + +int64_t qemu_get_clock_ns (QEMUClock *clock) +{ + return 0; +} + +void qemu_mod_timer (QEMUTimer *ts, int64_t expire_time) +{ + return; +} + +void qemu_free_timer (QEMUTimer *ts) +{ + +} + +void qemu_del_timer (QEMUTimer *ts) +{ + +} + +bool qemu_aio_wait() +{ + synctask_wake (synctask_get()); + synctask_yield (synctask_get()); + return 0; +} diff --git a/xlators/features/qemu-block/src/coroutine-synctask.c b/xlators/features/qemu-block/src/coroutine-synctask.c new file mode 100644 index 000000000..e43988a95 --- /dev/null +++ b/xlators/features/qemu-block/src/coroutine-synctask.c @@ -0,0 +1,116 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "syncop.h" +#include "qemu-block-memory-types.h" + +#include "qemu-block.h" + +/* + * This code serves as the bridge from the main glusterfs context to the qemu + * coroutine context via synctask. We create a single threaded syncenv with a + * single synctask responsible for processing a queue of coroutines. The qemu + * code invoked from within the synctask function handlers uses the ucontext + * coroutine implementation and scheduling logic internal to qemu. This + * effectively donates a thread of execution to qemu and its internal coroutine + * management. + * + * NOTE: The existence of concurrent synctasks has proven quite racy with regard + * to qemu coroutine management, particularly related to the lifecycle + * differences with top-level synctasks and internally created coroutines and + * interactions with qemu-internal queues (and locks, in turn). We explicitly + * disallow this scenario, via the queue, until it is more well supported. + */ + +static struct { + struct list_head queue; + gf_lock_t lock; + struct synctask *task; +} qb_co; + +static void +init_qbco() +{ + INIT_LIST_HEAD(&qb_co.queue); + LOCK_INIT(&qb_co.lock); +} + +static int +synctask_nop_cbk (int ret, call_frame_t *frame, void *opaque) +{ + return 0; +} + +static int +qb_synctask_wrap (void *opaque) +{ + qb_local_t *qb_local, *tmp; + + LOCK(&qb_co.lock); + + while (!list_empty(&qb_co.queue)) { + list_for_each_entry_safe(qb_local, tmp, &qb_co.queue, list) { + list_del_init(&qb_local->list); + break; + } + + UNLOCK(&qb_co.lock); + + qb_local->synctask_fn(qb_local); + /* qb_local is now unwound and gone! */ + + LOCK(&qb_co.lock); + } + + qb_co.task = NULL; + + UNLOCK(&qb_co.lock); + + return 0; +} + +int +qb_coroutine (call_frame_t *frame, synctask_fn_t fn) +{ + qb_local_t *qb_local = NULL; + qb_conf_t *qb_conf = NULL; + static int init = 0; + + qb_local = frame->local; + qb_local->synctask_fn = fn; + qb_conf = frame->this->private; + + if (!init) { + init = 1; + init_qbco(); + } + + LOCK(&qb_co.lock); + + if (!qb_co.task) + qb_co.task = synctask_create(qb_conf->env, qb_synctask_wrap, + synctask_nop_cbk, frame, NULL); + + list_add_tail(&qb_local->list, &qb_co.queue); + + UNLOCK(&qb_co.lock); + + return 0; +} diff --git a/xlators/features/qemu-block/src/monitor-logging.c b/xlators/features/qemu-block/src/monitor-logging.c new file mode 100644 index 000000000..d37c37f0f --- /dev/null +++ b/xlators/features/qemu-block/src/monitor-logging.c @@ -0,0 +1,50 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "qemu-block-memory-types.h" + +#include "block/block_int.h" + +Monitor *cur_mon; + +int +monitor_cur_is_qmp() +{ + /* No QMP support here */ + return 0; +} + +void +monitor_set_error (Monitor *mon, QError *qerror) +{ + /* NOP here */ + return; +} + + +void +monitor_vprintf(Monitor *mon, const char *fmt, va_list ap) +{ + char buf[4096]; + + vsnprintf(buf, sizeof(buf), fmt, ap); + + gf_log (THIS->name, GF_LOG_ERROR, "%s", buf); +} diff --git a/xlators/features/qemu-block/src/qb-coroutines.c b/xlators/features/qemu-block/src/qb-coroutines.c new file mode 100644 index 000000000..7c52adb21 --- /dev/null +++ b/xlators/features/qemu-block/src/qb-coroutines.c @@ -0,0 +1,662 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "inode.h" +#include "call-stub.h" +#include "defaults.h" +#include "qemu-block-memory-types.h" +#include "qemu-block.h" +#include "qb-coroutines.h" + + +int +qb_format_and_resume (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + call_stub_t *stub = NULL; + inode_t *inode = NULL; + char filename[64]; + char base_filename[128]; + int use_base = 0; + qb_inode_t *qb_inode = NULL; + Error *local_err = NULL; + fd_t *fd = NULL; + dict_t *xattr = NULL; + qb_conf_t *qb_conf = NULL; + int ret = -1; + + local = opaque; + frame = local->frame; + stub = local->stub; + inode = local->inode; + qb_conf = frame->this->private; + + qb_inode_to_filename (inode, filename, 64); + + qb_inode = qb_inode_ctx_get (frame->this, inode); + + /* + * See if the caller specified a backing image. + */ + if (!uuid_is_null(qb_inode->backing_gfid) || qb_inode->backing_fname) { + loc_t loc = {0,}; + char gfid_str[64]; + struct iatt buf; + + if (!uuid_is_null(qb_inode->backing_gfid)) { + loc.inode = inode_find(qb_conf->root_inode->table, + qb_inode->backing_gfid); + if (!loc.inode) { + loc.inode = inode_new(qb_conf->root_inode->table); + uuid_copy(loc.inode->gfid, + qb_inode->backing_gfid); + } + uuid_copy(loc.gfid, loc.inode->gfid); + } else if (qb_inode->backing_fname) { + loc.inode = inode_new(qb_conf->root_inode->table); + loc.name = qb_inode->backing_fname; + loc.parent = inode_parent(inode, NULL, NULL); + loc_path(&loc, loc.name); + } + + /* + * Lookup the backing image. Verify existence and/or get the + * gfid if we don't already have it. + */ + ret = syncop_lookup(FIRST_CHILD(frame->this), &loc, NULL, &buf, + NULL, NULL); + GF_FREE(qb_inode->backing_fname); + if (ret) { + loc_wipe(&loc); + ret = errno; + goto err; + } + + uuid_copy(qb_inode->backing_gfid, buf.ia_gfid); + loc_wipe(&loc); + + /* + * We pass the filename of the backing image into the qemu block + * subsystem as the associated gfid. This is embedded into the + * clone image and passed along to the gluster bdrv backend when + * the block subsystem needs to operate on the backing image on + * behalf of the clone. + */ + uuid_unparse(qb_inode->backing_gfid, gfid_str); + snprintf(base_filename, sizeof(base_filename), + "gluster://gfid:%s", gfid_str); + use_base = 1; + } + + bdrv_img_create (filename, qb_inode->fmt, + use_base ? base_filename : NULL, 0, 0, qb_inode->size, + 0, &local_err, true); + + if (error_is_set (&local_err)) { + gf_log (frame->this->name, GF_LOG_ERROR, "%s", + error_get_pretty (local_err)); + error_free (local_err); + QB_STUB_UNWIND (stub, -1, EIO); + return 0; + } + + fd = fd_anonymous (inode); + if (!fd) { + gf_log (frame->this->name, GF_LOG_ERROR, + "could not create anonymous fd for %s", + uuid_utoa (inode->gfid)); + QB_STUB_UNWIND (stub, -1, ENOMEM); + return 0; + } + + xattr = dict_new (); + if (!xattr) { + gf_log (frame->this->name, GF_LOG_ERROR, + "could not allocate xattr dict for %s", + uuid_utoa (inode->gfid)); + QB_STUB_UNWIND (stub, -1, ENOMEM); + fd_unref (fd); + return 0; + } + + ret = dict_set_str (xattr, qb_conf->qb_xattr_key, local->fmt); + if (ret) { + gf_log (frame->this->name, GF_LOG_ERROR, + "could not dict_set for %s", + uuid_utoa (inode->gfid)); + QB_STUB_UNWIND (stub, -1, ENOMEM); + fd_unref (fd); + dict_unref (xattr); + return 0; + } + + ret = syncop_fsetxattr (FIRST_CHILD(THIS), fd, xattr, 0); + if (ret) { + ret = errno; + gf_log (frame->this->name, GF_LOG_ERROR, + "failed to setxattr for %s", + uuid_utoa (inode->gfid)); + QB_STUB_UNWIND (stub, -1, ret); + fd_unref (fd); + dict_unref (xattr); + return 0; + } + + fd_unref (fd); + dict_unref (xattr); + + QB_STUB_UNWIND (stub, 0, 0); + + return 0; + +err: + QB_STUB_UNWIND(stub, -1, ret); + return 0; +} + + +static BlockDriverState * +qb_bs_create (inode_t *inode, const char *fmt) +{ + char filename[64]; + BlockDriverState *bs = NULL; + BlockDriver *drv = NULL; + int op_errno = 0; + int ret = 0; + + bs = bdrv_new (uuid_utoa (inode->gfid)); + if (!bs) { + op_errno = ENOMEM; + gf_log (THIS->name, GF_LOG_ERROR, + "could not allocate @bdrv for gfid:%s", + uuid_utoa (inode->gfid)); + goto err; + } + + drv = bdrv_find_format (fmt); + if (!drv) { + op_errno = EINVAL; + gf_log (THIS->name, GF_LOG_ERROR, + "Unknown file format: %s for gfid:%s", + fmt, uuid_utoa (inode->gfid)); + goto err; + } + + qb_inode_to_filename (inode, filename, 64); + + ret = bdrv_open (bs, filename, NULL, BDRV_O_RDWR, drv); + if (ret < 0) { + op_errno = -ret; + gf_log (THIS->name, GF_LOG_ERROR, + "Unable to bdrv_open() gfid:%s (%s)", + uuid_utoa (inode->gfid), strerror (op_errno)); + goto err; + } + + return bs; +err: + errno = op_errno; + return NULL; +} + + +int +qb_co_open (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + call_stub_t *stub = NULL; + inode_t *inode = NULL; + qb_inode_t *qb_inode = NULL; + + local = opaque; + frame = local->frame; + stub = local->stub; + inode = local->inode; + + qb_inode = qb_inode_ctx_get (frame->this, inode); + if (!qb_inode->bs) { + /* FIXME: we need locks around this when + enabling multithreaded syncop/coroutine + for qemu-block + */ + + qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); + if (!qb_inode->bs) { + QB_STUB_UNWIND (stub, -1, errno); + return 0; + } + } + qb_inode->refcnt++; + + QB_STUB_RESUME (stub); + + return 0; +} + + +int +qb_co_writev (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + call_stub_t *stub = NULL; + inode_t *inode = NULL; + qb_inode_t *qb_inode = NULL; + QEMUIOVector qiov = {0, }; + int ret = 0; + + local = opaque; + frame = local->frame; + stub = local->stub; + inode = local->inode; + + qb_inode = qb_inode_ctx_get (frame->this, inode); + if (!qb_inode->bs) { + /* FIXME: we need locks around this when + enabling multithreaded syncop/coroutine + for qemu-block + */ + + qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); + if (!qb_inode->bs) { + QB_STUB_UNWIND (stub, -1, errno); + return 0; + } + } + + qemu_iovec_init_external (&qiov, stub->args.vector, stub->args.count); + + ret = bdrv_pwritev (qb_inode->bs, stub->args.offset, &qiov); + + if (ret < 0) { + QB_STUB_UNWIND (stub, -1, -ret); + } else { + QB_STUB_UNWIND (stub, ret, 0); + } + + return 0; +} + + +int +qb_co_readv (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + call_stub_t *stub = NULL; + inode_t *inode = NULL; + qb_inode_t *qb_inode = NULL; + struct iobuf *iobuf = NULL; + struct iobref *iobref = NULL; + struct iovec iov = {0, }; + int ret = 0; + + local = opaque; + frame = local->frame; + stub = local->stub; + inode = local->inode; + + qb_inode = qb_inode_ctx_get (frame->this, inode); + if (!qb_inode->bs) { + /* FIXME: we need locks around this when + enabling multithreaded syncop/coroutine + for qemu-block + */ + + qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); + if (!qb_inode->bs) { + QB_STUB_UNWIND (stub, -1, errno); + return 0; + } + } + + if (stub->args.offset >= qb_inode->size) { + QB_STUB_UNWIND (stub, 0, 0); + return 0; + } + + iobuf = iobuf_get2 (frame->this->ctx->iobuf_pool, stub->args.size); + if (!iobuf) { + QB_STUB_UNWIND (stub, -1, ENOMEM); + return 0; + } + + iobref = iobref_new (); + if (!iobref) { + QB_STUB_UNWIND (stub, -1, ENOMEM); + iobuf_unref (iobuf); + return 0; + } + + if (iobref_add (iobref, iobuf) < 0) { + iobuf_unref (iobuf); + iobref_unref (iobref); + QB_STUB_UNWIND (stub, -1, ENOMEM); + return 0; + } + + ret = bdrv_pread (qb_inode->bs, stub->args.offset, iobuf_ptr (iobuf), + stub->args.size); + + if (ret < 0) { + QB_STUB_UNWIND (stub, -1, -ret); + iobref_unref (iobref); + return 0; + } + + iov.iov_base = iobuf_ptr (iobuf); + iov.iov_len = ret; + + stub->args_cbk.vector = iov_dup (&iov, 1); + stub->args_cbk.count = 1; + stub->args_cbk.iobref = iobref; + + QB_STUB_UNWIND (stub, ret, 0); + + return 0; +} + + +int +qb_co_fsync (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + call_stub_t *stub = NULL; + inode_t *inode = NULL; + qb_inode_t *qb_inode = NULL; + int ret = 0; + + local = opaque; + frame = local->frame; + stub = local->stub; + inode = local->inode; + + qb_inode = qb_inode_ctx_get (frame->this, inode); + if (!qb_inode->bs) { + /* FIXME: we need locks around this when + enabling multithreaded syncop/coroutine + for qemu-block + */ + + qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); + if (!qb_inode->bs) { + QB_STUB_UNWIND (stub, -1, errno); + return 0; + } + } + + ret = bdrv_flush (qb_inode->bs); + + if (ret < 0) { + QB_STUB_UNWIND (stub, -1, -ret); + } else { + QB_STUB_UNWIND (stub, ret, 0); + } + + return 0; +} + + +static void +qb_update_size_xattr (xlator_t *this, fd_t *fd, const char *fmt, off_t offset) +{ + char val[QB_XATTR_VAL_MAX]; + qb_conf_t *qb_conf = NULL; + dict_t *xattr = NULL; + + qb_conf = this->private; + + snprintf (val, QB_XATTR_VAL_MAX, "%s:%llu", + fmt, (long long unsigned) offset); + + xattr = dict_new (); + if (!xattr) + return; + + if (dict_set_str (xattr, qb_conf->qb_xattr_key, val) != 0) { + dict_unref (xattr); + return; + } + + syncop_fsetxattr (FIRST_CHILD(this), fd, xattr, 0); + dict_unref (xattr); +} + + +int +qb_co_truncate (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + call_stub_t *stub = NULL; + inode_t *inode = NULL; + qb_inode_t *qb_inode = NULL; + int ret = 0; + off_t offset = 0; + xlator_t *this = NULL; + + this = THIS; + + local = opaque; + frame = local->frame; + stub = local->stub; + inode = local->inode; + + qb_inode = qb_inode_ctx_get (frame->this, inode); + if (!qb_inode->bs) { + /* FIXME: we need locks around this when + enabling multithreaded syncop/coroutine + for qemu-block + */ + + qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); + if (!qb_inode->bs) { + QB_STUB_UNWIND (stub, -1, errno); + return 0; + } + } + + syncop_fstat (FIRST_CHILD(this), local->fd, &stub->args_cbk.prestat); + stub->args_cbk.prestat.ia_size = qb_inode->size; + + ret = bdrv_truncate (qb_inode->bs, stub->args.offset); + if (ret < 0) + goto out; + + offset = bdrv_getlength (qb_inode->bs); + + qb_inode->size = offset; + + syncop_fstat (FIRST_CHILD(this), local->fd, &stub->args_cbk.poststat); + stub->args_cbk.poststat.ia_size = qb_inode->size; + + qb_update_size_xattr (this, local->fd, qb_inode->fmt, qb_inode->size); + +out: + if (ret < 0) { + QB_STUB_UNWIND (stub, -1, -ret); + } else { + QB_STUB_UNWIND (stub, ret, 0); + } + + return 0; +} + + +int +qb_co_close (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + inode_t *inode = NULL; + qb_inode_t *qb_inode = NULL; + BlockDriverState *bs = NULL; + + local = opaque; + inode = local->inode; + + qb_inode = qb_inode_ctx_get (THIS, inode); + + if (!--qb_inode->refcnt) { + bs = qb_inode->bs; + qb_inode->bs = NULL; + bdrv_delete (bs); + } + + frame = local->frame; + frame->local = NULL; + qb_local_free (THIS, local); + STACK_DESTROY (frame->root); + + return 0; +} + + +int +qb_snapshot_create (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + call_stub_t *stub = NULL; + inode_t *inode = NULL; + qb_inode_t *qb_inode = NULL; + QEMUSnapshotInfo sn; + struct timeval tv = {0, }; + int ret = 0; + + local = opaque; + frame = local->frame; + stub = local->stub; + inode = local->inode; + + qb_inode = qb_inode_ctx_get (frame->this, inode); + if (!qb_inode->bs) { + /* FIXME: we need locks around this when + enabling multithreaded syncop/coroutine + for qemu-block + */ + + qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); + if (!qb_inode->bs) { + QB_STUB_UNWIND (stub, -1, errno); + return 0; + } + } + + memset (&sn, 0, sizeof (sn)); + pstrcpy (sn.name, sizeof(sn.name), local->name); + gettimeofday (&tv, NULL); + sn.date_sec = tv.tv_sec; + sn.date_nsec = tv.tv_usec * 1000; + + ret = bdrv_snapshot_create (qb_inode->bs, &sn); + if (ret < 0) { + QB_STUB_UNWIND (stub, -1, -ret); + } else { + QB_STUB_UNWIND (stub, ret, 0); + } + + return 0; +} + + +int +qb_snapshot_delete (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + call_stub_t *stub = NULL; + inode_t *inode = NULL; + qb_inode_t *qb_inode = NULL; + int ret = 0; + + local = opaque; + frame = local->frame; + stub = local->stub; + inode = local->inode; + + qb_inode = qb_inode_ctx_get (frame->this, inode); + if (!qb_inode->bs) { + /* FIXME: we need locks around this when + enabling multithreaded syncop/coroutine + for qemu-block + */ + + qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); + if (!qb_inode->bs) { + QB_STUB_UNWIND (stub, -1, errno); + return 0; + } + } + + ret = bdrv_snapshot_delete (qb_inode->bs, local->name); + + if (ret < 0) { + QB_STUB_UNWIND (stub, -1, -ret); + } else { + QB_STUB_UNWIND (stub, ret, 0); + } + + return 0; +} + + +int +qb_snapshot_goto (void *opaque) +{ + qb_local_t *local = NULL; + call_frame_t *frame = NULL; + call_stub_t *stub = NULL; + inode_t *inode = NULL; + qb_inode_t *qb_inode = NULL; + int ret = 0; + + local = opaque; + frame = local->frame; + stub = local->stub; + inode = local->inode; + + qb_inode = qb_inode_ctx_get (frame->this, inode); + if (!qb_inode->bs) { + /* FIXME: we need locks around this when + enabling multithreaded syncop/coroutine + for qemu-block + */ + + qb_inode->bs = qb_bs_create (inode, qb_inode->fmt); + if (!qb_inode->bs) { + QB_STUB_UNWIND (stub, -1, errno); + return 0; + } + } + + ret = bdrv_snapshot_goto (qb_inode->bs, local->name); + + if (ret < 0) { + QB_STUB_UNWIND (stub, -1, -ret); + } else { + QB_STUB_UNWIND (stub, ret, 0); + } + + return 0; +} diff --git a/xlators/features/qemu-block/src/qb-coroutines.h b/xlators/features/qemu-block/src/qb-coroutines.h new file mode 100644 index 000000000..583319f3b --- /dev/null +++ b/xlators/features/qemu-block/src/qb-coroutines.h @@ -0,0 +1,30 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __QB_COROUTINES_H +#define __QB_COROUTINES_H + +#include "syncop.h" +#include "call-stub.h" +#include "block/block_int.h" +#include "monitor/monitor.h" + +int qb_format_and_resume (void *opaque); +int qb_snapshot_create (void *opaque); +int qb_snapshot_delete (void *opaque); +int qb_snapshot_goto (void *opaque); +int qb_co_open (void *opaque); +int qb_co_close (void *opaque); +int qb_co_writev (void *opaque); +int qb_co_readv (void *opaque); +int qb_co_fsync (void *opaque); +int qb_co_truncate (void *opaque); + +#endif /* __QB_COROUTINES_H */ diff --git a/xlators/features/qemu-block/src/qemu-block-memory-types.h b/xlators/features/qemu-block/src/qemu-block-memory-types.h new file mode 100644 index 000000000..267b3893f --- /dev/null +++ b/xlators/features/qemu-block/src/qemu-block-memory-types.h @@ -0,0 +1,25 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef __QB_MEM_TYPES_H__ +#define __QB_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_qb_mem_types_ { + gf_qb_mt_qb_conf_t = gf_common_mt_end + 1, + gf_qb_mt_qb_inode_t, + gf_qb_mt_qb_local_t, + gf_qb_mt_coroutinesynctask_t, + gf_qb_mt_end +}; +#endif + diff --git a/xlators/features/qemu-block/src/qemu-block.c b/xlators/features/qemu-block/src/qemu-block.c new file mode 100644 index 000000000..48bbf3140 --- /dev/null +++ b/xlators/features/qemu-block/src/qemu-block.c @@ -0,0 +1,1140 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "inode.h" +#include "call-stub.h" +#include "defaults.h" +#include "qemu-block-memory-types.h" +#include "qemu-block.h" +#include "qb-coroutines.h" + + +qb_inode_t * +__qb_inode_ctx_get (xlator_t *this, inode_t *inode) +{ + uint64_t value = 0; + qb_inode_t *qb_inode = NULL; + + __inode_ctx_get (inode, this, &value); + qb_inode = (qb_inode_t *)(unsigned long) value; + + return qb_inode; +} + + +qb_inode_t * +qb_inode_ctx_get (xlator_t *this, inode_t *inode) +{ + qb_inode_t *qb_inode = NULL; + + LOCK (&inode->lock); + { + qb_inode = __qb_inode_ctx_get (this, inode); + } + UNLOCK (&inode->lock); + + return qb_inode; +} + + +qb_inode_t * +qb_inode_ctx_del (xlator_t *this, inode_t *inode) +{ + uint64_t value = 0; + qb_inode_t *qb_inode = NULL; + + inode_ctx_del (inode, this, &value); + qb_inode = (qb_inode_t *)(unsigned long) value; + + return qb_inode; +} + + +int +qb_inode_cleanup (xlator_t *this, inode_t *inode, int warn) +{ + qb_inode_t *qb_inode = NULL; + + qb_inode = qb_inode_ctx_del (this, inode); + + if (!qb_inode) + return 0; + + if (warn) + gf_log (this->name, GF_LOG_WARNING, + "inode %s no longer block formatted", + uuid_utoa (inode->gfid)); + + /* free (qb_inode->bs); */ + + GF_FREE (qb_inode); + + return 0; +} + + +int +qb_iatt_fixup (xlator_t *this, inode_t *inode, struct iatt *iatt) +{ + qb_inode_t *qb_inode = NULL; + + qb_inode = qb_inode_ctx_get (this, inode); + if (!qb_inode) + return 0; + + iatt->ia_size = qb_inode->size; + + return 0; +} + + +int +qb_format_extract (xlator_t *this, char *format, inode_t *inode) +{ + char *s, *save; + uint64_t size = 0; + char fmt[QB_XATTR_VAL_MAX+1] = {0, }; + qb_inode_t *qb_inode = NULL; + char *formatstr = NULL; + uuid_t gfid = {0,}; + char gfid_str[64] = {0,}; + int ret; + + strncpy(fmt, format, QB_XATTR_VAL_MAX); + + s = strtok_r(fmt, ":", &save); + if (!s) + goto invalid; + formatstr = gf_strdup(s); + + s = strtok_r(NULL, ":", &save); + if (!s) + goto invalid; + if (gf_string2bytesize (s, &size)) + goto invalid; + if (!size) + goto invalid; + + s = strtok_r(NULL, "\0", &save); + if (s && !strncmp(s, "<gfid:", strlen("<gfid:"))) { + /* + * Check for valid gfid backing image specifier. + */ + if (strlen(s) + 1 > sizeof(gfid_str)) + goto invalid; + ret = sscanf(s, "<gfid:%[^>]s", gfid_str); + if (ret == 1) { + ret = uuid_parse(gfid_str, gfid); + if (ret < 0) + goto invalid; + } + } + + qb_inode = qb_inode_ctx_get (this, inode); + if (!qb_inode) + qb_inode = GF_CALLOC (1, sizeof (*qb_inode), + gf_qb_mt_qb_inode_t); + if (!qb_inode) { + GF_FREE(formatstr); + return ENOMEM; + } + + strncpy(qb_inode->fmt, formatstr, QB_XATTR_VAL_MAX); + qb_inode->size = size; + + /* + * If a backing gfid was not specified, interpret any remaining bytes + * associated with a backing image as a filename local to the parent + * directory. The format processing will validate further. + */ + if (!uuid_is_null(gfid)) + uuid_copy(qb_inode->backing_gfid, gfid); + else if (s) + qb_inode->backing_fname = gf_strdup(s); + + inode_ctx_set (inode, this, (void *)&qb_inode); + + GF_FREE(formatstr); + + return 0; + +invalid: + GF_FREE(formatstr); + + gf_log (this->name, GF_LOG_WARNING, + "invalid format '%s' in inode %s", format, + uuid_utoa (inode->gfid)); + return EINVAL; +} + + +void +qb_local_free (xlator_t *this, qb_local_t *local) +{ + if (local->inode) + inode_unref (local->inode); + if (local->fd) + fd_unref (local->fd); + GF_FREE (local); +} + + +int +qb_local_init (call_frame_t *frame) +{ + qb_local_t *qb_local = NULL; + + qb_local = GF_CALLOC (1, sizeof (*qb_local), gf_qb_mt_qb_local_t); + if (!qb_local) + return -1; + INIT_LIST_HEAD(&qb_local->list); + + qb_local->frame = frame; + frame->local = qb_local; + + return 0; +} + + +int +qb_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + dict_t *xdata, struct iatt *postparent) +{ + char *format = NULL; + qb_conf_t *conf = NULL; + + conf = this->private; + + if (op_ret == -1) + goto out; + + /* + * Cache the root inode for dealing with backing images. The format + * coroutine and the gluster qemu backend driver both use the root inode + * table to verify and/or redirect I/O to the backing image via + * anonymous fd's. + */ + if (!conf->root_inode && __is_root_gfid(inode->gfid)) + conf->root_inode = inode_ref(inode); + + if (!xdata) + goto out; + + if (dict_get_str (xdata, conf->qb_xattr_key, &format)) + goto out; + + if (!format) { + qb_inode_cleanup (this, inode, 1); + goto out; + } + + op_errno = qb_format_extract (this, format, inode); + if (op_errno) + op_ret = -1; + + qb_iatt_fixup (this, inode, buf); +out: + QB_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf, + xdata, postparent); + return 0; +} + + +int +qb_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + qb_conf_t *conf = NULL; + + conf = this->private; + + xdata = xdata ? dict_ref (xdata) : dict_new (); + + if (!xdata) + goto enomem; + + if (dict_set_int32 (xdata, conf->qb_xattr_key, 0)) + goto enomem; + + STACK_WIND (frame, qb_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + dict_unref (xdata); + return 0; +enomem: + QB_STACK_UNWIND (lookup, frame, -1, ENOMEM, 0, 0, 0, 0); + if (xdata) + dict_unref (xdata); + return 0; +} + + +int +qb_setxattr_format (call_frame_t *frame, xlator_t *this, call_stub_t *stub, + dict_t *xattr, inode_t *inode) +{ + char *format = NULL; + int op_errno = 0; + qb_local_t *qb_local = NULL; + data_t *data = NULL; + qb_inode_t *qb_inode; + + if (!(data = dict_get (xattr, "trusted.glusterfs.block-format"))) { + QB_STUB_RESUME (stub); + return 0; + } + + format = alloca (data->len + 1); + memcpy (format, data->data, data->len); + format[data->len] = 0; + + op_errno = qb_format_extract (this, format, inode); + if (op_errno) { + QB_STUB_UNWIND (stub, -1, op_errno); + return 0; + } + qb_inode = qb_inode_ctx_get(this, inode); + + qb_local = frame->local; + + qb_local->stub = stub; + qb_local->inode = inode_ref (inode); + + snprintf(qb_local->fmt, QB_XATTR_VAL_MAX, "%s:%lu", qb_inode->fmt, + qb_inode->size); + + qb_coroutine (frame, qb_format_and_resume); + + return 0; +} + + +int +qb_setxattr_snapshot_create (call_frame_t *frame, xlator_t *this, + call_stub_t *stub, dict_t *xattr, inode_t *inode) +{ + qb_local_t *qb_local = NULL; + char *name = NULL; + data_t *data = NULL; + + if (!(data = dict_get (xattr, "trusted.glusterfs.block-snapshot-create"))) { + QB_STUB_RESUME (stub); + return 0; + } + + name = alloca (data->len + 1); + memcpy (name, data->data, data->len); + name[data->len] = 0; + + qb_local = frame->local; + + qb_local->stub = stub; + qb_local->inode = inode_ref (inode); + strncpy (qb_local->name, name, 128); + + qb_coroutine (frame, qb_snapshot_create); + + return 0; +} + + +int +qb_setxattr_snapshot_delete (call_frame_t *frame, xlator_t *this, + call_stub_t *stub, dict_t *xattr, inode_t *inode) +{ + qb_local_t *qb_local = NULL; + char *name = NULL; + data_t *data = NULL; + + if (!(data = dict_get (xattr, "trusted.glusterfs.block-snapshot-delete"))) { + QB_STUB_RESUME (stub); + return 0; + } + + name = alloca (data->len + 1); + memcpy (name, data->data, data->len); + name[data->len] = 0; + + qb_local = frame->local; + + qb_local->stub = stub; + qb_local->inode = inode_ref (inode); + strncpy (qb_local->name, name, 128); + + qb_coroutine (frame, qb_snapshot_delete); + + return 0; +} + +int +qb_setxattr_snapshot_goto (call_frame_t *frame, xlator_t *this, + call_stub_t *stub, dict_t *xattr, inode_t *inode) +{ + qb_local_t *qb_local = NULL; + char *name = NULL; + data_t *data = NULL; + + if (!(data = dict_get (xattr, "trusted.glusterfs.block-snapshot-goto"))) { + QB_STUB_RESUME (stub); + return 0; + } + + name = alloca (data->len + 1); + memcpy (name, data->data, data->len); + name[data->len] = 0; + + qb_local = frame->local; + + qb_local->stub = stub; + qb_local->inode = inode_ref (inode); + strncpy (qb_local->name, name, 128); + + qb_coroutine (frame, qb_snapshot_goto); + + return 0; +} + + +int +qb_setxattr_common (call_frame_t *frame, xlator_t *this, call_stub_t *stub, + dict_t *xattr, inode_t *inode) +{ + data_t *data = NULL; + + if ((data = dict_get (xattr, "trusted.glusterfs.block-format"))) { + qb_setxattr_format (frame, this, stub, xattr, inode); + return 0; + } + + if ((data = dict_get (xattr, "trusted.glusterfs.block-snapshot-create"))) { + qb_setxattr_snapshot_create (frame, this, stub, xattr, inode); + return 0; + } + + if ((data = dict_get (xattr, "trusted.glusterfs.block-snapshot-delete"))) { + qb_setxattr_snapshot_delete (frame, this, stub, xattr, inode); + return 0; + } + + if ((data = dict_get (xattr, "trusted.glusterfs.block-snapshot-goto"))) { + qb_setxattr_snapshot_goto (frame, this, stub, xattr, inode); + return 0; + } + + QB_STUB_RESUME (stub); + + return 0; +} + + +int +qb_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr, + int flags, dict_t *xdata) +{ + call_stub_t *stub = NULL; + + if (qb_local_init (frame) != 0) + goto enomem; + + stub = fop_setxattr_stub (frame, default_setxattr_resume, loc, xattr, + flags, xdata); + if (!stub) + goto enomem; + + qb_setxattr_common (frame, this, stub, xattr, loc->inode); + + return 0; +enomem: + QB_STACK_UNWIND (setxattr, frame, -1, ENOMEM, 0); + return 0; +} + + +int +qb_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr, + int flags, dict_t *xdata) +{ + call_stub_t *stub = NULL; + + if (qb_local_init (frame) != 0) + goto enomem; + + stub = fop_fsetxattr_stub (frame, default_fsetxattr_resume, fd, xattr, + flags, xdata); + if (!stub) + goto enomem; + + qb_setxattr_common (frame, this, stub, xattr, fd->inode); + + return 0; +enomem: + QB_STACK_UNWIND (fsetxattr, frame, -1, ENOMEM, 0); + return 0; +} + + +int +qb_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +{ + call_stub_t *stub = NULL; + qb_local_t *qb_local = NULL; + + qb_local = frame->local; + + if (op_ret < 0) + goto unwind; + + if (!qb_inode_ctx_get (this, qb_local->inode)) + goto unwind; + + stub = fop_open_cbk_stub (frame, NULL, op_ret, op_errno, fd, xdata); + if (!stub) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + qb_local->stub = stub; + + qb_coroutine (frame, qb_co_open); + + return 0; +unwind: + QB_STACK_UNWIND (open, frame, op_ret, op_errno, fd, xdata); + return 0; +} + + +int +qb_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + fd_t *fd, dict_t *xdata) +{ + qb_local_t *qb_local = NULL; + qb_inode_t *qb_inode = NULL; + + qb_inode = qb_inode_ctx_get (this, loc->inode); + if (!qb_inode) { + STACK_WIND (frame, default_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, + xdata); + return 0; + } + + if (qb_local_init (frame) != 0) + goto enomem; + + qb_local = frame->local; + + qb_local->inode = inode_ref (loc->inode); + qb_local->fd = fd_ref (fd); + + STACK_WIND (frame, qb_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; +enomem: + QB_STACK_UNWIND (open, frame, -1, ENOMEM, 0, 0); + return 0; +} + + +int +qb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) +{ + qb_local_t *qb_local = NULL; + qb_inode_t *qb_inode = NULL; + + qb_inode = qb_inode_ctx_get (this, fd->inode); + if (!qb_inode) { + STACK_WIND (frame, default_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, + offset, flags, iobref, xdata); + return 0; + } + + if (qb_local_init (frame) != 0) + goto enomem; + + qb_local = frame->local; + + qb_local->inode = inode_ref (fd->inode); + qb_local->fd = fd_ref (fd); + + qb_local->stub = fop_writev_stub (frame, NULL, fd, vector, count, + offset, flags, iobref, xdata); + if (!qb_local->stub) + goto enomem; + + qb_coroutine (frame, qb_co_writev); + + return 0; +enomem: + QB_STACK_UNWIND (writev, frame, -1, ENOMEM, 0, 0, 0); + return 0; +} + + +int +qb_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + qb_local_t *qb_local = NULL; + qb_inode_t *qb_inode = NULL; + + qb_inode = qb_inode_ctx_get (this, fd->inode); + if (!qb_inode) { + STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, fd, size, offset, + flags, xdata); + return 0; + } + + if (qb_local_init (frame) != 0) + goto enomem; + + qb_local = frame->local; + + qb_local->inode = inode_ref (fd->inode); + qb_local->fd = fd_ref (fd); + + qb_local->stub = fop_readv_stub (frame, NULL, fd, size, offset, + flags, xdata); + if (!qb_local->stub) + goto enomem; + + qb_coroutine (frame, qb_co_readv); + + return 0; +enomem: + QB_STACK_UNWIND (readv, frame, -1, ENOMEM, 0, 0, 0, 0, 0); + return 0; +} + + +int +qb_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int dsync, + dict_t *xdata) +{ + qb_local_t *qb_local = NULL; + qb_inode_t *qb_inode = NULL; + + qb_inode = qb_inode_ctx_get (this, fd->inode); + if (!qb_inode) { + STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, fd, dsync, xdata); + return 0; + } + + if (qb_local_init (frame) != 0) + goto enomem; + + qb_local = frame->local; + + qb_local->inode = inode_ref (fd->inode); + qb_local->fd = fd_ref (fd); + + qb_local->stub = fop_fsync_stub (frame, NULL, fd, dsync, xdata); + + if (!qb_local->stub) + goto enomem; + + qb_coroutine (frame, qb_co_fsync); + + return 0; +enomem: + QB_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); + return 0; +} + + +int +qb_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + qb_local_t *qb_local = NULL; + qb_inode_t *qb_inode = NULL; + + qb_inode = qb_inode_ctx_get (this, fd->inode); + if (!qb_inode) { + STACK_WIND (frame, default_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd, xdata); + return 0; + } + + if (qb_local_init (frame) != 0) + goto enomem; + + qb_local = frame->local; + + qb_local->inode = inode_ref (fd->inode); + qb_local->fd = fd_ref (fd); + + qb_local->stub = fop_flush_stub (frame, NULL, fd, xdata); + + if (!qb_local->stub) + goto enomem; + + qb_coroutine (frame, qb_co_fsync); + + return 0; +enomem: + QB_STACK_UNWIND (flush, frame, -1, ENOMEM, 0); + return 0; +} + +static int32_t +qb_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + qb_conf_t *conf = this->private; + gf_dirent_t *entry; + char *format; + + list_for_each_entry(entry, &entries->list, list) { + if (!entry->inode || !entry->dict) + continue; + + format = NULL; + if (dict_get_str(entry->dict, conf->qb_xattr_key, &format)) + continue; + + if (!format) { + qb_inode_cleanup(this, entry->inode, 1); + continue; + } + + if (qb_format_extract(this, format, entry->inode)) + continue; + + qb_iatt_fixup(this, entry->inode, &entry->d_stat); + } + + STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, xdata); + return 0; +} + +static int32_t +qb_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + qb_conf_t *conf = this->private; + + xdata = xdata ? dict_ref(xdata) : dict_new(); + if (!xdata) + goto enomem; + + if (dict_set_int32 (xdata, conf->qb_xattr_key, 0)) + goto enomem; + + STACK_WIND(frame, qb_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata); + + dict_unref(xdata); + return 0; + +enomem: + QB_STACK_UNWIND(readdirp, frame, -1, ENOMEM, NULL, NULL); + if (xdata) + dict_unref(xdata); + return 0; +} + +int +qb_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + qb_local_t *qb_local = NULL; + qb_inode_t *qb_inode = NULL; + + qb_inode = qb_inode_ctx_get (this, loc->inode); + if (!qb_inode) { + STACK_WIND (frame, default_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, + xdata); + return 0; + } + + if (qb_local_init (frame) != 0) + goto enomem; + + qb_local = frame->local; + + qb_local->inode = inode_ref (loc->inode); + qb_local->fd = fd_anonymous (loc->inode); + + qb_local->stub = fop_truncate_stub (frame, NULL, loc, offset, xdata); + + if (!qb_local->stub) + goto enomem; + + qb_coroutine (frame, qb_co_truncate); + + return 0; +enomem: + QB_STACK_UNWIND (truncate, frame, -1, ENOMEM, 0, 0, 0); + return 0; +} + + +int +qb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + qb_local_t *qb_local = NULL; + qb_inode_t *qb_inode = NULL; + + qb_inode = qb_inode_ctx_get (this, fd->inode); + if (!qb_inode) { + STACK_WIND (frame, default_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, + xdata); + return 0; + } + + if (qb_local_init (frame) != 0) + goto enomem; + + qb_local = frame->local; + + qb_local->inode = inode_ref (fd->inode); + qb_local->fd = fd_ref (fd); + + qb_local->stub = fop_ftruncate_stub (frame, NULL, fd, offset, xdata); + + if (!qb_local->stub) + goto enomem; + + qb_coroutine (frame, qb_co_truncate); + + return 0; +enomem: + QB_STACK_UNWIND (ftruncate, frame, -1, ENOMEM, 0, 0, 0); + return 0; +} + + +int +qb_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *iatt, dict_t *xdata) +{ + inode_t *inode = NULL; + + inode = frame->local; + frame->local = NULL; + + if (inode) { + qb_iatt_fixup (this, inode, iatt); + inode_unref (inode); + } + + QB_STACK_UNWIND (stat, frame, op_ret, op_errno, iatt, xdata); + + return 0; +} + +int +qb_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + if (qb_inode_ctx_get (this, loc->inode)) + frame->local = inode_ref (loc->inode); + + STACK_WIND (frame, qb_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; +} + + +int +qb_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *iatt, dict_t *xdata) +{ + inode_t *inode = NULL; + + inode = frame->local; + frame->local = NULL; + + if (inode) { + qb_iatt_fixup (this, inode, iatt); + inode_unref (inode); + } + + QB_STACK_UNWIND (fstat, frame, op_ret, op_errno, iatt, xdata); + + return 0; +} + + +int +qb_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + if (qb_inode_ctx_get (this, fd->inode)) + frame->local = inode_ref (fd->inode); + + STACK_WIND (frame, qb_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + return 0; +} + + +int +qb_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, struct iatt *post, + dict_t *xdata) +{ + inode_t *inode = NULL; + + inode = frame->local; + frame->local = NULL; + + if (inode) { + qb_iatt_fixup (this, inode, pre); + qb_iatt_fixup (this, inode, post); + inode_unref (inode); + } + + QB_STACK_UNWIND (setattr, frame, op_ret, op_errno, pre, post, xdata); + + return 0; +} + + +int +qb_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, + int valid, dict_t *xdata) +{ + if (qb_inode_ctx_get (this, loc->inode)) + frame->local = inode_ref (loc->inode); + + STACK_WIND (frame, qb_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, buf, valid, xdata); + return 0; +} + + +int +qb_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, struct iatt *post, + dict_t *xdata) +{ + inode_t *inode = NULL; + + inode = frame->local; + frame->local = NULL; + + if (inode) { + qb_iatt_fixup (this, inode, pre); + qb_iatt_fixup (this, inode, post); + inode_unref (inode); + } + + QB_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, pre, post, xdata); + + return 0; +} + + +int +qb_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf, + int valid, dict_t *xdata) +{ + if (qb_inode_ctx_get (this, fd->inode)) + frame->local = inode_ref (fd->inode); + + STACK_WIND (frame, qb_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetattr, fd, buf, valid, xdata); + return 0; +} + + +int +qb_forget (xlator_t *this, inode_t *inode) +{ + return qb_inode_cleanup (this, inode, 0); +} + + +int +qb_release (xlator_t *this, fd_t *fd) +{ + call_frame_t *frame = NULL; + + frame = create_frame (this, this->ctx->pool); + if (!frame) { + gf_log (this->name, GF_LOG_ERROR, + "Could not allocate frame. " + "Leaking QEMU BlockDriverState"); + return -1; + } + + if (qb_local_init (frame) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "Could not allocate local. " + "Leaking QEMU BlockDriverState"); + STACK_DESTROY (frame->root); + return -1; + } + + if (qb_coroutine (frame, qb_co_close) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "Could not allocate coroutine. " + "Leaking QEMU BlockDriverState"); + qb_local_free (this, frame->local); + frame->local = NULL; + STACK_DESTROY (frame->root); + } + + return 0; +} + +int +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + ret = xlator_mem_acct_init (this, gf_qb_mt_end + 1); + + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init " + "failed"); + return ret; +} + + +int +reconfigure (xlator_t *this, dict_t *options) +{ + return 0; +} + + +int +init (xlator_t *this) +{ + qb_conf_t *conf = NULL; + int32_t ret = -1; + static int bdrv_inited = 0; + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "FATAL: qemu-block (%s) not configured with exactly " + "one child", this->name); + goto out; + } + + conf = GF_CALLOC (1, sizeof (*conf), gf_qb_mt_qb_conf_t); + if (!conf) + goto out; + + /* configure 'option window-size <size>' */ + GF_OPTION_INIT ("default-password", conf->default_password, str, out); + + /* qemu coroutines use "co_mutex" for synchronizing among themselves. + However "co_mutex" itself is not threadsafe if the coroutine framework + is multithreaded (which usually is not). However synctasks are + fundamentally multithreaded, so for now create a syncenv which has + scaling limits set to max 1 thread so that the qemu coroutines can + execute "safely". + + Future work: provide an implementation of "co_mutex" which is + threadsafe and use the global multithreaded ctx->env syncenv. + */ + conf->env = syncenv_new (0, 1, 1); + + this->private = conf; + + ret = 0; + + snprintf (conf->qb_xattr_key, QB_XATTR_KEY_MAX, QB_XATTR_KEY_FMT, + this->name); + + cur_mon = (void *) 1; + + if (!bdrv_inited) { + bdrv_init (); + bdrv_inited = 1; + } + +out: + if (ret) + GF_FREE (conf); + + return ret; +} + + +void +fini (xlator_t *this) +{ + qb_conf_t *conf = NULL; + + conf = this->private; + + this->private = NULL; + + if (conf->root_inode) + inode_unref(conf->root_inode); + GF_FREE (conf); + + return; +} + + +struct xlator_fops fops = { + .lookup = qb_lookup, + .fsetxattr = qb_fsetxattr, + .setxattr = qb_setxattr, + .open = qb_open, + .writev = qb_writev, + .readv = qb_readv, + .fsync = qb_fsync, + .truncate = qb_truncate, + .ftruncate = qb_ftruncate, + .stat = qb_stat, + .fstat = qb_fstat, + .setattr = qb_setattr, + .fsetattr = qb_fsetattr, + .flush = qb_flush, +/* + .getxattr = qb_getxattr, + .fgetxattr = qb_fgetxattr +*/ + .readdirp = qb_readdirp, +}; + + +struct xlator_cbks cbks = { + .forget = qb_forget, + .release = qb_release, +}; + + +struct xlator_dumpops dumpops = { +}; + + +struct volume_options options[] = { + { .key = {"default-password"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "", + .description = "Default password for the AES encrypted block images." + }, + { .key = {NULL} }, +}; diff --git a/xlators/features/qemu-block/src/qemu-block.h b/xlators/features/qemu-block/src/qemu-block.h new file mode 100644 index 000000000..c95f2799a --- /dev/null +++ b/xlators/features/qemu-block/src/qemu-block.h @@ -0,0 +1,109 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __QEMU_BLOCK_H +#define __QEMU_BLOCK_H + +#include "syncop.h" +#include "call-stub.h" +#include "block/block_int.h" +#include "monitor/monitor.h" + +/* QB_XATTR_KEY_FMT is the on-disk xattr stored in the inode which + indicates that the file must be "interpreted" by the block format + logic. The value of the key is of the pattern: + + "format:virtual_size" + + e.g + + "qcow2:20GB" or "qed:100GB" + + The format and virtual size are colon separated. The format is + a case sensitive string which qemu recognizes. virtual_size is + specified as a size which glusterfs recognizes as size (i.e., + value accepted by gf_string2bytesize()) +*/ +#define QB_XATTR_KEY_FMT "trusted.glusterfs.%s.format" + +#define QB_XATTR_KEY_MAX 64 + +#define QB_XATTR_VAL_MAX 64 + + +typedef struct qb_inode { + char fmt[QB_XATTR_VAL_MAX]; /* this is only the format, not "format:size" */ + size_t size; /* virtual size in bytes */ + BlockDriverState *bs; + int refcnt; + uuid_t backing_gfid; + char *backing_fname; +} qb_inode_t; + + +typedef struct qb_conf { + Monitor *mon; + struct syncenv *env; + char qb_xattr_key[QB_XATTR_KEY_MAX]; + char *default_password; + inode_t *root_inode; +} qb_conf_t; + + +typedef struct qb_local { + call_frame_t *frame; /* backpointer */ + call_stub_t *stub; + inode_t *inode; + fd_t *fd; + char fmt[QB_XATTR_VAL_MAX+1]; + char name[256]; + synctask_fn_t synctask_fn; + struct list_head list; +} qb_local_t; + +void qb_local_free (xlator_t *this, qb_local_t *local); +int qb_coroutine (call_frame_t *frame, synctask_fn_t fn); +inode_t *qb_inode_from_filename (const char *filename); +int qb_inode_to_filename (inode_t *inode, char *filename, int size); +int qb_format_extract (xlator_t *this, char *format, inode_t *inode); + +qb_inode_t *qb_inode_ctx_get (xlator_t *this, inode_t *inode); + +#define QB_STACK_UNWIND(typ, frame, args ...) do { \ + qb_local_t *__local = frame->local; \ + xlator_t *__this = frame->this; \ + \ + frame->local = NULL; \ + STACK_UNWIND_STRICT (typ, frame, args); \ + if (__local) \ + qb_local_free (__this, __local); \ + } while (0) + +#define QB_STUB_UNWIND(stub, op_ret, op_errno) do { \ + qb_local_t *__local = stub->frame->local; \ + xlator_t *__this = stub->frame->this; \ + \ + stub->frame->local = NULL; \ + call_unwind_error (stub, op_ret, op_errno); \ + if (__local) \ + qb_local_free (__this, __local); \ + } while (0) + +#define QB_STUB_RESUME(stub_errno) do { \ + qb_local_t *__local = stub->frame->local; \ + xlator_t *__this = stub->frame->this; \ + \ + stub->frame->local = NULL; \ + call_resume (stub); \ + if (__local) \ + qb_local_free (__this, __local); \ + } while (0) + +#endif /* !__QEMU_BLOCK_H */ diff --git a/xlators/features/quiesce/src/quiesce.c b/xlators/features/quiesce/src/quiesce.c index 73eb91947..24c7dc6ed 100644 --- a/xlators/features/quiesce/src/quiesce.c +++ b/xlators/features/quiesce/src/quiesce.c @@ -111,7 +111,7 @@ void gf_quiesce_enqueue (xlator_t *this, call_stub_t *stub) { quiesce_priv_t *priv = NULL; - struct timeval timeout = {0,}; + struct timespec timeout = {0,}; priv = this->private; if (!priv) { @@ -129,7 +129,7 @@ gf_quiesce_enqueue (xlator_t *this, call_stub_t *stub) if (!priv->timer) { timeout.tv_sec = 20; - timeout.tv_usec = 0; + timeout.tv_nsec = 0; priv->timer = gf_timer_call_after (this->ctx, timeout, @@ -2492,7 +2492,7 @@ notify (xlator_t *this, int event, void *data, ...) { int ret = 0; quiesce_priv_t *priv = NULL; - struct timeval timeout = {0,}; + struct timespec timeout = {0,}; priv = this->private; if (!priv) @@ -2525,7 +2525,7 @@ notify (xlator_t *this, int event, void *data, ...) if (priv->timer) break; timeout.tv_sec = 20; - timeout.tv_usec = 0; + timeout.tv_nsec = 0; priv->timer = gf_timer_call_after (this->ctx, timeout, diff --git a/xlators/features/quota/src/quota.c b/xlators/features/quota/src/quota.c index 4ea54cca8..c527e7ca7 100644 --- a/xlators/features/quota/src/quota.c +++ b/xlators/features/quota/src/quota.c @@ -2969,6 +2969,177 @@ err: return 0; } +int32_t +quota_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t ret = 0; + uint64_t ctx_int = 0; + quota_inode_ctx_t *ctx = NULL; + quota_local_t *local = NULL; + quota_dentry_t *dentry = NULL; + int64_t delta = 0; + + local = frame->local; + + if ((op_ret < 0) || (local == NULL)) { + goto out; + } + + ret = inode_ctx_get (local->loc.inode, this, &ctx_int); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to get the context", local->loc.path); + goto out; + } + + ctx = (quota_inode_ctx_t *)(unsigned long) ctx_int; + + if (ctx == NULL) { + gf_log (this->name, GF_LOG_WARNING, + "quota context not set in %s (gfid:%s)", + local->loc.path, uuid_utoa (local->loc.inode->gfid)); + goto out; + } + + LOCK (&ctx->lock); + { + ctx->buf = *postbuf; + } + UNLOCK (&ctx->lock); + + list_for_each_entry (dentry, &ctx->parents, next) { + delta = (postbuf->ia_blocks - prebuf->ia_blocks) * 512; + quota_update_size (this, local->loc.inode, + dentry->name, dentry->par, delta); + } + +out: + QUOTA_STACK_UNWIND (fallocate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + +int32_t +quota_fallocate_helper(call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t mode, off_t offset, size_t len, dict_t *xdata) +{ + quota_local_t *local = NULL; + int32_t op_errno = EINVAL; + + local = frame->local; + if (local == NULL) { + gf_log (this->name, GF_LOG_WARNING, "local is NULL"); + goto unwind; + } + + if (local->op_ret == -1) { + op_errno = local->op_errno; + goto unwind; + } + + STACK_WIND (frame, quota_fallocate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len, + xdata); + return 0; + +unwind: + QUOTA_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +quota_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + int32_t ret = -1, op_errno = EINVAL; + int32_t parents = 0; + quota_local_t *local = NULL; + quota_inode_ctx_t *ctx = NULL; + quota_priv_t *priv = NULL; + call_stub_t *stub = NULL; + quota_dentry_t *dentry = NULL; + + GF_ASSERT (frame); + GF_VALIDATE_OR_GOTO ("quota", this, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd, unwind); + + local = quota_local_new (); + if (local == NULL) { + goto unwind; + } + + frame->local = local; + local->loc.inode = inode_ref (fd->inode); + + ret = quota_inode_ctx_get (fd->inode, -1, this, NULL, NULL, &ctx, 0); + if (ctx == NULL) { + gf_log (this->name, GF_LOG_WARNING, + "quota context not set in inode (gfid:%s)", + uuid_utoa (fd->inode->gfid)); + goto unwind; + } + + stub = fop_fallocate_stub(frame, quota_fallocate_helper, fd, mode, offset, len, + xdata); + if (stub == NULL) { + op_errno = ENOMEM; + goto unwind; + } + + priv = this->private; + GF_VALIDATE_OR_GOTO (this->name, priv, unwind); + + LOCK (&ctx->lock); + { + list_for_each_entry (dentry, &ctx->parents, next) { + parents++; + } + } + UNLOCK (&ctx->lock); + + /* + * Note that by using len as the delta we're assuming the range from + * offset to offset+len has not already been allocated. This can result + * in ENOSPC errors attempting to allocate an already allocated range. + */ + local->delta = len; + local->stub = stub; + local->link_count = parents; + + list_for_each_entry (dentry, &ctx->parents, next) { + ret = quota_check_limit (frame, fd->inode, this, dentry->name, + dentry->par); + if (ret == -1) { + break; + } + } + + stub = NULL; + + LOCK (&local->lock); + { + local->link_count = 0; + if (local->validate_count == 0) { + stub = local->stub; + local->stub = NULL; + } + } + UNLOCK (&local->lock); + + if (stub != NULL) { + call_resume (stub); + } + + return 0; + +unwind: + QUOTA_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + int32_t mem_acct_init (xlator_t *this) @@ -3304,6 +3475,7 @@ struct xlator_fops fops = { .removexattr = quota_removexattr, .fremovexattr = quota_fremovexattr, .readdirp = quota_readdirp, + .fallocate = quota_fallocate, }; struct xlator_cbks cbks = { diff --git a/xlators/lib/src/libxlator.c b/xlators/lib/src/libxlator.c index 624a929d0..9e5357255 100644 --- a/xlators/lib/src/libxlator.c +++ b/xlators/lib/src/libxlator.c @@ -11,6 +11,34 @@ #include "libxlator.h" +int marker_xtime_default_gauge[] = { + [MCNT_FOUND] = 1, + [MCNT_NOTFOUND] = -1, + [MCNT_ENODATA] = -1, + [MCNT_ENOTCONN] = -1, + [MCNT_ENOENT] = -1, + [MCNT_EOTHER] = -1, +}; + +int marker_uuid_default_gauge[] = { + [MCNT_FOUND] = 1, + [MCNT_NOTFOUND] = 0, + [MCNT_ENODATA] = 0, + [MCNT_ENOTCONN] = 0, + [MCNT_ENOENT] = 0, + [MCNT_EOTHER] = 0, +}; + +static int marker_idx_errno_map[] = { + [MCNT_FOUND] = EINVAL, + [MCNT_NOTFOUND] = EINVAL, + [MCNT_ENOENT] = ENOENT, + [MCNT_ENOTCONN] = ENOTCONN, + [MCNT_ENODATA] = ENODATA, + [MCNT_EOTHER] = EINVAL, + [MCNT_MAX] = 0, +}; + /*Copy the contents of oldtimebuf to newtimbuf*/ static void update_timebuf (uint32_t *oldtimbuf, uint32_t *newtimebuf) @@ -47,22 +75,61 @@ match_uuid_local (const char *name, char *uuid) static void marker_local_incr_errcount (xl_marker_local_t *local, int op_errno) { + marker_result_idx_t i = -1; + if (!local) return; switch (op_errno) { case ENODATA: - local->enodata_count++; + i = MCNT_ENODATA; break; case ENOENT: - local->enoent_count++; + i = MCNT_ENOENT; break; case ENOTCONN: - local->enotconn_count++; + i = MCNT_ENOTCONN; break; default: + i = MCNT_EOTHER; break; } + + local->count[i]++; +} + +static int +evaluate_marker_results (int *gauge, int *count) +{ + int i = 0; + int op_errno = 0; + gf_boolean_t sane = _gf_true; + + /* check if the policy of the gauge is violated; + * if yes, try to get the best errno, ie. look + * for the first position where there is a more + * specific kind of vioilation than the generic EINVAL + */ + for (i = 0; i < MCNT_MAX; i++) { + if (sane) { + if ((gauge[i] > 0 && count[i] < gauge[i]) || + (gauge[i] < 0 && count[i] >= -gauge[i])) { + sane = _gf_false; + /* generic action: adopt corresponding errno */ + op_errno = marker_idx_errno_map[i]; + } + } else { + /* already insane; trying to get a more informative + * errno by checking subsequent counters + */ + if (count[i] > 0) + op_errno = marker_idx_errno_map[i]; + } + if (op_errno && op_errno != EINVAL) + break; + } + + return op_errno; } /* Aggregate all the <volid>.xtime attrs of the cluster and send the max*/ @@ -98,14 +165,10 @@ cluster_markerxtime_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { callcnt = --local->call_count; - if (local->esomerr) - goto unlock; - vol_uuid = local->vol_uuid; if (op_ret) { marker_local_incr_errcount (local, op_errno); - local->esomerr = op_errno; goto unlock; } @@ -119,11 +182,11 @@ cluster_markerxtime_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (dict_get_ptr (dict, marker_xattr, (void **)&net_timebuf)) { gf_log (this->name, GF_LOG_WARNING, "Unable to get <uuid>.xtime attr"); - local->noxtime_count++; + local->count[MCNT_NOTFOUND]++; goto unlock; } - if (local->has_xtime) { + if (local->count[MCNT_FOUND]) { get_hosttime (net_timebuf, host_timebuf); if ( (host_timebuf[0]>local->host_timebuf[0]) || (host_timebuf[0] == local->host_timebuf[0] && @@ -135,7 +198,7 @@ cluster_markerxtime_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } else { get_hosttime (net_timebuf, local->host_timebuf); update_timebuf (net_timebuf, local->net_timebuf); - local->has_xtime = _gf_true; + local->count[MCNT_FOUND]++; } } @@ -147,7 +210,7 @@ unlock: op_errno = 0; need_unwind = 1; - if (local->has_xtime) { + if (local->count[MCNT_FOUND]) { if (!dict) dict = dict_new(); @@ -160,17 +223,9 @@ unlock: } } - if (local->noxtime_count) - goto out; - - if (local->enodata_count || local->enotconn_count || - local->enoent_count) { + op_errno = evaluate_marker_results (local->gauge, local->count); + if (op_errno) op_ret = -1; - op_errno = local->enodata_count? ENODATA: - local->enotconn_count? ENOTCONN: - local->enoent_count? ENOENT: - local->esomerr; - } } out: @@ -228,7 +283,7 @@ cluster_markeruuid_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (ret) goto unlock; - if (marker_has_volinfo (local)) { + if (local->count[MCNT_FOUND]) { if ((local->volmark->major != volmark->major) || (local->volmark->minor != volmark->minor)) { op_ret = -1; @@ -257,6 +312,7 @@ cluster_markeruuid_cbk (call_frame_t *frame, void *cookie, xlator_t *this, uuid_unparse (volmark->uuid, vol_uuid); if (volmark->retval) local->retval = volmark->retval; + local->count[MCNT_FOUND]++; } } unlock: @@ -267,7 +323,7 @@ unlock: op_errno = 0; need_unwind = 1; - if (marker_has_volinfo (local)) { + if (local->count[MCNT_FOUND]) { if (!dict) dict = dict_new(); @@ -277,11 +333,10 @@ unlock: op_ret = -1; op_errno = ENOMEM; } - } else { - op_ret = -1; - op_errno = local->enotconn_count? ENOTCONN: - local->enoent_count? ENOENT:EINVAL; } + op_errno = evaluate_marker_results (local->gauge, local->count); + if (op_errno) + op_ret = -1; } out: @@ -303,7 +358,7 @@ cluster_getmarkerattr (call_frame_t *frame,xlator_t *this, loc_t *loc, const char *name, void *xl_local, xlator_specf_unwind_t xl_specf_getxattr_unwind, xlator_t **sub_volumes, int count, int type, - char *vol_uuid) + int *gauge, char *vol_uuid) { int i = 0; xl_marker_local_t *local = NULL; @@ -326,6 +381,7 @@ cluster_getmarkerattr (call_frame_t *frame,xlator_t *this, loc_t *loc, local->call_count = count; local->xl_specf_unwind = xl_specf_getxattr_unwind; local->vol_uuid = vol_uuid; + memcpy (local->gauge, gauge, sizeof (local->gauge)); frame->local = local; @@ -357,3 +413,58 @@ err: return -1; } + +int +gf_get_min_stime (xlator_t *this, dict_t *dst, char *key, data_t *value) +{ + int ret = -1; + uint32_t *net_timebuf = NULL; + uint32_t *value_timebuf = NULL; + uint32_t host_timebuf[2] = {0,}; + uint32_t host_value_timebuf[2] = {0,}; + + /* stime should be minimum of all the other nodes */ + ret = dict_get_bin (dst, key, (void **)&net_timebuf); + if (ret < 0) { + net_timebuf = GF_CALLOC (1, sizeof (int64_t), + gf_common_mt_char); + if (!net_timebuf) + goto out; + + ret = dict_set_bin (dst, key, net_timebuf, sizeof (int64_t)); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "key=%s: dict set failed", key); + goto error; + } + } + + value_timebuf = data_to_bin (value); + if (!value_timebuf) { + gf_log (this->name, GF_LOG_WARNING, + "key=%s: getting value of stime failed", key); + ret = -1; + goto out; + } + + get_hosttime (value_timebuf, host_value_timebuf); + get_hosttime (net_timebuf, host_timebuf); + + /* can't use 'min()' macro here as we need to compare two fields + in the array, selectively */ + if ((host_value_timebuf[0] > host_timebuf[0]) || + ((host_value_timebuf[0] == host_timebuf[0]) && + (host_value_timebuf[1] > host_timebuf[1]))) { + update_timebuf (value_timebuf, net_timebuf); + } + + ret = 0; +out: + return ret; +error: + /* To be used only when net_timebuf is not set in the dict */ + if (net_timebuf) + GF_FREE (net_timebuf); + + return ret; +} diff --git a/xlators/lib/src/libxlator.h b/xlators/lib/src/libxlator.h index 7e75c9482..1d5e1657f 100644 --- a/xlators/lib/src/libxlator.h +++ b/xlators/lib/src/libxlator.h @@ -48,6 +48,69 @@ struct volume_mark { uint32_t usec; }__attribute__ ((__packed__)); + +/* + * The enumerated type here + * is used to index two kind + * of integer arrays: + * - gauges + * - counters + + * A counter is used internally, + * in getxattr callbacks, to count + * the results, categorized as + * the enum names suggest. So values + * in the counter are always non-negative. + + * Gauges are part of the API. + * The caller passes one to the + * top-level aggregator function, + * cluster_getmarkerattr(). The gauge + * defines an evaluation policy for the + * counter. That is, at the + * end of the aggregation process + * the gauge is matched against the + * counter, and the policy + * represented by the gauge decides + * whether to return with success or failure, + * and in latter case, what particular failure + * case (errno). + + * The rules are the following: for some index i, + * - if gauge[i] == 0, no requirement is set + * against counter[i]; + * - if gauge[i] > 0, counter[i] >= gauge[i] + * is required; + * - if gauge[i] < 0, counter[i] < |gauge[i]| + * is required. + + * If the requirement is not met, then i is mapped + * to the respective errno (MCNT_ENOENT -> ENOENT), + * or in lack of that, EINVAL. + + * Cf. evaluate_marker_results() and marker_idx_errno_map[] + * in libxlator.c + + * We provide two default gauges, one inteded for xtime + * aggregation, other for volume mark aggregation. The + * policies they represent agree with the hard-coded + * one prior to gauges. Cf. marker_xtime_default_gauge + * and marker_uuid_default_gauge in libxlator.c + */ + +typedef enum { + MCNT_FOUND, + MCNT_NOTFOUND, + MCNT_ENODATA, + MCNT_ENOTCONN, + MCNT_ENOENT, + MCNT_EOTHER, + MCNT_MAX +} marker_result_idx_t; + +extern int marker_xtime_default_gauge[]; +extern int marker_uuid_default_gauge[]; + struct marker_str { struct volume_mark *volmark; data_t *data; @@ -55,13 +118,8 @@ struct marker_str { uint32_t host_timebuf[2]; uint32_t net_timebuf[2]; int32_t call_count; - unsigned has_xtime:1; - int32_t enoent_count; - int32_t enotconn_count; - int32_t enodata_count; - int32_t noxtime_count; - - int esomerr; + int gauge[MCNT_MAX]; + int count[MCNT_MAX]; xlator_specf_unwind_t xl_specf_unwind; void *xl_local; @@ -71,15 +129,6 @@ struct marker_str { typedef struct marker_str xl_marker_local_t; -static inline gf_boolean_t -marker_has_volinfo (xl_marker_local_t *marker) -{ - if (marker->volmark) - return _gf_true; - else - return _gf_false; -} - int32_t cluster_markerxtime_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, dict_t *dict, dict_t *xdata); @@ -93,12 +142,12 @@ cluster_getmarkerattr (call_frame_t *frame,xlator_t *this, loc_t *loc, const char *name, void *xl_local, xlator_specf_unwind_t xl_specf_getxattr_unwind, xlator_t **sub_volumes, int count, int type, - char *vol_uuid); + int *gauge, char *vol_uuid); int match_uuid_local (const char *name, char *uuid); - - +int +gf_get_min_stime (xlator_t *this, dict_t *dst, char *key, data_t *value); #endif /* !_LIBXLATOR_H */ diff --git a/xlators/mgmt/glusterd/src/Makefile.am b/xlators/mgmt/glusterd/src/Makefile.am index a6f49ae01..933c44019 100644 --- a/xlators/mgmt/glusterd/src/Makefile.am +++ b/xlators/mgmt/glusterd/src/Makefile.am @@ -11,7 +11,9 @@ glusterd_la_SOURCES = glusterd.c glusterd-handler.c glusterd-sm.c \ glusterd-volgen.c glusterd-rebalance.c glusterd-quota.c \ glusterd-geo-rep.c glusterd-replace-brick.c glusterd-log-ops.c \ glusterd-volume-ops.c glusterd-brick-ops.c glusterd-mountbroker.c \ - glusterd-syncop.c glusterd-hooks.c glusterd-volume-set.c + glusterd-syncop.c glusterd-hooks.c glusterd-volume-set.c \ + glusterd-locks.c glusterd-snapshot.c glusterd-mgmt-handler.c \ + glusterd-mgmt.c glusterd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ $(top_builddir)/rpc/xdr/src/libgfxdr.la \ @@ -21,7 +23,8 @@ glusterd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ noinst_HEADERS = glusterd.h glusterd-utils.h glusterd-op-sm.h \ glusterd-sm.h glusterd-store.h glusterd-mem-types.h \ glusterd-pmap.h glusterd-volgen.h glusterd-mountbroker.h \ - glusterd-syncop.h glusterd-hooks.h + glusterd-syncop.h glusterd-hooks.h glusterd-locks.h \ + glusterd-mgmt.h AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ -I$(rpclibdir) -I$(CONTRIBDIR)/rbtree \ diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c index 912a6a798..596503c21 100644 --- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c @@ -546,6 +546,72 @@ glusterd_handle_add_brick (rpcsvc_request_t *req) return glusterd_big_locked_handler (req, __glusterd_handle_add_brick); } +static int +subvol_matcher_init (int **subvols, int count) +{ + int ret = -1; + + *subvols = GF_CALLOC (count, sizeof(int), gf_gld_mt_int); + if (*subvols) + ret = 0; + + return ret; +} + +static void +subvol_matcher_update (int *subvols, glusterd_volinfo_t *volinfo, + glusterd_brickinfo_t *brickinfo) +{ + glusterd_brickinfo_t *tmp = NULL; + int32_t sub_volume = 0; + int pos = 0; + + list_for_each_entry (tmp, &volinfo->bricks, brick_list) { + + if (strcmp (tmp->hostname, brickinfo->hostname) || + strcmp (tmp->path, brickinfo->path)) { + pos++; + continue; + } + gf_log (THIS->name, GF_LOG_DEBUG, LOGSTR_FOUND_BRICK, + brickinfo->hostname, brickinfo->path, + volinfo->volname); + sub_volume = (pos / volinfo->dist_leaf_count); + subvols[sub_volume]++; + break; + } + +} + +static int +subvol_matcher_verify (int *subvols, glusterd_volinfo_t *volinfo, char *err_str, + size_t err_len, char *vol_type) +{ + int i = 0; + int ret = 0; + + do { + + if (subvols[i] % volinfo->dist_leaf_count == 0) { + continue; + } else { + ret = -1; + snprintf (err_str, err_len, + "Bricks not from same subvol for %s", vol_type); + gf_log (THIS->name, GF_LOG_ERROR, "%s", err_str); + break; + } + } while (++i < volinfo->subvol_count); + + return ret; +} + +static void +subvol_matcher_destroy (int *subvols) +{ + GF_FREE (subvols); +} + int __glusterd_handle_remove_brick (rpcsvc_request_t *req) { @@ -559,10 +625,7 @@ __glusterd_handle_remove_brick (rpcsvc_request_t *req) int i = 1; glusterd_volinfo_t *volinfo = NULL; glusterd_brickinfo_t *brickinfo = NULL; - int32_t pos = 0; - int32_t sub_volume = 0; - int32_t sub_volume_start = 0; - int32_t sub_volume_end = 0; + int *subvols = NULL; glusterd_brickinfo_t *tmp = NULL; char err_str[2048] = {0}; gf_cli_rsp rsp = {0,}; @@ -681,17 +744,6 @@ __glusterd_handle_remove_brick (rpcsvc_request_t *req) goto out; } - /*Do not allow remove-brick if the volume is a replicate volume*/ - if ((volinfo->type == GF_CLUSTER_TYPE_REPLICATE) && - (volinfo->brick_count == volinfo->replica_count)) { - snprintf (err_str, sizeof(err_str), - "Removing brick from a replicate volume " - "is not allowed"); - gf_log (this->name, GF_LOG_ERROR, "%s", err_str); - ret = -1; - goto out; - } - if (!replica_count && (volinfo->type == GF_CLUSTER_TYPE_STRIPE_REPLICATE) && (volinfo->brick_count == volinfo->dist_leaf_count)) { @@ -738,6 +790,14 @@ __glusterd_handle_remove_brick (rpcsvc_request_t *req) } strcpy (brick_list, " "); + + if ((volinfo->type != GF_CLUSTER_TYPE_NONE) && + (volinfo->subvol_count > 1)) { + ret = subvol_matcher_init (&subvols, volinfo->subvol_count); + if (ret) + goto out; + } + while ( i <= count) { snprintf (key, sizeof (key), "brick%d", i); ret = dict_get_str (dict, key, &brick); @@ -805,38 +865,18 @@ __glusterd_handle_remove_brick (rpcsvc_request_t *req) goto out; } - pos = 0; - list_for_each_entry (tmp, &volinfo->bricks, brick_list) { - - if (strcmp (tmp->hostname,brickinfo->hostname) || - strcmp (tmp->path, brickinfo->path)) { - pos++; - continue; - } + /* Find which subvolume the brick belongs to */ + subvol_matcher_update (subvols, volinfo, brickinfo); + } - gf_log (this->name, GF_LOG_DEBUG, LOGSTR_FOUND_BRICK, - brickinfo->hostname, brickinfo->path, - volinfo->volname); - if (!sub_volume && (volinfo->dist_leaf_count > 1)) { - sub_volume = (pos / volinfo->dist_leaf_count) + 1; - sub_volume_start = (volinfo->dist_leaf_count * - (sub_volume - 1)); - sub_volume_end = (volinfo->dist_leaf_count * - sub_volume) - 1; - } else { - if (pos < sub_volume_start || - pos >sub_volume_end) { - ret = -1; - snprintf (err_str, sizeof (err_str), - "Bricks not from same subvol " - "for %s", vol_type); - gf_log (this->name, GF_LOG_ERROR, - "%s", err_str); - goto out; - } - } - break; - } + /* Check if the bricks belong to the same subvolumes.*/ + if ((volinfo->type != GF_CLUSTER_TYPE_NONE) && + (volinfo->subvol_count > 1)) { + ret = subvol_matcher_verify (subvols, volinfo, + err_str, sizeof(err_str), + vol_type); + if (ret) + goto out; } ret = glusterd_op_begin_synctask (req, GD_OP_REMOVE_BRICK, dict); @@ -859,6 +899,7 @@ out: } GF_FREE (brick_list); + subvol_matcher_destroy (subvols); free (cli_req.dict.dict_val); //its malloced by xdr return ret; @@ -871,23 +912,121 @@ glusterd_handle_remove_brick (rpcsvc_request_t *req) __glusterd_handle_remove_brick); } +static int +_glusterd_restart_gsync_session (dict_t *this, char *key, + data_t *value, void *data) +{ + char *slave = NULL; + char *slave_buf = NULL; + char *path_list = NULL; + char *slave_vol = NULL; + char *slave_ip = NULL; + char *conf_path = NULL; + char **errmsg = NULL; + int ret = -1; + glusterd_gsync_status_temp_t *param = NULL; + gf_boolean_t is_running = _gf_false; + + param = (glusterd_gsync_status_temp_t *)data; + + GF_ASSERT (param); + GF_ASSERT (param->volinfo); + + slave = strchr(value->data, ':'); + if (slave) { + slave++; + slave_buf = gf_strdup (slave); + if (!slave_buf) { + gf_log ("", GF_LOG_ERROR, + "Failed to gf_strdup"); + ret = -1; + goto out; + } + } + else + return 0; + + ret = dict_set_dynstr (param->rsp_dict, "slave", slave_buf); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to store slave"); + if (slave_buf) + GF_FREE(slave_buf); + goto out; + } + + ret = glusterd_get_slave_details_confpath (param->volinfo, + param->rsp_dict, + &slave_ip, &slave_vol, + &conf_path, errmsg); + if (ret) { + if (*errmsg) + gf_log ("", GF_LOG_ERROR, "%s", *errmsg); + else + gf_log ("", GF_LOG_ERROR, + "Unable to fetch slave or confpath details."); + goto out; + } + + /* In cases that gsyncd is not running, we will not invoke it + * because of add-brick. */ + ret = glusterd_check_gsync_running_local (param->volinfo->volname, + slave, conf_path, + &is_running); + if (ret) { + gf_log ("", GF_LOG_ERROR, "gsync running validation failed."); + goto out; + } + if (_gf_false == is_running) { + gf_log ("", GF_LOG_DEBUG, "gsync session for %s and %s is" + " not running on this node. Hence not restarting.", + param->volinfo->volname, slave); + ret = 0; + goto out; + } + + ret = glusterd_get_local_brickpaths (param->volinfo, &path_list); + if (!path_list) { + gf_log ("", GF_LOG_DEBUG, "This node not being part of" + " volume should not be running gsyncd. Hence" + " no gsyncd process to restart."); + ret = 0; + goto out; + } + + ret = glusterd_check_restart_gsync_session (param->volinfo, slave, + param->rsp_dict, path_list, + conf_path, 0); + if (ret) + gf_log ("", GF_LOG_ERROR, + "Unable to restart gsync session."); + +out: + gf_log ("", GF_LOG_DEBUG, "Returning %d.", ret); + return ret; +} + /* op-sm */ int glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count, char *bricks, dict_t *dict) { - glusterd_brickinfo_t *brickinfo = NULL; - char *brick = NULL; - int32_t i = 1; - char *brick_list = NULL; - char *free_ptr1 = NULL; - char *free_ptr2 = NULL; - char *saveptr = NULL; - int32_t ret = -1; - int32_t stripe_count = 0; - int32_t replica_count = 0; - int32_t type = 0; + char *brick = NULL; + int32_t i = 1; + char *brick_list = NULL; + char *free_ptr1 = NULL; + char *free_ptr2 = NULL; + char *saveptr = NULL; + int32_t ret = -1; + int32_t stripe_count = 0; + int32_t replica_count = 0; + int32_t type = 0; + glusterd_brickinfo_t *brickinfo = NULL; + glusterd_gsync_status_temp_t param = {0, }; + gf_boolean_t restart_needed = 0; + char msg[1024] __attribute__((unused)) = {0, }; + int caps = 0; GF_ASSERT (volinfo); @@ -968,13 +1107,40 @@ glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count, if (count) brick = strtok_r (brick_list+1, " \n", &saveptr); +#ifdef HAVE_BD_XLATOR + if (brickinfo->vg[0]) + caps = CAPS_BD | CAPS_THIN | + CAPS_OFFLOAD_COPY | CAPS_OFFLOAD_SNAPSHOT; +#endif while (i <= count) { - ret = glusterd_volume_brickinfo_get_by_brick (brick, volinfo, &brickinfo); if (ret) goto out; +#ifdef HAVE_BD_XLATOR + /* Check for VG/thin pool if its BD volume */ + if (brickinfo->vg[0]) { + ret = glusterd_is_valid_vg (brickinfo, 0, msg); + if (ret) { + gf_log (THIS->name, GF_LOG_CRITICAL, "%s", msg); + goto out; + } + /* if anyone of the brick does not have thin support, + disable it for entire volume */ + caps &= brickinfo->caps; + } else + caps = 0; +#endif + + if (uuid_is_null (brickinfo->uuid)) { + ret = glusterd_resolve_brick (brickinfo); + if (ret) { + gf_log ("", GF_LOG_ERROR, FMTSTR_RESOLVE_BRICK, + brickinfo->hostname, brickinfo->path); + goto out; + } + } ret = glusterd_brick_start (volinfo, brickinfo, _gf_true); @@ -982,8 +1148,27 @@ glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count, goto out; i++; brick = strtok_r (NULL, " \n", &saveptr); + + /* Check if the brick is added in this node, and set + * the restart_needed flag. */ + if ((!uuid_compare (brickinfo->uuid, MY_UUID)) && + !restart_needed) { + restart_needed = 1; + gf_log ("", GF_LOG_DEBUG, + "Restart gsyncd session, if it's already " + "running."); + } } + /* If the restart_needed flag is set, restart gsyncd sessions for that + * particular master with all the slaves. */ + if (restart_needed) { + param.rsp_dict = dict; + param.volinfo = volinfo; + dict_foreach (volinfo->gsync_slaves, + _glusterd_restart_gsync_session, ¶m); + } + volinfo->caps = caps; out: GF_FREE (free_ptr1); GF_FREE (free_ptr2); @@ -1080,15 +1265,6 @@ glusterd_op_stage_add_brick (dict_t *dict, char **op_errstr) goto out; } - if (volinfo->backend == GD_VOL_BK_BD) { - snprintf (msg, sizeof (msg), "Add brick is not supported for " - "Block backend volume %s.", volname); - gf_log (THIS->name, GF_LOG_ERROR, "%s", msg); - *op_errstr = gf_strdup (msg); - ret = -1; - goto out; - } - ret = glusterd_validate_volume_id (dict, volinfo); if (ret) goto out; @@ -1166,6 +1342,18 @@ glusterd_op_stage_add_brick (dict_t *dict, char **op_errstr) } if (!uuid_compare (brickinfo->uuid, MY_UUID)) { +#ifdef HAVE_BD_XLATOR + if (brickinfo->vg[0]) { + ret = glusterd_is_valid_vg (brickinfo, 1, msg); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "%s", + msg); + *op_errstr = gf_strdup (msg); + goto out; + } + } +#endif + ret = glusterd_validate_and_create_brickpath (brickinfo, volinfo->volume_id, op_errstr, is_force); @@ -1255,6 +1443,16 @@ glusterd_op_stage_remove_brick (dict_t *dict, char **op_errstr) case GF_OP_CMD_START: { + if ((volinfo->type == GF_CLUSTER_TYPE_REPLICATE) && + dict_get (dict, "replica-count")) { + snprintf (msg, sizeof(msg), "Migration of data is not " + "needed when reducing replica count. Use the" + " 'force' option"); + errstr = gf_strdup (msg); + gf_log (this->name, GF_LOG_ERROR, "%s", errstr); + goto out; + } + if (GLUSTERD_STATUS_STARTED != volinfo->status) { snprintf (msg, sizeof (msg), "Volume %s needs to be " "started before remove-brick (you can use " @@ -1264,6 +1462,17 @@ glusterd_op_stage_remove_brick (dict_t *dict, char **op_errstr) gf_log (this->name, GF_LOG_ERROR, "%s", errstr); goto out; } + if (!gd_is_remove_brick_committed (volinfo)) { + snprintf (msg, sizeof (msg), "An earlier remove-brick " + "task exists for volume %s. Either commit it" + " or stop it before starting a new task.", + volinfo->volname); + errstr = gf_strdup (msg); + gf_log (this->name, GF_LOG_ERROR, "Earlier remove-brick" + " task exists for volume %s.", + volinfo->volname); + goto out; + } if (glusterd_is_defrag_on(volinfo)) { errstr = gf_strdup("Rebalance is in progress. Please " "retry after completion"); @@ -1271,7 +1480,7 @@ glusterd_op_stage_remove_brick (dict_t *dict, char **op_errstr) goto out; } - if (is_origin_glusterd ()) { + if (is_origin_glusterd (dict)) { ret = glusterd_generate_and_set_task_id (dict, GF_REMOVE_BRICK_TID_KEY); if (ret) { @@ -1458,15 +1667,6 @@ glusterd_op_add_brick (dict_t *dict, char **op_errstr) goto out; } - /* Need to reset the defrag/rebalance status accordingly */ - switch (volinfo->rebal.defrag_status) { - case GF_DEFRAG_STATUS_FAILED: - case GF_DEFRAG_STATUS_COMPLETE: - volinfo->rebal.defrag_status = 0; - default: - break; - } - ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT); if (ret) goto out; @@ -1498,6 +1698,8 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr) glusterd_brickinfo_t *tmp = NULL; char *task_id_str = NULL; xlator_t *this = NULL; + dict_t *bricks_dict = NULL; + char *brick_tmpstr = NULL; this = THIS; GF_ASSERT (this); @@ -1525,7 +1727,7 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr) /* Set task-id, if available, in ctx dict for operations other than * start */ - if (is_origin_glusterd () && (cmd != GF_OP_CMD_START)) { + if (is_origin_glusterd (dict) && (cmd != GF_OP_CMD_START)) { if (!uuid_is_null (volinfo->rebal.rebalance_id)) { ret = glusterd_copy_uuid_to_dict (volinfo->rebal.rebalance_id, dict, @@ -1538,9 +1740,14 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr) } } - /* Clear task-id on commmitting/stopping of remove-brick operation */ - if ((cmd != GF_OP_CMD_START) || (cmd != GF_OP_CMD_STATUS)) + /* Clear task-id, rebal.op and stored bricks on commmitting/stopping + * remove-brick */ + if ((cmd != GF_OP_CMD_START) || (cmd != GF_OP_CMD_STATUS)) { uuid_clear (volinfo->rebal.rebalance_id); + volinfo->rebal.op = GD_OP_NONE; + dict_unref (volinfo->rebal.dict); + volinfo->rebal.dict = NULL; + } ret = -1; switch (cmd) { @@ -1587,6 +1794,7 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr) ret = 0; } else { uuid_parse (task_id_str, volinfo->rebal.rebalance_id) ; + volinfo->rebal.op = GD_OP_REMOVE_BRICK; } force = 0; break; @@ -1623,7 +1831,20 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr) goto out; } - + /* Save the list of bricks for later usage. Right now this is required + * for displaying the task parameters with task status in volume status. + */ + bricks_dict = dict_new (); + if (!bricks_dict) { + ret = -1; + goto out; + } + ret = dict_set_int32 (bricks_dict, "count", count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to save remove-brick count"); + goto out; + } while ( i <= count) { snprintf (key, 256, "brick%d", i); ret = dict_get_str (dict, key, &brick); @@ -1633,6 +1854,21 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr) goto out; } + brick_tmpstr = gf_strdup (brick); + if (!brick_tmpstr) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "Failed to duplicate brick name"); + goto out; + } + ret = dict_set_dynstr (bricks_dict, key, brick_tmpstr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to add brick to dict"); + goto out; + } + brick_tmpstr = NULL; + ret = glusterd_op_perform_remove_brick (volinfo, brick, force, &need_rebalance); if (ret) @@ -1646,6 +1882,7 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr) volinfo->replica_count, replica_count, volinfo->volname); volinfo->replica_count = replica_count; + volinfo->sub_count = replica_count; volinfo->dist_leaf_count = glusterd_get_dist_leaf_count (volinfo); volinfo->subvol_count = (volinfo->brick_count / volinfo->dist_leaf_count); @@ -1662,6 +1899,8 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr) } } } + volinfo->rebal.dict = bricks_dict; + bricks_dict = NULL; ret = glusterd_create_volfiles_and_notify_services (volinfo); if (ret) { @@ -1706,5 +1945,9 @@ out: if (ret && err_str[0] && op_errstr) *op_errstr = gf_strdup (err_str); + GF_FREE (brick_tmpstr); + if (bricks_dict) + dict_unref (bricks_dict); + return ret; } diff --git a/xlators/mgmt/glusterd/src/glusterd-geo-rep.c b/xlators/mgmt/glusterd/src/glusterd-geo-rep.c index 7e2ab833f..5786694bd 100644 --- a/xlators/mgmt/glusterd/src/glusterd-geo-rep.c +++ b/xlators/mgmt/glusterd/src/glusterd-geo-rep.c @@ -25,6 +25,40 @@ #include <signal.h> +static int +dict_get_param (dict_t *dict, char *key, char **param); + +static int +glusterd_get_statefile_name (glusterd_volinfo_t *volinfo, char *slave, + char *conf_path, char **statefile); + +static int +glusterd_get_slave_info (char *slave, char **slave_ip, + char **slave_vol, char **op_errstr); + +static int +glusterd_gsync_read_frm_status (char *path, char *buf, size_t blen); + +struct gsync_config_opt_vals_ gsync_confopt_vals[] = { + {.op_name = "change_detector", + .no_of_pos_vals = 2, + .case_sensitive = _gf_true, + .values = {"xsync", "changelog"}, + }, + {.op_name = "special_sync_mode", + .no_of_pos_vals = 2, + .case_sensitive = _gf_true, + .values = {"partial", "recover"} + }, + {.op_name = "log-level", + .no_of_pos_vals = 5, + .case_sensitive = _gf_false, + .values = {"critical", "error", "warning", "info", "debug"} + }, + {.op_name = NULL, + }, +}; + static char *gsync_reserved_opts[] = { "gluster-command-dir", "pid-file", @@ -33,10 +67,156 @@ static char *gsync_reserved_opts[] = { "session-owner", "state-socket-unencoded", "socketdir", + "ignore-deletes", + "local-id", + "local-path", + "slave-id", NULL }; int +__glusterd_handle_sys_exec (rpcsvc_request_t *req) +{ + int32_t ret = 0; + dict_t *dict = NULL; + gf_cli_req cli_req = {{0},}; + glusterd_op_t cli_op = GD_OP_SYS_EXEC; + glusterd_conf_t *priv = NULL; + char *host_uuid = NULL; + char err_str[2048] = {0,}; + xlator_t *this = NULL; + + GF_ASSERT (req); + + this = THIS; + GF_ASSERT (this); + priv = this->private; + GF_ASSERT (priv); + + ret = xdr_to_generic (req->msg[0], &cli_req, + (xdrproc_t)xdr_gf_cli_req); + if (ret < 0) { + req->rpc_err = GARBAGE_ARGS; + goto out; + } + + if (cli_req.dict.dict_len) { + dict = dict_new (); + if (!dict) + goto out; + + + ret = dict_unserialize (cli_req.dict.dict_val, + cli_req.dict.dict_len, + &dict); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to " + "unserialize req-buffer to dictionary"); + snprintf (err_str, sizeof (err_str), "Unable to decode " + "the command"); + goto out; + } else { + dict->extra_stdfree = cli_req.dict.dict_val; + } + + host_uuid = gf_strdup (uuid_utoa(MY_UUID)); + if (host_uuid == NULL) { + snprintf (err_str, sizeof (err_str), "Failed to get " + "the uuid of local glusterd"); + ret = -1; + goto out; + } + + ret = dict_set_dynstr (dict, "host-uuid", host_uuid); + if (ret) + goto out; + } + + ret = glusterd_op_begin_synctask (req, cli_op, dict); + +out: + if (ret) { + if (err_str[0] == '\0') + snprintf (err_str, sizeof (err_str), + "Operation failed"); + ret = glusterd_op_send_cli_response (cli_op, ret, 0, req, + dict, err_str); + } + return ret; +} + +int +__glusterd_handle_copy_file (rpcsvc_request_t *req) +{ + int32_t ret = 0; + dict_t *dict = NULL; + gf_cli_req cli_req = {{0},}; + glusterd_op_t cli_op = GD_OP_COPY_FILE; + glusterd_conf_t *priv = NULL; + char *host_uuid = NULL; + char err_str[2048] = {0,}; + xlator_t *this = NULL; + + GF_ASSERT (req); + + this = THIS; + GF_ASSERT (this); + priv = this->private; + GF_ASSERT (priv); + + ret = xdr_to_generic (req->msg[0], &cli_req, + (xdrproc_t)xdr_gf_cli_req); + if (ret < 0) { + req->rpc_err = GARBAGE_ARGS; + goto out; + } + + if (cli_req.dict.dict_len) { + dict = dict_new (); + if (!dict) + goto out; + + + ret = dict_unserialize (cli_req.dict.dict_val, + cli_req.dict.dict_len, + &dict); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to " + "unserialize req-buffer to dictionary"); + snprintf (err_str, sizeof (err_str), "Unable to decode " + "the command"); + goto out; + } else { + dict->extra_stdfree = cli_req.dict.dict_val; + } + + host_uuid = gf_strdup (uuid_utoa(MY_UUID)); + if (host_uuid == NULL) { + snprintf (err_str, sizeof (err_str), "Failed to get " + "the uuid of local glusterd"); + ret = -1; + goto out; + } + + ret = dict_set_dynstr (dict, "host-uuid", host_uuid); + if (ret) + goto out; + } + + ret = glusterd_op_begin_synctask (req, cli_op, dict); + +out: + if (ret) { + if (err_str[0] == '\0') + snprintf (err_str, sizeof (err_str), + "Operation failed"); + ret = glusterd_op_send_cli_response (cli_op, ret, 0, req, + dict, err_str); + } + return ret; +} + +int __glusterd_handle_gsync_set (rpcsvc_request_t *req) { int32_t ret = 0; @@ -100,13 +280,13 @@ __glusterd_handle_gsync_set (rpcsvc_request_t *req) ret = dict_get_str (dict, "master", &master); if (ret < 0) { gf_log (this->name, GF_LOG_INFO, "master not found, while " - "handling"GEOREP" options"); + "handling "GEOREP" options"); master = "(No Master)"; } ret = dict_get_str (dict, "slave", &slave); if (ret < 0) { - gf_log (this->name, GF_LOG_INFO, "slave not not found, while" + gf_log (this->name, GF_LOG_INFO, "slave not found, while " "handling "GEOREP" options"); slave = "(No Slave)"; } @@ -120,6 +300,10 @@ __glusterd_handle_gsync_set (rpcsvc_request_t *req) } switch (type) { + case GF_GSYNC_OPTION_TYPE_CREATE: + strncpy (operation, "create", sizeof (operation)); + cli_op = GD_OP_GSYNC_CREATE; + break; case GF_GSYNC_OPTION_TYPE_START: strncpy (operation, "start", sizeof (operation)); @@ -136,12 +320,9 @@ __glusterd_handle_gsync_set (rpcsvc_request_t *req) case GF_GSYNC_OPTION_TYPE_STATUS: strncpy (operation, "status", sizeof (operation)); break; - case GF_GSYNC_OPTION_TYPE_ROTATE: - strncpy (operation, "rotate", sizeof(operation)); - break; } - ret = glusterd_op_begin_synctask (req, GD_OP_GSYNC_SET, dict); + ret = glusterd_op_begin_synctask (req, cli_op, dict); out: if (ret) { @@ -154,6 +335,17 @@ out: return ret; } +int +glusterd_handle_sys_exec (rpcsvc_request_t *req) +{ + return glusterd_big_locked_handler (req, __glusterd_handle_sys_exec); +} + +int +glusterd_handle_copy_file (rpcsvc_request_t *req) +{ + return glusterd_big_locked_handler (req, __glusterd_handle_copy_file); +} int glusterd_handle_gsync_set (rpcsvc_request_t *req) @@ -444,7 +636,7 @@ _fcbk_conftodict (char *resbuf, size_t blen, FILE *fp, void *data) } static int -glusterd_gsync_get_config (char *master, char *slave, char *gl_workdir, dict_t *dict) +glusterd_gsync_get_config (char *master, char *slave, char *conf_path, dict_t *dict) { /* key + value, where value must be able to accommodate a path */ char resbuf[256 + PATH_MAX] = {0,}; @@ -452,7 +644,7 @@ glusterd_gsync_get_config (char *master, char *slave, char *gl_workdir, dict_t * runinit (&runner); runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL); - runner_argprintf (&runner, "%s/"GSYNC_CONF, gl_workdir); + runner_argprintf (&runner, "%s", conf_path); runner_argprintf (&runner, ":%s", master); runner_add_args (&runner, slave, "--config-get-all", NULL); @@ -462,13 +654,13 @@ glusterd_gsync_get_config (char *master, char *slave, char *gl_workdir, dict_t * static int glusterd_gsync_get_param_file (char *prmfile, const char *param, char *master, - char *slave, char *gl_workdir) + char *slave, char *conf_path) { runner_t runner = {0,}; runinit (&runner); runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL); - runner_argprintf (&runner, "%s/"GSYNC_CONF, gl_workdir); + runner_argprintf (&runner, "%s", conf_path); runner_argprintf (&runner, ":%s", master); runner_add_args (&runner, slave, "--config-get", NULL); runner_argprintf (&runner, "%s-file", param); @@ -477,83 +669,14 @@ glusterd_gsync_get_param_file (char *prmfile, const char *param, char *master, } static int -glusterd_gsync_get_session_owner (char *master, char *slave, char *session_owner, - char *gl_workdir) -{ - runner_t runner = {0,}; - - runinit(&runner); - runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL); - runner_argprintf (&runner, "%s/"GSYNC_CONF, gl_workdir); - runner_argprintf (&runner, ":%s", master); - runner_add_args (&runner, slave, "--config-get", "session-owner", - NULL); - - return glusterd_query_extutil (session_owner, &runner); -} - -/* check whether @slave is local or remote. normalized - * urls starting with ssh are considered to be remote - * @returns - * 1 if slave is remote - * 0 is slave is local - */ -static int -glusterd_gsync_slave_is_remote (char *slave) -{ - int ret = 0; - char *ssh_pos = NULL; - - ssh_pos = strstr(slave, "ssh://"); - if ( ssh_pos && ((ssh_pos - slave) == 0) ) - ret = 1; - - return ret; -} - -static int -glusterd_gsync_get_slave_log_file (char *master, char *slave, char *log_file) -{ - int ret = -1; - runner_t runner = {0,}; - char uuid_str[64] = {0,}; - glusterd_conf_t *priv = NULL; - char *gl_workdir = NULL; - - GF_ASSERT(THIS); - GF_ASSERT(THIS->private); - - priv = THIS->private; - - GF_VALIDATE_OR_GOTO("gsyncd", master, out); - GF_VALIDATE_OR_GOTO("gsyncd", slave, out); - - gl_workdir = priv->workdir; - - /* get the session owner for the master-slave session */ - ret = glusterd_gsync_get_session_owner (master, slave, uuid_str, - gl_workdir); - if (ret) - goto out; - - /* get the log file for the slave */ - runinit(&runner); - runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL); - runner_argprintf (&runner, "%s/"GSYNC_CONF, gl_workdir); - runner_argprintf (&runner, "--session-owner=%s", uuid_str); - runner_add_args (&runner, slave, "--config-get", "log-file", NULL); - - ret = glusterd_query_extutil (log_file, &runner); - - out: - return ret; -} - -static int -gsyncd_getpidfile (char *master, char *slave, char *pidfile) +gsyncd_getpidfile (char *master, char *slave, char *pidfile, char *conf_path) { int ret = -1; glusterd_conf_t *priv = NULL; + char *confpath = NULL; + char conf_buf[PATH_MAX] = ""; + struct stat stbuf = {0,}; + GF_ASSERT (THIS); GF_ASSERT (THIS->private); @@ -563,8 +686,22 @@ gsyncd_getpidfile (char *master, char *slave, char *pidfile) GF_VALIDATE_OR_GOTO ("gsync", master, out); GF_VALIDATE_OR_GOTO ("gsync", slave, out); + ret = lstat (conf_path, &stbuf); + if (!ret) { + gf_log ("", GF_LOG_DEBUG, "Using passed config template(%s).", + conf_path); + confpath = conf_path; + } else { + ret = snprintf (conf_buf, sizeof(conf_buf) - 1, + "%s/"GSYNC_CONF_TEMPLATE, priv->workdir); + conf_buf[ret] = '\0'; + confpath = conf_buf; + gf_log ("", GF_LOG_DEBUG, "Using default config template(%s).", + confpath); + } + ret = glusterd_gsync_get_param_file (pidfile, "pid", master, - slave, priv->workdir); + slave, confpath); if (ret == -1) { ret = -2; gf_log ("", GF_LOG_WARNING, "failed to create the pidfile string"); @@ -578,32 +715,6 @@ gsyncd_getpidfile (char *master, char *slave, char *pidfile) } static int -glusterd_gsyncd_getlogfile (char *master, char *slave, char *log_file) -{ - int ret = -1; - glusterd_conf_t *priv = NULL; - - GF_ASSERT (THIS); - GF_ASSERT (THIS->private); - - priv = THIS->private; - - GF_VALIDATE_OR_GOTO ("gsync", master, out); - GF_VALIDATE_OR_GOTO ("gsync", slave, out); - - ret = glusterd_gsync_get_param_file (log_file, "log", master, - slave, priv->workdir); - if (ret == -1) { - ret = -2; - gf_log ("", GF_LOG_WARNING, "failed to gsyncd logfile"); - goto out; - } - - out: - return ret; -} - -static int gsync_status_byfd (int fd) { GF_ASSERT (fd >= -1); @@ -620,12 +731,12 @@ gsync_status_byfd (int fd) * return -1 when not running */ int -gsync_status (char *master, char *slave, int *status) +gsync_status (char *master, char *slave, char *conf_path, int *status) { char pidfile[PATH_MAX] = {0,}; int fd = -1; - fd = gsyncd_getpidfile (master, slave, pidfile); + fd = gsyncd_getpidfile (master, slave, pidfile, conf_path); if (fd == -2) return -1; @@ -662,16 +773,48 @@ out: } static int -gsync_verify_config_options (dict_t *dict, char **op_errstr) +glusterd_verify_gsyncd_spawn (char *master, char *slave) { - char **resopt = NULL; - int i = 0; - char *subop = NULL; - char *slave = NULL; - char *op_name = NULL; - char *op_value = NULL; - char *t = NULL; - gf_boolean_t banned = _gf_true; + int ret = 0; + runner_t runner = {0,}; + + runinit (&runner); + runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", + "--verify", "spawning", NULL); + runner_argprintf (&runner, ":%s", master); + runner_add_args (&runner, slave, NULL); + runner_redir (&runner, STDOUT_FILENO, RUN_PIPE); + ret = runner_start (&runner); + if (ret) { + gf_log ("", GF_LOG_ERROR, "spawning child failed"); + ret = -1; + goto out; + } + + if (runner_end (&runner) != 0) + ret = -1; + +out: + gf_log ("", GF_LOG_DEBUG, "returning %d", ret); + return ret; +} + +static int +gsync_verify_config_options (dict_t *dict, char **op_errstr, char *volname) +{ + char **resopt = NULL; + int i = 0; + int ret = -1; + char *subop = NULL; + char *slave = NULL; + char *op_name = NULL; + char *op_value = NULL; + char *t = NULL; + char errmsg[PATH_MAX] = ""; + gf_boolean_t banned = _gf_true; + gf_boolean_t op_match = _gf_true; + gf_boolean_t val_match = _gf_true; + struct gsync_config_opt_vals_ *conf_vals = NULL; if (dict_get_str (dict, "subop", &subop) != 0) { gf_log ("", GF_LOG_WARNING, "missing subop"); @@ -695,6 +838,12 @@ gsync_verify_config_options (dict_t *dict, char **op_errstr) } if (runcmd (GSYNCD_PREFIX"/gsyncd", "--config-check", op_name, NULL)) { + ret = glusterd_verify_gsyncd_spawn (volname, slave); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to spawn gsyncd"); + return 0; + } + gf_log ("", GF_LOG_WARNING, "Invalid option %s", op_name); *op_errstr = gf_strdup ("Invalid option"); @@ -739,35 +888,109 @@ gsync_verify_config_options (dict_t *dict, char **op_errstr) } } + /* Check options in gsync_confopt_vals for invalid values */ + for (conf_vals = gsync_confopt_vals; conf_vals->op_name; conf_vals++) { + op_match = _gf_true; + for (i = 0; conf_vals->op_name[i] && op_name[i]; i++) { + if (conf_vals->op_name[i] == op_name[i] || + (conf_vals->op_name[i] == '_' && op_name[i] == '-')) + continue; + op_match = _gf_false; + } + + if (op_match) { + val_match = _gf_false; + for (i = 0; i < conf_vals->no_of_pos_vals; i++) { + if(conf_vals->case_sensitive){ + if (!strcmp (conf_vals->values[i], op_value)) + val_match = _gf_true; + } else { + if (!strcasecmp (conf_vals->values[i], op_value)) + val_match = _gf_true; + } + } + + if (!val_match) { + ret = snprintf (errmsg, sizeof(errmsg) - 1, + "Invalid values (%s) for" + " option %s", op_value, + op_name); + errmsg[ret] = '\0'; + + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + *op_errstr = gf_strdup (errmsg); + return -1; + } + } + } + return 0; } static int glusterd_get_gsync_status_mst_slv (glusterd_volinfo_t *volinfo, - char *slave, dict_t *rsp_dict, char *node); + char *slave, char *conf_path, + dict_t *rsp_dict, char *node); static int _get_status_mst_slv (dict_t *this, char *key, data_t *value, void *data) { glusterd_gsync_status_temp_t *param = NULL; char *slave = NULL; - int ret = 0; + char *slave_buf = NULL; + char *slave_ip = NULL; + char *slave_vol = NULL; + char *errmsg = NULL; + char conf_path[PATH_MAX] = ""; + int ret = -1; + glusterd_conf_t *priv = NULL; param = (glusterd_gsync_status_temp_t *)data; GF_ASSERT (param); GF_ASSERT (param->volinfo); + if (THIS) + priv = THIS->private; + if (priv == NULL) { + gf_log ("", GF_LOG_ERROR, "priv of glusterd not present"); + goto out; + } + slave = strchr(value->data, ':'); - if (slave) - slave ++; - else + if (!slave) return 0; + slave++; + + ret = glusterd_get_slave_info (slave, &slave_ip, &slave_vol, &errmsg); + if (ret) { + if (errmsg) + gf_log ("", GF_LOG_ERROR, "Unable to fetch " + "slave details. Error: %s", errmsg); + else + gf_log ("", GF_LOG_ERROR, + "Unable to fetch slave details."); + ret = -1; + goto out; + } + + ret = snprintf (conf_path, sizeof(conf_path) - 1, + "%s/"GEOREP"/%s_%s_%s/gsyncd.conf", + priv->workdir, param->volinfo->volname, + slave_ip, slave_vol); + conf_path[ret] = '\0'; ret = glusterd_get_gsync_status_mst_slv(param->volinfo, - slave, param->rsp_dict, + slave, conf_path, + param->rsp_dict, param->node); - return 0; +out: + + if (slave_buf) + GF_FREE(slave_buf); + + gf_log ("", GF_LOG_DEBUG, "Returning %d.", ret); + return ret; } @@ -788,19 +1011,22 @@ static int glusterd_remove_slave_in_info (glusterd_volinfo_t *volinfo, char *slave, char **op_errstr) { + int zero_slave_entries = _gf_true; int ret = 0; char *slavekey = NULL; GF_ASSERT (volinfo); GF_ASSERT (slave); - ret = glusterd_get_slave (volinfo, slave, &slavekey); - if (ret < 0) { - ret++; - goto out; - } - - dict_del (volinfo->gsync_slaves, slavekey); + do { + ret = glusterd_get_slave (volinfo, slave, &slavekey); + if (ret < 0 && zero_slave_entries) { + ret++; + goto out; + } + zero_slave_entries = _gf_false; + dict_del (volinfo->gsync_slaves, slavekey); + } while (ret >= 0); ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT); @@ -853,8 +1079,9 @@ glusterd_gsync_get_uuid (char *slave, glusterd_volinfo_t *vol, return ret; } -static int +int glusterd_check_gsync_running_local (char *master, char *slave, + char *conf_path, gf_boolean_t *is_run) { int ret = -1; @@ -865,7 +1092,7 @@ glusterd_check_gsync_running_local (char *master, char *slave, GF_ASSERT (is_run); *is_run = _gf_false; - ret = gsync_status (master, slave, &ret_status); + ret = gsync_status (master, slave, conf_path, &ret_status); if (ret == 0 && ret_status == 0) { *is_run = _gf_true; } else if (ret == -1) { @@ -882,7 +1109,8 @@ glusterd_check_gsync_running_local (char *master, char *slave, static int glusterd_store_slave_in_info (glusterd_volinfo_t *volinfo, char *slave, - char *host_uuid, char **op_errstr) + char *host_uuid, char **op_errstr, + gf_boolean_t is_force) { int ret = 0; int maxslv = 0; @@ -905,7 +1133,8 @@ glusterd_store_slave_in_info (glusterd_volinfo_t *volinfo, char *slave, case -1: break; default: - GF_ASSERT (ret > 0); + if (!is_force) + GF_ASSERT (ret > 0); ret = dict_get_str (volinfo->gsync_slaves, slavekey, &slaveentry); GF_ASSERT (ret == 0); @@ -914,13 +1143,23 @@ glusterd_store_slave_in_info (glusterd_volinfo_t *volinfo, char *slave, * assert an uuid mismatch */ t = strtail (slaveentry, host_uuid); - GF_ASSERT (!t || *t != ':'); + if (!is_force) + GF_ASSERT (!t || *t != ':'); + + if (is_force) { + gf_log ("", GF_LOG_DEBUG, GEOREP" has already been " + "invoked for the %s (master) and %s (slave)." + " Allowing without saving info again due to" + " force command.", volinfo->volname, slave); + ret = 0; + goto out; + } gf_log ("", GF_LOG_ERROR, GEOREP" has already been invoked for " "the %s (master) and %s (slave) " "from a different machine", volinfo->volname, slave); - *op_errstr = gf_strdup (GEOREP" already running in an an" + *op_errstr = gf_strdup (GEOREP" already running in " "another machine"); ret = -1; goto out; @@ -953,23 +1192,26 @@ glusterd_store_slave_in_info (glusterd_volinfo_t *volinfo, char *slave, return ret; } - static int glusterd_op_verify_gsync_start_options (glusterd_volinfo_t *volinfo, - char *slave, char **op_errstr) + char *slave, char *conf_path, + char *statefile, char **op_errstr, + gf_boolean_t is_force) { int ret = -1; gf_boolean_t is_running = _gf_false; char msg[2048] = {0}; uuid_t uuid = {0}; - glusterd_conf_t *priv = NULL; - xlator_t *this = NULL; + glusterd_conf_t *priv = NULL; + xlator_t *this = NULL; + struct stat stbuf = {0,}; this = THIS; GF_ASSERT (volinfo); GF_ASSERT (slave); GF_ASSERT (op_errstr); + GF_ASSERT (conf_path); GF_ASSERT (this && this->private); priv = this->private; @@ -979,26 +1221,56 @@ glusterd_op_verify_gsync_start_options (glusterd_volinfo_t *volinfo, "before "GEOREP" start", volinfo->volname); goto out; } + + ret = lstat (statefile, &stbuf); + if (ret) { + snprintf (msg, sizeof (msg), "Session between %s and %s has" + " not been created. Please create session and retry.", + volinfo->volname, slave); + gf_log ("", GF_LOG_ERROR, "%s", msg); + *op_errstr = gf_strdup (msg); + goto out; + } + + /* Check if the gsync slave info is stored. If not + * session has not been created */ + ret = glusterd_gsync_get_uuid (slave, volinfo, uuid); + if (ret) { + snprintf (msg, sizeof (msg), "Session between %s and %s has" + " not been created. Please create session and retry.", + volinfo->volname, slave); + gf_log ("", GF_LOG_ERROR, "%s", msg); + goto out; + } + + if (is_force) { + ret = 0; + goto out; + } + /*Check if the gsync is already started in cmd. inited host * If so initiate add it into the glusterd's priv*/ - ret = glusterd_gsync_get_uuid (slave, volinfo, uuid); - if ((ret == 0) && (uuid_compare (MY_UUID, uuid) == 0)) { - ret = glusterd_check_gsync_running_local (volinfo->volname, - slave, &is_running); - if (ret) { - snprintf (msg, sizeof (msg), GEOREP" start option " - "validation failed "); - goto out; - } - if (_gf_true == is_running) { - snprintf (msg, sizeof (msg), GEOREP " session between" - " %s & %s already started", volinfo->volname, - slave); - ret = -1; - goto out; - } + ret = glusterd_check_gsync_running_local (volinfo->volname, + slave, conf_path, + &is_running); + if (ret) { + snprintf (msg, sizeof (msg), GEOREP" start option " + "validation failed "); + goto out; + } + if (_gf_true == is_running) { + snprintf (msg, sizeof (msg), GEOREP " session between" + " %s & %s already started", volinfo->volname, + slave); + ret = -1; + goto out; + } + + ret = glusterd_verify_gsyncd_spawn (volinfo->volname, slave); + if (ret) { + snprintf (msg, sizeof (msg), "Unable to spawn gsyncd"); + gf_log ("", GF_LOG_ERROR, "%s", msg); } - ret = 0; out: if (ret && (msg[0] != '\0')) { *op_errstr = gf_strdup (msg); @@ -1024,11 +1296,13 @@ glusterd_check_gsync_running (glusterd_volinfo_t *volinfo, gf_boolean_t *flag) static int glusterd_op_verify_gsync_running (glusterd_volinfo_t *volinfo, - char *slave, char **op_errstr) + char *slave, char *conf_path, + char **op_errstr) { - int ret = -1; - char msg[2048] = {0}; - uuid_t uuid = {0}; + int pfd = -1; + int ret = -1; + char msg[2048] = {0}; + char pidfile[PATH_MAX] = {0,}; GF_ASSERT (THIS && THIS->private); GF_ASSERT (volinfo); @@ -1041,13 +1315,26 @@ glusterd_op_verify_gsync_running (glusterd_volinfo_t *volinfo, goto out; } - ret = glusterd_gsync_get_uuid (slave, volinfo, uuid); - if (ret == -1) { - snprintf (msg, sizeof (msg), GEOREP" session between %s & %s" - " not active", volinfo->volname, slave); + + pfd = gsyncd_getpidfile (volinfo->volname, slave, pidfile, conf_path); + if (pfd == -2) { + gf_log ("", GF_LOG_ERROR, GEOREP" stop validation " + "failed for %s & %s", volinfo->volname, slave); + ret = -1; + goto out; + } + if (gsync_status_byfd (pfd) == -1) { + snprintf (msg, sizeof (msg), GEOREP" session b/w %s & %s is not" + " running on this node.", volinfo->volname, slave); + gf_log ("", GF_LOG_ERROR, "%s", msg); + ret = -1; + /* monitor gsyncd already dead */ goto out; } + if (pfd < 0) + goto out; + ret = 0; out: if (ret && (msg[0] != '\0')) { @@ -1066,6 +1353,18 @@ glusterd_verify_gsync_status_opts (dict_t *dict, char **op_errstr) gf_boolean_t exists = _gf_false; glusterd_volinfo_t *volinfo = NULL; int ret = 0; + char *conf_path = NULL; + char *slave_ip = NULL; + char *slave_vol = NULL; + glusterd_conf_t *priv = NULL; + + if (THIS) + priv = THIS->private; + if (priv == NULL) { + gf_log ("", GF_LOG_ERROR, "priv of glusterd not present"); + *op_errstr = gf_strdup ("glusterd defunct"); + goto out; + } ret = dict_get_str (dict, "master", &volname); if (ret < 0) { @@ -1090,16 +1389,25 @@ glusterd_verify_gsync_status_opts (dict_t *dict, char **op_errstr) goto out; } - out: + ret = glusterd_get_slave_details_confpath (volinfo, dict, &slave_ip, + &slave_vol, &conf_path, + op_errstr); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to fetch slave or confpath details."); + ret = -1; + goto out; + } + +out: gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); return ret; - } int glusterd_op_gsync_args_get (dict_t *dict, char **op_errstr, - char **master, char **slave) + char **master, char **slave, char **host_uuid) { int ret = -1; @@ -1124,6 +1432,14 @@ glusterd_op_gsync_args_get (dict_t *dict, char **op_errstr, } } + if (host_uuid) { + ret = dict_get_str (dict, "host-uuid", host_uuid); + if (ret < 0) { + gf_log ("", GF_LOG_WARNING, "host_uuid not found"); + *op_errstr = gf_strdup ("host_uuid not found"); + goto out; + } + } ret = 0; out: @@ -1132,17 +1448,647 @@ out: } int +glusterd_op_stage_sys_exec (dict_t *dict, char **op_errstr) +{ + char errmsg[PATH_MAX] = ""; + char *command = NULL; + char command_path[PATH_MAX] = ""; + struct stat st = {0,}; + int ret = -1; + glusterd_conf_t *conf = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + conf = this->private; + GF_ASSERT (conf); + + if (conf->op_version < 2) { + gf_log ("", GF_LOG_ERROR, "Op Version not supported."); + snprintf (errmsg, sizeof(errmsg), "One or more nodes do not" + " support the required op version."); + *op_errstr = gf_strdup (errmsg); + ret = -1; + goto out; + } + + ret = dict_get_str (dict, "command", &command); + if (ret) { + strcpy (errmsg, "internal error"); + gf_log ("", GF_LOG_ERROR, + "Unable to get command from dict"); + goto out; + } + + /* enforce local occurrence of the command */ + if (strchr (command, '/')) { + strcpy (errmsg, "invalid command name"); + ret = -1; + goto out; + } + + sprintf (command_path, GSYNCD_PREFIX"/peer_%s", command); + /* check if it's executable */ + ret = access (command_path, X_OK); + if (!ret) + /* check if it's a regular file */ + ret = stat (command_path, &st); + if (!ret && !S_ISREG (st.st_mode)) + ret = -1; + +out: + if (ret) { + if (errmsg[0] == '\0') + snprintf (errmsg, sizeof (errmsg), "%s not found.", + command); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + } + + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +int +glusterd_op_stage_copy_file (dict_t *dict, char **op_errstr) +{ + char abs_filename[PATH_MAX] = ""; + char errmsg[PATH_MAX] = ""; + char *filename = NULL; + char *host_uuid = NULL; + char uuid_str [64] = {0}; + int ret = -1; + glusterd_conf_t *priv = NULL; + struct stat stbuf = {0,}; + + if (THIS) + priv = THIS->private; + if (priv == NULL) { + gf_log ("", GF_LOG_ERROR, "priv of glusterd not present"); + *op_errstr = gf_strdup ("glusterd defunct"); + goto out; + } + + if (priv->op_version < 2) { + gf_log ("", GF_LOG_ERROR, "Op Version not supported."); + snprintf (errmsg, sizeof(errmsg), "One or more nodes do not" + " support the required op version."); + *op_errstr = gf_strdup (errmsg); + ret = -1; + goto out; + } + + ret = dict_get_str (dict, "host-uuid", &host_uuid); + if (ret < 0) { + gf_log ("", GF_LOG_ERROR, "Unable to fetch" + " host-uuid from dict."); + goto out; + } + + uuid_utoa_r (MY_UUID, uuid_str); + if (!strcmp (uuid_str, host_uuid)) { + ret = dict_get_str (dict, "source", &filename); + if (ret < 0) { + gf_log ("", GF_LOG_ERROR, "Unable to fetch" + " filename from dict."); + *op_errstr = gf_strdup ("command unsuccessful"); + goto out; + } + snprintf (abs_filename, sizeof(abs_filename), + "%s/%s", priv->workdir, filename); + + ret = lstat (abs_filename, &stbuf); + if (ret) { + snprintf (errmsg, sizeof (errmsg), "Source file" + " does not exist in %s", priv->workdir); + *op_errstr = gf_strdup (errmsg); + goto out; + } + + if (!S_ISREG(stbuf.st_mode)) { + snprintf (errmsg, sizeof (errmsg), "Source file" + " is not a regular file."); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + ret = -1; + goto out; + } + } + + ret = 0; +out: + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +static int +glusterd_get_statefile_name (glusterd_volinfo_t *volinfo, char *slave, + char *conf_path, char **statefile) +{ + glusterd_conf_t *priv = NULL; + int ret = -1; + char *master = NULL; + char *buf = NULL; + dict_t *confd = NULL; + char *confpath = NULL; + char conf_buf[PATH_MAX] = ""; + struct stat stbuf = {0,}; + + GF_ASSERT (THIS); + GF_ASSERT (THIS->private); + GF_ASSERT (volinfo); + + master = volinfo->volname; + + confd = dict_new (); + if (!confd) { + gf_log ("", GF_LOG_ERROR, "Unable to create new dict"); + goto out; + } + + priv = THIS->private; + + ret = lstat (conf_path, &stbuf); + if (!ret) { + gf_log ("", GF_LOG_INFO, "Using passed config template(%s).", + conf_path); + confpath = conf_path; + } else { + ret = snprintf (conf_buf, sizeof(conf_buf) - 1, + "%s/"GSYNC_CONF_TEMPLATE, priv->workdir); + conf_buf[ret] = '\0'; + confpath = conf_buf; + gf_log ("", GF_LOG_INFO, "Using default config template(%s).", + confpath); + } + + ret = glusterd_gsync_get_config (master, slave, confpath, + confd); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to get configuration data" + "for %s(master), %s(slave)", master, slave); + goto out; + + } + + ret = dict_get_param (confd, "state_file", &buf); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to get state_file's name."); + goto out; + } + + *statefile = gf_strdup(buf); + if (!*statefile) { + gf_log ("", GF_LOG_ERROR, "Unable to gf_strdup."); + ret = -1; + goto out; + } + + ret = 0; + out: + if (confd) + dict_destroy (confd); + + gf_log ("", GF_LOG_DEBUG, "Returning %d ", ret); + return ret; +} + +static int +glusterd_create_status_file (char *master, char *slave, char *slave_ip, + char *slave_vol, char *status) +{ + int ret = -1; + runner_t runner = {0,}; + glusterd_conf_t *priv = NULL; + + if (THIS) + priv = THIS->private; + if (priv == NULL) { + gf_log ("", GF_LOG_ERROR, "priv of glusterd not present"); + goto out; + } + + if (!status) { + gf_log ("", GF_LOG_ERROR, "Status Empty"); + goto out; + } + gf_log ("", GF_LOG_DEBUG, "slave = %s", slave); + + runinit (&runner); + runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "--create", + status, "-c", NULL); + runner_argprintf (&runner, "%s/"GEOREP"/%s_%s_%s/gsyncd.conf", + priv->workdir, master, slave_ip, slave_vol); + runner_argprintf (&runner, ":%s", master); + runner_add_args (&runner, slave, NULL); + synclock_unlock (&priv->big_lock); + ret = runner_run (&runner); + synclock_lock (&priv->big_lock); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Creating status file failed."); + ret = -1; + goto out; + } + + ret = 0; +out: + gf_log ("", GF_LOG_DEBUG, "returning %d", ret); + return ret; +} + +static int +glusterd_verify_slave (char *volname, char *slave_ip, char *slave, + char **op_errstr, gf_boolean_t *is_force_blocker) +{ + int32_t ret = -1; + runner_t runner = {0,}; + char log_file_path[PATH_MAX] = ""; + char buf[PATH_MAX] = ""; + char *tmp = NULL; + char *save_ptr = NULL; + glusterd_conf_t *priv = NULL; + + GF_ASSERT (volname); + GF_ASSERT (slave_ip); + GF_ASSERT (slave); + + if (THIS) + priv = THIS->private; + if (priv == NULL) { + gf_log ("", GF_LOG_ERROR, "priv of glusterd not present"); + goto out; + } + + snprintf (log_file_path, sizeof(log_file_path), + DEFAULT_LOG_FILE_DIRECTORY"/create_verify_log"); + + runinit (&runner); + runner_add_args (&runner, GSYNCD_PREFIX"/gverify.sh", NULL); + runner_argprintf (&runner, "%s", volname); + runner_argprintf (&runner, "%s", slave_ip); + runner_argprintf (&runner, "%s", slave); + runner_argprintf (&runner, "%s", log_file_path); + runner_redir (&runner, STDOUT_FILENO, RUN_PIPE); + synclock_unlock (&priv->big_lock); + ret = runner_run (&runner); + synclock_lock (&priv->big_lock); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Not a valid slave"); + ret = glusterd_gsync_read_frm_status (log_file_path, + buf, sizeof(buf)); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to read from %s", + log_file_path); + goto out; + } + + /* Tokenize the error message from gverify.sh to figure out + * if the error is a force blocker or not. */ + tmp = strtok_r (buf, "|", &save_ptr); + if (!strcmp (tmp, "FORCE_BLOCKER")) + *is_force_blocker = 1; + else { + /* No FORCE_BLOCKER flag present so all that is + * present is the error message. */ + *is_force_blocker = 0; + if (tmp) + *op_errstr = gf_strdup (tmp); + ret = -1; + goto out; + } + + /* Copy rest of the error message to op_errstr */ + tmp = strtok_r (NULL, "|", &save_ptr); + if (tmp) + *op_errstr = gf_strdup (tmp); + ret = -1; + goto out; + } + ret = 0; +out: + unlink (log_file_path); + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +int +glusterd_mountbroker_check (char **slave_ip, char **op_errstr) +{ + int ret = -1; + char *tmp = NULL; + char *save_ptr = NULL; + char *username = NULL; + char *host = NULL; + char errmsg[PATH_MAX] = ""; + + GF_ASSERT (slave_ip); + GF_ASSERT (*slave_ip); + + /* Checking if hostname has user specified */ + host = strstr (*slave_ip, "@"); + if (!host) { + gf_log ("", GF_LOG_DEBUG, "No username provided."); + ret = 0; + goto out; + } else { + /* Moving the host past the '@' and checking if the + * actual hostname also has '@' */ + host++; + if (strstr (host, "@")) { + gf_log ("", GF_LOG_DEBUG, "host = %s", host); + ret = snprintf (errmsg, sizeof(errmsg) - 1, + "Invalid Hostname (%s).", host); + errmsg[ret] = '\0'; + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + ret = -1; + if (op_errstr) + *op_errstr = gf_strdup (errmsg); + goto out; + } + + /* Fetching the username and hostname + * and checking if the username is non-root */ + username = strtok_r (*slave_ip, "@", &save_ptr); + tmp = strtok_r (NULL, "@", &save_ptr); + if (strcmp (username, "root")) { + ret = snprintf (errmsg, sizeof(errmsg) - 1, + "Non-root username (%s@%s) not allowed.", + username, tmp); + errmsg[ret] = '\0'; + if (op_errstr) + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, + "Non-Root username not allowed."); + ret = -1; + goto out; + } + + *slave_ip = gf_strdup (tmp); + if (!*slave_ip) { + gf_log ("", GF_LOG_ERROR, "Out of memory"); + ret = -1; + goto out; + } + } + + ret = 0; +out: + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +int +glusterd_op_stage_gsync_create (dict_t *dict, char **op_errstr) +{ + char *down_peerstr = NULL; + char *slave = NULL; + char *volname = NULL; + char *host_uuid = NULL; + char *statefile = NULL; + char *slave_ip = NULL; + char *slave_vol = NULL; + char *conf_path = NULL; + char errmsg[PATH_MAX] = ""; + char common_pem_file[PATH_MAX] = ""; + char hook_script[PATH_MAX] = ""; + char uuid_str [64] = ""; + int ret = -1; + int is_pem_push = -1; + gf_boolean_t is_force = -1; + gf_boolean_t is_force_blocker = -1; + gf_boolean_t exists = _gf_false; + glusterd_conf_t *conf = NULL; + glusterd_volinfo_t *volinfo = NULL; + struct stat stbuf = {0,}; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + conf = this->private; + GF_ASSERT (conf); + + ret = glusterd_op_gsync_args_get (dict, op_errstr, &volname, + &slave, &host_uuid); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to fetch arguments"); + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return -1; + } + + if (conf->op_version < 2) { + gf_log ("", GF_LOG_ERROR, "Op Version not supported."); + snprintf (errmsg, sizeof(errmsg), "One or more nodes do not" + " support the required op version."); + *op_errstr = gf_strdup (errmsg); + ret = -1; + goto out; + } + + exists = glusterd_check_volume_exists (volname); + ret = glusterd_volinfo_find (volname, &volinfo); + if ((ret) || (!exists)) { + gf_log ("", GF_LOG_WARNING, "volume name does not exist"); + snprintf (errmsg, sizeof(errmsg), "Volume name %s does not" + " exist", volname); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return -1; + } + + ret = glusterd_get_slave_details_confpath (volinfo, dict, &slave_ip, + &slave_vol, &conf_path, + op_errstr); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to fetch slave or confpath details."); + ret = -1; + goto out; + } + + is_force = dict_get_str_boolean (dict, "force", _gf_false); + + uuid_utoa_r (MY_UUID, uuid_str); + if (!strcmp (uuid_str, host_uuid)) { + ret = glusterd_are_vol_all_peers_up (volinfo, + &conf->peers, + &down_peerstr); + if ((ret == _gf_false) && !is_force) { + snprintf (errmsg, sizeof (errmsg), "Peer %s," + " which is a part of %s volume, is" + " down. Please bring up the peer and" + " retry.", down_peerstr, + volinfo->volname); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + *op_errstr = gf_strdup (errmsg); + GF_FREE (down_peerstr); + down_peerstr = NULL; + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return -1; + } else if (ret == _gf_false) { + gf_log ("", GF_LOG_INFO, "Peer %s," + " which is a part of %s volume, is" + " down. Force creating geo-rep session." + " On bringing up the peer, re-run" + " \"gluster system:: execute" + " gsec_create\" and \"gluster volume" + " geo-replication %s %s create push-pem" + " force\"", down_peerstr, volinfo->volname, + volinfo->volname, slave); + } + + /* Checking if slave host is pingable, has proper passwordless + * ssh login setup, slave volume is created, slave vol is empty, + * and if it has enough memory and bypass in case of force if + * the error is not a force blocker */ + ret = glusterd_verify_slave (volname, slave_ip, slave_vol, + op_errstr, &is_force_blocker); + if (ret) { + if (is_force && !is_force_blocker) { + gf_log ("", GF_LOG_INFO, "%s is not a valid slave" + " volume. Error: %s. Force creating geo-rep" + " session.", slave, *op_errstr); + } else { + gf_log ("", GF_LOG_ERROR, + "%s is not a valid slave volume. Error: %s", + slave, *op_errstr); + ret = -1; + goto out; + } + } + + ret = dict_get_int32 (dict, "push_pem", &is_pem_push); + if (!ret && is_pem_push) { + ret = snprintf (common_pem_file, + sizeof(common_pem_file) - 1, + "%s"GLUSTERD_COMMON_PEM_PUB_FILE, + conf->workdir); + common_pem_file[ret] = '\0'; + + ret = snprintf (hook_script, sizeof(hook_script) - 1, + "%s"GLUSTERD_CREATE_HOOK_SCRIPT, + conf->workdir); + hook_script[ret] = '\0'; + + ret = lstat (common_pem_file, &stbuf); + if (ret) { + snprintf (errmsg, sizeof (errmsg), "%s" + " required for push-pem is" + " not present. Please run" + " \"gluster system:: execute" + " gsec_create\"", common_pem_file); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + *op_errstr = gf_strdup (errmsg); + ret = -1; + goto out; + } + + ret = lstat (hook_script, &stbuf); + if (ret) { + snprintf (errmsg, sizeof (errmsg), + "The hook-script (%s) required " + "for push-pem is not present. " + "Please install the hook-script " + "and retry", hook_script); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + *op_errstr = gf_strdup (errmsg); + ret = -1; + goto out; + } + + if (!S_ISREG(stbuf.st_mode)) { + snprintf (errmsg, sizeof (errmsg), "%s" + " required for push-pem is" + " not a regular file. Please run" + " \"gluster system:: execute" + " gsec_create\"", common_pem_file); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + ret = -1; + goto out; + } + } + } + + ret = glusterd_get_statefile_name (volinfo, slave, conf_path, &statefile); + if (ret) { + if (!strstr(slave, "::")) + snprintf (errmsg, sizeof (errmsg), + "%s is not a valid slave url.", slave); + else + snprintf (errmsg, sizeof (errmsg), "Please check gsync " + "config file. Unable to get statefile's name"); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + ret = -1; + goto out; + } + + ret = dict_set_str (dict, "statefile", statefile); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to store statefile path"); + goto out; + } + + ret = lstat (statefile, &stbuf); + if (!ret && !is_force) { + snprintf (errmsg, sizeof (errmsg), "Session between %s" + " and %s is already created.", + volinfo->volname, slave); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + ret = -1; + goto out; + } else if (!ret) + gf_log ("", GF_LOG_INFO, "Session between %s" + " and %s is already created. Force" + " creating again.", volinfo->volname, slave); + + ret = glusterd_verify_gsyncd_spawn (volinfo->volname, slave); + if (ret) { + snprintf (errmsg, sizeof (errmsg), "Unable to spawn gsyncd."); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + goto out; + } + + ret = 0; +out: + + if (ret && errmsg[0] != '\0') + *op_errstr = gf_strdup (errmsg); + + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +int glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr) { int ret = 0; int type = 0; char *volname = NULL; char *slave = NULL; + char *slave_ip = NULL; + char *slave_vol = NULL; + char *down_peerstr = NULL; + char *statefile = NULL; + char *path_list = NULL; + char *conf_path = NULL; gf_boolean_t exists = _gf_false; glusterd_volinfo_t *volinfo = NULL; char errmsg[PATH_MAX] = {0,}; dict_t *ctx = NULL; + gf_boolean_t is_force = 0; + gf_boolean_t is_force_blocker = -1; + gf_boolean_t is_running = _gf_false; + uuid_t uuid = {0}; + char uuid_str [64] = {0}; + char *host_uuid = NULL; + xlator_t *this = NULL; + glusterd_conf_t *conf = NULL; + struct stat stbuf = {0,}; + this = THIS; + GF_ASSERT (this); + conf = this->private; + GF_ASSERT (conf); ret = dict_get_int32 (dict, "type", &type); if (ret < 0) { @@ -1151,25 +2097,26 @@ glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr) goto out; } - switch (type) { - case GF_GSYNC_OPTION_TYPE_STATUS: + if (type == GF_GSYNC_OPTION_TYPE_STATUS) { ret = glusterd_verify_gsync_status_opts (dict, op_errstr); - goto out; - case GF_GSYNC_OPTION_TYPE_CONFIG: - ret = gsync_verify_config_options (dict, op_errstr); + } + ret = glusterd_op_gsync_args_get (dict, op_errstr, + &volname, &slave, &host_uuid); + if (ret) goto out; - case GF_GSYNC_OPTION_TYPE_ROTATE: - /* checks same as status mode */ - ret = glusterd_verify_gsync_status_opts(dict, op_errstr); - goto out; - } + uuid_utoa_r (MY_UUID, uuid_str); - ret = glusterd_op_gsync_args_get (dict, op_errstr, &volname, &slave); - if (ret) + if (conf->op_version < 2) { + gf_log ("", GF_LOG_ERROR, "Op Version not supported."); + snprintf (errmsg, sizeof(errmsg), "One or more nodes do not" + " support the required op version."); + *op_errstr = gf_strdup (errmsg); + ret = -1; goto out; + } exists = glusterd_check_volume_exists (volname); ret = glusterd_volinfo_find (volname, &volinfo); @@ -1182,12 +2129,96 @@ glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr) goto out; } + ret = glusterd_get_slave_details_confpath (volinfo, dict, &slave_ip, + &slave_vol, &conf_path, + op_errstr); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to fetch slave or confpath details."); + ret = -1; + goto out; + } + + ret = glusterd_get_statefile_name (volinfo, slave, conf_path, &statefile); + if (ret) { + /* Checking if slave host is pingable, has proper passwordless + * ssh login setup */ + ret = glusterd_verify_slave (volname, slave_ip, slave_vol, + op_errstr, &is_force_blocker); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "%s is not a valid slave volume. Error: %s", + slave, *op_errstr); + goto out; + } + + if (!strstr(slave, "::")) + snprintf (errmsg, sizeof (errmsg), + "%s is not a valid slave url.", slave); + else + snprintf (errmsg, sizeof (errmsg), + "Unable to get statefile's name"); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + ret = -1; + goto out; + } + + ret = dict_set_str (dict, "statefile", statefile); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to store statefile path"); + goto out; + } + + is_force = dict_get_str_boolean (dict, "force", _gf_false); + + /* Allowing stop force to bypass the statefile check + * as this command acts as a fail safe method to stop geo-rep + * session. */ + if ((type == GF_GSYNC_OPTION_TYPE_CONFIG) || + ((type == GF_GSYNC_OPTION_TYPE_STOP) && !is_force) || + (type == GF_GSYNC_OPTION_TYPE_DELETE)) { + ret = lstat (statefile, &stbuf); + if (ret) { + snprintf (errmsg, sizeof(errmsg), "Geo-replication" + " session between %s and %s does not exist.", + volinfo->volname, slave); + gf_log ("", GF_LOG_ERROR, "%s. statefile = %s", + errmsg, statefile); + *op_errstr = gf_strdup (errmsg); + ret = -1; + goto out; + } + } + + /* Check if all peers that are a part of the volume are up or not */ + if ((type == GF_GSYNC_OPTION_TYPE_DELETE) || + ((type == GF_GSYNC_OPTION_TYPE_STOP) && !is_force)) { + if (!strcmp (uuid_str, host_uuid)) { + ret = glusterd_are_vol_all_peers_up (volinfo, + &conf->peers, + &down_peerstr); + if (ret == _gf_false) { + snprintf (errmsg, sizeof (errmsg), "Peer %s," + " which is a part of %s volume, is" + " down. Please bring up the peer and" + " retry.", down_peerstr, + volinfo->volname); + *op_errstr = gf_strdup (errmsg); + ret = -1; + GF_FREE (down_peerstr); + down_peerstr = NULL; + goto out; + } + } + } + switch (type) { case GF_GSYNC_OPTION_TYPE_START: /* don't attempt to start gsync if replace-brick is * in progress */ if (glusterd_is_rb_ongoing (volinfo)) { - snprintf (errmsg, sizeof(errmsg),"replace-brick is in" + snprintf (errmsg, sizeof(errmsg), "replace-brick is in" " progress, not starting geo-replication"); *op_errstr = gf_strdup (errmsg); ret = -1; @@ -1195,16 +2226,18 @@ glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr) } ret = glusterd_op_verify_gsync_start_options (volinfo, slave, - op_errstr); + conf_path, statefile, + op_errstr, is_force); if (ret) goto out; ctx = glusterd_op_get_ctx(); if (ctx) { - /*gsyncd does a fuse mount to start the geo-rep session*/ + /* gsyncd does a fuse mount to start + * the geo-rep session */ if (!glusterd_is_fuse_available ()) { - gf_log ("glusterd", GF_LOG_ERROR, "Unable to open" - " /dev/fuse (%s), geo-replication start" - " failed", strerror (errno)); + gf_log ("glusterd", GF_LOG_ERROR, "Unable to " + "open /dev/fuse (%s), geo-replication " + "start failed", strerror (errno)); snprintf (errmsg, sizeof(errmsg), "fuse unvailable"); *op_errstr = gf_strdup (errmsg); @@ -1215,17 +2248,72 @@ glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr) break; case GF_GSYNC_OPTION_TYPE_STOP: - ret = glusterd_op_verify_gsync_running (volinfo, slave, - op_errstr); + if (!is_force) { + ret = glusterd_op_verify_gsync_running (volinfo, slave, + conf_path, + op_errstr); + if (ret) { + ret = glusterd_get_local_brickpaths (volinfo, + &path_list); + if (path_list) + ret = -1; + } + } + break; + + case GF_GSYNC_OPTION_TYPE_CONFIG: + ret = gsync_verify_config_options (dict, op_errstr, volname); + goto out; + break; + + case GF_GSYNC_OPTION_TYPE_DELETE: + /* Check if the gsync session is still running + * If so ask the user to stop geo-replication first.*/ + ret = glusterd_gsync_get_uuid (slave, volinfo, uuid); + if (ret) { + snprintf (errmsg, sizeof(errmsg), "Geo-replication" + " session between %s and %s does not exist.", + volinfo->volname, slave); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + *op_errstr = gf_strdup (errmsg); + ret = -1; + goto out; + } else { + ret = glusterd_check_gsync_running_local (volinfo->volname, + slave, conf_path, + &is_running); + if (_gf_true == is_running) { + snprintf (errmsg, sizeof (errmsg), GEOREP + " session between %s & %s is " + "still active. Please stop the " + "session and retry.", + volinfo->volname, slave); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + *op_errstr = gf_strdup (errmsg); + ret = -1; + goto out; + } + } + + ret = glusterd_verify_gsyncd_spawn (volinfo->volname, slave); + if (ret) { + snprintf (errmsg, sizeof (errmsg), + "Unable to spawn gsyncd"); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + } + break; } out: + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); return ret; } static int -stop_gsync (char *master, char *slave, char **msg) +stop_gsync (char *master, char *slave, char **msg, + char *conf_path, gf_boolean_t is_force) { int32_t ret = 0; int pfd = -1; @@ -1237,19 +2325,16 @@ stop_gsync (char *master, char *slave, char **msg) GF_ASSERT (THIS); GF_ASSERT (THIS->private); - pfd = gsyncd_getpidfile (master, slave, pidfile); - if (pfd == -2) { + pfd = gsyncd_getpidfile (master, slave, pidfile, conf_path); + if (pfd == -2 && !is_force) { gf_log ("", GF_LOG_ERROR, GEOREP" stop validation " " failed for %s & %s", master, slave); ret = -1; goto out; } - if (gsync_status_byfd (pfd) == -1) { + if (gsync_status_byfd (pfd) == -1 && !is_force) { gf_log ("", GF_LOG_ERROR, "gsyncd b/w %s & %s is not" " running", master, slave); - if (msg) - *msg = gf_strdup ("Warning: "GEOREP" session was " - "defunct at stop time"); /* monitor gsyncd already dead */ goto out; } @@ -1284,16 +2369,16 @@ stop_gsync (char *master, char *slave, char **msg) out: sys_close (pfd); + + if (is_force) + ret = 0; return ret; } static int -glusterd_check_restart_gsync_session (glusterd_volinfo_t *volinfo, char *slave, - dict_t *resp_dict); - -static int glusterd_gsync_configure (glusterd_volinfo_t *volinfo, char *slave, - dict_t *dict, dict_t *resp_dict, char **op_errstr) + char *path_list, dict_t *dict, + dict_t *resp_dict, char **op_errstr) { int32_t ret = -1; char *op_name = NULL; @@ -1302,6 +2387,10 @@ glusterd_gsync_configure (glusterd_volinfo_t *volinfo, char *slave, glusterd_conf_t *priv = NULL; char *subop = NULL; char *master = NULL; + char *conf_path = NULL; + char *slave_ip = NULL; + char *slave_vol = NULL; + struct stat stbuf = {0, }; GF_ASSERT (slave); GF_ASSERT (op_errstr); @@ -1336,10 +2425,17 @@ glusterd_gsync_configure (glusterd_volinfo_t *volinfo, char *slave, goto out; } + ret = dict_get_str (dict, "conf_path", &conf_path); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to fetch conf file path."); + goto out; + } + master = ""; runinit (&runner); runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL); - runner_argprintf (&runner, "%s/"GSYNC_CONF, priv->workdir); + runner_argprintf (&runner, "%s", conf_path); if (volinfo) { master = volinfo->volname; runner_argprintf (&runner, ":%s", master); @@ -1362,13 +2458,46 @@ glusterd_gsync_configure (glusterd_volinfo_t *volinfo, char *slave, goto out; } + + if (!strcmp (op_name, "state_file")) { + + ret = lstat (op_value, &stbuf); + if (ret) { + ret = dict_get_str (dict, "slave_ip", &slave_ip); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to fetch slave IP."); + goto out; + } + + ret = dict_get_str (dict, "slave_vol", &slave_vol); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to fetch slave volume name."); + goto out; + } + + ret = glusterd_create_status_file (volinfo->volname, slave, + slave_ip, slave_vol, + "Switching Status File"); + if (ret || lstat (op_value, &stbuf)) { + gf_log ("", GF_LOG_ERROR, "Unable to create %s" + ". Error : %s", op_value, + strerror (errno)); + ret = -1; + goto out; + } + } + } + ret = 0; gf_asprintf (op_errstr, "config-%s successful", subop); out: if (!ret && volinfo) { ret = glusterd_check_restart_gsync_session (volinfo, slave, - resp_dict); + resp_dict, path_list, + conf_path, 0); if (ret) *op_errstr = gf_strdup ("internal error"); } @@ -1494,13 +2623,14 @@ dict_get_param (dict_t *dict, char *key, char **param) } static int -glusterd_read_status_file (char *master, char *slave, - dict_t *dict, char *node) +glusterd_read_status_file (glusterd_volinfo_t *volinfo, char *slave, + char *conf_path, dict_t *dict, char *node) { glusterd_conf_t *priv = NULL; int ret = 0; char *statefile = NULL; - char buf[1024] = {0, }; + char *master = NULL; + char buf[1024] = "defunct"; char nds[1024] = {0, }; char mst[1024] = {0, }; char slv[1024] = {0, }; @@ -1510,34 +2640,38 @@ glusterd_read_status_file (char *master, char *slave, int gsync_count = 0; int status = 0; char *dyn_node = NULL; + char *path_list = NULL; GF_ASSERT (THIS); GF_ASSERT (THIS->private); + GF_ASSERT (volinfo); + + master = volinfo->volname; confd = dict_new (); - if (!dict) + if (!dict) { + gf_log ("", GF_LOG_ERROR, "Not able to create dict."); return -1; + } priv = THIS->private; - ret = glusterd_gsync_get_config (master, slave, priv->workdir, + + ret = glusterd_gsync_get_config (master, slave, conf_path, confd); if (ret) { gf_log ("", GF_LOG_ERROR, "Unable to get configuration data" "for %s(master), %s(slave)", master, slave); - goto out; + goto done; } - ret = gsync_status (master, slave, &status); - if (ret == 0 && status == -1) { - strncpy (buf, "defunct", sizeof (buf)); - goto done; - } else if (ret == -1) - goto out; - ret = dict_get_param (confd, "state_file", &statefile); - if (ret) - goto out; + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to get state_file's name " + "for %s(master), %s(slave). Please check gsync " + "config file.", master, slave); + goto done; + } ret = glusterd_gsync_read_frm_status (statefile, buf, sizeof (buf)); if (ret) { gf_log ("", GF_LOG_ERROR, "Unable to read the status" @@ -1545,12 +2679,27 @@ glusterd_read_status_file (char *master, char *slave, strncpy (buf, "defunct", sizeof (buf)); goto done; } - if (strcmp (buf, "OK") != 0) + + ret = gsync_status (master, slave, conf_path, &status); + if (ret == 0 && status == -1) { + if ((strcmp (buf, "Not Started")) && + (strcmp (buf, "Stopped"))) + strncpy (buf, "defunct", sizeof (buf)); + goto done; + } else if (ret == -1) { + gf_log ("", GF_LOG_ERROR, "Unable to get gsync status"); + goto done; + } + + if (strcmp (buf, "Stable") != 0) goto done; ret = dict_get_param (confd, "state_socket_unencoded", &statefile); - if (ret) - goto out; + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to get state_socket_unencoded" + " filepath. Please check gsync config file."); + goto done; + } ret = glusterd_gsync_fetch_status_extra (statefile, buf, sizeof (buf)); if (ret) { gf_log ("", GF_LOG_ERROR, "Unable to fetch extra status" @@ -1571,6 +2720,19 @@ glusterd_read_status_file (char *master, char *slave, } done: + if ((!strcmp (buf, "defunct")) || + (!strcmp (buf, "Not Started")) || + (!strcmp (buf, "Stopped"))) { + ret = glusterd_get_local_brickpaths (volinfo, &path_list); + if (!path_list) { + gf_log ("", GF_LOG_DEBUG, "This node not being part of" + " volume should not be running gsyncd. Hence" + " shouldn't display status for this node."); + ret = 0; + goto out; + } + } + ret = dict_get_int32 (dict, "gsync-count", &gsync_count); if (ret) @@ -1621,23 +2783,22 @@ glusterd_read_status_file (char *master, char *slave, if (ret) goto out; - ret = 0; out: dict_destroy (confd); - gf_log ("", GF_LOG_DEBUG, "Returning %d ", ret); - return ret; + return 0; } -static int +int glusterd_check_restart_gsync_session (glusterd_volinfo_t *volinfo, char *slave, - dict_t *resp_dict) + dict_t *resp_dict, char *path_list, + char *conf_path, gf_boolean_t is_force) { int ret = 0; - uuid_t uuid = {0, }; glusterd_conf_t *priv = NULL; char *status_msg = NULL; + gf_boolean_t is_running = _gf_false; GF_ASSERT (volinfo); GF_ASSERT (slave); @@ -1646,18 +2807,22 @@ glusterd_check_restart_gsync_session (glusterd_volinfo_t *volinfo, char *slave, priv = THIS->private; - if (glusterd_gsync_get_uuid (slave, volinfo, uuid)) - /* session does not exist, nothing to do */ + ret = glusterd_check_gsync_running_local (volinfo->volname, + slave, conf_path, + &is_running); + if (!ret && (_gf_true != is_running)) + /* gsynd not running, nothing to do */ goto out; - if (uuid_compare (MY_UUID, uuid) == 0) { - ret = stop_gsync (volinfo->volname, slave, &status_msg); - if (ret == 0 && status_msg) - ret = dict_set_str (resp_dict, "gsync-status", - status_msg); - if (ret == 0) - ret = glusterd_start_gsync (volinfo, slave, - uuid_utoa(MY_UUID), NULL); - } + + ret = stop_gsync (volinfo->volname, slave, &status_msg, + conf_path, is_force); + if (ret == 0 && status_msg) + ret = dict_set_str (resp_dict, "gsync-status", + status_msg); + if (ret == 0) + ret = glusterd_start_gsync (volinfo, slave, path_list, + conf_path, uuid_utoa(MY_UUID), + NULL); out: gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); @@ -1665,7 +2830,7 @@ glusterd_check_restart_gsync_session (glusterd_volinfo_t *volinfo, char *slave, } static int32_t -glusterd_marker_create_volfile (glusterd_volinfo_t *volinfo) +glusterd_marker_changelog_create_volfile (glusterd_volinfo_t *volinfo) { int32_t ret = 0; @@ -1689,58 +2854,82 @@ out: } static int -glusterd_set_marker_gsync (glusterd_volinfo_t *volinfo) +glusterd_set_gsync_knob (glusterd_volinfo_t *volinfo, char *key, int *vc) { - int ret = -1; - int marker_set = _gf_false; - char *gsync_status = NULL; + int ret = -1; + int conf_enabled = _gf_false; + char *knob_on = NULL; GF_ASSERT (THIS); GF_ASSERT (THIS->private); - marker_set = glusterd_volinfo_get_boolean (volinfo, VKEY_MARKER_XTIME); - if (marker_set == -1) { - gf_log ("", GF_LOG_ERROR, "failed to get the marker status"); - ret = -1; + conf_enabled = glusterd_volinfo_get_boolean (volinfo, key); + if (conf_enabled == -1) { + gf_log ("", GF_LOG_ERROR, + "failed to get key %s from volinfo", key); goto out; } - if (marker_set == _gf_false) { - gsync_status = gf_strdup ("on"); - if (gsync_status == NULL) { + ret = 0; + if (conf_enabled == _gf_false) { + *vc = 1; + knob_on = gf_strdup ("on"); + if (knob_on == NULL) { ret = -1; goto out; } ret = glusterd_gsync_volinfo_dict_set (volinfo, - VKEY_MARKER_XTIME, gsync_status); - if (ret < 0) - goto out; - - ret = glusterd_marker_create_volfile (volinfo); - if (ret) { - gf_log ("", GF_LOG_ERROR, "Setting dict failed"); - goto out; - } + key, knob_on); } - ret = 0; -out: + out: gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); return ret; } +static int +glusterd_set_gsync_confs (glusterd_volinfo_t *volinfo) +{ + int ret = -1; + int volfile_changed = 0; + + ret = glusterd_set_gsync_knob (volinfo, + VKEY_MARKER_XTIME, &volfile_changed); + if (ret) + goto out; + /** + * enable ignore-pid-check blindly as it could be needed for + * cascading setups. + */ + ret = glusterd_set_gsync_knob (volinfo, VKEY_MARKER_XTIME_FORCE, + &volfile_changed); + if (ret) + goto out; + + ret = glusterd_set_gsync_knob (volinfo, + VKEY_CHANGELOG, &volfile_changed); + if (ret) + goto out; + if (volfile_changed) + ret = glusterd_marker_changelog_create_volfile (volinfo); + + out: + return ret; +} static int glusterd_get_gsync_status_mst_slv (glusterd_volinfo_t *volinfo, - char *slave, dict_t *rsp_dict, - char *node) + char *slave, char *conf_path, + dict_t *rsp_dict, char *node) { + char *statefile = NULL; uuid_t uuid = {0, }; glusterd_conf_t *priv = NULL; int ret = 0; + struct stat stbuf = {0, }; GF_ASSERT (volinfo); GF_ASSERT (slave); @@ -1750,19 +2939,38 @@ glusterd_get_gsync_status_mst_slv (glusterd_volinfo_t *volinfo, priv = THIS->private; ret = glusterd_gsync_get_uuid (slave, volinfo, uuid); - if ((ret == 0) && (uuid_compare (MY_UUID, uuid) != 0)) - goto out; - if (ret) { - ret = 0; gf_log ("", GF_LOG_INFO, "geo-replication status %s %s :" "session is not active", volinfo->volname, slave); - goto out; + + ret = glusterd_get_statefile_name (volinfo, slave, + conf_path, &statefile); + if (ret) { + if (!strstr(slave, "::")) + gf_log ("", GF_LOG_INFO, + "%s is not a valid slave url.", slave); + else + gf_log ("", GF_LOG_INFO, "Unable to get" + " statefile's name"); + ret = 0; + goto out; + } + + ret = lstat (statefile, &stbuf); + if (ret) { + gf_log ("", GF_LOG_INFO, "%s statefile not present.", + statefile); + ret = 0; + goto out; + } } - ret = glusterd_read_status_file (volinfo->volname, - slave, rsp_dict, node); - out: + ret = glusterd_read_status_file (volinfo, slave, conf_path, + rsp_dict, node); +out: + if (statefile) + GF_FREE (statefile); + gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret); return ret; } @@ -1813,6 +3021,7 @@ glusterd_get_gsync_status (dict_t *dict, char **op_errstr, dict_t *rsp_dict) { char *slave = NULL; char *volname = NULL; + char *conf_path = NULL; char errmsg[PATH_MAX] = {0, }; gf_boolean_t exists = _gf_false; glusterd_volinfo_t *volinfo = NULL; @@ -1850,7 +3059,14 @@ glusterd_get_gsync_status (dict_t *dict, char **op_errstr, dict_t *rsp_dict) goto out; } - ret = glusterd_get_gsync_status_mst_slv (volinfo, slave, + ret = dict_get_str (dict, "conf_path", &conf_path); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to fetch conf file path."); + goto out; + } + + ret = glusterd_get_gsync_status_mst_slv (volinfo, slave, conf_path, rsp_dict, my_hostname); out: @@ -1859,300 +3075,406 @@ glusterd_get_gsync_status (dict_t *dict, char **op_errstr, dict_t *rsp_dict) } static int -glusterd_send_sigstop (pid_t pid) -{ - int ret = 0; - ret = kill (pid, SIGSTOP); - if (ret) - gf_log ("", GF_LOG_ERROR, GEOREP"failed to send SIGSTOP signal"); - return ret; -} - -static int -glusterd_send_sigcont (pid_t pid) -{ - int ret = 0; - ret = kill (pid, SIGCONT); - if (ret) - gf_log ("", GF_LOG_ERROR, GEOREP"failed to send SIGCONT signal"); - return ret; -} - -/* - * Log rotations flow is something like this: - * - Send SIGSTOP to process group (this will stop monitor/worker process - * and also the slave if it's local) - * - Rotate log file for monitor/worker - * - Rotate log file for slave if it's local - * - Send SIGCONT to the process group. Monitor wakes up, kills the worker - * (this is done in the SIGCONT handler), which results in the termination - * of the slave (local/remote). After returning from signal handler, - * monitor detects absence of worker and starts it again, which in-turn - * starts the slave. - */ -static int -glusterd_send_log_rotate_signal (pid_t pid, char *logfile1, char *logfile2) +glusterd_gsync_delete (glusterd_volinfo_t *volinfo, char *slave, char *slave_ip, + char *slave_vol, char *path_list, dict_t *dict, + dict_t *resp_dict, char **op_errstr) { - int ret = 0; - char rlogfile[PATH_MAX] = {0,}; - time_t rottime = 0; - - ret = glusterd_send_sigstop (-pid); - rottime = time (NULL); + int32_t ret = -1; + runner_t runner = {0,}; + glusterd_conf_t *priv = NULL; + char *master = NULL; + char *gl_workdir = NULL; + char geo_rep_dir[PATH_MAX] = ""; + char *conf_path = NULL; - snprintf (rlogfile, sizeof (rlogfile), "%s.%"PRIu64, logfile1, - (uint64_t) rottime); - ret = rename (logfile1, rlogfile); - if (ret) - gf_log ("", GF_LOG_ERROR, "rename failed for geo-rep log file"); + GF_ASSERT (slave); + GF_ASSERT (slave_ip); + GF_ASSERT (slave_vol); + GF_ASSERT (op_errstr); + GF_ASSERT (dict); + GF_ASSERT (resp_dict); - if (!*logfile2) { - gf_log ("", GF_LOG_DEBUG, "Slave is not local," - " skipping rotation"); - ret = 0; + if (THIS) + priv = THIS->private; + if (priv == NULL) { + gf_log ("", GF_LOG_ERROR, "priv of glusterd not present"); + *op_errstr = gf_strdup ("glusterd defunct"); goto out; } - (void) snprintf (rlogfile, sizeof (rlogfile), "%s.%"PRIu64, logfile2, - (uint64_t) rottime); - ret = rename (logfile2, rlogfile); - if (ret) - gf_log ("", GF_LOG_ERROR, "rename failed for geo-rep slave" - " log file"); + ret = dict_get_str (dict, "conf_path", &conf_path); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to fetch conf file path."); + goto out; + } - out: - ret = glusterd_send_sigcont (-pid); + gl_workdir = priv->workdir; + master = ""; + runinit (&runner); + runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", + "--delete", "-c", NULL); + runner_argprintf (&runner, "%s", conf_path); - return ret; -} + if (volinfo) { + master = volinfo->volname; + runner_argprintf (&runner, ":%s", master); + } + runner_add_arg (&runner, slave); + runner_redir (&runner, STDOUT_FILENO, RUN_PIPE); + synclock_unlock (&priv->big_lock); + ret = runner_run (&runner); + synclock_lock (&priv->big_lock); + if (ret) { + gf_log ("", GF_LOG_ERROR, "gsyncd failed to " + "delete session info for %s and %s peers", + master, slave); -static int -glusterd_get_pid_from_file (char *master, char *slave, pid_t *pid) -{ - int ret = -1; - int pfd = 0; - char pidfile[PATH_MAX] = {0,}; - char buff[1024] = {0,}; + gf_asprintf (op_errstr, "gsyncd failed to " + "delete session info for %s and %s peers", + master, slave); - pfd = gsyncd_getpidfile (master, slave, pidfile); - if (pfd == -2) { - gf_log ("", GF_LOG_ERROR, GEOREP" log-rotate validation " - " failed for %s & %s", master, slave); - goto out; - } - if (gsync_status_byfd (pfd) == -1) { - gf_log ("", GF_LOG_ERROR, "gsyncd b/w %s & %s is not" - " running", master, slave); goto out; } - if (pfd < 0) - goto out; + ret = snprintf (geo_rep_dir, sizeof(geo_rep_dir) - 1, + "%s/"GEOREP"/%s_%s_%s", gl_workdir, + volinfo->volname, slave_ip, slave_vol); + geo_rep_dir[ret] = '\0'; - ret = read (pfd, buff, 1024); - if (ret < 0) { - gf_log ("", GF_LOG_ERROR, GEOREP" cannot read pid from pid-file"); - goto out; + ret = rmdir (geo_rep_dir); + if (ret) { + if (errno == ENOENT) + gf_log ("", GF_LOG_DEBUG, "Geo Rep Dir(%s) Not Present.", + geo_rep_dir); + else { + gf_log ("", GF_LOG_ERROR, "Unable to delete " + "Geo Rep Dir(%s). Error: %s", geo_rep_dir, + strerror (errno)); + goto out; + } } - - *pid = strtol (buff, NULL, 10); ret = 0; + gf_asprintf (op_errstr, "delete successful"); + out: - sys_close(pfd); + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); return ret; } -static int -glusterd_do_gsync_log_rotate (char *master, char *slave, uuid_t *uuid, char **op_errstr) +int +glusterd_op_sys_exec (dict_t *dict, char **op_errstr, dict_t *rsp_dict) { - int ret = 0; - glusterd_conf_t *priv = NULL; - pid_t pid = 0; - char log_file1[PATH_MAX] = {0,}; - char log_file2[PATH_MAX] = {0,}; + char buf[PATH_MAX] = ""; + char cmd_arg_name[PATH_MAX] = ""; + char output_name[PATH_MAX] = ""; + char errmsg[PATH_MAX] = ""; + char *ptr = NULL; + char *bufp = NULL; + char *command = NULL; + char **cmd_args = NULL; + int ret = -1; + int i = -1; + int cmd_args_count = 0; + int output_count = 0; + glusterd_conf_t *priv = NULL; + runner_t runner = {0,}; - GF_ASSERT (THIS); - GF_ASSERT (THIS->private); - - priv = THIS->private; + GF_ASSERT (dict); + GF_ASSERT (op_errstr); + GF_ASSERT (rsp_dict); - ret = glusterd_get_pid_from_file (master, slave, &pid); - if (ret) + if (THIS) + priv = THIS->private; + if (priv == NULL) { + gf_log ("", GF_LOG_ERROR, "priv of glusterd not present"); + *op_errstr = gf_strdup ("glusterd defunct"); goto out; + } - /* log file */ - ret = glusterd_gsyncd_getlogfile (master, slave, log_file1); - if (ret) + ret = dict_get_str (dict, "command", &command); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to get command from dict"); goto out; + } - /* check if slave is local or remote */ - ret = glusterd_gsync_slave_is_remote (slave); - if (ret) - goto do_rotate; - - /* slave log file - slave is local and it's log can be rotated */ - ret = glusterd_gsync_get_slave_log_file (master, slave, log_file2); + ret = dict_get_int32 (dict, "cmd_args_count", &cmd_args_count); if (ret) - goto out; - - do_rotate: - ret = glusterd_send_log_rotate_signal (pid, log_file1, log_file2); - - out: - if (ret && op_errstr) - *op_errstr = gf_strdup("Error rotating log file"); - return ret; -} - -static int -glusterd_do_gsync_log_rotation_mst_slv (glusterd_volinfo_t *volinfo, char *slave, - char **op_errstr) -{ - uuid_t uuid = {0, }; - glusterd_conf_t *priv = NULL; - int ret = 0; - char errmsg[1024] = {0,}; - xlator_t *this = NULL; + gf_log ("", GF_LOG_INFO, "No cmd_args_count"); + + if (cmd_args_count) { + cmd_args = GF_CALLOC (cmd_args_count, sizeof (char*), + gf_common_mt_char); + if (!cmd_args) { + gf_log ("", GF_LOG_ERROR, "Unable to calloc. " + "Errno = %s", strerror(errno)); + goto out; + } - GF_ASSERT (volinfo); - GF_ASSERT (slave); - GF_ASSERT (THIS); - this = THIS; - GF_ASSERT (this->private); - priv = this->private; + for (i=1; i <= cmd_args_count; i++) { + memset (cmd_arg_name, '\0', sizeof(cmd_arg_name)); + snprintf (cmd_arg_name, sizeof(cmd_arg_name), + "cmd_arg_%d", i); + ret = dict_get_str (dict, cmd_arg_name, &cmd_args[i-1]); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to get %s in dict", + cmd_arg_name); + goto out; + } + } + } - ret = glusterd_gsync_get_uuid (slave, volinfo, uuid); - if ((ret == 0) && (uuid_compare (MY_UUID, uuid) != 0)) + runinit (&runner); + runner_argprintf (&runner, GSYNCD_PREFIX"/peer_%s", command); + for (i=0; i < cmd_args_count; i++) + runner_add_arg (&runner, cmd_args[i]); + runner_redir (&runner, STDOUT_FILENO, RUN_PIPE); + synclock_unlock (&priv->big_lock); + ret = runner_start (&runner); + if (ret == -1) { + snprintf (errmsg, sizeof (errmsg), "Unable to " + "execute command. Error : %s", + strerror (errno)); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + ret = -1; + synclock_lock (&priv->big_lock); goto out; + } + ptr = fgets(buf, sizeof(buf), runner_chio (&runner, STDOUT_FILENO)); + if (ptr) { + ret = dict_get_int32 (rsp_dict, "output_count", &output_count); + if (ret) + output_count = 1; + else + output_count++; + memset (output_name, '\0', sizeof (output_name)); + snprintf (output_name, sizeof (output_name), + "output_%d", output_count); + if (buf[strlen(buf) - 1] == '\n') + buf[strlen(buf) - 1] = '\0'; + bufp = gf_strdup (buf); + if (!bufp) + gf_log ("", GF_LOG_ERROR, "gf_strdup failed."); + ret = dict_set_dynstr (rsp_dict, output_name, bufp); + if (ret) { + GF_FREE (bufp); + gf_log ("", GF_LOG_ERROR, "output set failed."); + } + ret = dict_set_int32 (rsp_dict, "output_count", output_count); + if (ret) + gf_log ("", GF_LOG_ERROR, "output_count set failed."); + } + + ret = runner_end (&runner); if (ret) { - snprintf(errmsg, sizeof(errmsg), "geo-replication session b/w %s %s not active", - volinfo->volname, slave); - gf_log (this->name, GF_LOG_WARNING, "%s", errmsg); - if (op_errstr) - *op_errstr = gf_strdup(errmsg); + snprintf (errmsg, sizeof (errmsg), "Unable to " + "end. Error : %s", + strerror (errno)); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + ret = -1; + synclock_lock (&priv->big_lock); goto out; } + synclock_lock (&priv->big_lock); - ret = glusterd_do_gsync_log_rotate (volinfo->volname, slave, &uuid, op_errstr); + ret = 0; +out: + if (cmd_args) { + GF_FREE (cmd_args); + cmd_args = NULL; + } - out: - gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret); + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); return ret; } -static int -_iterate_log_rotate_mst_slv (dict_t *this, char *key, data_t *value, void *data) +int +glusterd_op_copy_file (dict_t *dict, char **op_errstr) { - glusterd_gsync_status_temp_t *param = NULL; - char *slave = NULL; + char abs_filename[PATH_MAX] = ""; + char errmsg[PATH_MAX] = ""; + char *filename = NULL; + char *host_uuid = NULL; + char uuid_str [64] = {0}; + char *contents = NULL; + char buf[1024] = ""; + int ret = -1; + int fd = -1; + int bytes_writen = 0; + int bytes_read = 0; + int contents_size = -1; + int file_mode = -1; + glusterd_conf_t *priv = NULL; + struct stat stbuf = {0,}; - param = (glusterd_gsync_status_temp_t *) data; - GF_ASSERT (param); - GF_ASSERT (param->volinfo); - - slave = strchr (value->data, ':'); - if (slave) - slave++; - else { - gf_log ("", GF_LOG_ERROR, "geo-replication log-rotate: slave (%s) " - "not conforming to format", slave); - return -1; + if (THIS) + priv = THIS->private; + if (priv == NULL) { + gf_log ("", GF_LOG_ERROR, "priv of glusterd not present"); + *op_errstr = gf_strdup ("glusterd defunct"); + goto out; } - (void) glusterd_do_gsync_log_rotation_mst_slv (param->volinfo, slave, NULL); - return 0; -} + ret = dict_get_str (dict, "host-uuid", &host_uuid); + if (ret < 0) + goto out; -static int -glusterd_do_gsync_log_rotation_mst (glusterd_volinfo_t *volinfo) -{ - glusterd_gsync_status_temp_t param = {0, }; + ret = dict_get_str (dict, "source", &filename); + if (ret < 0) { + gf_log ("", GF_LOG_ERROR, "Unable to fetch" + " filename from dict."); + *op_errstr = gf_strdup ("command unsuccessful"); + goto out; + } + snprintf (abs_filename, sizeof(abs_filename), + "%s/%s", priv->workdir, filename); - GF_ASSERT (volinfo); + uuid_utoa_r (MY_UUID, uuid_str); + if (!strcmp (uuid_str, host_uuid)) { + ret = lstat (abs_filename, &stbuf); + if (ret) { + snprintf (errmsg, sizeof (errmsg), "Source file" + " does not exist in %s", priv->workdir); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + goto out; + } - param.volinfo = volinfo; - dict_foreach (volinfo->gsync_slaves, _iterate_log_rotate_mst_slv, ¶m); - return 0; -} + contents = GF_CALLOC(1, stbuf.st_size+1, gf_common_mt_char); + if (!contents) { + snprintf (errmsg, sizeof (errmsg), + "Unable to allocate memory"); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + ret = -1; + goto out; + } -static int -glusterd_rotate_gsync_all () -{ - int32_t ret = 0; - glusterd_conf_t *priv = NULL; - glusterd_volinfo_t *volinfo = NULL; + fd = open (abs_filename, O_RDONLY); + if (fd < 0) { + snprintf (errmsg, sizeof (errmsg), "Unable to open %s", + abs_filename); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + ret = -1; + goto out; + } - GF_ASSERT (THIS); - priv = THIS->private; + do { + ret = read (fd, buf, sizeof(buf)); + if (ret > 0) { + memcpy (contents+bytes_read, buf, ret); + bytes_read += ret; + memset (buf, '\0', sizeof(buf)); + } + } while (ret > 0); - GF_ASSERT (priv); + if (bytes_read != stbuf.st_size) { + snprintf (errmsg, sizeof (errmsg), "Unable to read all " + "the data from %s", abs_filename); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + ret = -1; + goto out; + } - list_for_each_entry (volinfo, &priv->volumes, vol_list) { - ret = glusterd_do_gsync_log_rotation_mst (volinfo); - if (ret) + ret = dict_set_int32 (dict, "contents_size", stbuf.st_size); + if (ret) { + snprintf (errmsg, sizeof (errmsg), "Unable to set" + " contents size in dict."); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); goto out; - } + } - out: - gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret); - return ret; -} + ret = dict_set_int32 (dict, "file_mode", + (int32_t)stbuf.st_mode); + if (ret) { + snprintf (errmsg, sizeof (errmsg), "Unable to set" + " file mode in dict."); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + goto out; + } -static int -glusterd_rotate_gsync_logs (dict_t *dict, char **op_errstr, dict_t *rsp_dict) -{ - char *slave = NULL; - char *volname = NULL; - char errmsg[1024] = {0,}; - gf_boolean_t exists = _gf_false; - glusterd_volinfo_t *volinfo = NULL; - char **linearr = NULL; - int ret = 0; + ret = dict_set_bin (dict, "common_pem_contents", + contents, stbuf.st_size); + if (ret) { + snprintf (errmsg, sizeof (errmsg), "Unable to set" + " pem contents in dict."); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + goto out; + } + close (fd); + } else { + ret = dict_get_bin (dict, "common_pem_contents", + (void **) &contents); + if (ret) { + snprintf (errmsg, sizeof (errmsg), "Unable to get" + " pem contents in dict."); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + goto out; + } - ret = dict_get_str (dict, "master", &volname); - if (ret < 0) { - ret = glusterd_rotate_gsync_all (); - goto out; - } + ret = dict_get_int32 (dict, "contents_size", &contents_size); + if (ret) { + snprintf (errmsg, sizeof (errmsg), "Unable to set" + " contents size in dict."); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + goto out; + } - exists = glusterd_check_volume_exists (volname); - ret = glusterd_volinfo_find (volname, &volinfo); - if ((ret) || (!exists)) { - snprintf (errmsg, sizeof(errmsg), "Volume %s does not" - " exist", volname); - gf_log ("", GF_LOG_WARNING, "%s", errmsg); - *op_errstr = gf_strdup (errmsg); - ret = -1; - goto out; - } + ret = dict_get_int32 (dict, "file_mode", &file_mode); + if (ret) { + snprintf (errmsg, sizeof (errmsg), "Unable to get" + " file mode in dict."); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + goto out; + } - ret = dict_get_str (dict, "slave", &slave); - if (ret < 0) { - ret = glusterd_do_gsync_log_rotation_mst (volinfo); - goto out; - } + fd = open (abs_filename, O_WRONLY | O_TRUNC | O_CREAT, 0600); + if (fd < 0) { + snprintf (errmsg, sizeof (errmsg), "Unable to open %s", + abs_filename); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + ret = -1; + goto out; + } - /* for the given slave use the normalized url */ - ret = glusterd_urltransform_single (slave, "normalize", &linearr); - if (ret == -1) - goto out; + bytes_writen = write (fd, contents, contents_size); - ret = glusterd_do_gsync_log_rotation_mst_slv (volinfo, linearr[0], - op_errstr); - if (ret) - gf_log ("gsyncd", GF_LOG_ERROR, "gsyncd log-rotate failed for" - " %s & %s", volname, slave); + if (bytes_writen != contents_size) { + snprintf (errmsg, sizeof (errmsg), "Failed to write" + " to %s", abs_filename); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + ret = -1; + goto out; + } - glusterd_urltransform_free (linearr, 1); - out: + fchmod (fd, file_mode); + close (fd); + } + + ret = 0; +out: + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); return ret; } - int glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict) { @@ -2162,11 +3484,16 @@ glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict) dict_t *resp_dict = NULL; char *host_uuid = NULL; char *slave = NULL; + char *slave_ip = NULL; + char *slave_vol = NULL; char *volname = NULL; + char *path_list = NULL; glusterd_volinfo_t *volinfo = NULL; glusterd_conf_t *priv = NULL; + gf_boolean_t is_force = _gf_false; char *status_msg = NULL; - uuid_t uuid = {0, }; + gf_boolean_t is_running = _gf_false; + char *conf_path = NULL; GF_ASSERT (THIS); GF_ASSERT (THIS->private); @@ -2192,15 +3519,28 @@ glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict) goto out; } - if (type == GF_GSYNC_OPTION_TYPE_ROTATE) { - ret = glusterd_rotate_gsync_logs (dict, op_errstr, resp_dict); + ret = dict_get_str (dict, "slave", &slave); + if (ret < 0) goto out; + ret = dict_get_str (dict, "slave_ip", &slave_ip); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to fetch slave volume name."); + goto out; } - ret = dict_get_str (dict, "slave", &slave); - if (ret < 0) + ret = dict_get_str (dict, "slave_vol", &slave_vol); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to fetch slave volume name."); goto out; + } + + ret = dict_get_str (dict, "conf_path", &conf_path); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to fetch conf file path."); + goto out; + } if (dict_get_str (dict, "master", &volname) == 0) { ret = glusterd_volinfo_find (volname, &volinfo); @@ -2209,11 +3549,31 @@ glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict) volname); goto out; } + + ret = glusterd_get_local_brickpaths (volinfo, &path_list); } if (type == GF_GSYNC_OPTION_TYPE_CONFIG) { - ret = glusterd_gsync_configure (volinfo, slave, dict, resp_dict, - op_errstr); + ret = glusterd_gsync_configure (volinfo, slave, path_list, + dict, resp_dict, op_errstr); + + ret = dict_set_str (resp_dict, "conf_path", conf_path); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to store conf_file_path."); + goto out; + } + goto out; + } + + if (type == GF_GSYNC_OPTION_TYPE_DELETE) { + ret = glusterd_remove_slave_in_info(volinfo, slave, op_errstr); + if (ret && !is_force && path_list) + goto out; + + ret = glusterd_gsync_delete (volinfo, slave, slave_ip, + slave_vol, path_list, dict, + resp_dict, op_errstr); goto out; } @@ -2222,48 +3582,652 @@ glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict) goto out; } + is_force = dict_get_str_boolean (dict, "force", _gf_false); + if (type == GF_GSYNC_OPTION_TYPE_START) { - ret = glusterd_set_marker_gsync (volinfo); + ret = glusterd_set_gsync_confs (volinfo); if (ret != 0) { - gf_log ("", GF_LOG_WARNING, "marker start failed"); + gf_log ("", GF_LOG_WARNING, "marker/changelog start failed"); *op_errstr = gf_strdup ("failed to initialize indexing"); ret = -1; goto out; } - ret = glusterd_store_slave_in_info(volinfo, slave, - host_uuid, op_errstr); - if (ret) - goto out; - ret = glusterd_start_gsync (volinfo, slave, host_uuid, - op_errstr); + ret = glusterd_start_gsync (volinfo, slave, path_list, + conf_path, host_uuid, op_errstr); } if (type == GF_GSYNC_OPTION_TYPE_STOP) { - - ret = glusterd_gsync_get_uuid (slave, volinfo, uuid); - if (ret) { + ret = glusterd_check_gsync_running_local (volinfo->volname, + slave, conf_path, + &is_running); + if (!ret && !is_force && path_list && + (_gf_true != is_running)) { gf_log ("", GF_LOG_WARNING, GEOREP" is not set up for" "%s(master) and %s(slave)", volname, slave); *op_errstr = strdup (GEOREP" is not set up"); goto out; } - ret = glusterd_remove_slave_in_info(volinfo, slave, op_errstr); - if (ret) + ret = stop_gsync (volname, slave, &status_msg, conf_path, is_force); + if (ret == 0 && status_msg) + ret = dict_set_str (resp_dict, "gsync-status", + status_msg); + if (ret != 0 && !is_force && path_list) + *op_errstr = gf_strdup ("internal error"); + + if (!ret) { + ret = glusterd_create_status_file (volinfo->volname, + slave, slave_ip, + slave_vol, "Stopped"); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to update" + "state_file. Error : %s", + strerror (errno)); + } + } + } + +out: + if (path_list) { + GF_FREE (path_list); + path_list = NULL; + } + + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +int +glusterd_get_slave_details_confpath (glusterd_volinfo_t *volinfo, dict_t *dict, + char **slave_ip, char **slave_vol, + char **conf_path, char **op_errstr) +{ + int ret = -1; + char confpath[PATH_MAX] = ""; + glusterd_conf_t *priv = NULL; + char *slave = NULL; + + GF_ASSERT (THIS); + priv = THIS->private; + GF_ASSERT (priv); + + ret = dict_get_str (dict, "slave", &slave); + if (ret || !slave) { + gf_log ("", GF_LOG_ERROR, "Unable to fetch slave from dict"); + ret = -1; + goto out; + } + + ret = glusterd_get_slave_info (slave, slave_ip, slave_vol, op_errstr); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to fetch slave details."); + ret = -1; + goto out; + } + + ret = dict_set_str (dict, "slave_ip", *slave_ip); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to store slave IP."); + goto out; + } + + ret = dict_set_str (dict, "slave_vol", *slave_vol); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to store slave volume name."); + goto out; + } + + ret = snprintf (confpath, sizeof(confpath) - 1, + "%s/"GEOREP"/%s_%s_%s/gsyncd.conf", + priv->workdir, volinfo->volname, + *slave_ip, *slave_vol); + confpath[ret] = '\0'; + *conf_path = gf_strdup (confpath); + if (!(*conf_path)) { + gf_log ("", GF_LOG_ERROR, + "Unable to gf_strdup. Error: %s", strerror (errno)); + ret = -1; + goto out; + } + + ret = dict_set_str (dict, "conf_path", *conf_path); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to store conf_path"); + goto out; + } + +out: + gf_log ("", GF_LOG_DEBUG,"Returning %d", ret); + return ret; + +} + +static int +glusterd_get_slave_info (char *slave, char **slave_ip, + char **slave_vol, char **op_errstr) +{ + char *tmp = NULL; + char *save_ptr = NULL; + char **linearr = NULL; + int32_t ret = -1; + char errmsg[PATH_MAX] = ""; + + ret = glusterd_urltransform_single (slave, "normalize", + &linearr); + if (ret == -1) { + ret = snprintf (errmsg, sizeof(errmsg) - 1, + "Invalid Url: %s", slave); + errmsg[ret] = '\0'; + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "Failed to normalize url"); + goto out; + } + + tmp = strtok_r (linearr[0], "/", &save_ptr); + tmp = strtok_r (NULL, "/", &save_ptr); + slave = strtok_r (tmp, ":", &save_ptr); + if (slave) { + ret = glusterd_mountbroker_check (&slave, op_errstr); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Invalid slave url: %s", *op_errstr); goto out; + } - if (uuid_compare (MY_UUID, uuid) != 0) { + *slave_ip = gf_strdup (slave); + if (!*slave_ip) { + gf_log ("", GF_LOG_ERROR, + "Failed to gf_strdup"); + ret = -1; goto out; } + gf_log ("", GF_LOG_DEBUG, "Slave IP : %s", *slave_ip); + ret = 0; + } else { + gf_log ("", GF_LOG_ERROR, "Invalid slave name"); + goto out; + } - ret = stop_gsync (volname, slave, &status_msg); - if (ret == 0 && status_msg) - ret = dict_set_str (resp_dict, "gsync-status", - status_msg); - if (ret != 0) - *op_errstr = gf_strdup ("internal error"); + slave = strtok_r (NULL, ":", &save_ptr); + if (slave) { + *slave_vol = gf_strdup (slave); + if (!*slave_vol) { + gf_log ("", GF_LOG_ERROR, + "Failed to gf_strdup"); + ret = -1; + goto out; + } + gf_log ("", GF_LOG_DEBUG, "Slave Vol : %s", *slave_vol); + ret = 0; + } else { + gf_log ("", GF_LOG_ERROR, "Invalid slave name"); + goto out; + } + +out: + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +static void +runinit_gsyncd_setrx (runner_t *runner, char *conf_path) +{ + runinit (runner); + runner_add_args (runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL); + runner_argprintf (runner, "%s", conf_path); + runner_add_arg (runner, "--config-set-rx"); +} + +static int +glusterd_check_gsync_present (int *valid_state) +{ + char buff[PATH_MAX] = {0, }; + runner_t runner = {0,}; + char *ptr = NULL; + int ret = 0; + + runinit (&runner); + runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "--version", NULL); + runner_redir (&runner, STDOUT_FILENO, RUN_PIPE); + ret = runner_start (&runner); + if (ret == -1) { + if (errno == ENOENT) { + gf_log ("glusterd", GF_LOG_INFO, GEOREP + " module not installed in the system"); + *valid_state = 0; + } + else { + gf_log ("glusterd", GF_LOG_ERROR, GEOREP + " module not working as desired"); + *valid_state = -1; + } + goto out; + } + + ptr = fgets(buff, sizeof(buff), runner_chio (&runner, STDOUT_FILENO)); + if (ptr) { + if (!strstr (buff, "gsyncd")) { + ret = -1; + gf_log ("glusterd", GF_LOG_ERROR, GEOREP" module not " + "working as desired"); + *valid_state = -1; + goto out; + } + } else { + ret = -1; + gf_log ("glusterd", GF_LOG_ERROR, GEOREP" module not " + "working as desired"); + *valid_state = -1; + goto out; + } + + ret = 0; + out: + + runner_end (&runner); + + gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret); + return ret; + +} + +static int +create_conf_file (glusterd_conf_t *conf, char *conf_path) +#define RUN_GSYNCD_CMD do { \ + ret = runner_run_reuse (&runner); \ + if (ret == -1) { \ + runner_log (&runner, "glusterd", GF_LOG_ERROR, "command failed"); \ + runner_end (&runner); \ + goto out; \ + } \ + runner_end (&runner); \ +} while (0) +{ + int ret = 0; + runner_t runner = {0,}; + char georepdir[PATH_MAX] = {0,}; + int valid_state = 0; + + valid_state = -1; + ret = glusterd_check_gsync_present (&valid_state); + if (-1 == ret) { + ret = valid_state; + goto out; + } + + ret = snprintf (georepdir, sizeof(georepdir) - 1, "%s/"GEOREP, + conf->workdir); + georepdir[ret] = '\0'; + + /************ + * master pre-configuration + ************/ + + /* remote-gsyncd */ + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_args (&runner, "remote-gsyncd", GSYNCD_PREFIX"/gsyncd", ".", ".", NULL); + RUN_GSYNCD_CMD; + + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_args (&runner, "remote-gsyncd", "/nonexistent/gsyncd", + ".", "^ssh:", NULL); + RUN_GSYNCD_CMD; + + /* gluster-command-dir */ + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_args (&runner, "gluster-command-dir", SBIN_DIR"/", + ".", ".", NULL); + RUN_GSYNCD_CMD; + + /* gluster-params */ + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_args (&runner, "gluster-params", + "aux-gfid-mount xlator-option=*-dht.assert-no-child-down=true", + ".", ".", NULL); + RUN_GSYNCD_CMD; + + /* ssh-command */ + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_arg (&runner, "ssh-command"); + runner_argprintf (&runner, + "ssh -oPasswordAuthentication=no " + "-oStrictHostKeyChecking=no " + "-i %s/secret.pem", georepdir); + runner_add_args (&runner, ".", ".", NULL); + RUN_GSYNCD_CMD; + + /* pid-file */ + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_arg (&runner, "pid-file"); + runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}.pid", georepdir); + runner_add_args (&runner, ".", ".", NULL); + RUN_GSYNCD_CMD; + + /* state-file */ + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_arg (&runner, "state-file"); + runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}.status", georepdir); + runner_add_args (&runner, ".", ".", NULL); + RUN_GSYNCD_CMD; + + /* state-detail-file */ + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_arg (&runner, "state-detail-file"); + runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}-detail.status", georepdir); + runner_add_args (&runner, ".", ".", NULL); + RUN_GSYNCD_CMD; + + /* state-socket */ + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_arg (&runner, "state-socket-unencoded"); + runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}.socket", georepdir); + runner_add_args (&runner, ".", ".", NULL); + RUN_GSYNCD_CMD; + + /* socketdir */ + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_args (&runner, "socketdir", GLUSTERD_SOCK_DIR, ".", ".", NULL); + RUN_GSYNCD_CMD; + + /* log-file */ + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_args (&runner, + "log-file", + DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"/${mastervol}/${eSlave}.log", + ".", ".", NULL); + RUN_GSYNCD_CMD; + + /* gluster-log-file */ + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_args (&runner, + "gluster-log-file", + DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"/${mastervol}/${eSlave}${local_id}.gluster.log", + ".", ".", NULL); + RUN_GSYNCD_CMD; + + /* ignore-deletes */ + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_args (&runner, "ignore-deletes", "true", ".", ".", NULL); + RUN_GSYNCD_CMD; + + /* special-sync-mode */ + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_args (&runner, "special-sync-mode", "partial", ".", ".", NULL); + RUN_GSYNCD_CMD; + + /* change-detector == changelog */ + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_args(&runner, "change-detector", "changelog", ".", ".", NULL); + RUN_GSYNCD_CMD; + + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_arg(&runner, "working-dir"); + runner_argprintf(&runner, "%s/${mastervol}/${eSlave}", + DEFAULT_VAR_RUN_DIRECTORY); + runner_add_args (&runner, ".", ".", NULL); + RUN_GSYNCD_CMD; + + /************ + * slave pre-configuration + ************/ + + /* gluster-command-dir */ + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_args (&runner, "gluster-command-dir", SBIN_DIR"/", + ".", NULL); + RUN_GSYNCD_CMD; + + /* gluster-params */ + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_args (&runner, "gluster-params", + "aux-gfid-mount xlator-option=*-dht.assert-no-child-down=true", + ".", NULL); + RUN_GSYNCD_CMD; + + /* log-file */ + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_args (&runner, + "log-file", + DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"-slaves/${session_owner}:${eSlave}.log", + ".", NULL); + RUN_GSYNCD_CMD; + + /* MountBroker log-file */ + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_args (&runner, + "log-file-mbr", + DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"-slaves/mbr/${session_owner}:${eSlave}.log", + ".", NULL); + RUN_GSYNCD_CMD; + + /* gluster-log-file */ + runinit_gsyncd_setrx (&runner, conf_path); + runner_add_args (&runner, + "gluster-log-file", + DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"-slaves/${session_owner}:${eSlave}.gluster.log", + ".", NULL); + RUN_GSYNCD_CMD; + + out: + return ret ? -1 : 0; +} + +static int +glusterd_create_essential_dir_files (glusterd_volinfo_t *volinfo, dict_t *dict, + char *slave, char *slave_ip, + char *slave_vol, char **op_errstr) +{ + int ret = -1; + char *conf_path = NULL; + char *statefile = NULL; + char buf[PATH_MAX] = ""; + char errmsg[PATH_MAX] = ""; + glusterd_conf_t *conf = NULL; + struct stat stbuf = {0,}; + + GF_ASSERT (THIS); + conf = THIS->private; + + ret = dict_get_str (dict, "conf_path", &conf_path); + if (ret) { + snprintf (errmsg, sizeof (errmsg), + "Unable to fetch conf file path."); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + goto out; + } + + ret = dict_get_str (dict, "statefile", &statefile); + if (ret) { + snprintf (errmsg, sizeof (errmsg), + "Unable to fetch statefile path."); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + goto out; + } + + ret = snprintf (buf, sizeof(buf) - 1, "%s/"GEOREP"/%s_%s_%s", + conf->workdir, volinfo->volname, slave_ip, slave_vol); + buf[ret] = '\0'; + ret = mkdir_p (buf, 0777, _gf_true); + if (ret) { + snprintf (errmsg, sizeof (errmsg), "Unable to create %s" + ". Error : %s", buf, strerror (errno)); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + goto out; + } + + ret = snprintf (buf, PATH_MAX, DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"/%s", + volinfo->volname); + buf[ret] = '\0'; + ret = mkdir_p (buf, 0777, _gf_true); + if (ret) { + snprintf (errmsg, sizeof (errmsg), "Unable to create %s" + ". Error : %s", buf, strerror (errno)); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + goto out; + } + + ret = lstat (conf_path, &stbuf); + if (!ret) { + gf_log ("", GF_LOG_DEBUG, "Session already running." + " Not creating config file again."); + } else { + ret = create_conf_file (conf, conf_path); + if (ret || lstat (conf_path, &stbuf)) { + snprintf (errmsg, sizeof (errmsg), "Failed to create" + " config file(%s).", conf_path); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + goto out; + } + } + + ret = lstat (statefile, &stbuf); + if (!ret) { + gf_log ("", GF_LOG_DEBUG, "Session already running." + " Not creating status file again."); + goto out; + } else { + ret = glusterd_create_status_file (volinfo->volname, slave, + slave_ip, slave_vol, + "Not Started"); + if (ret || lstat (statefile, &stbuf)) { + snprintf (errmsg, sizeof (errmsg), "Unable to create %s" + ". Error : %s", statefile, strerror (errno)); + *op_errstr = gf_strdup (errmsg); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + ret = -1; + goto out; + } + } + +out: + gf_log ("", GF_LOG_DEBUG,"Returning %d", ret); + return ret; +} + +int +glusterd_op_gsync_create (dict_t *dict, char **op_errstr, dict_t *rsp_dict) +{ + char common_pem_file[PATH_MAX] = ""; + char errmsg[PATH_MAX] = ""; + char hooks_args[PATH_MAX] = ""; + char uuid_str [64] = ""; + char *host_uuid = NULL; + char *slave_ip = NULL; + char *slave_vol = NULL; + char *arg_buf = NULL; + char *volname = NULL; + char *slave = NULL; + int32_t ret = -1; + int32_t is_pem_push = -1; + gf_boolean_t is_force = -1; + glusterd_conf_t *conf = NULL; + glusterd_volinfo_t *volinfo = NULL; + + GF_ASSERT (THIS); + conf = THIS->private; + GF_ASSERT (conf); + GF_ASSERT (dict); + GF_ASSERT (op_errstr); + + ret = glusterd_op_gsync_args_get (dict, op_errstr, + &volname, &slave, &host_uuid); + if (ret) + goto out; + + snprintf (common_pem_file, sizeof(common_pem_file), + "%s"GLUSTERD_COMMON_PEM_PUB_FILE, conf->workdir); + + ret = glusterd_volinfo_find (volname, &volinfo); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Volinfo for %s" + " (master) not found", volname); + goto out; + } + + ret = dict_get_str (dict, "slave_vol", &slave_vol); + if (ret) { + snprintf (errmsg, sizeof (errmsg), + "Unable to fetch slave volume name."); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + goto out; + } + + ret = dict_get_str (dict, "slave_ip", &slave_ip); + if (ret) { + snprintf (errmsg, sizeof (errmsg), + "Unable to fetch slave IP."); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + ret = -1; + goto out; + } + + is_force = dict_get_str_boolean (dict, "force", _gf_false); + + uuid_utoa_r (MY_UUID, uuid_str); + if (!strcmp (uuid_str, host_uuid)) { + ret = dict_get_int32 (dict, "push_pem", &is_pem_push); + if (!ret && is_pem_push) { + gf_log ("", GF_LOG_DEBUG, "Trying to setup" + " pem files in slave"); + is_pem_push = 1; + } else + is_pem_push = 0; + + snprintf(hooks_args, sizeof(hooks_args), + "is_push_pem=%d pub_file=%s slave_ip=%s", + is_pem_push, common_pem_file, slave_ip); + + } else + snprintf(hooks_args, sizeof(hooks_args), + "This argument will stop the hooks script"); + + arg_buf = gf_strdup (hooks_args); + if (!arg_buf) { + gf_log ("", GF_LOG_ERROR, "Failed to" + " gf_strdup"); + if (is_force) { + ret = 0; + goto create_essentials; + } + ret = -1; + goto out; + } + + ret = dict_set_str (dict, "hooks_args", arg_buf); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Failed to set" + " hooks_args in dict."); + if (is_force) { + ret = 0; + goto create_essentials; + } + goto out; + } + +create_essentials: + + ret = glusterd_create_essential_dir_files (volinfo, dict, slave, + slave_ip, slave_vol, + op_errstr); + if (ret) + goto out; + + ret = glusterd_store_slave_in_info (volinfo, slave, + host_uuid, op_errstr, + is_force); + if (ret) { + snprintf (errmsg, sizeof (errmsg), "Unable to store" + " slave info."); + gf_log ("", GF_LOG_ERROR, "%s", errmsg); + goto out; } out: diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c index df0e1c525..71d076624 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handler.c +++ b/xlators/mgmt/glusterd/src/glusterd-handler.c @@ -33,6 +33,7 @@ #include "glusterd-op-sm.h" #include "glusterd-utils.h" #include "glusterd-store.h" +#include "glusterd-locks.h" #include "glusterd1-xdr.h" #include "cli1-xdr.h" @@ -54,6 +55,8 @@ #include <lvm2app.h> #endif +extern uuid_t global_txn_id; + int glusterd_big_locked_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, void *data, rpc_clnt_notify_t notify_fn) @@ -335,7 +338,9 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo, char *volume_id_str = NULL; struct args_pack pack = {0,}; xlator_t *this = NULL; - +#ifdef HAVE_BD_XLATOR + int caps = 0; +#endif GF_ASSERT (volinfo); GF_ASSERT (volumes); @@ -360,6 +365,21 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo, if (ret) goto out; + /* As of now, the snap volumes are also displayed as part of + volume info command. So this change is to display whether + the volume is original volume or the snap_volume. If + displaying of snap volumes in volume info o/p is not needed + this should be removed. + */ + snprintf (key, 256, "volume%d.snap_volume", count); + ret = dict_set_int32 (volumes, key, volinfo->is_snap_volume); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "failed to set whether " + "the volume is a snap volume or actual volume (%s)", + volinfo->volname); + goto out; + } + snprintf (key, 256, "volume%d.brick_count", count); ret = dict_set_int32 (volumes, key, volinfo->brick_count); if (ret) @@ -400,14 +420,76 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo, goto out; #ifdef HAVE_BD_XLATOR - snprintf (key, 256, "volume%d.backend", count); - ret = dict_set_int32 (volumes, key, volinfo->backend); - if (ret) - goto out; + if (volinfo->caps) { + caps = 0; + snprintf (key, 256, "volume%d.xlator0", count); + buf = GF_MALLOC (256, gf_common_mt_char); + if (!buf) { + ret = ENOMEM; + goto out; + } + if (volinfo->caps & CAPS_BD) + snprintf (buf, 256, "BD"); + ret = dict_set_dynstr (volumes, key, buf); + if (ret) { + GF_FREE (buf); + goto out; + } + + if (volinfo->caps & CAPS_THIN) { + snprintf (key, 256, "volume%d.xlator0.caps%d", count, + caps++); + buf = GF_MALLOC (256, gf_common_mt_char); + if (!buf) { + ret = ENOMEM; + goto out; + } + snprintf (buf, 256, "thin"); + ret = dict_set_dynstr (volumes, key, buf); + if (ret) { + GF_FREE (buf); + goto out; + } + } + + if (volinfo->caps & CAPS_OFFLOAD_COPY) { + snprintf (key, 256, "volume%d.xlator0.caps%d", count, + caps++); + buf = GF_MALLOC (256, gf_common_mt_char); + if (!buf) { + ret = ENOMEM; + goto out; + } + snprintf (buf, 256, "offload_copy"); + ret = dict_set_dynstr (volumes, key, buf); + if (ret) { + GF_FREE (buf); + goto out; + } + } + + if (volinfo->caps & CAPS_OFFLOAD_SNAPSHOT) { + snprintf (key, 256, "volume%d.xlator0.caps%d", count, + caps++); + buf = GF_MALLOC (256, gf_common_mt_char); + if (!buf) { + ret = ENOMEM; + goto out; + } + snprintf (buf, 256, "offload_snapshot"); + ret = dict_set_dynstr (volumes, key, buf); + if (ret) { + GF_FREE (buf); + goto out; + } + } + + } #endif list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { char brick[1024] = {0,}; + char brick_uuid[64] = {0,}; snprintf (key, 256, "volume%d.brick%d", count, i); snprintf (brick, 1024, "%s:%s", brickinfo->hostname, brickinfo->path); @@ -415,6 +497,25 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo, ret = dict_set_dynstr (volumes, key, buf); if (ret) goto out; + snprintf (key, 256, "volume%d.brick%d.uuid", count, i); + snprintf (brick_uuid, 64, "%s", uuid_utoa (brickinfo->uuid)); + buf = gf_strdup (brick_uuid); + if (!buf) + goto out; + ret = dict_set_dynstr (volumes, key, buf); + if (ret) + goto out; + +#ifdef HAVE_BD_XLATOR + if (volinfo->caps & CAPS_BD) { + snprintf (key, 256, "volume%d.vg%d", count, i); + snprintf (brick, 1024, "%s", brickinfo->vg); + buf = gf_strdup (brick); + ret = dict_set_dynstr (volumes, key, buf); + if (ret) + goto out; + } +#endif i++; } @@ -478,10 +579,17 @@ int32_t glusterd_op_txn_begin (rpcsvc_request_t *req, glusterd_op_t op, void *ctx, char *err_str, size_t err_len) { - int32_t ret = -1; - xlator_t *this = NULL; - glusterd_conf_t *priv = NULL; - int32_t locked = 0; + int32_t ret = -1; + dict_t *dict = NULL; + xlator_t *this = NULL; + glusterd_conf_t *priv = NULL; + int32_t locked = 0; + char *tmp = NULL; + char *volname = NULL; + uuid_t *txn_id = NULL; + uuid_t *originator_uuid = NULL; + glusterd_op_info_t txn_op_info = {{0},}; + glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE; GF_ASSERT (req); GF_ASSERT ((op > GD_OP_NONE) && (op < GD_OP_MAX)); @@ -492,33 +600,140 @@ glusterd_op_txn_begin (rpcsvc_request_t *req, glusterd_op_t op, void *ctx, priv = this->private; GF_ASSERT (priv); - ret = glusterd_lock (MY_UUID); + dict = ctx; + + /* Generate a transaction-id for this operation and + * save it in the dict. This transaction id distinguishes + * each transaction, and helps separate opinfos in the + * op state machine. */ + txn_id = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t); + if (!txn_id) + goto out; + + uuid_generate (*txn_id); + + ret = dict_set_bin (dict, "transaction_id", + txn_id, sizeof(*txn_id)); if (ret) { gf_log (this->name, GF_LOG_ERROR, - "Unable to acquire lock on localhost, ret: %d", ret); - snprintf (err_str, err_len, "Another transaction is in progress. " - "Please try again after sometime."); + "Failed to set transaction id."); + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "Transaction_id = %s", uuid_utoa (*txn_id)); + + /* Save the MY_UUID as the originator_uuid. This originator_uuid + * will be used by is_origin_glusterd() to determine if a node + * is the originator node for a command. */ + originator_uuid = GF_CALLOC (1, sizeof(uuid_t), + gf_common_mt_uuid_t); + if (!originator_uuid) { + ret = -1; goto out; } + uuid_copy (*originator_uuid, MY_UUID); + ret = dict_set_bin (dict, "originator_uuid", + originator_uuid, sizeof (uuid_t)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set originator uuid."); + goto out; + } + + /* Based on the op_version, acquire a cluster or mgmt_v3 lock */ + if (priv->op_version < 3) { + ret = glusterd_lock (MY_UUID); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to acquire lock on localhost, ret: %d", + ret); + snprintf (err_str, err_len, + "Another transaction is in progress. " + "Please try again after sometime."); + goto out; + } + } else { + /* If no volname is given as a part of the command, locks will + * not be held */ + ret = dict_get_str (dict, "volname", &tmp); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Failed to get volume " + "name"); + goto local_locking_done; + } else { + /* Use a copy of volname, as cli response will be + * sent before the unlock, and the volname in the + * dict, might be removed */ + volname = gf_strdup (tmp); + if (!volname) + goto out; + } + + ret = glusterd_mgmt_v3_lock (volname, MY_UUID, "vol"); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to acquire lock for %s", volname); + snprintf (err_str, err_len, + "Another transaction is in progress for %s. " + "Please try again after sometime.", volname); + goto out; + } + } + locked = 1; gf_log (this->name, GF_LOG_DEBUG, "Acquired lock on localhost"); - ret = glusterd_op_sm_inject_event (GD_OP_EVENT_START_LOCK, NULL); +local_locking_done: + + /* If no volname is given as a part of the command, locks will + * not be held, hence sending stage event. */ + if (volname) + event_type = GD_OP_EVENT_START_LOCK; + else { + txn_op_info.state.state = GD_OP_STATE_LOCK_SENT; + event_type = GD_OP_EVENT_ALL_ACC; + } + + /* Save opinfo for this transaction with the transaction id */ + glusterd_txn_opinfo_init (&txn_op_info, NULL, &op, ctx, req); + + ret = glusterd_set_txn_opinfo (txn_id, &txn_op_info); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to set transaction's opinfo"); + if (ctx) + dict_unref (ctx); + goto out; + } + + ret = glusterd_op_sm_inject_event (event_type, txn_id, ctx); if (ret) { gf_log (this->name, GF_LOG_ERROR, "Failed to acquire cluster" " lock."); goto out; } - glusterd_op_set_op (op); - glusterd_op_set_ctx (ctx); - glusterd_op_set_req (req); - - out: - if (locked && ret) - glusterd_unlock (MY_UUID); + if (locked && ret) { + /* Based on the op-version, we release the + * cluster or mgmt_v3 lock */ + if (priv->op_version < 3) + glusterd_unlock (MY_UUID); + else { + ret = glusterd_mgmt_v3_unlock (volname, MY_UUID, + "vol"); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Unable to release lock for %s", + volname); + ret = -1; + } + } + + if (volname) + GF_FREE (volname); gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret); return ret; @@ -527,11 +742,15 @@ out: int __glusterd_handle_cluster_lock (rpcsvc_request_t *req) { - gd1_mgmt_cluster_lock_req lock_req = {{0},}; - int32_t ret = -1; - glusterd_op_lock_ctx_t *ctx = NULL; - glusterd_peerinfo_t *peerinfo = NULL; - xlator_t *this = NULL; + dict_t *op_ctx = NULL; + int32_t ret = -1; + gd1_mgmt_cluster_lock_req lock_req = {{0},}; + glusterd_op_lock_ctx_t *ctx = NULL; + glusterd_op_t op = GD_OP_EVENT_LOCK; + glusterd_peerinfo_t *peerinfo = NULL; + glusterd_op_info_t txn_op_info = {{0},}; + uuid_t *txn_id = &global_txn_id; + xlator_t *this = NULL; this = THIS; GF_ASSERT (this); @@ -566,8 +785,29 @@ __glusterd_handle_cluster_lock (rpcsvc_request_t *req) uuid_copy (ctx->uuid, lock_req.uuid); ctx->req = req; + ctx->dict = NULL; + + op_ctx = dict_new (); + if (!op_ctx) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to set new dict"); + goto out; + } - ret = glusterd_op_sm_inject_event (GD_OP_EVENT_LOCK, ctx); + glusterd_txn_opinfo_init (&txn_op_info, NULL, &op, op_ctx, req); + + ret = glusterd_set_txn_opinfo (txn_id, &txn_op_info); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to set transaction's opinfo"); + dict_unref (txn_op_info.op_ctx); + goto out; + } + + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_LOCK, txn_id, ctx); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Failed to inject event GD_OP_EVENT_LOCK"); out: gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret); @@ -643,6 +883,9 @@ __glusterd_handle_stage_op (rpcsvc_request_t *req) gd1_mgmt_stage_op_req op_req = {{0},}; glusterd_peerinfo_t *peerinfo = NULL; xlator_t *this = NULL; + uuid_t *txn_id = &global_txn_id; + glusterd_op_info_t txn_op_info = {{0},}; + glusterd_op_sm_state_info_t state; this = THIS; GF_ASSERT (this); @@ -671,7 +914,36 @@ __glusterd_handle_stage_op (rpcsvc_request_t *req) if (ret) goto out; - ret = glusterd_op_sm_inject_event (GD_OP_EVENT_STAGE_OP, req_ctx); + ret = dict_get_bin (req_ctx->dict, "transaction_id", (void **)&txn_id); + + gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id)); + + /* In cases where there is no volname, the receivers won't have a + * transaction opinfo created, as for those operations, the locking + * phase where the transaction opinfos are created, won't be called. */ + ret = glusterd_get_txn_opinfo (txn_id, &txn_op_info); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "No transaction's opinfo set"); + + state.state = GD_OP_STATE_LOCKED; + glusterd_txn_opinfo_init (&txn_op_info, &state, + &op_req.op, req_ctx->dict, req); + + ret = glusterd_set_txn_opinfo (txn_id, &txn_op_info); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to set transaction's opinfo"); + dict_unref (req_ctx->dict); + goto out; + } + } + + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_STAGE_OP, + txn_id, req_ctx); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Failed to inject event GD_OP_EVENT_STAGE_OP"); out: free (op_req.buf.buf_val);//malloced by xdr @@ -695,6 +967,7 @@ __glusterd_handle_commit_op (rpcsvc_request_t *req) gd1_mgmt_commit_op_req op_req = {{0},}; glusterd_peerinfo_t *peerinfo = NULL; xlator_t *this = NULL; + uuid_t *txn_id = &global_txn_id; this = THIS; GF_ASSERT (this); @@ -725,11 +998,12 @@ __glusterd_handle_commit_op (rpcsvc_request_t *req) if (ret) goto out; - ret = glusterd_op_init_ctx (op_req.op); - if (ret) - goto out; + ret = dict_get_bin (req_ctx->dict, "transaction_id", (void **)&txn_id); - ret = glusterd_op_sm_inject_event (GD_OP_EVENT_COMMIT_OP, req_ctx); + gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id)); + + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_COMMIT_OP, + txn_id, req_ctx); out: free (op_req.buf.buf_val);//malloced by xdr @@ -748,17 +1022,19 @@ int __glusterd_handle_cli_probe (rpcsvc_request_t *req) { int32_t ret = -1; - gf1_cli_probe_req cli_req = {0,}; - glusterd_peerinfo_t *peerinfo = NULL; - gf_boolean_t run_fsm = _gf_true; - xlator_t *this = NULL; - char *bind_name = NULL; + gf_cli_req cli_req = {{0,},}; + glusterd_peerinfo_t *peerinfo = NULL; + gf_boolean_t run_fsm = _gf_true; + xlator_t *this = NULL; + char *bind_name = NULL; + dict_t *dict = NULL; + char *hostname = NULL; + int port = 0; GF_ASSERT (req); this = THIS; - ret = xdr_to_generic (req->msg[0], &cli_req, - (xdrproc_t)xdr_gf1_cli_probe_req); + ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req); if (ret < 0) { //failed to decode msg; gf_log ("", GF_LOG_ERROR, "xdr decoding error"); @@ -766,63 +1042,80 @@ __glusterd_handle_cli_probe (rpcsvc_request_t *req) goto out; } + if (cli_req.dict.dict_len) { + dict = dict_new (); + + ret = dict_unserialize (cli_req.dict.dict_val, + cli_req.dict.dict_len, &dict); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "unserialize req-buffer to dictionary"); + goto out; + } + } + + ret = dict_get_str (dict, "hostname", &hostname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get hostname"); + goto out; + } + + ret = dict_get_int32 (dict, "port", &port); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get port"); + goto out; + } + if (glusterd_is_any_volume_in_server_quorum (this) && !does_gd_meet_server_quorum (this)) { glusterd_xfer_cli_probe_resp (req, -1, GF_PROBE_QUORUM_NOT_MET, - NULL, - cli_req.hostname, cli_req.port); + NULL, hostname, port, dict); gf_log (this->name, GF_LOG_ERROR, "Quorum does not meet, " "rejecting operation"); ret = 0; goto out; } - gf_cmd_log ("peer probe", " on host %s:%d", cli_req.hostname, - cli_req.port); gf_log ("glusterd", GF_LOG_INFO, "Received CLI probe req %s %d", - cli_req.hostname, cli_req.port); + hostname, port); if (dict_get_str(this->options,"transport.socket.bind-address", &bind_name) == 0) { gf_log ("glusterd", GF_LOG_DEBUG, "only checking probe address vs. bind address"); - ret = glusterd_is_same_address(bind_name,cli_req.hostname); + ret = gf_is_same_address (bind_name, hostname); } else { - ret = glusterd_is_local_addr(cli_req.hostname); + ret = gf_is_local_addr (hostname); } if (ret) { - glusterd_xfer_cli_probe_resp (req, 0, GF_PROBE_LOCALHOST, NULL, - cli_req.hostname, cli_req.port); + glusterd_xfer_cli_probe_resp (req, 0, GF_PROBE_LOCALHOST, + NULL, hostname, port, dict); ret = 0; goto out; } - if (!(ret = glusterd_friend_find_by_hostname(cli_req.hostname, - &peerinfo))) { - if (strcmp (peerinfo->hostname, cli_req.hostname) == 0) { + if (!(ret = glusterd_friend_find_by_hostname (hostname, &peerinfo))) { + if (strcmp (peerinfo->hostname, hostname) == 0) { gf_log ("glusterd", GF_LOG_DEBUG, "Probe host %s port " - "%d already a peer", cli_req.hostname, - cli_req.port); + "%d already a peer", hostname, port); glusterd_xfer_cli_probe_resp (req, 0, GF_PROBE_FRIEND, - NULL, cli_req.hostname, - cli_req.port); + NULL, hostname, port, + dict); goto out; } } - ret = glusterd_probe_begin (req, cli_req.hostname, cli_req.port); - - gf_cmd_log ("peer probe","on host %s:%d %s",cli_req.hostname, - cli_req.port, (ret) ? "FAILED" : "SUCCESS"); + ret = glusterd_probe_begin (req, hostname, port, dict); if (ret == GLUSTERD_CONNECTION_AWAITED) { //fsm should be run after connection establishes run_fsm = _gf_false; ret = 0; } + out: - free (cli_req.hostname);//its malloced by xdr + free (cli_req.dict.dict_val); if (run_fsm) { glusterd_friend_sm (); @@ -842,11 +1135,15 @@ int __glusterd_handle_cli_deprobe (rpcsvc_request_t *req) { int32_t ret = -1; - gf1_cli_deprobe_req cli_req = {0,}; - uuid_t uuid = {0}; - int op_errno = 0; - xlator_t *this = NULL; - glusterd_conf_t *priv = NULL; + gf_cli_req cli_req = {{0,},}; + uuid_t uuid = {0}; + int op_errno = 0; + xlator_t *this = NULL; + glusterd_conf_t *priv = NULL; + dict_t *dict = NULL; + char *hostname = NULL; + int port = 0; + int flags = 0; this = THIS; GF_ASSERT (this); @@ -855,16 +1152,46 @@ __glusterd_handle_cli_deprobe (rpcsvc_request_t *req) GF_ASSERT (req); ret = xdr_to_generic (req->msg[0], &cli_req, - (xdrproc_t)xdr_gf1_cli_deprobe_req); + (xdrproc_t)xdr_gf_cli_req); if (ret < 0) { //failed to decode msg; req->rpc_err = GARBAGE_ARGS; goto out; } + if (cli_req.dict.dict_len) { + dict = dict_new (); + + ret = dict_unserialize (cli_req.dict.dict_val, + cli_req.dict.dict_len, &dict); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "unserialize req-buffer to dictionary"); + goto out; + } + } + gf_log ("glusterd", GF_LOG_INFO, "Received CLI deprobe req"); - ret = glusterd_hostname_to_uuid (cli_req.hostname, uuid); + ret = dict_get_str (dict, "hostname", &hostname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get hostname"); + goto out; + } + + ret = dict_get_int32 (dict, "port", &port); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get port"); + goto out; + } + + ret = dict_get_int32 (dict, "flags", &flags); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get flags"); + goto out; + } + + ret = glusterd_hostname_to_uuid (hostname, uuid); if (ret) { op_errno = GF_DEPROBE_NOT_FRIEND; goto out; @@ -876,7 +1203,7 @@ __glusterd_handle_cli_deprobe (rpcsvc_request_t *req) goto out; } - if (!(cli_req.flags & GF_CLI_FLAG_OP_FORCE)) { + if (!(flags & GF_CLI_FLAG_OP_FORCE)) { if (!uuid_is_null (uuid)) { /* Check if peers are connected, except peer being detached*/ if (!glusterd_chk_peers_connected_befriended (uuid)) { @@ -904,23 +1231,19 @@ __glusterd_handle_cli_deprobe (rpcsvc_request_t *req) } if (!uuid_is_null (uuid)) { - ret = glusterd_deprobe_begin (req, cli_req.hostname, - cli_req.port, uuid); + ret = glusterd_deprobe_begin (req, hostname, port, uuid, dict); } else { - ret = glusterd_deprobe_begin (req, cli_req.hostname, - cli_req.port, NULL); + ret = glusterd_deprobe_begin (req, hostname, port, NULL, dict); } - gf_cmd_log ("peer deprobe", "on host %s:%d %s", cli_req.hostname, - cli_req.port, (ret) ? "FAILED" : "SUCCESS"); out: + free (cli_req.dict.dict_val); + if (ret) { ret = glusterd_xfer_cli_deprobe_resp (req, ret, op_errno, NULL, - cli_req.hostname); + hostname, dict); } - free (cli_req.hostname);//malloced by xdr - glusterd_friend_sm (); glusterd_op_sm (); @@ -1049,27 +1372,32 @@ glusterd_handle_cli_get_volume (rpcsvc_request_t *req) __glusterd_handle_cli_get_volume); } -#ifdef HAVE_BD_XLATOR int -__glusterd_handle_cli_bd_op (rpcsvc_request_t *req) +__glusterd_handle_cli_uuid_reset (rpcsvc_request_t *req) { - int32_t ret = -1; - gf_cli_req cli_req = { {0,} }; - dict_t *dict = NULL; - char *volname = NULL; - char op_errstr[2048] = {0,}; - glusterd_op_t cli_op = GD_OP_BD_OP; + int ret = -1; + dict_t *dict = NULL; + xlator_t *this = NULL; + glusterd_conf_t *priv = NULL; + uuid_t uuid = {0}; + gf_cli_rsp rsp = {0,}; + gf_cli_req cli_req = {{0,}}; + char msg_str[2048] = {0,}; GF_ASSERT (req); + this = THIS; + priv = this->private; + GF_ASSERT (priv); + ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req); if (ret < 0) { - /* failed to decode msg */ + //failed to decode msg; req->rpc_err = GARBAGE_ARGS; goto out; } - gf_log ("glusterd", GF_LOG_DEBUG, "Received bd op req"); + gf_log ("glusterd", GF_LOG_DEBUG, "Received uuid reset req"); if (cli_req.dict.dict_len) { /* Unserialize the dictionary */ @@ -1082,58 +1410,84 @@ __glusterd_handle_cli_bd_op (rpcsvc_request_t *req) gf_log ("glusterd", GF_LOG_ERROR, "failed to " "unserialize req-buffer to dictionary"); + snprintf (msg_str, sizeof (msg_str), "Unable to decode " + "the buffer"); goto out; } else { dict->extra_stdfree = cli_req.dict.dict_val; } } - ret = dict_get_str (dict, "volname", &volname); - if (ret) { - gf_log (THIS->name, GF_LOG_ERROR, - "failed to get volname"); + /* In the above section if dict_unserialize is successful, ret is set + * to zero. + */ + ret = -1; + // Do not allow peer reset if there are any volumes in the cluster + if (!list_empty (&priv->volumes)) { + snprintf (msg_str, sizeof (msg_str), "volumes are already " + "present in the cluster. Resetting uuid is not " + "allowed"); + gf_log (this->name, GF_LOG_WARNING, "%s", msg_str); goto out; } - ret = glusterd_op_begin (req, GD_OP_BD_OP, dict, op_errstr, - sizeof (op_errstr)); - gf_cmd_log ("bd op: %s", ((ret == 0) ? "SUCCESS": "FAILED")); -out: - if (ret && dict) - dict_unref (dict); + // Do not allow peer reset if trusted storage pool is already formed + if (!list_empty (&priv->peers)) { + snprintf (msg_str, sizeof (msg_str),"trusted storage pool " + "has been already formed. Please detach this peer " + "from the pool and reset its uuid."); + gf_log (this->name, GF_LOG_WARNING, "%s", msg_str); + goto out; + } - glusterd_friend_sm (); - glusterd_op_sm (); + uuid_copy (uuid, priv->uuid); + ret = glusterd_uuid_generate_save (); + + if (!uuid_compare (uuid, MY_UUID)) { + snprintf (msg_str, sizeof (msg_str), "old uuid and the new uuid" + " are same. Try gluster peer reset again"); + gf_log (this->name, GF_LOG_ERROR, "%s", msg_str); + ret = -1; + goto out; + } +out: if (ret) { - if (op_errstr[0] == '\0') - snprintf (op_errstr, sizeof (op_errstr), - "Operation failed"); - ret = glusterd_op_send_cli_response (cli_op, ret, 0, - req, NULL, op_errstr); + rsp.op_ret = -1; + if (msg_str[0] == '\0') + snprintf (msg_str, sizeof (msg_str), "Operation " + "failed"); + rsp.op_errstr = msg_str; + ret = 0; + } else { + rsp.op_errstr = ""; } + glusterd_to_cli (req, &rsp, NULL, 0, NULL, + (xdrproc_t)xdr_gf_cli_rsp, dict); + return ret; } int -glusterd_handle_cli_bd_op (rpcsvc_request_t *req) +glusterd_handle_cli_uuid_reset (rpcsvc_request_t *req) { - return glusterd_big_locked_handler (req, __glusterd_handle_cli_bd_op); + return glusterd_big_locked_handler (req, + __glusterd_handle_cli_uuid_reset); } -#endif int -__glusterd_handle_cli_uuid_reset (rpcsvc_request_t *req) +__glusterd_handle_cli_uuid_get (rpcsvc_request_t *req) { - int ret = -1; - dict_t *dict = NULL; - xlator_t *this = NULL; - glusterd_conf_t *priv = NULL; - uuid_t uuid = {0}; - gf_cli_rsp rsp = {0,}; - gf_cli_req cli_req = {{0,}}; + int ret = -1; + dict_t *dict = NULL; + dict_t *rsp_dict = NULL; + xlator_t *this = NULL; + glusterd_conf_t *priv = NULL; + gf_cli_rsp rsp = {0,}; + gf_cli_req cli_req = {{0,}}; char msg_str[2048] = {0,}; + char uuid_str[64] = {0,}; GF_ASSERT (req); @@ -1143,16 +1497,18 @@ __glusterd_handle_cli_uuid_reset (rpcsvc_request_t *req) ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req); if (ret < 0) { - //failed to decode msg; req->rpc_err = GARBAGE_ARGS; goto out; } - gf_log ("glusterd", GF_LOG_DEBUG, "Received uuid reset req"); + gf_log ("glusterd", GF_LOG_DEBUG, "Received uuid get req"); if (cli_req.dict.dict_len) { - /* Unserialize the dictionary */ dict = dict_new (); + if (!dict) { + ret = -1; + goto out; + } ret = dict_unserialize (cli_req.dict.dict_val, cli_req.dict.dict_len, @@ -1164,44 +1520,35 @@ __glusterd_handle_cli_uuid_reset (rpcsvc_request_t *req) snprintf (msg_str, sizeof (msg_str), "Unable to decode " "the buffer"); goto out; + } else { dict->extra_stdfree = cli_req.dict.dict_val; + } } - /* In the above section if dict_unserialize is successful, ret is set - * to zero. - */ - ret = -1; - // Do not allow peer reset if there are any volumes in the cluster - if (!list_empty (&priv->volumes)) { - snprintf (msg_str, sizeof (msg_str), "volumes are already " - "present in the cluster. Resetting uuid is not " - "allowed"); - gf_log (this->name, GF_LOG_WARNING, "%s", msg_str); + rsp_dict = dict_new (); + if (!rsp_dict) { + ret = -1; goto out; } - // Do not allow peer reset if trusted storage pool is already formed - if (!list_empty (&priv->peers)) { - snprintf (msg_str, sizeof (msg_str),"trusted storage pool " - "has been already formed. Please detach this peer " - "from the pool and reset its uuid."); - gf_log (this->name, GF_LOG_WARNING, "%s", msg_str); + uuid_utoa_r (MY_UUID, uuid_str); + ret = dict_set_str (rsp_dict, "uuid", uuid_str); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set uuid in " + "dictionary."); goto out; } - uuid_copy (uuid, priv->uuid); - ret = glusterd_uuid_generate_save (); - - if (!uuid_compare (uuid, MY_UUID)) { - snprintf (msg_str, sizeof (msg_str), "old uuid and the new uuid" - " are same. Try gluster peer reset again"); - gf_log (this->name, GF_LOG_ERROR, "%s", msg_str); - ret = -1; + ret = dict_allocate_and_serialize (rsp_dict, &rsp.dict.dict_val, + &rsp.dict.dict_len); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to serialize " + "dictionary."); goto out; } - + ret = 0; out: if (ret) { rsp.op_ret = -1; @@ -1209,22 +1556,22 @@ out: snprintf (msg_str, sizeof (msg_str), "Operation " "failed"); rsp.op_errstr = msg_str; - ret = 0; + } else { rsp.op_errstr = ""; + } glusterd_to_cli (req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_cli_rsp, dict); - return ret; + return 0; } - int -glusterd_handle_cli_uuid_reset (rpcsvc_request_t *req) +glusterd_handle_cli_uuid_get (rpcsvc_request_t *req) { return glusterd_big_locked_handler (req, - __glusterd_handle_cli_uuid_reset); + __glusterd_handle_cli_uuid_get); } int @@ -1548,7 +1895,7 @@ __glusterd_handle_sync_volume (rpcsvc_request_t *req) gf_log (this->name, GF_LOG_INFO, "Received volume sync req " "for volume %s", (flags & GF_CLI_SYNC_ALL) ? "all" : volname); - if (glusterd_is_local_addr (hostname)) { + if (gf_is_local_addr (hostname)) { ret = -1; snprintf (msg, sizeof (msg), "sync from localhost" " not allowed"); @@ -1707,6 +2054,57 @@ glusterd_op_unlock_send_resp (rpcsvc_request_t *req, int32_t status) } int +glusterd_op_mgmt_v3_lock_send_resp (rpcsvc_request_t *req, uuid_t *txn_id, + int32_t status) +{ + + gd1_mgmt_v3_lock_rsp rsp = {{0},}; + int ret = -1; + + GF_ASSERT (req); + GF_ASSERT (txn_id); + glusterd_get_uuid (&rsp.uuid); + rsp.op_ret = status; + if (rsp.op_ret) + rsp.op_errno = errno; + uuid_copy (rsp.txn_id, *txn_id); + + ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL, + (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp); + + gf_log (THIS->name, GF_LOG_DEBUG, "Responded to mgmt_v3 lock, ret: %d", + ret); + + return ret; +} + +int +glusterd_op_mgmt_v3_unlock_send_resp (rpcsvc_request_t *req, uuid_t *txn_id, + int32_t status) +{ + + gd1_mgmt_v3_unlock_rsp rsp = {{0},}; + int ret = -1; + + GF_ASSERT (req); + GF_ASSERT (txn_id); + rsp.op_ret = status; + if (rsp.op_ret) + rsp.op_errno = errno; + glusterd_get_uuid (&rsp.uuid); + uuid_copy (rsp.txn_id, *txn_id); + + ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL, + (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp); + + gf_log (THIS->name, GF_LOG_DEBUG, + "Responded to mgmt_v3 unlock, ret: %d", + ret); + + return ret; +} + +int __glusterd_handle_cluster_unlock (rpcsvc_request_t *req) { gd1_mgmt_cluster_unlock_req unlock_req = {{0}, }; @@ -1714,6 +2112,7 @@ __glusterd_handle_cluster_unlock (rpcsvc_request_t *req) glusterd_op_lock_ctx_t *ctx = NULL; glusterd_peerinfo_t *peerinfo = NULL; xlator_t *this = NULL; + uuid_t *txn_id = &global_txn_id; this = THIS; GF_ASSERT (this); @@ -1748,8 +2147,9 @@ __glusterd_handle_cluster_unlock (rpcsvc_request_t *req) } uuid_copy (ctx->uuid, unlock_req.uuid); ctx->req = req; + ctx->dict = NULL; - ret = glusterd_op_sm_inject_event (GD_OP_EVENT_UNLOCK, ctx); + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_UNLOCK, txn_id, ctx); out: glusterd_friend_sm (); @@ -2340,8 +2740,10 @@ __glusterd_handle_mount (rpcsvc_request_t *req) gf1_cli_mount_rsp rsp = {0,}; dict_t *dict = NULL; int ret = 0; + glusterd_conf_t *priv = NULL; GF_ASSERT (req); + priv = THIS->private; ret = xdr_to_generic (req->msg[0], &mnt_req, (xdrproc_t)xdr_gf1_cli_mount_req); @@ -2374,8 +2776,10 @@ __glusterd_handle_mount (rpcsvc_request_t *req) } } + synclock_unlock (&priv->big_lock); rsp.op_ret = glusterd_do_mount (mnt_req.label, dict, &rsp.path, &rsp.op_errno); + synclock_lock (&priv->big_lock); out: if (!rsp.path) @@ -2725,7 +3129,8 @@ out: } int -glusterd_probe_begin (rpcsvc_request_t *req, const char *hoststr, int port) +glusterd_probe_begin (rpcsvc_request_t *req, const char *hoststr, int port, + dict_t *dict) { int ret = -1; glusterd_peerinfo_t *peerinfo = NULL; @@ -2741,6 +3146,7 @@ glusterd_probe_begin (rpcsvc_request_t *req, const char *hoststr, int port) " for host: %s (%d)", hoststr, port); args.mode = GD_MODE_ON; args.req = req; + args.dict = dict; ret = glusterd_friend_add ((char *)hoststr, port, GD_FRIEND_STATE_DEFAULT, NULL, &peerinfo, 0, &args); @@ -2762,11 +3168,11 @@ glusterd_probe_begin (rpcsvc_request_t *req, const char *hoststr, int port) ret = glusterd_friend_sm_inject_event (event); glusterd_xfer_cli_probe_resp (req, 0, GF_PROBE_SUCCESS, NULL, (char*)hoststr, - port); + port, dict); } } else { glusterd_xfer_cli_probe_resp (req, 0, GF_PROBE_FRIEND, NULL, - (char*)hoststr, port); + (char*)hoststr, port, dict); } out: @@ -2776,7 +3182,7 @@ out: int glusterd_deprobe_begin (rpcsvc_request_t *req, const char *hoststr, int port, - uuid_t uuid) + uuid_t uuid, dict_t *dict) { int ret = -1; glusterd_peerinfo_t *peerinfo = NULL; @@ -2817,6 +3223,7 @@ glusterd_deprobe_begin (rpcsvc_request_t *req, const char *hoststr, int port, ctx->hostname = gf_strdup (hoststr); ctx->port = port; ctx->req = req; + ctx->dict = dict; event->ctx = ctx; @@ -2895,49 +3302,205 @@ glusterd_xfer_friend_add_resp (rpcsvc_request_t *req, char *myhostname, return ret; } +static void +set_probe_error_str (int op_ret, int op_errno, char *op_errstr, char *errstr, + size_t len, char *hostname, int port) +{ + if ((op_errstr) && (strcmp (op_errstr, ""))) { + snprintf (errstr, len, "%s", op_errstr); + return; + } + + if (!op_ret) { + switch (op_errno) { + case GF_PROBE_LOCALHOST: + snprintf (errstr, len, "Probe on localhost not " + "needed"); + break; + + case GF_PROBE_FRIEND: + snprintf (errstr, len, "Host %s port %d already" + " in peer list", hostname, port); + break; + + default: + if (op_errno != 0) + snprintf (errstr, len, "Probe returned " + "with unknown errno %d", + op_errno); + break; + } + } else { + switch (op_errno) { + case GF_PROBE_ANOTHER_CLUSTER: + snprintf (errstr, len, "%s is already part of " + "another cluster", hostname); + break; + + case GF_PROBE_VOLUME_CONFLICT: + snprintf (errstr, len, "Atleast one volume on " + "%s conflicts with existing volumes " + "in the cluster", hostname); + break; + + case GF_PROBE_UNKNOWN_PEER: + snprintf (errstr, len, "%s responded with " + "'unknown peer' error, this could " + "happen if %s doesn't have localhost " + "in its peer database", hostname, + hostname); + break; + + case GF_PROBE_ADD_FAILED: + snprintf (errstr, len, "Failed to add peer " + "information on %s", hostname); + break; + + case GF_PROBE_SAME_UUID: + snprintf (errstr, len, "Peer uuid (host %s) is " + "same as local uuid", hostname); + break; + + case GF_PROBE_QUORUM_NOT_MET: + snprintf (errstr, len, "Cluster quorum is not " + "met. Changing peers is not allowed " + "in this state"); + break; + + default: + snprintf (errstr, len, "Probe returned with " + "unknown errno %d", op_errno); + break; + } + } +} + int glusterd_xfer_cli_probe_resp (rpcsvc_request_t *req, int32_t op_ret, int32_t op_errno, char *op_errstr, char *hostname, - int port) + int port, dict_t *dict) { - gf1_cli_probe_rsp rsp = {0, }; + gf_cli_rsp rsp = {0,}; int32_t ret = -1; + char errstr[2048] = {0,}; + char *cmd_str = NULL; + xlator_t *this = THIS; GF_ASSERT (req); + GF_ASSERT (this); + + (void) set_probe_error_str (op_ret, op_errno, op_errstr, errstr, + sizeof (errstr), hostname, port); + + if (dict) { + ret = dict_get_str (dict, "cmd-str", &cmd_str); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "command string"); + } rsp.op_ret = op_ret; rsp.op_errno = op_errno; - rsp.op_errstr = op_errstr ? op_errstr : ""; - rsp.hostname = hostname; - rsp.port = port; + rsp.op_errstr = (errstr[0] != '\0') ? errstr : ""; + + gf_cmd_log ("", "%s : %s %s %s", cmd_str, + (op_ret) ? "FAILED" : "SUCCESS", + (errstr[0] != '\0') ? ":" : " ", + (errstr[0] != '\0') ? errstr : " "); ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL, - (xdrproc_t)xdr_gf1_cli_probe_rsp); + (xdrproc_t)xdr_gf_cli_rsp); - gf_log ("glusterd", GF_LOG_INFO, "Responded to CLI, ret: %d",ret); + if (dict) + dict_unref (dict); + gf_log (this->name, GF_LOG_DEBUG, "Responded to CLI, ret: %d",ret); return ret; } +static void +set_deprobe_error_str (int op_ret, int op_errno, char *op_errstr, char *errstr, + size_t len, char *hostname) +{ + if ((op_errstr) && (strcmp (op_errstr, ""))) { + snprintf (errstr, len, "%s", op_errstr); + return; + } + + if (op_ret) { + switch (op_errno) { + case GF_DEPROBE_LOCALHOST: + snprintf (errstr, len, "%s is localhost", + hostname); + break; + + case GF_DEPROBE_NOT_FRIEND: + snprintf (errstr, len, "%s is not part of " + "cluster", hostname); + break; + + case GF_DEPROBE_BRICK_EXIST: + snprintf (errstr, len, "Brick(s) with the peer " + "%s exist in cluster", hostname); + break; + + case GF_DEPROBE_FRIEND_DOWN: + snprintf (errstr, len, "One of the peers is " + "probably down. Check with " + "'peer status'"); + break; + + case GF_DEPROBE_QUORUM_NOT_MET: + snprintf (errstr, len, "Cluster quorum is not " + "met. Changing peers is not allowed " + "in this state"); + break; + + default: + snprintf (errstr, len, "Detach returned with " + "unknown errno %d", op_errno); + break; + + } + } +} + + int glusterd_xfer_cli_deprobe_resp (rpcsvc_request_t *req, int32_t op_ret, int32_t op_errno, char *op_errstr, - char *hostname) + char *hostname, dict_t *dict) { - gf1_cli_deprobe_rsp rsp = {0, }; + gf_cli_rsp rsp = {0,}; int32_t ret = -1; + char *cmd_str = NULL; + char errstr[2048] = {0,}; GF_ASSERT (req); + (void) set_deprobe_error_str (op_ret, op_errno, op_errstr, errstr, + sizeof (errstr), hostname); + + if (dict) { + ret = dict_get_str (dict, "cmd-str", &cmd_str); + if (ret) + gf_log (THIS->name, GF_LOG_ERROR, "Failed to get " + "command string"); + } + rsp.op_ret = op_ret; rsp.op_errno = op_errno; - rsp.op_errstr = op_errstr ? op_errstr : ""; - rsp.hostname = hostname; + rsp.op_errstr = (errstr[0] != '\0') ? errstr : ""; + + gf_cmd_log ("", "%s : %s %s %s", cmd_str, + (op_ret) ? "FAILED" : "SUCCESS", + (errstr[0] != '\0') ? ":" : " ", + (errstr[0] != '\0') ? errstr : " "); ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL, - (xdrproc_t)xdr_gf1_cli_deprobe_rsp); + (xdrproc_t)xdr_gf_cli_rsp); - gf_log ("glusterd", GF_LOG_INFO, "Responded to CLI, ret: %d",ret); + gf_log (THIS->name, GF_LOG_DEBUG, "Responded to CLI, ret: %d",ret); return ret; } @@ -3279,6 +3842,47 @@ glusterd_handle_cli_clearlocks_volume (rpcsvc_request_t *req) __glusterd_handle_cli_clearlocks_volume); } +static int +get_brickinfo_from_brickid (char *brickid, glusterd_brickinfo_t **brickinfo) +{ + glusterd_volinfo_t *volinfo = NULL; + char *volid_str = NULL; + char *brick = NULL; + char *brickid_dup = NULL; + uuid_t volid = {0}; + int ret = -1; + + brickid_dup = gf_strdup (brickid); + if (!brickid_dup) + goto out; + + volid_str = brickid_dup; + brick = strchr (brickid_dup, ':'); + *brick = '\0'; + brick++; + if (!volid_str || !brick) + goto out; + + uuid_parse (volid_str, volid); + ret = glusterd_volinfo_find_by_volume_id (volid, &volinfo); + if (ret) { + /* Check if it a snapshot volume */ + ret = glusterd_snap_volinfo_find_by_volume_id (volid, &volinfo); + if (ret) + goto out; + } + + ret = glusterd_volume_brickinfo_get_by_brick (brick, volinfo, + brickinfo); + if (ret) + goto out; + + ret = 0; +out: + GF_FREE (brickid_dup); + return ret; +} + int __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, void *data) @@ -3286,10 +3890,15 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata, xlator_t *this = NULL; glusterd_conf_t *conf = NULL; int ret = 0; + char *brickid = NULL; glusterd_brickinfo_t *brickinfo = NULL; - brickinfo = mydata; - if (!brickinfo) + brickid = mydata; + if (!brickid) + return 0; + + ret = get_brickinfo_from_brickid (brickid, &brickinfo); + if (ret) return 0; this = THIS; @@ -3299,15 +3908,21 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata, switch (event) { case RPC_CLNT_CONNECT: - gf_log (this->name, GF_LOG_DEBUG, "got RPC_CLNT_CONNECT"); + gf_log (this->name, GF_LOG_DEBUG, "Connected to %s:%s", + brickinfo->hostname, brickinfo->path); glusterd_set_brick_status (brickinfo, GF_BRICK_STARTED); ret = default_notify (this, GF_EVENT_CHILD_UP, NULL); break; case RPC_CLNT_DISCONNECT: - gf_log (this->name, GF_LOG_DEBUG, "got RPC_CLNT_DISCONNECT"); + if (GF_BRICK_STARTED == brickinfo->status) + gf_log (this->name, GF_LOG_INFO, "Disconnected from " + "%s:%s", brickinfo->hostname, brickinfo->path); + glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED); + if (rpc_clnt_is_disabled (rpc)) + GF_FREE (brickid); break; default: @@ -3383,11 +3998,13 @@ glusterd_friend_remove_notify (glusterd_peerctx_t *peerctx) glusterd_peerinfo_t *peerinfo = peerctx->peerinfo; rpcsvc_request_t *req = peerctx->args.req; char *errstr = peerctx->errstr; + dict_t *dict = NULL; GF_ASSERT (peerctx); peerinfo = peerctx->peerinfo; req = peerctx->args.req; + dict = peerctx->args.dict; errstr = peerctx->errstr; ret = glusterd_friend_sm_new_event (GD_FRIEND_EVENT_REMOVE_FRIEND, @@ -3401,7 +4018,8 @@ glusterd_friend_remove_notify (glusterd_peerctx_t *peerctx) } glusterd_xfer_cli_probe_resp (req, -1, ENOTCONN, errstr, - peerinfo->hostname, peerinfo->port); + peerinfo->hostname, + peerinfo->port, dict); new_event->peerinfo = peerinfo; ret = glusterd_friend_sm_inject_event (new_event); @@ -3426,6 +4044,7 @@ __glusterd_peer_rpc_notify (struct rpc_clnt *rpc, void *mydata, glusterd_peerinfo_t *peerinfo = NULL; glusterd_peerctx_t *peerctx = NULL; gf_boolean_t quorum_action = _gf_false; + glusterd_volinfo_t *volinfo = NULL; peerctx = mydata; if (!peerctx) @@ -3453,6 +4072,20 @@ __glusterd_peer_rpc_notify (struct rpc_clnt *rpc, void *mydata, gf_log (this->name, GF_LOG_DEBUG, "got RPC_CLNT_DISCONNECT %d", peerinfo->state.state); + if (peerinfo->connected) { + list_for_each_entry (volinfo, &conf->volumes, vol_list) { + ret = glusterd_mgmt_v3_unlock (volinfo->volname, + peerinfo->uuid, + "vol"); + if (ret) + gf_log (this->name, GF_LOG_TRACE, + "Lock not released for %s", + volinfo->volname); + } + + ret = 0; + } + if ((peerinfo->quorum_contrib != QUORUM_DOWN) && (peerinfo->state.state == GD_FRIEND_STATE_BEFRIENDED)) { peerinfo->quorum_contrib = QUORUM_DOWN; @@ -3502,11 +4135,11 @@ glusterd_null (rpcsvc_request_t *req) } rpcsvc_actor_t gd_svc_mgmt_actors[] = { - [GLUSTERD_MGMT_NULL] = { "NULL", GLUSTERD_MGMT_NULL, glusterd_null, NULL, 0}, - [GLUSTERD_MGMT_CLUSTER_LOCK] = { "CLUSTER_LOCK", GLUSTERD_MGMT_CLUSTER_LOCK, glusterd_handle_cluster_lock, NULL, 0}, - [GLUSTERD_MGMT_CLUSTER_UNLOCK] = { "CLUSTER_UNLOCK", GLUSTERD_MGMT_CLUSTER_UNLOCK, glusterd_handle_cluster_unlock, NULL, 0}, - [GLUSTERD_MGMT_STAGE_OP] = { "STAGE_OP", GLUSTERD_MGMT_STAGE_OP, glusterd_handle_stage_op, NULL, 0}, - [GLUSTERD_MGMT_COMMIT_OP] = { "COMMIT_OP", GLUSTERD_MGMT_COMMIT_OP, glusterd_handle_commit_op, NULL, 0}, + [GLUSTERD_MGMT_NULL] = { "NULL", GLUSTERD_MGMT_NULL, glusterd_null, NULL, 0, DRC_NA}, + [GLUSTERD_MGMT_CLUSTER_LOCK] = { "CLUSTER_LOCK", GLUSTERD_MGMT_CLUSTER_LOCK, glusterd_handle_cluster_lock, NULL, 0, DRC_NA}, + [GLUSTERD_MGMT_CLUSTER_UNLOCK] = { "CLUSTER_UNLOCK", GLUSTERD_MGMT_CLUSTER_UNLOCK, glusterd_handle_cluster_unlock, NULL, 0, DRC_NA}, + [GLUSTERD_MGMT_STAGE_OP] = { "STAGE_OP", GLUSTERD_MGMT_STAGE_OP, glusterd_handle_stage_op, NULL, 0, DRC_NA}, + [GLUSTERD_MGMT_COMMIT_OP] = { "COMMIT_OP", GLUSTERD_MGMT_COMMIT_OP, glusterd_handle_commit_op, NULL, 0, DRC_NA}, }; struct rpcsvc_program gd_svc_mgmt_prog = { @@ -3519,11 +4152,11 @@ struct rpcsvc_program gd_svc_mgmt_prog = { }; rpcsvc_actor_t gd_svc_peer_actors[] = { - [GLUSTERD_FRIEND_NULL] = { "NULL", GLUSTERD_MGMT_NULL, glusterd_null, NULL, 0}, - [GLUSTERD_PROBE_QUERY] = { "PROBE_QUERY", GLUSTERD_PROBE_QUERY, glusterd_handle_probe_query, NULL, 0}, - [GLUSTERD_FRIEND_ADD] = { "FRIEND_ADD", GLUSTERD_FRIEND_ADD, glusterd_handle_incoming_friend_req, NULL, 0}, - [GLUSTERD_FRIEND_REMOVE] = { "FRIEND_REMOVE", GLUSTERD_FRIEND_REMOVE, glusterd_handle_incoming_unfriend_req, NULL, 0}, - [GLUSTERD_FRIEND_UPDATE] = { "FRIEND_UPDATE", GLUSTERD_FRIEND_UPDATE, glusterd_handle_friend_update, NULL, 0}, + [GLUSTERD_FRIEND_NULL] = { "NULL", GLUSTERD_MGMT_NULL, glusterd_null, NULL, 0, DRC_NA}, + [GLUSTERD_PROBE_QUERY] = { "PROBE_QUERY", GLUSTERD_PROBE_QUERY, glusterd_handle_probe_query, NULL, 0, DRC_NA}, + [GLUSTERD_FRIEND_ADD] = { "FRIEND_ADD", GLUSTERD_FRIEND_ADD, glusterd_handle_incoming_friend_req, NULL, 0, DRC_NA}, + [GLUSTERD_FRIEND_REMOVE] = { "FRIEND_REMOVE", GLUSTERD_FRIEND_REMOVE, glusterd_handle_incoming_unfriend_req, NULL, 0, DRC_NA}, + [GLUSTERD_FRIEND_UPDATE] = { "FRIEND_UPDATE", GLUSTERD_FRIEND_UPDATE, glusterd_handle_friend_update, NULL, 0, DRC_NA}, }; struct rpcsvc_program gd_svc_peer_prog = { @@ -3538,38 +4171,39 @@ struct rpcsvc_program gd_svc_peer_prog = { rpcsvc_actor_t gd_svc_cli_actors[] = { - [GLUSTER_CLI_PROBE] = { "CLI_PROBE", GLUSTER_CLI_PROBE, glusterd_handle_cli_probe, NULL, 0}, - [GLUSTER_CLI_CREATE_VOLUME] = { "CLI_CREATE_VOLUME", GLUSTER_CLI_CREATE_VOLUME, glusterd_handle_create_volume, NULL, 0}, - [GLUSTER_CLI_DEFRAG_VOLUME] = { "CLI_DEFRAG_VOLUME", GLUSTER_CLI_DEFRAG_VOLUME, glusterd_handle_defrag_volume, NULL, 0}, - [GLUSTER_CLI_DEPROBE] = { "FRIEND_REMOVE", GLUSTER_CLI_DEPROBE, glusterd_handle_cli_deprobe, NULL, 0}, - [GLUSTER_CLI_LIST_FRIENDS] = { "LIST_FRIENDS", GLUSTER_CLI_LIST_FRIENDS, glusterd_handle_cli_list_friends, NULL, 0}, - [GLUSTER_CLI_UUID_RESET] = { "UUID_RESET", GLUSTER_CLI_UUID_RESET, glusterd_handle_cli_uuid_reset, NULL, 0}, - [GLUSTER_CLI_START_VOLUME] = { "START_VOLUME", GLUSTER_CLI_START_VOLUME, glusterd_handle_cli_start_volume, NULL, 0}, - [GLUSTER_CLI_STOP_VOLUME] = { "STOP_VOLUME", GLUSTER_CLI_STOP_VOLUME, glusterd_handle_cli_stop_volume, NULL, 0}, - [GLUSTER_CLI_DELETE_VOLUME] = { "DELETE_VOLUME", GLUSTER_CLI_DELETE_VOLUME, glusterd_handle_cli_delete_volume, NULL, 0}, - [GLUSTER_CLI_GET_VOLUME] = { "GET_VOLUME", GLUSTER_CLI_GET_VOLUME, glusterd_handle_cli_get_volume, NULL, 0}, - [GLUSTER_CLI_ADD_BRICK] = { "ADD_BRICK", GLUSTER_CLI_ADD_BRICK, glusterd_handle_add_brick, NULL, 0}, - [GLUSTER_CLI_REPLACE_BRICK] = { "REPLACE_BRICK", GLUSTER_CLI_REPLACE_BRICK, glusterd_handle_replace_brick, NULL, 0}, - [GLUSTER_CLI_REMOVE_BRICK] = { "REMOVE_BRICK", GLUSTER_CLI_REMOVE_BRICK, glusterd_handle_remove_brick, NULL, 0}, - [GLUSTER_CLI_LOG_ROTATE] = { "LOG FILENAME", GLUSTER_CLI_LOG_ROTATE, glusterd_handle_log_rotate, NULL, 0}, - [GLUSTER_CLI_SET_VOLUME] = { "SET_VOLUME", GLUSTER_CLI_SET_VOLUME, glusterd_handle_set_volume, NULL, 0}, - [GLUSTER_CLI_SYNC_VOLUME] = { "SYNC_VOLUME", GLUSTER_CLI_SYNC_VOLUME, glusterd_handle_sync_volume, NULL, 0}, - [GLUSTER_CLI_RESET_VOLUME] = { "RESET_VOLUME", GLUSTER_CLI_RESET_VOLUME, glusterd_handle_reset_volume, NULL, 0}, - [GLUSTER_CLI_FSM_LOG] = { "FSM_LOG", GLUSTER_CLI_FSM_LOG, glusterd_handle_fsm_log, NULL, 0}, - [GLUSTER_CLI_GSYNC_SET] = { "GSYNC_SET", GLUSTER_CLI_GSYNC_SET, glusterd_handle_gsync_set, NULL, 0}, - [GLUSTER_CLI_PROFILE_VOLUME] = { "STATS_VOLUME", GLUSTER_CLI_PROFILE_VOLUME, glusterd_handle_cli_profile_volume, NULL, 0}, - [GLUSTER_CLI_QUOTA] = { "QUOTA", GLUSTER_CLI_QUOTA, glusterd_handle_quota, NULL, 0}, - [GLUSTER_CLI_GETWD] = { "GETWD", GLUSTER_CLI_GETWD, glusterd_handle_getwd, NULL, 1}, - [GLUSTER_CLI_STATUS_VOLUME] = {"STATUS_VOLUME", GLUSTER_CLI_STATUS_VOLUME, glusterd_handle_status_volume, NULL, 0}, - [GLUSTER_CLI_MOUNT] = { "MOUNT", GLUSTER_CLI_MOUNT, glusterd_handle_mount, NULL, 1}, - [GLUSTER_CLI_UMOUNT] = { "UMOUNT", GLUSTER_CLI_UMOUNT, glusterd_handle_umount, NULL, 1}, - [GLUSTER_CLI_HEAL_VOLUME] = { "HEAL_VOLUME", GLUSTER_CLI_HEAL_VOLUME, glusterd_handle_cli_heal_volume, NULL, 0}, - [GLUSTER_CLI_STATEDUMP_VOLUME] = {"STATEDUMP_VOLUME", GLUSTER_CLI_STATEDUMP_VOLUME, glusterd_handle_cli_statedump_volume, NULL, 0}, - [GLUSTER_CLI_LIST_VOLUME] = {"LIST_VOLUME", GLUSTER_CLI_LIST_VOLUME, glusterd_handle_cli_list_volume, NULL, 0}, - [GLUSTER_CLI_CLRLOCKS_VOLUME] = {"CLEARLOCKS_VOLUME", GLUSTER_CLI_CLRLOCKS_VOLUME, glusterd_handle_cli_clearlocks_volume, NULL, 0}, -#ifdef HAVE_BD_XLATOR - [GLUSTER_CLI_BD_OP] = {"BD_OP", GLUSTER_CLI_BD_OP, glusterd_handle_cli_bd_op, NULL, 0}, -#endif + [GLUSTER_CLI_PROBE] = { "CLI_PROBE", GLUSTER_CLI_PROBE, glusterd_handle_cli_probe, NULL, 0, DRC_NA}, + [GLUSTER_CLI_CREATE_VOLUME] = { "CLI_CREATE_VOLUME", GLUSTER_CLI_CREATE_VOLUME, glusterd_handle_create_volume, NULL, 0, DRC_NA}, + [GLUSTER_CLI_DEFRAG_VOLUME] = { "CLI_DEFRAG_VOLUME", GLUSTER_CLI_DEFRAG_VOLUME, glusterd_handle_defrag_volume, NULL, 0, DRC_NA}, + [GLUSTER_CLI_DEPROBE] = { "FRIEND_REMOVE", GLUSTER_CLI_DEPROBE, glusterd_handle_cli_deprobe, NULL, 0, DRC_NA}, + [GLUSTER_CLI_LIST_FRIENDS] = { "LIST_FRIENDS", GLUSTER_CLI_LIST_FRIENDS, glusterd_handle_cli_list_friends, NULL, 0, DRC_NA}, + [GLUSTER_CLI_UUID_RESET] = { "UUID_RESET", GLUSTER_CLI_UUID_RESET, glusterd_handle_cli_uuid_reset, NULL, 0, DRC_NA}, + [GLUSTER_CLI_UUID_GET] = { "UUID_GET", GLUSTER_CLI_UUID_GET, glusterd_handle_cli_uuid_get, NULL, 0, DRC_NA}, + [GLUSTER_CLI_START_VOLUME] = { "START_VOLUME", GLUSTER_CLI_START_VOLUME, glusterd_handle_cli_start_volume, NULL, 0, DRC_NA}, + [GLUSTER_CLI_STOP_VOLUME] = { "STOP_VOLUME", GLUSTER_CLI_STOP_VOLUME, glusterd_handle_cli_stop_volume, NULL, 0, DRC_NA}, + [GLUSTER_CLI_DELETE_VOLUME] = { "DELETE_VOLUME", GLUSTER_CLI_DELETE_VOLUME, glusterd_handle_cli_delete_volume, NULL, 0, DRC_NA}, + [GLUSTER_CLI_GET_VOLUME] = { "GET_VOLUME", GLUSTER_CLI_GET_VOLUME, glusterd_handle_cli_get_volume, NULL, 0, DRC_NA}, + [GLUSTER_CLI_ADD_BRICK] = { "ADD_BRICK", GLUSTER_CLI_ADD_BRICK, glusterd_handle_add_brick, NULL, 0, DRC_NA}, + [GLUSTER_CLI_REPLACE_BRICK] = { "REPLACE_BRICK", GLUSTER_CLI_REPLACE_BRICK, glusterd_handle_replace_brick, NULL, 0, DRC_NA}, + [GLUSTER_CLI_REMOVE_BRICK] = { "REMOVE_BRICK", GLUSTER_CLI_REMOVE_BRICK, glusterd_handle_remove_brick, NULL, 0, DRC_NA}, + [GLUSTER_CLI_LOG_ROTATE] = { "LOG FILENAME", GLUSTER_CLI_LOG_ROTATE, glusterd_handle_log_rotate, NULL, 0, DRC_NA}, + [GLUSTER_CLI_SET_VOLUME] = { "SET_VOLUME", GLUSTER_CLI_SET_VOLUME, glusterd_handle_set_volume, NULL, 0, DRC_NA}, + [GLUSTER_CLI_SYNC_VOLUME] = { "SYNC_VOLUME", GLUSTER_CLI_SYNC_VOLUME, glusterd_handle_sync_volume, NULL, 0, DRC_NA}, + [GLUSTER_CLI_RESET_VOLUME] = { "RESET_VOLUME", GLUSTER_CLI_RESET_VOLUME, glusterd_handle_reset_volume, NULL, 0, DRC_NA}, + [GLUSTER_CLI_FSM_LOG] = { "FSM_LOG", GLUSTER_CLI_FSM_LOG, glusterd_handle_fsm_log, NULL, 0, DRC_NA}, + [GLUSTER_CLI_GSYNC_SET] = { "GSYNC_SET", GLUSTER_CLI_GSYNC_SET, glusterd_handle_gsync_set, NULL, 0, DRC_NA}, + [GLUSTER_CLI_PROFILE_VOLUME] = { "STATS_VOLUME", GLUSTER_CLI_PROFILE_VOLUME, glusterd_handle_cli_profile_volume, NULL, 0, DRC_NA}, + [GLUSTER_CLI_QUOTA] = { "QUOTA", GLUSTER_CLI_QUOTA, glusterd_handle_quota, NULL, 0, DRC_NA}, + [GLUSTER_CLI_GETWD] = { "GETWD", GLUSTER_CLI_GETWD, glusterd_handle_getwd, NULL, 1, DRC_NA}, + [GLUSTER_CLI_STATUS_VOLUME] = {"STATUS_VOLUME", GLUSTER_CLI_STATUS_VOLUME, glusterd_handle_status_volume, NULL, 0, DRC_NA}, + [GLUSTER_CLI_MOUNT] = { "MOUNT", GLUSTER_CLI_MOUNT, glusterd_handle_mount, NULL, 1, DRC_NA}, + [GLUSTER_CLI_UMOUNT] = { "UMOUNT", GLUSTER_CLI_UMOUNT, glusterd_handle_umount, NULL, 1, DRC_NA}, + [GLUSTER_CLI_HEAL_VOLUME] = { "HEAL_VOLUME", GLUSTER_CLI_HEAL_VOLUME, glusterd_handle_cli_heal_volume, NULL, 0, DRC_NA}, + [GLUSTER_CLI_STATEDUMP_VOLUME] = {"STATEDUMP_VOLUME", GLUSTER_CLI_STATEDUMP_VOLUME, glusterd_handle_cli_statedump_volume, NULL, 0, DRC_NA}, + [GLUSTER_CLI_LIST_VOLUME] = {"LIST_VOLUME", GLUSTER_CLI_LIST_VOLUME, glusterd_handle_cli_list_volume, NULL, 0, DRC_NA}, + [GLUSTER_CLI_CLRLOCKS_VOLUME] = {"CLEARLOCKS_VOLUME", GLUSTER_CLI_CLRLOCKS_VOLUME, glusterd_handle_cli_clearlocks_volume, NULL, 0, DRC_NA}, + [GLUSTER_CLI_COPY_FILE] = {"COPY_FILE", GLUSTER_CLI_COPY_FILE, glusterd_handle_copy_file, NULL, 0, DRC_NA}, + [GLUSTER_CLI_SYS_EXEC] = {"SYS_EXEC", GLUSTER_CLI_SYS_EXEC, glusterd_handle_sys_exec, NULL, 0, DRC_NA}, + [GLUSTER_CLI_SNAP] = {"SNAP", GLUSTER_CLI_SNAP, glusterd_handle_snapshot, NULL, 0, DRC_NA}, }; struct rpcsvc_program gd_svc_cli_prog = { @@ -3580,3 +4214,24 @@ struct rpcsvc_program gd_svc_cli_prog = { .actors = gd_svc_cli_actors, .synctask = _gf_true, }; + +/* This is a minimal RPC prog, which contains only the readonly RPC procs from + * the cli rpcsvc + */ +rpcsvc_actor_t gd_svc_cli_actors_ro[] = { + [GLUSTER_CLI_LIST_FRIENDS] = { "LIST_FRIENDS", GLUSTER_CLI_LIST_FRIENDS, glusterd_handle_cli_list_friends, NULL, 0, DRC_NA}, + [GLUSTER_CLI_UUID_GET] = { "UUID_GET", GLUSTER_CLI_UUID_GET, glusterd_handle_cli_uuid_get, NULL, 0, DRC_NA}, + [GLUSTER_CLI_GET_VOLUME] = { "GET_VOLUME", GLUSTER_CLI_GET_VOLUME, glusterd_handle_cli_get_volume, NULL, 0, DRC_NA}, + [GLUSTER_CLI_GETWD] = { "GETWD", GLUSTER_CLI_GETWD, glusterd_handle_getwd, NULL, 1, DRC_NA}, + [GLUSTER_CLI_STATUS_VOLUME] = {"STATUS_VOLUME", GLUSTER_CLI_STATUS_VOLUME, glusterd_handle_status_volume, NULL, 0, DRC_NA}, + [GLUSTER_CLI_LIST_VOLUME] = {"LIST_VOLUME", GLUSTER_CLI_LIST_VOLUME, glusterd_handle_cli_list_volume, NULL, 0, DRC_NA}, +}; + +struct rpcsvc_program gd_svc_cli_prog_ro = { + .progname = "GlusterD svc cli read-only", + .prognum = GLUSTER_CLI_PROGRAM, + .progver = GLUSTER_CLI_VERSION, + .numactors = GLUSTER_CLI_MAXVALUE, + .actors = gd_svc_cli_actors_ro, + .synctask = _gf_true, +}; diff --git a/xlators/mgmt/glusterd/src/glusterd-handshake.c b/xlators/mgmt/glusterd/src/glusterd-handshake.c index 9124c46ee..0f0357c4c 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handshake.c +++ b/xlators/mgmt/glusterd/src/glusterd-handshake.c @@ -30,26 +30,147 @@ extern struct rpc_clnt_program gd_peer_prog; extern struct rpc_clnt_program gd_mgmt_prog; +extern struct rpc_clnt_program gd_mgmt_v3_prog; +extern struct rpc_clnt_program gd_mgmt_v3_prog; + #define TRUSTED_PREFIX "trusted-" typedef ssize_t (*gfs_serialize_t) (struct iovec outmsg, void *data); +static int +get_snap_volname_and_volinfo (const char *volpath, char **volname, + glusterd_volinfo_t **volinfo) +{ + int ret = -1; + char *save_ptr = NULL; + char *str_token = NULL; + char *snapname = NULL; + char *volname_token = NULL; + char *vol = NULL; + glusterd_snap_t *snap = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (volpath); + GF_ASSERT (volinfo); + + str_token = gf_strdup (volpath); + if (NULL == str_token) { + goto out; + } + + /* Input volname will have below formats: + * /snaps/<snapname>/<volname>.<hostname> + * or + * /snaps/<snapname>/<parent-volname> + * We need to extract snapname and parent_volname */ + + /*split string by "/" */ + strtok_r (str_token, "/", &save_ptr); + snapname = strtok_r(NULL, "/", &save_ptr); + if (!snapname) { + gf_log(this->name, GF_LOG_ERROR, "Invalid path: %s", volpath); + goto out; + } + + volname_token = strtok_r(NULL, "/", &save_ptr); + if (!volname_token) { + gf_log(this->name, GF_LOG_ERROR, "Invalid path: %s", volpath); + goto out; + } + + snap = glusterd_find_snap_by_name (snapname); + if (!snap) { + gf_log(this->name, GF_LOG_ERROR, "Failed to " + "fetch snap %s", snapname); + goto out; + } + + /* Find if its a parent volume name or snap volume + * name. This function will succeed if volname_token + * is a parent volname + */ + ret = glusterd_volinfo_find (volname_token, volinfo); + if (ret) { + *volname = gf_strdup (volname_token); + if (NULL == *volname) { + ret = -1; + goto out; + } + + ret = glusterd_snap_volinfo_find (volname_token, snap, + volinfo); + if (ret) { + /* Split the volume name */ + vol = strtok_r (volname_token, ".", &save_ptr); + if (!vol) { + gf_log(this->name, GF_LOG_ERROR, "Invalid " + "volname (%s)", volname_token); + goto out; + } + + ret = glusterd_snap_volinfo_find (vol, snap, volinfo); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "Failed to " + "fetch snap volume from volname (%s)", + vol); + goto out; + } + } + } else { + /*volname_token is parent volname*/ + ret = glusterd_snap_volinfo_find_from_parent_volname ( + volname_token, snap, volinfo); + if (ret) { + gf_log(this->name, GF_LOG_ERROR, "Failed to " + "fetch snap volume from parent " + "volname (%s)", volname_token); + goto out; + } + + /* Since volname_token is a parent volname we should + * get the snap volname here*/ + *volname = gf_strdup ((*volinfo)->volname); + if (NULL == *volname) { + ret = -1; + goto out; + } + } + +out: + if (ret && NULL != *volname) { + GF_FREE (*volname); + *volname = NULL; + } + return ret; +} + static size_t build_volfile_path (const char *volname, char *path, size_t path_len, char *trusted_str) { - struct stat stbuf = {0,}; - int32_t ret = -1; - glusterd_conf_t *priv = NULL; - char *vol = NULL; - char *dup_volname = NULL; - char *free_ptr = NULL; - char *tmp = NULL; - glusterd_volinfo_t *volinfo = NULL; - char *server = NULL; - - priv = THIS->private; + struct stat stbuf = {0,}; + int32_t ret = -1; + glusterd_conf_t *priv = NULL; + char *vol = NULL; + char *dup_volname = NULL; + char *free_ptr = NULL; + char *save_ptr = NULL; + char *str_token = NULL; + glusterd_volinfo_t *volinfo = NULL; + char *server = NULL; + const char *volname_ptr = NULL; + char path_prefix [PATH_MAX] = {0,}; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + priv = this->private; + GF_ASSERT (priv); + GF_ASSERT (volname); + GF_ASSERT (path); if (strstr (volname, "gluster/")) { server = strchr (volname, '/') + 1; @@ -57,6 +178,22 @@ build_volfile_path (const char *volname, char *path, path, path_len); ret = 1; goto out; + } else if ((str_token = strstr (volname, "/snaps/"))) { + ret = get_snap_volname_and_volinfo (str_token, &dup_volname, + &volinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get snap" + " volinfo from path (%s)", volname); + ret = -1; + goto out; + } + + snprintf (path_prefix, sizeof (path_prefix), "%s/snaps/%s", + priv->workdir, volinfo->snapshot->snapname); + + free_ptr = dup_volname; + volname_ptr = dup_volname; + goto gotvolinfo; } else if (volname[0] != '/') { /* Normal behavior */ dup_volname = gf_strdup (volname); @@ -67,39 +204,53 @@ build_volfile_path (const char *volname, char *path, dup_volname = gf_strdup (&volname[1]); } + if (!dup_volname) { + gf_log(THIS->name, GF_LOG_ERROR, "strdup failed"); + ret = -1; + goto out; + } free_ptr = dup_volname; + volname_ptr = volname; + + snprintf (path_prefix, sizeof (path_prefix), "%s/vols", + priv->workdir); ret = glusterd_volinfo_find (dup_volname, &volinfo); + if (ret) { /* Split the volume name */ - vol = strtok_r (dup_volname, ".", &tmp); + vol = strtok_r (dup_volname, ".", &save_ptr); if (!vol) goto out; + ret = glusterd_volinfo_find (vol, &volinfo); if (ret) goto out; } +gotvolinfo: if (!glusterd_auth_get_username (volinfo)) trusted_str = NULL; - ret = snprintf (path, path_len, "%s/vols/%s/%s.vol", - priv->workdir, volinfo->volname, volname); + ret = snprintf (path, path_len, "%s/%s/%s.vol", path_prefix, + volinfo->volname, volname_ptr); if (ret == -1) goto out; ret = stat (path, &stbuf); if ((ret == -1) && (errno == ENOENT)) { - snprintf (path, path_len, "%s/vols/%s/%s%s-fuse.vol", - priv->workdir, volinfo->volname, - (trusted_str ? trusted_str : ""), dup_volname); + snprintf (path, path_len, "%s/%s/%s%s-fuse.vol", + path_prefix, volinfo->volname, + (trusted_str ? trusted_str : ""), + dup_volname); + ret = stat (path, &stbuf); } if ((ret == -1) && (errno == ENOENT)) { - snprintf (path, path_len, "%s/vols/%s/%s-tcp.vol", - priv->workdir, volinfo->volname, volname); + snprintf (path, path_len, "%s/%s/%s-tcp.vol", + path_prefix, volinfo->volname, volname_ptr); } ret = 1; @@ -244,16 +395,22 @@ __server_getspec (rpcsvc_request_t *req) } trans = req->trans; + /* addrstr will be empty for cli socket connections */ ret = rpcsvc_transport_peername (trans, (char *)&addrstr, sizeof (addrstr)); if (ret) goto fail; - tmp = strrchr (addrstr, ':'); - *tmp = '\0'; + tmp = strrchr (addrstr, ':'); + if (tmp) + *tmp = '\0'; - /* we trust the local admin */ - if (glusterd_is_local_addr (addrstr)) { + /* The trusted volfiles are given to the glusterd owned process like NFS + * server, self-heal daemon etc., so that they are not inadvertently + * blocked by a auth.{allow,reject} setting. The trusted volfile is not + * meant for external users. + */ + if (strlen (addrstr) && gf_is_local_addr (addrstr)) { ret = build_volfile_path (volume, filename, sizeof (filename), @@ -408,12 +565,16 @@ gd_validate_cluster_op_version (xlator_t *this, int cluster_op_version, goto out; } - if (cluster_op_version < conf->op_version) { + /* The peer can only reduce its op-version when it doesn't have any + * volumes. Reducing op-version when it already contains volumes can + * lead to inconsistencies in the cluster + */ + if ((cluster_op_version < conf->op_version) && + !list_empty (&conf->volumes)) { gf_log (this->name, GF_LOG_ERROR, - "operating version %d is less than the currently " - "running version (%d) on the machine (as per peer " - "request from %s)", cluster_op_version, - conf->op_version, peerid); + "cannot reduce operating version to %d from current " + "version %d as volumes exist (as per peer request from " + "%s)", cluster_op_version, conf->op_version, peerid); goto out; } @@ -582,11 +743,9 @@ glusterd_mgmt_hndsk_versions_ack (rpcsvc_request_t *req) } rpcsvc_actor_t gluster_handshake_actors[] = { - [GF_HNDSK_NULL] = {"NULL", GF_HNDSK_NULL, NULL, NULL, 0}, - [GF_HNDSK_GETSPEC] = {"GETSPEC", GF_HNDSK_GETSPEC, - server_getspec, NULL, 0}, - [GF_HNDSK_EVENT_NOTIFY] = {"EVENTNOTIFY", GF_HNDSK_EVENT_NOTIFY, - server_event_notify, NULL, 0}, + [GF_HNDSK_NULL] = {"NULL", GF_HNDSK_NULL, NULL, NULL, 0, DRC_NA}, + [GF_HNDSK_GETSPEC] = {"GETSPEC", GF_HNDSK_GETSPEC, server_getspec, NULL, 0, DRC_NA}, + [GF_HNDSK_EVENT_NOTIFY] = {"EVENTNOTIFY", GF_HNDSK_EVENT_NOTIFY, server_event_notify, NULL, 0, DRC_NA}, }; @@ -598,6 +757,19 @@ struct rpcsvc_program gluster_handshake_prog = { .numactors = GF_HNDSK_MAXVALUE, }; +/* A minimal RPC program just for the cli getspec command */ +rpcsvc_actor_t gluster_cli_getspec_actors[] = { + [GF_HNDSK_GETSPEC] = {"GETSPEC", GF_HNDSK_GETSPEC, server_getspec, NULL, 0, DRC_NA}, +}; + +struct rpcsvc_program gluster_cli_getspec_prog = { + .progname = "Gluster Handshake (CLI Getspec)", + .prognum = GLUSTER_HNDSK_PROGRAM, + .progver = GLUSTER_HNDSK_VERSION, + .actors = gluster_cli_getspec_actors, + .numactors = GF_HNDSK_MAXVALUE, +}; + char *glusterd_dump_proc[GF_DUMP_MAXVALUE] = { [GF_DUMP_NULL] = "NULL", @@ -677,6 +849,7 @@ glusterd_event_connected_inject (glusterd_peerctx_t *peerctx) ctx->hostname = gf_strdup (peerinfo->hostname); ctx->port = peerinfo->port; ctx->req = peerctx->args.req; + ctx->dict = peerctx->args.dict; event->peerinfo = peerinfo; event->ctx = ctx; @@ -734,16 +907,6 @@ gd_validate_peer_op_version (xlator_t *this, glusterd_peerinfo_t *peerinfo, goto out; } - /* If peer is already operating at a higher op_version reject it. - * Cluster cannot be moved to higher op_version to accomodate a peer. - */ - if (peer_op_version > conf->op_version) { - ret = gf_asprintf (errstr, "Peer %s is already at a higher " - "op-version", peerinfo->hostname); - ret = -1; - goto out; - } - ret = 0; out: gf_log (this->name , GF_LOG_DEBUG, "Peer %s %s", peerinfo->hostname, @@ -800,6 +963,7 @@ __glusterd_mgmt_hndsk_version_ack_cbk (struct rpc_req *req, struct iovec *iov, */ peerinfo->mgmt = &gd_mgmt_prog; peerinfo->peer = &gd_peer_prog; + peerinfo->mgmt_v3 = &gd_mgmt_v3_prog; ret = default_notify (this, GF_EVENT_CHILD_UP, NULL); @@ -1026,6 +1190,14 @@ glusterd_set_clnt_mgmt_program (glusterd_peerinfo_t *peerinfo, peerinfo->peer->progname, peerinfo->peer->prognum, peerinfo->peer->progver); } + + if (peerinfo->mgmt_v3) { + gf_log ("", GF_LOG_INFO, + "Using Program %s, Num (%d), Version (%d)", + peerinfo->mgmt_v3->progname, peerinfo->mgmt_v3->prognum, + peerinfo->mgmt_v3->progver); + } + ret = 0; out: return ret; diff --git a/xlators/mgmt/glusterd/src/glusterd-hooks.c b/xlators/mgmt/glusterd/src/glusterd-hooks.c index a61e1e85f..2b43a452e 100644 --- a/xlators/mgmt/glusterd/src/glusterd-hooks.c +++ b/xlators/mgmt/glusterd/src/glusterd-hooks.c @@ -49,6 +49,7 @@ char glusterd_hook_dirnames[GD_OP_MAX][256] = [GD_OP_RESET_VOLUME] = EMPTY, [GD_OP_SYNC_VOLUME] = EMPTY, [GD_OP_LOG_ROTATE] = EMPTY, + [GD_OP_GSYNC_CREATE] = "gsync-create", [GD_OP_GSYNC_SET] = EMPTY, [GD_OP_PROFILE_VOLUME] = EMPTY, [GD_OP_QUOTA] = EMPTY, @@ -185,6 +186,7 @@ static int glusterd_hooks_add_op_args (runner_t *runner, glusterd_op_t op, dict_t *op_ctx, glusterd_commit_hook_type_t type) { + char *hooks_args = NULL; int vol_count = 0; gf_boolean_t truth = _gf_false; glusterd_volinfo_t *voliter = NULL; @@ -236,6 +238,18 @@ glusterd_hooks_add_op_args (runner_t *runner, glusterd_op_t op, ret = glusterd_hooks_set_volume_args (op_ctx, runner); break; + case GD_OP_GSYNC_CREATE: + ret = dict_get_str (op_ctx, "hooks_args", &hooks_args); + if (ret) + gf_log ("", GF_LOG_DEBUG, + "No Hooks Arguments."); + else + gf_log ("", GF_LOG_DEBUG, + "Hooks Args = %s", hooks_args); + if (hooks_args) + runner_argprintf (runner, "%s", hooks_args); + break; + default: break; diff --git a/xlators/mgmt/glusterd/src/glusterd-locks.c b/xlators/mgmt/glusterd/src/glusterd-locks.c new file mode 100644 index 000000000..0af2a186f --- /dev/null +++ b/xlators/mgmt/glusterd/src/glusterd-locks.c @@ -0,0 +1,637 @@ +/* + Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "common-utils.h" +#include "cli1-xdr.h" +#include "xdr-generic.h" +#include "glusterd.h" +#include "glusterd-op-sm.h" +#include "glusterd-store.h" +#include "glusterd-utils.h" +#include "glusterd-volgen.h" +#include "glusterd-locks.h" +#include "run.h" +#include "syscall.h" + +#include <signal.h> + +#define GF_MAX_LOCKING_ENTITIES 2 + +/* Valid entities that the mgmt_v3 lock can hold locks upon * + * To add newer entities to be locked, we can just add more * + * entries to this table along with the type and default value */ +valid_entities valid_types[] = { + { "vol", _gf_true }, + { "snap", _gf_false }, + { NULL }, +}; + +static dict_t *mgmt_v3_lock; + +/* Checks if the lock request is for a valid entity */ +gf_boolean_t +glusterd_mgmt_v3_is_type_valid (char *type) +{ + int32_t i = 0; + gf_boolean_t ret = _gf_false; + + GF_ASSERT (type); + + for (i = 0; valid_types[i].type; i++) { + if (!strcmp (type, valid_types[i].type)) { + ret = _gf_true; + break; + } + } + + return ret; +} + +/* Initialize the global mgmt_v3 lock list(dict) when + * glusterd is spawned */ +int32_t +glusterd_mgmt_v3_lock_init () +{ + int32_t ret = -1; + + mgmt_v3_lock = dict_new (); + if (!mgmt_v3_lock) { + ret = -1; + goto out; + } + + ret = 0; +out: + return ret; +} + +/* Destroy the global mgmt_v3 lock list(dict) when + * glusterd cleanup is performed */ +void +glusterd_mgmt_v3_lock_fini () +{ + if (mgmt_v3_lock) + dict_destroy (mgmt_v3_lock); +} + +int32_t +glusterd_get_mgmt_v3_lock_owner (char *key, uuid_t *uuid) +{ + int32_t ret = -1; + mgmt_v3_lock_obj *lock_obj = NULL; + uuid_t no_owner = {"\0"}; + xlator_t *this = NULL; + + GF_ASSERT(THIS); + this = THIS; + + if (!key || !uuid) { + gf_log (this->name, GF_LOG_ERROR, "key or uuid is null."); + ret = -1; + goto out; + } + + ret = dict_get_bin(mgmt_v3_lock, key, (void **) &lock_obj); + if (!ret) + uuid_copy (*uuid, lock_obj->lock_owner); + else + uuid_copy (*uuid, no_owner); + + ret = 0; +out: + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +/* This function is called with the locked_count and type, to * + * release all the acquired locks. */ +static int32_t +glusterd_release_multiple_locks_per_entity (dict_t *dict, uuid_t uuid, + int32_t locked_count, + char *type) +{ + char name_buf[PATH_MAX] = ""; + char *name = NULL; + int32_t i = -1; + int32_t op_ret = 0; + int32_t ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT(this); + GF_ASSERT (dict); + GF_ASSERT (type); + + if (locked_count == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "No %s locked as part of this transaction", + type); + goto out; + } + + /* Release all the locks held */ + for (i = 0; i < locked_count; i++) { + snprintf (name_buf, sizeof(name_buf), + "%sname%d", type, i+1); + + /* Looking for volname1, volname2 or snapname1, * + * as key in the dict snapname2 */ + ret = dict_get_str (dict, name_buf, &name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to get %s locked_count = %d", + name_buf, locked_count); + op_ret = ret; + continue; + } + + ret = glusterd_mgmt_v3_unlock (name, uuid, type); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to release lock for %s.", + name); + op_ret = ret; + } + } + +out: + gf_log (this->name, GF_LOG_TRACE, "Returning %d", op_ret); + return op_ret; +} + +/* Given the count and type of the entity this function acquires * + * locks on multiple elements of the same entity. For example: * + * If type is "vol" this function tries to acquire locks on multiple * + * volumes */ +static int32_t +glusterd_acquire_multiple_locks_per_entity (dict_t *dict, uuid_t uuid, + int32_t count, char *type) +{ + char name_buf[PATH_MAX] = ""; + char *name = NULL; + int32_t i = -1; + int32_t ret = -1; + int32_t locked_count = 0; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT(this); + GF_ASSERT (dict); + GF_ASSERT (type); + + /* Locking one element after other */ + for (i = 0; i < count; i++) { + snprintf (name_buf, sizeof(name_buf), + "%sname%d", type, i+1); + + /* Looking for volname1, volname2 or snapname1, * + * as key in the dict snapname2 */ + ret = dict_get_str (dict, name_buf, &name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to get %s count = %d", + name_buf, count); + break; + } + + ret = glusterd_mgmt_v3_lock (name, uuid, type); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to acquire lock for %s %s " + "on behalf of %s. Reversing " + "this transaction", type, name, + uuid_utoa(uuid)); + break; + } + locked_count++; + } + + if (count == locked_count) { + /* If all locking ops went successfuly, return as success */ + ret = 0; + goto out; + } + + /* If we failed to lock one element, unlock others and return failure */ + ret = glusterd_release_multiple_locks_per_entity (dict, uuid, + locked_count, + type); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to release multiple %s locks", + type); + } + ret = -1; +out: + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +/* Given the type of entity, this function figures out if it should unlock a * + * single element of multiple elements of the said entity. For example: * + * if the type is "vol", this function will accordingly unlock a single volume * + * or multiple volumes */ +static int32_t +glusterd_mgmt_v3_unlock_entity (dict_t *dict, uuid_t uuid, char *type, + gf_boolean_t default_value) +{ + char name_buf[PATH_MAX] = ""; + char *name = NULL; + int32_t count = -1; + int32_t ret = -1; + gf_boolean_t hold_locks = _gf_false; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT(this); + GF_ASSERT (dict); + GF_ASSERT (type); + + snprintf (name_buf, sizeof(name_buf), "hold_%s_locks", type); + hold_locks = dict_get_str_boolean (dict, name_buf, default_value); + + if (hold_locks == _gf_false) { + /* Locks were not held for this particular entity * + * Hence nothing to release */ + ret = 0; + goto out; + } + + /* Looking for volcount or snapcount in the dict */ + snprintf (name_buf, sizeof(name_buf), "%scount", type); + ret = dict_get_int32 (dict, name_buf, &count); + if (ret) { + /* count is not present. Only one * + * element name needs to be unlocked */ + snprintf (name_buf, sizeof(name_buf), "%sname", + type); + ret = dict_get_str (dict, name_buf, &name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to fetch %sname", type); + goto out; + } + + ret = glusterd_mgmt_v3_unlock (name, uuid, type); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to release lock for %s %s " + "on behalf of %s.", type, name, + uuid_utoa(uuid)); + goto out; + } + } else { + /* Unlocking one element name after another */ + ret = glusterd_release_multiple_locks_per_entity (dict, + uuid, + count, + type); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to release all %s locks", type); + goto out; + } + } + + ret = 0; +out: + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +/* Given the type of entity, this function figures out if it should lock a * + * single element or multiple elements of the said entity. For example: * + * if the type is "vol", this function will accordingly lock a single volume * + * or multiple volumes */ +static int32_t +glusterd_mgmt_v3_lock_entity (dict_t *dict, uuid_t uuid, char *type, + gf_boolean_t default_value) +{ + char name_buf[PATH_MAX] = ""; + char *name = NULL; + int32_t count = -1; + int32_t ret = -1; + gf_boolean_t hold_locks = _gf_false; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT(this); + GF_ASSERT (dict); + GF_ASSERT (type); + + snprintf (name_buf, sizeof(name_buf), "hold_%s_locks", type); + hold_locks = dict_get_str_boolean (dict, name_buf, default_value); + + if (hold_locks == _gf_false) { + /* Not holding locks for this particular entity */ + ret = 0; + goto out; + } + + /* Looking for volcount or snapcount in the dict */ + snprintf (name_buf, sizeof(name_buf), "%scount", type); + ret = dict_get_int32 (dict, name_buf, &count); + if (ret) { + /* count is not present. Only one * + * element name needs to be locked */ + snprintf (name_buf, sizeof(name_buf), "%sname", + type); + ret = dict_get_str (dict, name_buf, &name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to fetch %sname", type); + goto out; + } + + ret = glusterd_mgmt_v3_lock (name, uuid, type); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to acquire lock for %s %s " + "on behalf of %s.", type, name, + uuid_utoa(uuid)); + goto out; + } + } else { + /* Locking one element name after another */ + ret = glusterd_acquire_multiple_locks_per_entity (dict, + uuid, + count, + type); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to acquire all %s locks", type); + goto out; + } + } + + ret = 0; +out: + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +/* Try to release locks of multiple entities like * + * volume, snaps etc. */ +int32_t +glusterd_multiple_mgmt_v3_unlock (dict_t *dict, uuid_t uuid) +{ + int32_t i = -1; + int32_t ret = -1; + int32_t op_ret = 0; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT(this); + + if (!dict) { + gf_log (this->name, GF_LOG_ERROR, "dict is null."); + ret = -1; + goto out; + } + + for (i = 0; valid_types[i].type; i++) { + ret = glusterd_mgmt_v3_unlock_entity + (dict, uuid, + valid_types[i].type, + valid_types[i].default_value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to unlock all %s", + valid_types[i].type); + op_ret = ret; + } + } + + ret = op_ret; +out: + gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +/* Try to acquire locks on multiple entities like * + * volume, snaps etc. */ +int32_t +glusterd_multiple_mgmt_v3_lock (dict_t *dict, uuid_t uuid) +{ + int32_t i = -1; + int32_t ret = -1; + int32_t locked_count = 0; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT(this); + + if (!dict) { + gf_log (this->name, GF_LOG_ERROR, "dict is null."); + ret = -1; + goto out; + } + + /* Locking one entity after other */ + for (i = 0; valid_types[i].type; i++) { + ret = glusterd_mgmt_v3_lock_entity + (dict, uuid, + valid_types[i].type, + valid_types[i].default_value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to lock all %s", + valid_types[i].type); + break; + } + locked_count++; + } + + if (locked_count == GF_MAX_LOCKING_ENTITIES) { + /* If all locking ops went successfuly, return as success */ + ret = 0; + goto out; + } + + /* If we failed to lock one entity, unlock others and return failure */ + for (i = 0; i < locked_count; i++) { + ret = glusterd_mgmt_v3_unlock_entity + (dict, uuid, + valid_types[i].type, + valid_types[i].default_value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to unlock all %s", + valid_types[i].type); + } + } + ret = -1; +out: + gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +int32_t +glusterd_mgmt_v3_lock (const char *name, uuid_t uuid, char *type) +{ + char key[PATH_MAX] = ""; + int32_t ret = -1; + mgmt_v3_lock_obj *lock_obj = NULL; + gf_boolean_t is_valid = _gf_true; + uuid_t owner = {0}; + xlator_t *this = NULL; + + GF_ASSERT(THIS); + this = THIS; + + if (!name || !type) { + gf_log (this->name, GF_LOG_ERROR, "name or type is null."); + ret = -1; + goto out; + } + + is_valid = glusterd_mgmt_v3_is_type_valid (type); + if (is_valid != _gf_true) { + gf_log (this->name, GF_LOG_ERROR, + "Invalid entity. Cannot perform locking " + "operation on %s types", type); + ret = -1; + goto out; + } + + ret = snprintf (key, sizeof(key), "%s_%s", name, type); + if (ret != strlen(name) + 1 + strlen(type)) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, "Unable to create key"); + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "Trying to acquire lock of %s %s for %s as %s", + type, name, uuid_utoa (uuid), key); + + ret = glusterd_get_mgmt_v3_lock_owner (key, &owner); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "Unable to get mgmt_v3 lock owner"); + goto out; + } + + /* If the lock has already been held for the given volume + * we fail */ + if (!uuid_is_null (owner)) { + gf_log (this->name, GF_LOG_ERROR, "Lock for %s held by %s", + name, uuid_utoa (owner)); + ret = -1; + goto out; + } + + lock_obj = GF_CALLOC (1, sizeof(mgmt_v3_lock_obj), + gf_common_mt_mgmt_v3_lock_obj_t); + if (!lock_obj) { + ret = -1; + goto out; + } + + uuid_copy (lock_obj->lock_owner, uuid); + + ret = dict_set_bin (mgmt_v3_lock, key, lock_obj, + sizeof(*lock_obj)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to set lock owner in mgmt_v3 lock"); + if (lock_obj) + GF_FREE (lock_obj); + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "Lock for %s %s successfully held by %s", + type, name, uuid_utoa (uuid)); + + ret = 0; +out: + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +int32_t +glusterd_mgmt_v3_unlock (const char *name, uuid_t uuid, char *type) +{ + char key[PATH_MAX] = ""; + int32_t ret = -1; + gf_boolean_t is_valid = _gf_true; + uuid_t owner = {0}; + xlator_t *this = NULL; + + GF_ASSERT(THIS); + this = THIS; + + if (!name || !type) { + gf_log (this->name, GF_LOG_ERROR, "name is null."); + ret = -1; + goto out; + } + + is_valid = glusterd_mgmt_v3_is_type_valid (type); + if (is_valid != _gf_true) { + gf_log (this->name, GF_LOG_ERROR, + "Invalid entity. Cannot perform unlocking " + "operation on %s types", type); + ret = -1; + goto out; + } + + ret = snprintf (key, sizeof(key), "%s_%s", + name, type); + if (ret != strlen(name) + 1 + strlen(type)) { + gf_log (this->name, GF_LOG_ERROR, "Unable to create key"); + ret = -1; + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "Trying to release lock of %s %s for %s as %s", + type, name, uuid_utoa (uuid), key); + + ret = glusterd_get_mgmt_v3_lock_owner (key, &owner); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "Unable to get mgmt_v3 lock owner"); + goto out; + } + + if (uuid_is_null (owner)) { + gf_log (this->name, GF_LOG_ERROR, + "Lock for %s %s not held", type, name); + ret = -1; + goto out; + } + + ret = uuid_compare (uuid, owner); + if (ret) { + + gf_log (this->name, GF_LOG_ERROR, "Lock owner mismatch. " + "Lock for %s %s held by %s", + type, name, uuid_utoa (owner)); + goto out; + } + + /* Removing the mgmt_v3 lock from the global list */ + dict_del (mgmt_v3_lock, key); + + gf_log (this->name, GF_LOG_DEBUG, + "Lock for %s %s successfully released", + type, name); + + ret = 0; +out: + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} diff --git a/xlators/mgmt/glusterd/src/glusterd-locks.h b/xlators/mgmt/glusterd/src/glusterd-locks.h new file mode 100644 index 000000000..83eb8c997 --- /dev/null +++ b/xlators/mgmt/glusterd/src/glusterd-locks.h @@ -0,0 +1,51 @@ +/* + Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _GLUSTERD_LOCKS_H_ +#define _GLUSTERD_LOCKS_H_ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +typedef struct mgmt_v3_lock_object_ { + uuid_t lock_owner; +} mgmt_v3_lock_obj; + +typedef struct mgmt_v3_lock_valid_entities { + char *type; /* Entity type like vol, snap */ + gf_boolean_t default_value; /* The default value that * + * determines if the locks * + * should be held for that * + * entity */ +} valid_entities; + +int32_t +glusterd_mgmt_v3_lock_init (); + +void +glusterd_mgmt_v3_lock_fini (); + +int32_t +glusterd_get_mgmt_v3_lock_owner (char *volname, uuid_t *uuid); + +int32_t +glusterd_mgmt_v3_lock (const char *key, uuid_t uuid, char *type); + +int32_t +glusterd_mgmt_v3_unlock (const char *key, uuid_t uuid, char *type); + +int32_t +glusterd_multiple_mgmt_v3_lock (dict_t *dict, uuid_t uuid); + +int32_t +glusterd_multiple_mgmt_v3_unlock (dict_t *dict, uuid_t uuid); + +#endif diff --git a/xlators/mgmt/glusterd/src/glusterd-log-ops.c b/xlators/mgmt/glusterd/src/glusterd-log-ops.c index 0136cddc9..33bd95c03 100644 --- a/xlators/mgmt/glusterd/src/glusterd-log-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-log-ops.c @@ -24,7 +24,7 @@ #include <signal.h> int -glusterd_handle_log_rotate (rpcsvc_request_t *req) +__glusterd_handle_log_rotate (rpcsvc_request_t *req) { int32_t ret = -1; gf_cli_req cli_req = {{0,}}; @@ -90,6 +90,13 @@ out: return ret; } +int +glusterd_handle_log_rotate (rpcsvc_request_t *req) +{ + return glusterd_big_locked_handler (req, + __glusterd_handle_log_rotate); +} + /* op-sm */ int glusterd_op_stage_log_rotate (dict_t *dict, char **op_errstr) diff --git a/xlators/mgmt/glusterd/src/glusterd-mem-types.h b/xlators/mgmt/glusterd/src/glusterd-mem-types.h index 98216e28a..e6f6a0333 100644 --- a/xlators/mgmt/glusterd/src/glusterd-mem-types.h +++ b/xlators/mgmt/glusterd/src/glusterd-mem-types.h @@ -66,7 +66,10 @@ typedef enum gf_gld_mem_types_ { gf_gld_mt_hooks_stub_t = gf_common_mt_end + 50, gf_gld_mt_hooks_priv_t = gf_common_mt_end + 51, gf_gld_mt_mop_commit_req_t = gf_common_mt_end + 52, - gf_gld_mt_end = gf_common_mt_end + 53, + gf_gld_mt_int = gf_common_mt_end + 53, + gf_gld_mt_snap_t = gf_common_mt_end + 54, + gf_gld_mt_missed_snapinfo_t = gf_common_mt_end + 55, + gf_gld_mt_end = gf_common_mt_end + 56, } gf_gld_mem_types_t; #endif diff --git a/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c b/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c new file mode 100644 index 000000000..a5f38ce9c --- /dev/null +++ b/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c @@ -0,0 +1,924 @@ +/* + Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +/* rpc related syncops */ +#include "rpc-clnt.h" +#include "protocol-common.h" +#include "xdr-generic.h" +#include "glusterd1-xdr.h" +#include "glusterd-syncop.h" + +#include "glusterd.h" +#include "glusterd-utils.h" +#include "glusterd-locks.h" +#include "glusterd-mgmt.h" +#include "glusterd-op-sm.h" + +static int +glusterd_mgmt_v3_null (rpcsvc_request_t *req) +{ + return 0; +} + +static int +glusterd_mgmt_v3_lock_send_resp (rpcsvc_request_t *req, int32_t status) +{ + + gd1_mgmt_v3_lock_rsp rsp = {{0},}; + int ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + + rsp.op_ret = status; + if (rsp.op_ret) + rsp.op_errno = errno; + + glusterd_get_uuid (&rsp.uuid); + + ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL, + (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp); + + gf_log (this->name, GF_LOG_DEBUG, + "Responded to mgmt_v3 lock, ret: %d", ret); + + return ret; +} + +static int +glusterd_synctasked_mgmt_v3_lock (rpcsvc_request_t *req, + gd1_mgmt_v3_lock_req *lock_req, + glusterd_op_lock_ctx_t *ctx) +{ + int32_t ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + GF_ASSERT (ctx); + GF_ASSERT (ctx->dict); + + /* Trying to acquire multiple mgmt_v3 locks */ + ret = glusterd_multiple_mgmt_v3_lock (ctx->dict, ctx->uuid); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Failed to acquire mgmt_v3 locks for %s", + uuid_utoa (ctx->uuid)); + + ret = glusterd_mgmt_v3_lock_send_resp (req, ret); + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +static int +glusterd_op_state_machine_mgmt_v3_lock (rpcsvc_request_t *req, + gd1_mgmt_v3_lock_req *lock_req, + glusterd_op_lock_ctx_t *ctx) +{ + int32_t ret = -1; + xlator_t *this = NULL; + glusterd_op_info_t txn_op_info = {{0},}; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + + glusterd_txn_opinfo_init (&txn_op_info, NULL, &lock_req->op, + ctx->dict, req); + + ret = glusterd_set_txn_opinfo (&lock_req->txn_id, &txn_op_info); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to set transaction's opinfo"); + goto out; + } + + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_LOCK, + &lock_req->txn_id, ctx); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Failed to inject event GD_OP_EVENT_LOCK"); + +out: + glusterd_friend_sm (); + glusterd_op_sm (); + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +static int +glusterd_handle_mgmt_v3_lock_fn (rpcsvc_request_t *req) +{ + gd1_mgmt_v3_lock_req lock_req = {{0},}; + int32_t ret = -1; + glusterd_peerinfo_t *peerinfo = NULL; + glusterd_op_lock_ctx_t *ctx = NULL; + xlator_t *this = NULL; + gf_boolean_t is_synctasked = _gf_false; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + + ret = xdr_to_generic (req->msg[0], &lock_req, + (xdrproc_t)xdr_gd1_mgmt_v3_lock_req); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Failed to decode lock " + "request received from peer"); + req->rpc_err = GARBAGE_ARGS; + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, "Received mgmt_v3 lock req " + "from uuid: %s", uuid_utoa (lock_req.uuid)); + + if (glusterd_friend_find_by_uuid (lock_req.uuid, &peerinfo)) { + gf_log (this->name, GF_LOG_WARNING, "%s doesn't " + "belong to the cluster. Ignoring request.", + uuid_utoa (lock_req.uuid)); + ret = -1; + goto out; + } + + ctx = GF_CALLOC (1, sizeof (*ctx), gf_gld_mt_op_lock_ctx_t); + if (!ctx) { + ret = -1; + goto out; + } + + uuid_copy (ctx->uuid, lock_req.uuid); + ctx->req = req; + + ctx->dict = dict_new (); + if (!ctx->dict) { + ret = -1; + goto out; + } + + ret = dict_unserialize (lock_req.dict.dict_val, + lock_req.dict.dict_len, &ctx->dict); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "failed to unserialize the dictionary"); + goto out; + } + + is_synctasked = dict_get_str_boolean (ctx->dict, + "is_synctasked", _gf_false); + if (is_synctasked) + ret = glusterd_synctasked_mgmt_v3_lock (req, &lock_req, ctx); + else + ret = glusterd_op_state_machine_mgmt_v3_lock (req, &lock_req, + ctx); + +out: + + if (ret) { + if (ctx->dict) + dict_unref (ctx->dict); + if (ctx) + GF_FREE (ctx); + } + + free (lock_req.dict.dict_val); + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +static int +glusterd_mgmt_v3_pre_validate_send_resp (rpcsvc_request_t *req, + int32_t op, int32_t status, + char *op_errstr, dict_t *rsp_dict) +{ + gd1_mgmt_v3_pre_val_rsp rsp = {{0},}; + int ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + + rsp.op_ret = status; + glusterd_get_uuid (&rsp.uuid); + rsp.op = op; + if (op_errstr) + rsp.op_errstr = op_errstr; + else + rsp.op_errstr = ""; + + ret = dict_allocate_and_serialize (rsp_dict, &rsp.dict.dict_val, + &rsp.dict.dict_len); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get serialized length of dict"); + goto out; + } + + ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL, + (xdrproc_t)xdr_gd1_mgmt_v3_pre_val_rsp); + + GF_FREE (rsp.dict.dict_val); +out: + gf_log (this->name, GF_LOG_DEBUG, + "Responded to pre validation, ret: %d", ret); + return ret; +} + +static int +glusterd_handle_pre_validate_fn (rpcsvc_request_t *req) +{ + int32_t ret = -1; + gd1_mgmt_v3_pre_val_req op_req = {{0},}; + glusterd_peerinfo_t *peerinfo = NULL; + xlator_t *this = NULL; + char *op_errstr = NULL; + dict_t *dict = NULL; + dict_t *rsp_dict = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + + ret = xdr_to_generic (req->msg[0], &op_req, + (xdrproc_t)xdr_gd1_mgmt_v3_pre_val_req); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to decode pre validation " + "request received from peer"); + req->rpc_err = GARBAGE_ARGS; + goto out; + } + + if (glusterd_friend_find_by_uuid (op_req.uuid, &peerinfo)) { + gf_log (this->name, GF_LOG_WARNING, "%s doesn't " + "belong to the cluster. Ignoring request.", + uuid_utoa (op_req.uuid)); + ret = -1; + goto out; + } + + dict = dict_new (); + if (!dict) + goto out; + + ret = dict_unserialize (op_req.dict.dict_val, + op_req.dict.dict_len, &dict); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "failed to unserialize the dictionary"); + goto out; + } + + rsp_dict = dict_new (); + if (!rsp_dict) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to get new dictionary"); + return -1; + } + + ret = gd_mgmt_v3_pre_validate_fn (op_req.op, dict, &op_errstr, + rsp_dict); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Pre Validation failed on operation %s", + gd_op_list[op_req.op]); + } + + ret = glusterd_mgmt_v3_pre_validate_send_resp (req, op_req.op, + ret, op_errstr, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to send Pre Validation " + "response for operation %s", + gd_op_list[op_req.op]); + goto out; + } + +out: + if (op_errstr && (strcmp (op_errstr, ""))) + GF_FREE (op_errstr); + + free (op_req.dict.dict_val); + + if (dict) + dict_unref (dict); + + if (rsp_dict) + dict_unref (rsp_dict); + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +static int +glusterd_mgmt_v3_brick_op_send_resp (rpcsvc_request_t *req, + int32_t op, int32_t status, + char *op_errstr, dict_t *rsp_dict) +{ + gd1_mgmt_v3_brick_op_rsp rsp = {{0},}; + int ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + + rsp.op_ret = status; + glusterd_get_uuid (&rsp.uuid); + rsp.op = op; + if (op_errstr) + rsp.op_errstr = op_errstr; + else + rsp.op_errstr = ""; + + ret = dict_allocate_and_serialize (rsp_dict, &rsp.dict.dict_val, + &rsp.dict.dict_len); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get serialized length of dict"); + goto out; + } + + ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL, + (xdrproc_t)xdr_gd1_mgmt_v3_brick_op_rsp); + + GF_FREE (rsp.dict.dict_val); +out: + gf_log (this->name, GF_LOG_DEBUG, + "Responded to brick op, ret: %d", ret); + return ret; +} + +static int +glusterd_handle_brick_op_fn (rpcsvc_request_t *req) +{ + int32_t ret = -1; + gd1_mgmt_v3_brick_op_req op_req = {{0},}; + glusterd_peerinfo_t *peerinfo = NULL; + xlator_t *this = NULL; + char *op_errstr = NULL; + dict_t *dict = NULL; + dict_t *rsp_dict = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + + ret = xdr_to_generic (req->msg[0], &op_req, + (xdrproc_t)xdr_gd1_mgmt_v3_brick_op_req); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Failed to decode brick op " + "request received from peer"); + req->rpc_err = GARBAGE_ARGS; + goto out; + } + + if (glusterd_friend_find_by_uuid (op_req.uuid, &peerinfo)) { + gf_log (this->name, GF_LOG_WARNING, "%s doesn't " + "belong to the cluster. Ignoring request.", + uuid_utoa (op_req.uuid)); + ret = -1; + goto out; + } + + dict = dict_new (); + if (!dict) + goto out; + + ret = dict_unserialize (op_req.dict.dict_val, + op_req.dict.dict_len, &dict); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "failed to unserialize the dictionary"); + goto out; + } + + rsp_dict = dict_new (); + if (!rsp_dict) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to get new dictionary"); + return -1; + } + + ret = gd_mgmt_v3_brick_op_fn (op_req.op, dict, &op_errstr, + rsp_dict); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Brick Op failed on operation %s", + gd_op_list[op_req.op]); + } + + ret = glusterd_mgmt_v3_brick_op_send_resp (req, op_req.op, + ret, op_errstr, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to send brick op " + "response for operation %s", + gd_op_list[op_req.op]); + goto out; + } + +out: + if (op_errstr && (strcmp (op_errstr, ""))) + GF_FREE (op_errstr); + + free (op_req.dict.dict_val); + + if (dict) + dict_unref (dict); + + if (rsp_dict) + dict_unref (rsp_dict); + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +static int +glusterd_mgmt_v3_commit_send_resp (rpcsvc_request_t *req, + int32_t op, int32_t status, + char *op_errstr, dict_t *rsp_dict) +{ + gd1_mgmt_v3_commit_rsp rsp = {{0},}; + int ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + + rsp.op_ret = status; + glusterd_get_uuid (&rsp.uuid); + rsp.op = op; + if (op_errstr) + rsp.op_errstr = op_errstr; + else + rsp.op_errstr = ""; + + ret = dict_allocate_and_serialize (rsp_dict, &rsp.dict.dict_val, + &rsp.dict.dict_len); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get serialized length of dict"); + goto out; + } + + ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL, + (xdrproc_t)xdr_gd1_mgmt_v3_commit_rsp); + + GF_FREE (rsp.dict.dict_val); +out: + gf_log (this->name, GF_LOG_DEBUG, "Responded to commit, ret: %d", ret); + return ret; +} + +static int +glusterd_handle_commit_fn (rpcsvc_request_t *req) +{ + int32_t ret = -1; + gd1_mgmt_v3_commit_req op_req = {{0},}; + glusterd_peerinfo_t *peerinfo = NULL; + xlator_t *this = NULL; + char *op_errstr = NULL; + dict_t *dict = NULL; + dict_t *rsp_dict = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + + ret = xdr_to_generic (req->msg[0], &op_req, + (xdrproc_t)xdr_gd1_mgmt_v3_commit_req); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Failed to decode commit " + "request received from peer"); + req->rpc_err = GARBAGE_ARGS; + goto out; + } + + if (glusterd_friend_find_by_uuid (op_req.uuid, &peerinfo)) { + gf_log (this->name, GF_LOG_WARNING, "%s doesn't " + "belong to the cluster. Ignoring request.", + uuid_utoa (op_req.uuid)); + ret = -1; + goto out; + } + + dict = dict_new (); + if (!dict) + goto out; + + ret = dict_unserialize (op_req.dict.dict_val, + op_req.dict.dict_len, &dict); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "failed to unserialize the dictionary"); + goto out; + } + + rsp_dict = dict_new (); + if (!rsp_dict) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to get new dictionary"); + return -1; + } + + ret = gd_mgmt_v3_commit_fn (op_req.op, dict, &op_errstr, + rsp_dict); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "commit failed on operation %s", + gd_op_list[op_req.op]); + } + + ret = glusterd_mgmt_v3_commit_send_resp (req, op_req.op, + ret, op_errstr, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to send commit " + "response for operation %s", + gd_op_list[op_req.op]); + goto out; + } + +out: + if (op_errstr && (strcmp (op_errstr, ""))) + GF_FREE (op_errstr); + + free (op_req.dict.dict_val); + + if (dict) + dict_unref (dict); + + if (rsp_dict) + dict_unref (rsp_dict); + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +static int +glusterd_mgmt_v3_post_validate_send_resp (rpcsvc_request_t *req, + int32_t op, int32_t status, + char *op_errstr, dict_t *rsp_dict) +{ + gd1_mgmt_v3_post_val_rsp rsp = {{0},}; + int ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + + rsp.op_ret = status; + glusterd_get_uuid (&rsp.uuid); + rsp.op = op; + if (op_errstr) + rsp.op_errstr = op_errstr; + else + rsp.op_errstr = ""; + + ret = dict_allocate_and_serialize (rsp_dict, &rsp.dict.dict_val, + &rsp.dict.dict_len); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get serialized length of dict"); + goto out; + } + + ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL, + (xdrproc_t)xdr_gd1_mgmt_v3_post_val_rsp); + + GF_FREE (rsp.dict.dict_val); +out: + gf_log (this->name, GF_LOG_DEBUG, + "Responded to post validation, ret: %d", ret); + return ret; +} + +static int +glusterd_handle_post_validate_fn (rpcsvc_request_t *req) +{ + int32_t ret = -1; + gd1_mgmt_v3_post_val_req op_req = {{0},}; + glusterd_peerinfo_t *peerinfo = NULL; + xlator_t *this = NULL; + char *op_errstr = NULL; + dict_t *dict = NULL; + dict_t *rsp_dict = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + + ret = xdr_to_generic (req->msg[0], &op_req, + (xdrproc_t)xdr_gd1_mgmt_v3_post_val_req); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to decode post validation " + "request received from peer"); + req->rpc_err = GARBAGE_ARGS; + goto out; + } + + if (glusterd_friend_find_by_uuid (op_req.uuid, &peerinfo)) { + gf_log (this->name, GF_LOG_WARNING, "%s doesn't " + "belong to the cluster. Ignoring request.", + uuid_utoa (op_req.uuid)); + ret = -1; + goto out; + } + + dict = dict_new (); + if (!dict) + goto out; + + ret = dict_unserialize (op_req.dict.dict_val, + op_req.dict.dict_len, &dict); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "failed to unserialize the dictionary"); + goto out; + } + + rsp_dict = dict_new (); + if (!rsp_dict) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to get new dictionary"); + return -1; + } + + ret = gd_mgmt_v3_post_validate_fn (op_req.op, op_req.op_ret, dict, + &op_errstr, rsp_dict); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Post Validation failed on operation %s", + gd_op_list[op_req.op]); + } + + ret = glusterd_mgmt_v3_post_validate_send_resp (req, op_req.op, + ret, op_errstr, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to send Post Validation " + "response for operation %s", + gd_op_list[op_req.op]); + goto out; + } + +out: + if (op_errstr && (strcmp (op_errstr, ""))) + GF_FREE (op_errstr); + + free (op_req.dict.dict_val); + + if (dict) + dict_unref (dict); + + if (rsp_dict) + dict_unref (rsp_dict); + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +static int +glusterd_mgmt_v3_unlock_send_resp (rpcsvc_request_t *req, int32_t status) +{ + + gd1_mgmt_v3_unlock_rsp rsp = {{0},}; + int ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + + rsp.op_ret = status; + if (rsp.op_ret) + rsp.op_errno = errno; + + glusterd_get_uuid (&rsp.uuid); + + ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL, + (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp); + + gf_log (this->name, GF_LOG_DEBUG, + "Responded to mgmt_v3 unlock, ret: %d", ret); + + return ret; +} + +static int +glusterd_synctasked_mgmt_v3_unlock (rpcsvc_request_t *req, + gd1_mgmt_v3_unlock_req *unlock_req, + glusterd_op_lock_ctx_t *ctx) +{ + int32_t ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + GF_ASSERT (ctx); + + /* Trying to release multiple mgmt_v3 locks */ + ret = glusterd_multiple_mgmt_v3_unlock (ctx->dict, ctx->uuid); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to release mgmt_v3 locks for %s", + uuid_utoa(ctx->uuid)); + } + + ret = glusterd_mgmt_v3_unlock_send_resp (req, ret); + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + + +static int +glusterd_op_state_machine_mgmt_v3_unlock (rpcsvc_request_t *req, + gd1_mgmt_v3_unlock_req *lock_req, + glusterd_op_lock_ctx_t *ctx) +{ + int32_t ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_UNLOCK, + &lock_req->txn_id, ctx); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Failed to inject event GD_OP_EVENT_UNLOCK"); + + glusterd_friend_sm (); + glusterd_op_sm (); + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +static int +glusterd_handle_mgmt_v3_unlock_fn (rpcsvc_request_t *req) +{ + gd1_mgmt_v3_unlock_req lock_req = {{0},}; + int32_t ret = -1; + glusterd_op_lock_ctx_t *ctx = NULL; + glusterd_peerinfo_t *peerinfo = NULL; + xlator_t *this = NULL; + gf_boolean_t is_synctasked = _gf_false; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + + ret = xdr_to_generic (req->msg[0], &lock_req, + (xdrproc_t)xdr_gd1_mgmt_v3_unlock_req); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Failed to decode unlock " + "request received from peer"); + req->rpc_err = GARBAGE_ARGS; + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, "Received mgmt_v3 unlock req " + "from uuid: %s", uuid_utoa (lock_req.uuid)); + + if (glusterd_friend_find_by_uuid (lock_req.uuid, &peerinfo)) { + gf_log (this->name, GF_LOG_WARNING, "%s doesn't " + "belong to the cluster. Ignoring request.", + uuid_utoa (lock_req.uuid)); + ret = -1; + goto out; + } + + ctx = GF_CALLOC (1, sizeof (*ctx), gf_gld_mt_op_lock_ctx_t); + if (!ctx) { + ret = -1; + goto out; + } + + uuid_copy (ctx->uuid, lock_req.uuid); + ctx->req = req; + + ctx->dict = dict_new (); + if (!ctx->dict) { + ret = -1; + goto out; + } + + ret = dict_unserialize (lock_req.dict.dict_val, + lock_req.dict.dict_len, &ctx->dict); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "failed to unserialize the dictionary"); + goto out; + } + + is_synctasked = dict_get_str_boolean (ctx->dict, + "is_synctasked", _gf_false); + if (is_synctasked) + ret = glusterd_synctasked_mgmt_v3_unlock (req, &lock_req, ctx); + else + ret = glusterd_op_state_machine_mgmt_v3_unlock (req, &lock_req, + ctx); + +out: + + if (ret) { + if (ctx->dict) + dict_unref (ctx->dict); + if (ctx) + GF_FREE (ctx); + } + + free (lock_req.dict.dict_val); + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +int +glusterd_handle_mgmt_v3_lock (rpcsvc_request_t *req) +{ + return glusterd_big_locked_handler (req, + glusterd_handle_mgmt_v3_lock_fn); +} + +static int +glusterd_handle_pre_validate (rpcsvc_request_t *req) +{ + return glusterd_big_locked_handler (req, + glusterd_handle_pre_validate_fn); +} + +static int +glusterd_handle_brick_op (rpcsvc_request_t *req) +{ + return glusterd_big_locked_handler (req, + glusterd_handle_brick_op_fn); +} + +static int +glusterd_handle_commit (rpcsvc_request_t *req) +{ + return glusterd_big_locked_handler (req, + glusterd_handle_commit_fn); +} + +static int +glusterd_handle_post_validate (rpcsvc_request_t *req) +{ + return glusterd_big_locked_handler (req, + glusterd_handle_post_validate_fn); +} + +int +glusterd_handle_mgmt_v3_unlock (rpcsvc_request_t *req) +{ + return glusterd_big_locked_handler (req, + glusterd_handle_mgmt_v3_unlock_fn); +} + +rpcsvc_actor_t gd_svc_mgmt_v3_actors[] = { + [GLUSTERD_MGMT_V3_NULL] = { "NULL", GLUSTERD_MGMT_V3_NULL, glusterd_mgmt_v3_null, NULL, 0, DRC_NA}, + [GLUSTERD_MGMT_V3_LOCK] = { "MGMT_V3_LOCK", GLUSTERD_MGMT_V3_LOCK, glusterd_handle_mgmt_v3_lock, NULL, 0, DRC_NA}, + [GLUSTERD_MGMT_V3_PRE_VALIDATE] = { "PRE_VAL", GLUSTERD_MGMT_V3_PRE_VALIDATE, glusterd_handle_pre_validate, NULL, 0, DRC_NA}, + [GLUSTERD_MGMT_V3_BRICK_OP] = { "BRCK_OP", GLUSTERD_MGMT_V3_BRICK_OP, glusterd_handle_brick_op, NULL, 0, DRC_NA}, + [GLUSTERD_MGMT_V3_COMMIT] = { "COMMIT", GLUSTERD_MGMT_V3_COMMIT, glusterd_handle_commit, NULL, 0, DRC_NA}, + [GLUSTERD_MGMT_V3_POST_VALIDATE] = { "POST_VAL", GLUSTERD_MGMT_V3_POST_VALIDATE, glusterd_handle_post_validate, NULL, 0, DRC_NA}, + [GLUSTERD_MGMT_V3_UNLOCK] = { "MGMT_V3_UNLOCK", GLUSTERD_MGMT_V3_UNLOCK, glusterd_handle_mgmt_v3_unlock, NULL, 0, DRC_NA}, +}; + +struct rpcsvc_program gd_svc_mgmt_v3_prog = { + .progname = "GlusterD svc mgmt v3", + .prognum = GD_MGMT_V3_PROGRAM, + .progver = GD_MGMT_V3_VERSION, + .numactors = GLUSTERD_MGMT_V3_MAXVALUE, + .actors = gd_svc_mgmt_v3_actors, + .synctask = _gf_true, +}; diff --git a/xlators/mgmt/glusterd/src/glusterd-mgmt.c b/xlators/mgmt/glusterd/src/glusterd-mgmt.c new file mode 100644 index 000000000..d52532e54 --- /dev/null +++ b/xlators/mgmt/glusterd/src/glusterd-mgmt.c @@ -0,0 +1,1893 @@ +/* + Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +/* rpc related syncops */ +#include "rpc-clnt.h" +#include "protocol-common.h" +#include "xdr-generic.h" +#include "glusterd1-xdr.h" +#include "glusterd-syncop.h" + +#include "glusterd.h" +#include "glusterd-utils.h" +#include "glusterd-locks.h" +#include "glusterd-mgmt.h" +#include "glusterd-op-sm.h" + +extern struct rpc_clnt_program gd_mgmt_v3_prog; + + +static void +gd_mgmt_v3_collate_errors (struct syncargs *args, int op_ret, int op_errno, + char *op_errstr, int op_code, + glusterd_peerinfo_t *peerinfo, u_char *uuid) +{ + char *peer_str = NULL; + char err_str[PATH_MAX] = "Please check log file for details."; + char op_err[PATH_MAX] = ""; + int32_t len = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (args); + GF_ASSERT (uuid); + + if (op_ret) { + args->op_ret = op_ret; + args->op_errno = op_errno; + + if (peerinfo) + peer_str = peerinfo->hostname; + else + peer_str = uuid_utoa (uuid); + + if (op_errstr && strcmp (op_errstr, "")) { + len = snprintf (err_str, sizeof(err_str) - 1, + "Error: %s", op_errstr); + err_str[len] = '\0'; + } + + switch (op_code){ + case GLUSTERD_MGMT_V3_LOCK: + { + len = snprintf (op_err, sizeof(op_err) - 1, + "Locking failed " + "on %s. %s", peer_str, err_str); + break; + } + case GLUSTERD_MGMT_V3_PRE_VALIDATE: + { + len = snprintf (op_err, sizeof(op_err) - 1, + "Pre Validation failed " + "on %s. %s", peer_str, err_str); + break; + } + case GLUSTERD_MGMT_V3_BRICK_OP: + { + len = snprintf (op_err, sizeof(op_err) - 1, + "Brick ops failed " + "on %s. %s", peer_str, err_str); + break; + } + case GLUSTERD_MGMT_V3_COMMIT: + { + len = snprintf (op_err, sizeof(op_err) - 1, + "Commit failed on %s. %s", + peer_str, err_str); + break; + } + case GLUSTERD_MGMT_V3_POST_VALIDATE: + { + len = snprintf (op_err, sizeof(op_err) - 1, + "Post Validation failed " + "on %s. %s", peer_str, err_str); + break; + } + case GLUSTERD_MGMT_V3_UNLOCK: + { + len = snprintf (op_err, sizeof(op_err) - 1, + "Unlocking failed " + "on %s. %s", peer_str, err_str); + break; + } + } + op_err[len] = '\0'; + + if (args->errstr) { + len = snprintf (err_str, sizeof(err_str) - 1, + "%s\n%s", args->errstr, + op_err); + GF_FREE (args->errstr); + args->errstr = NULL; + } else + len = snprintf (err_str, sizeof(err_str) - 1, + "%s", op_err); + err_str[len] = '\0'; + + gf_log (this->name, GF_LOG_ERROR, "%s", op_err); + args->errstr = gf_strdup (err_str); + } + + return; +} + +int32_t +gd_mgmt_v3_pre_validate_fn (glusterd_op_t op, dict_t *dict, + char **op_errstr, dict_t *rsp_dict) +{ + int32_t ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (op_errstr); + GF_ASSERT (rsp_dict); + + switch (op) { + case GD_OP_SNAP: + { + ret = glusterd_snapshot_prevalidate (dict, op_errstr, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "Snapshot Prevalidate Failed"); + goto out; + } + + break; + } + default: + break; + } + + ret = 0; +out: + gf_log (this->name, GF_LOG_DEBUG, "OP = %d. Returning %d", op, ret); + return ret; +} + +int32_t +gd_mgmt_v3_brick_op_fn (glusterd_op_t op, dict_t *dict, + char **op_errstr, dict_t *rsp_dict) +{ + int32_t ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (op_errstr); + GF_ASSERT (rsp_dict); + + switch (op) { + case GD_OP_SNAP: + { + ret = glusterd_snapshot_brickop (dict, op_errstr, rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "snapshot brickop " + "failed"); + goto out; + } + break; + } + default: + break; + } + + ret = 0; +out: + gf_log (this->name, GF_LOG_TRACE, "OP = %d. Returning %d", op, ret); + return ret; +} + +int32_t +gd_mgmt_v3_commit_fn (glusterd_op_t op, dict_t *dict, + char **op_errstr, dict_t *rsp_dict) +{ + int32_t ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (op_errstr); + GF_ASSERT (rsp_dict); + + switch (op) { + case GD_OP_SNAP: + { + ret = glusterd_snapshot (dict, op_errstr, rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "Snapshot Commit Failed"); + goto out; + } + break; + } + default: + break; + } + + ret = 0; +out: + gf_log (this->name, GF_LOG_DEBUG, "OP = %d. Returning %d", op, ret); + return ret; +} + +int32_t +gd_mgmt_v3_post_validate_fn (glusterd_op_t op, int32_t op_ret, dict_t *dict, + char **op_errstr, dict_t *rsp_dict) +{ + int32_t ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (op_errstr); + GF_ASSERT (rsp_dict); + + switch (op) { + case GD_OP_SNAP: + { + ret = glusterd_snapshot_postvalidate (dict, op_ret, + op_errstr, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "postvalidate operation failed"); + goto out; + } + break; + } + default: + break; + } + + ret = 0; + +out: + gf_log (this->name, GF_LOG_TRACE, "OP = %d. Returning %d", op, ret); + return ret; +} + +int32_t +gd_mgmt_v3_lock_cbk_fn (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + int32_t ret = -1; + struct syncargs *args = NULL; + glusterd_peerinfo_t *peerinfo = NULL; + gd1_mgmt_v3_lock_rsp rsp = {{0},}; + call_frame_t *frame = NULL; + int32_t op_ret = -1; + int32_t op_errno = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + GF_ASSERT (myframe); + + /* Even though the lock command has failed, while collating the errors + (gd_mgmt_v3_collate_errors), args->op_ret and args->op_errno will be + used. @args is obtained from frame->local. So before checking the + status of the request and going out if its a failure, args should be + set to frame->local. Otherwise, while collating args will be NULL. + This applies to other phases such as prevalidate, brickop, commit and + postvalidate also. + */ + frame = myframe; + args = frame->local; + peerinfo = frame->cookie; + frame->local = NULL; + frame->cookie = NULL; + + if (-1 == req->rpc_status) { + op_errno = ENOTCONN; + goto out; + } + + if (!iov) { + gf_log (this->name, GF_LOG_ERROR, "iov is NULL"); + op_errno = EINVAL; + goto out; + } + + ret = xdr_to_generic (*iov, &rsp, + (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp); + if (ret < 0) + goto out; + + uuid_copy (args->uuid, rsp.uuid); + + op_ret = rsp.op_ret; + op_errno = rsp.op_errno; + +out: + gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL, + GLUSTERD_MGMT_V3_LOCK, + peerinfo, rsp.uuid); + free (rsp.dict.dict_val); + STACK_DESTROY (frame->root); + synctask_barrier_wake(args); + return 0; +} + +int32_t +gd_mgmt_v3_lock_cbk (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + return glusterd_big_locked_cbk (req, iov, count, myframe, + gd_mgmt_v3_lock_cbk_fn); +} + +int +gd_mgmt_v3_lock (glusterd_op_t op, dict_t *op_ctx, + glusterd_peerinfo_t *peerinfo, + struct syncargs *args, uuid_t my_uuid, + uuid_t recv_uuid) +{ + gd1_mgmt_v3_lock_req req = {{0},}; + glusterd_conf_t *conf = THIS->private; + int32_t ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (op_ctx); + GF_ASSERT (peerinfo); + GF_ASSERT (args); + + ret = dict_allocate_and_serialize (op_ctx, + &req.dict.dict_val, + &req.dict.dict_len); + if (ret) + goto out; + + uuid_copy (req.uuid, my_uuid); + req.op = op; + synclock_unlock (&conf->big_lock); + ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo, + &gd_mgmt_v3_prog, + GLUSTERD_MGMT_V3_LOCK, + gd_mgmt_v3_lock_cbk, + (xdrproc_t) xdr_gd1_mgmt_v3_lock_req); + synclock_lock (&conf->big_lock); +out: + GF_FREE (req.dict.dict_val); + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +int +glusterd_mgmt_v3_initiate_lockdown (glusterd_conf_t *conf, glusterd_op_t op, + dict_t *dict, char **op_errstr, int npeers, + gf_boolean_t *is_acquired) +{ + char *volname = NULL; + glusterd_peerinfo_t *peerinfo = NULL; + int32_t ret = -1; + int32_t peer_cnt = 0; + struct syncargs args = {0}; + struct list_head *peers = NULL; + uuid_t peer_uuid = {0}; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (conf); + GF_ASSERT (dict); + GF_ASSERT (op_errstr); + GF_ASSERT (is_acquired); + + peers = &conf->xaction_peers; + + /* Trying to acquire multiple mgmt_v3 locks on local node */ + ret = glusterd_multiple_mgmt_v3_lock (dict, MY_UUID); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to acquire mgmt_v3 locks on localhost"); + goto out; + } + + *is_acquired = _gf_true; + + if (!npeers) { + ret = 0; + goto out; + } + + /* Sending mgmt_v3 lock req to other nodes in the cluster */ + gd_syncargs_init (&args, NULL); + synctask_barrier_init((&args)); + peer_cnt = 0; + list_for_each_entry (peerinfo, peers, op_peers_list) { + gd_mgmt_v3_lock (op, dict, peerinfo, &args, + MY_UUID, peer_uuid); + peer_cnt++; + } + gd_synctask_barrier_wait((&args), peer_cnt); + + if (args.errstr) + *op_errstr = gf_strdup (args.errstr); + + ret = args.op_ret; + + gf_log (this->name, GF_LOG_DEBUG, "Sent lock op req for %s " + "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret); +out: + if (ret) { + if (*op_errstr) + gf_log (this->name, GF_LOG_ERROR, "%s", + *op_errstr); + + if (volname) + ret = gf_asprintf (op_errstr, + "Another transaction is in progress " + "for %s. Please try again after " + "sometime.", volname); + else + ret = gf_asprintf (op_errstr, + "Another transaction is in progress " + "Please try again after sometime."); + + if (ret == -1) + *op_errstr = NULL; + + ret = -1; + } + + return ret; +} + +int +glusterd_pre_validate_aggr_rsp_dict (glusterd_op_t op, dict_t *aggr, dict_t *rsp) +{ + int32_t ret = 0; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (aggr); + GF_ASSERT (rsp); + + switch (op) { + case GD_OP_SNAP: + ret = glusterd_snap_pre_validate_use_rsp_dict (aggr, rsp); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to aggregate prevalidate " + "response dictionaries."); + goto out; + } + break; + default: + ret = -1; + gf_log (this->name, GF_LOG_ERROR, "Invalid op (%s)", gd_op_list[op]); + + break; + } +out: + return ret; +} + +int32_t +gd_mgmt_v3_pre_validate_cbk_fn (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + int32_t ret = -1; + struct syncargs *args = NULL; + glusterd_peerinfo_t *peerinfo = NULL; + gd1_mgmt_v3_pre_val_rsp rsp = {{0},}; + call_frame_t *frame = NULL; + int32_t op_ret = -1; + int32_t op_errno = -1; + dict_t *rsp_dict = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + GF_ASSERT (myframe); + + frame = myframe; + args = frame->local; + peerinfo = frame->cookie; + frame->local = NULL; + frame->cookie = NULL; + + if (-1 == req->rpc_status) { + op_errno = ENOTCONN; + goto out; + } + + if (!iov) { + gf_log (this->name, GF_LOG_ERROR, "iov is NULL"); + op_errno = EINVAL; + } + + ret = xdr_to_generic (*iov, &rsp, + (xdrproc_t)xdr_gd1_mgmt_v3_pre_val_rsp); + if (ret < 0) + goto out; + + if (rsp.dict.dict_len) { + /* Unserialize the dictionary */ + rsp_dict = dict_new (); + + ret = dict_unserialize (rsp.dict.dict_val, + rsp.dict.dict_len, + &rsp_dict); + if (ret < 0) { + GF_FREE (rsp.dict.dict_val); + goto out; + } else { + rsp_dict->extra_stdfree = rsp.dict.dict_val; + } + } + + uuid_copy (args->uuid, rsp.uuid); + pthread_mutex_lock (&args->lock_dict); + { + ret = glusterd_pre_validate_aggr_rsp_dict (rsp.op, args->dict, + rsp_dict); + } + pthread_mutex_unlock (&args->lock_dict); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s", + "Failed to aggregate response from " + " node/brick"); + if (!rsp.op_ret) + op_ret = ret; + else { + op_ret = rsp.op_ret; + op_errno = rsp.op_errno; + } + } else { + op_ret = rsp.op_ret; + op_errno = rsp.op_errno; + } + +out: + if (rsp_dict) + dict_unref (rsp_dict); + + gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL, + GLUSTERD_MGMT_V3_PRE_VALIDATE, + peerinfo, rsp.uuid); + + if (rsp.op_errstr) + free (rsp.op_errstr); + + STACK_DESTROY (frame->root); + synctask_barrier_wake(args); + return 0; +} + +int32_t +gd_mgmt_v3_pre_validate_cbk (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + return glusterd_big_locked_cbk (req, iov, count, myframe, + gd_mgmt_v3_pre_validate_cbk_fn); +} + +int +gd_mgmt_v3_pre_validate_req (glusterd_op_t op, dict_t *op_ctx, + glusterd_peerinfo_t *peerinfo, + struct syncargs *args, uuid_t my_uuid, + uuid_t recv_uuid) +{ + int32_t ret = -1; + gd1_mgmt_v3_pre_val_req req = {{0},}; + glusterd_conf_t *conf = THIS->private; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (op_ctx); + GF_ASSERT (peerinfo); + GF_ASSERT (args); + + ret = dict_allocate_and_serialize (op_ctx, + &req.dict.dict_val, + &req.dict.dict_len); + if (ret) + goto out; + + uuid_copy (req.uuid, my_uuid); + req.op = op; + synclock_unlock (&conf->big_lock); + ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo, + &gd_mgmt_v3_prog, + GLUSTERD_MGMT_V3_PRE_VALIDATE, + gd_mgmt_v3_pre_validate_cbk, + (xdrproc_t) xdr_gd1_mgmt_v3_pre_val_req); + synclock_lock (&conf->big_lock); +out: + GF_FREE (req.dict.dict_val); + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +int +glusterd_mgmt_v3_pre_validate (glusterd_conf_t *conf, glusterd_op_t op, + dict_t *req_dict, char **op_errstr, int npeers) +{ + int32_t ret = -1; + int32_t peer_cnt = 0; + dict_t *rsp_dict = NULL; + glusterd_peerinfo_t *peerinfo = NULL; + struct syncargs args = {0}; + struct list_head *peers = NULL; + uuid_t peer_uuid = {0}; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (conf); + GF_ASSERT (req_dict); + GF_ASSERT (op_errstr); + + peers = &conf->xaction_peers; + + rsp_dict = dict_new (); + if (!rsp_dict) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to create response dictionary"); + goto out; + } + + /* Pre Validation on local node */ + ret = gd_mgmt_v3_pre_validate_fn (op, req_dict, op_errstr, + rsp_dict); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Pre Validation failed for " + "operation %s on local node", + gd_op_list[op]); + + if (*op_errstr == NULL) { + ret = gf_asprintf (op_errstr, + "Pre-validation failed " + "on localhost. Please " + "check log file for details"); + if (ret == -1) + *op_errstr = NULL; + + ret = -1; + } + goto out; + } + + ret = glusterd_pre_validate_aggr_rsp_dict (op, req_dict, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s", + "Failed to aggregate response from " + " node/brick"); + goto out; + } + + dict_unref (rsp_dict); + rsp_dict = NULL; + + if (!npeers) { + ret = 0; + goto out; + } + + /* Sending Pre Validation req to other nodes in the cluster */ + gd_syncargs_init (&args, req_dict); + synctask_barrier_init((&args)); + peer_cnt = 0; + list_for_each_entry (peerinfo, peers, op_peers_list) { + gd_mgmt_v3_pre_validate_req (op, req_dict, peerinfo, &args, + MY_UUID, peer_uuid); + peer_cnt++; + } + gd_synctask_barrier_wait((&args), peer_cnt); + + if (args.op_ret) { + gf_log (this->name, GF_LOG_ERROR, + "Pre Validation failed on peers"); + + if (args.errstr) + *op_errstr = gf_strdup (args.errstr); + } + + ret = args.op_ret; + + gf_log (this->name, GF_LOG_DEBUG, "Sent pre valaidation req for %s " + "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret); +out: + return ret; +} + +int +glusterd_mgmt_v3_build_payload (dict_t **req, char **op_errstr, dict_t *dict, + glusterd_op_t op) +{ + int32_t ret = -1; + dict_t *req_dict = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + GF_ASSERT (op_errstr); + GF_ASSERT (dict); + + req_dict = dict_new (); + if (!req_dict) + goto out; + + switch (op) { + case GD_OP_SNAP: + dict_copy (dict, req_dict); + break; + default: + break; + } + + *req = req_dict; + ret = 0; +out: + return ret; +} + +int32_t +gd_mgmt_v3_brick_op_cbk_fn (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + int32_t ret = -1; + struct syncargs *args = NULL; + glusterd_peerinfo_t *peerinfo = NULL; + gd1_mgmt_v3_brick_op_rsp rsp = {{0},}; + call_frame_t *frame = NULL; + int32_t op_ret = -1; + int32_t op_errno = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + GF_ASSERT (myframe); + + frame = myframe; + args = frame->local; + peerinfo = frame->cookie; + frame->local = NULL; + frame->cookie = NULL; + + /* If the operation failed, then iov can be NULL. So better check the + status of the operation and then worry about iov (if the status of + the command is success) + */ + if (-1 == req->rpc_status) { + op_errno = ENOTCONN; + goto out; + } + + if (!iov) { + gf_log (this->name, GF_LOG_ERROR, "iov is NULL"); + op_errno = EINVAL; + goto out; + } + + ret = xdr_to_generic (*iov, &rsp, + (xdrproc_t)xdr_gd1_mgmt_v3_brick_op_rsp); + if (ret < 0) + goto out; + + uuid_copy (args->uuid, rsp.uuid); + + op_ret = rsp.op_ret; + op_errno = rsp.op_errno; + +out: + gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL, + GLUSTERD_MGMT_V3_BRICK_OP, + peerinfo, rsp.uuid); + + if (rsp.op_errstr) + free (rsp.op_errstr); + + free (rsp.dict.dict_val); + + STACK_DESTROY (frame->root); + synctask_barrier_wake(args); + return 0; +} + +int32_t +gd_mgmt_v3_brick_op_cbk (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + return glusterd_big_locked_cbk (req, iov, count, myframe, + gd_mgmt_v3_brick_op_cbk_fn); +} + +int +gd_mgmt_v3_brick_op_req (glusterd_op_t op, dict_t *op_ctx, + glusterd_peerinfo_t *peerinfo, + struct syncargs *args, uuid_t my_uuid, + uuid_t recv_uuid) +{ + int32_t ret = -1; + gd1_mgmt_v3_brick_op_req req = {{0},}; + glusterd_conf_t *conf = THIS->private; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (op_ctx); + GF_ASSERT (peerinfo); + GF_ASSERT (args); + + ret = dict_allocate_and_serialize (op_ctx, + &req.dict.dict_val, + &req.dict.dict_len); + if (ret) + goto out; + + uuid_copy (req.uuid, my_uuid); + req.op = op; + synclock_unlock (&conf->big_lock); + ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo, + &gd_mgmt_v3_prog, + GLUSTERD_MGMT_V3_BRICK_OP, + gd_mgmt_v3_brick_op_cbk, + (xdrproc_t) xdr_gd1_mgmt_v3_brick_op_req); + synclock_lock (&conf->big_lock); +out: + GF_FREE (req.dict.dict_val); + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +int +glusterd_mgmt_v3_brick_op (glusterd_conf_t *conf, glusterd_op_t op, + dict_t *req_dict, char **op_errstr, int npeers) +{ + int32_t ret = -1; + int32_t peer_cnt = 0; + dict_t *rsp_dict = NULL; + glusterd_peerinfo_t *peerinfo = NULL; + struct syncargs args = {0}; + struct list_head *peers = NULL; + uuid_t peer_uuid = {0}; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (conf); + GF_ASSERT (req_dict); + GF_ASSERT (op_errstr); + + peers = &conf->xaction_peers; + + rsp_dict = dict_new (); + if (!rsp_dict) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to create response dictionary"); + goto out; + } + + /* Perform brick op on local node */ + ret = gd_mgmt_v3_brick_op_fn (op, req_dict, op_errstr, + rsp_dict); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Brick ops failed for " + "operation %s on local node", + gd_op_list[op]); + + if (*op_errstr == NULL) { + ret = gf_asprintf (op_errstr, + "Brick ops failed " + "on localhost. Please " + "check log file for details"); + if (ret == -1) + *op_errstr = NULL; + + ret = -1; + } + goto out; + } + + dict_unref (rsp_dict); + rsp_dict = NULL; + + if (!npeers) { + ret = 0; + goto out; + } + + /* Sending brick op req to other nodes in the cluster */ + gd_syncargs_init (&args, NULL); + synctask_barrier_init((&args)); + peer_cnt = 0; + list_for_each_entry (peerinfo, peers, op_peers_list) { + gd_mgmt_v3_brick_op_req (op, req_dict, peerinfo, &args, + MY_UUID, peer_uuid); + peer_cnt++; + } + gd_synctask_barrier_wait((&args), peer_cnt); + + if (args.op_ret) { + gf_log (this->name, GF_LOG_ERROR, + "Brick ops failed on peers"); + + if (args.errstr) + *op_errstr = gf_strdup (args.errstr); + } + + ret = args.op_ret; + + gf_log (this->name, GF_LOG_DEBUG, "Sent brick op req for %s " + "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret); +out: + return ret; +} + +int32_t +gd_mgmt_v3_commit_cbk_fn (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + int32_t ret = -1; + struct syncargs *args = NULL; + glusterd_peerinfo_t *peerinfo = NULL; + gd1_mgmt_v3_commit_rsp rsp = {{0},}; + call_frame_t *frame = NULL; + int32_t op_ret = -1; + int32_t op_errno = -1; + dict_t *rsp_dict = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + GF_ASSERT (myframe); + + frame = myframe; + args = frame->local; + peerinfo = frame->cookie; + frame->local = NULL; + frame->cookie = NULL; + + if (-1 == req->rpc_status) { + op_errno = ENOTCONN; + goto out; + } + + if (!iov) { + gf_log (this->name, GF_LOG_ERROR, "iov is NULL"); + op_errno = EINVAL; + goto out; + } + + ret = xdr_to_generic (*iov, &rsp, + (xdrproc_t)xdr_gd1_mgmt_v3_commit_rsp); + if (ret < 0) + goto out; + + if (rsp.dict.dict_len) { + /* Unserialize the dictionary */ + rsp_dict = dict_new (); + + ret = dict_unserialize (rsp.dict.dict_val, + rsp.dict.dict_len, + &rsp_dict); + if (ret < 0) { + GF_FREE (rsp.dict.dict_val); + goto out; + } else { + rsp_dict->extra_stdfree = rsp.dict.dict_val; + } + } + + uuid_copy (args->uuid, rsp.uuid); + pthread_mutex_lock (&args->lock_dict); + { + ret = glusterd_syncop_aggr_rsp_dict (rsp.op, args->dict, + rsp_dict); + } + pthread_mutex_unlock (&args->lock_dict); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s", + "Failed to aggregate response from " + " node/brick"); + if (!rsp.op_ret) + op_ret = ret; + else { + op_ret = rsp.op_ret; + op_errno = rsp.op_errno; + } + } else { + op_ret = rsp.op_ret; + op_errno = rsp.op_errno; + } + +out: + if (rsp_dict) + dict_unref (rsp_dict); + + gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL, + GLUSTERD_MGMT_V3_COMMIT, + peerinfo, rsp.uuid); + + STACK_DESTROY (frame->root); + synctask_barrier_wake(args); + return 0; +} + +int32_t +gd_mgmt_v3_commit_cbk (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + return glusterd_big_locked_cbk (req, iov, count, myframe, + gd_mgmt_v3_commit_cbk_fn); +} + +int +gd_mgmt_v3_commit_req (glusterd_op_t op, dict_t *op_ctx, + glusterd_peerinfo_t *peerinfo, + struct syncargs *args, uuid_t my_uuid, + uuid_t recv_uuid) +{ + int32_t ret = -1; + gd1_mgmt_v3_commit_req req = {{0},}; + glusterd_conf_t *conf = THIS->private; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (op_ctx); + GF_ASSERT (peerinfo); + GF_ASSERT (args); + + ret = dict_allocate_and_serialize (op_ctx, + &req.dict.dict_val, + &req.dict.dict_len); + if (ret) + goto out; + + uuid_copy (req.uuid, my_uuid); + req.op = op; + synclock_unlock (&conf->big_lock); + ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo, + &gd_mgmt_v3_prog, + GLUSTERD_MGMT_V3_COMMIT, + gd_mgmt_v3_commit_cbk, + (xdrproc_t) xdr_gd1_mgmt_v3_commit_req); + synclock_lock (&conf->big_lock); +out: + GF_FREE (req.dict.dict_val); + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +int +glusterd_mgmt_v3_commit (glusterd_conf_t *conf, glusterd_op_t op, + dict_t *op_ctx, dict_t *req_dict, + char **op_errstr, int npeers) +{ + int32_t ret = -1; + int32_t peer_cnt = 0; + dict_t *rsp_dict = NULL; + glusterd_peerinfo_t *peerinfo = NULL; + struct syncargs args = {0}; + struct list_head *peers = NULL; + uuid_t peer_uuid = {0}; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (conf); + GF_ASSERT (op_ctx); + GF_ASSERT (req_dict); + GF_ASSERT (op_errstr); + + peers = &conf->xaction_peers; + + rsp_dict = dict_new (); + if (!rsp_dict) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to create response dictionary"); + goto out; + } + + /* Commit on local node */ + ret = gd_mgmt_v3_commit_fn (op, req_dict, op_errstr, + rsp_dict); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Commit failed for " + "operation %s on local node", + gd_op_list[op]); + + if (*op_errstr == NULL) { + ret = gf_asprintf (op_errstr, + "Commit failed " + "on localhost. Please " + "check log file for details."); + if (ret == -1) + *op_errstr = NULL; + + ret = -1; + } + goto out; + } + + ret = glusterd_syncop_aggr_rsp_dict (op, op_ctx, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s", + "Failed to aggregate response from " + " node/brick"); + goto out; + } + + dict_unref (rsp_dict); + rsp_dict = NULL; + + if (!npeers) { + ret = 0; + goto out; + } + + /* Sending commit req to other nodes in the cluster */ + gd_syncargs_init (&args, op_ctx); + synctask_barrier_init((&args)); + peer_cnt = 0; + list_for_each_entry (peerinfo, peers, op_peers_list) { + gd_mgmt_v3_commit_req (op, req_dict, peerinfo, &args, + MY_UUID, peer_uuid); + peer_cnt++; + } + gd_synctask_barrier_wait((&args), peer_cnt); + + if (args.op_ret) { + gf_log (this->name, GF_LOG_ERROR, + "Commit failed on peers"); + + if (args.errstr) + *op_errstr = gf_strdup (args.errstr); + } + + ret = args.op_ret; + + gf_log (this->name, GF_LOG_DEBUG, "Sent commit req for %s to %d " + "peers. Returning %d", gd_op_list[op], peer_cnt, ret); +out: + return ret; +} + +int32_t +gd_mgmt_v3_post_validate_cbk_fn (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + int32_t ret = -1; + struct syncargs *args = NULL; + glusterd_peerinfo_t *peerinfo = NULL; + gd1_mgmt_v3_post_val_rsp rsp = {{0},}; + call_frame_t *frame = NULL; + int32_t op_ret = -1; + int32_t op_errno = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + GF_ASSERT (myframe); + + frame = myframe; + args = frame->local; + peerinfo = frame->cookie; + frame->local = NULL; + frame->cookie = NULL; + + if (-1 == req->rpc_status) { + op_errno = ENOTCONN; + goto out; + } + + if (!iov) { + gf_log (this->name, GF_LOG_ERROR, "iov is NULL"); + op_errno = EINVAL; + goto out; + } + + ret = xdr_to_generic (*iov, &rsp, + (xdrproc_t)xdr_gd1_mgmt_v3_post_val_rsp); + if (ret < 0) + goto out; + + uuid_copy (args->uuid, rsp.uuid); + + op_ret = rsp.op_ret; + op_errno = rsp.op_errno; + +out: + gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL, + GLUSTERD_MGMT_V3_POST_VALIDATE, + peerinfo, rsp.uuid); + if (rsp.op_errstr) + free (rsp.op_errstr); + + free (rsp.dict.dict_val); + STACK_DESTROY (frame->root); + synctask_barrier_wake(args); + return 0; +} + +int32_t +gd_mgmt_v3_post_validate_cbk (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + return glusterd_big_locked_cbk (req, iov, count, myframe, + gd_mgmt_v3_post_validate_cbk_fn); +} + +int +gd_mgmt_v3_post_validate_req (glusterd_op_t op, int32_t op_ret, dict_t *op_ctx, + glusterd_peerinfo_t *peerinfo, + struct syncargs *args, uuid_t my_uuid, + uuid_t recv_uuid) +{ + int32_t ret = -1; + gd1_mgmt_v3_post_val_req req = {{0},}; + glusterd_conf_t *conf = THIS->private; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (op_ctx); + GF_ASSERT (peerinfo); + GF_ASSERT (args); + + ret = dict_allocate_and_serialize (op_ctx, + &req.dict.dict_val, + &req.dict.dict_len); + if (ret) + goto out; + + uuid_copy (req.uuid, my_uuid); + req.op = op; + req.op_ret = op_ret; + synclock_unlock (&conf->big_lock); + ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo, + &gd_mgmt_v3_prog, + GLUSTERD_MGMT_V3_POST_VALIDATE, + gd_mgmt_v3_post_validate_cbk, + (xdrproc_t) xdr_gd1_mgmt_v3_post_val_req); + synclock_lock (&conf->big_lock); +out: + GF_FREE (req.dict.dict_val); + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +int +glusterd_mgmt_v3_post_validate (glusterd_conf_t *conf, glusterd_op_t op, + int32_t op_ret, dict_t *dict, dict_t *req_dict, + char **op_errstr, int npeers) +{ + int32_t ret = -1; + int32_t peer_cnt = 0; + dict_t *rsp_dict = NULL; + glusterd_peerinfo_t *peerinfo = NULL; + struct syncargs args = {0}; + struct list_head *peers = NULL; + uuid_t peer_uuid = {0}; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (conf); + GF_ASSERT (dict); + GF_ASSERT (req_dict); + GF_ASSERT (op_errstr); + + peers = &conf->xaction_peers; + GF_ASSERT (peers); + + rsp_dict = dict_new (); + if (!rsp_dict) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to create response dictionary"); + goto out; + } + + /* Copy the contents of dict like missed snaps info to req_dict */ + dict_copy (dict, req_dict); + + /* Post Validation on local node */ + ret = gd_mgmt_v3_post_validate_fn (op, op_ret, req_dict, op_errstr, + rsp_dict); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Post Validation failed for " + "operation %s on local node", + gd_op_list[op]); + + if (*op_errstr == NULL) { + ret = gf_asprintf (op_errstr, + "Post-validation failed " + "on localhost. Please check " + "log file for details"); + if (ret == -1) + *op_errstr = NULL; + + ret = -1; + } + goto out; + } + + dict_unref (rsp_dict); + rsp_dict = NULL; + + if (!npeers) { + ret = 0; + goto out; + } + + /* Sending Post Validation req to other nodes in the cluster */ + gd_syncargs_init (&args, req_dict); + synctask_barrier_init((&args)); + peer_cnt = 0; + list_for_each_entry (peerinfo, peers, op_peers_list) { + gd_mgmt_v3_post_validate_req (op, op_ret, req_dict, peerinfo, + &args, MY_UUID, peer_uuid); + peer_cnt++; + } + gd_synctask_barrier_wait((&args), peer_cnt); + + if (args.op_ret) { + gf_log (this->name, GF_LOG_ERROR, + "Post Validation failed on peers"); + + if (args.errstr) + *op_errstr = gf_strdup (args.errstr); + } + + ret = args.op_ret; + + gf_log (this->name, GF_LOG_DEBUG, "Sent post valaidation req for %s " + "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret); +out: + return ret; +} + +int32_t +gd_mgmt_v3_unlock_cbk_fn (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + int32_t ret = -1; + struct syncargs *args = NULL; + glusterd_peerinfo_t *peerinfo = NULL; + gd1_mgmt_v3_unlock_rsp rsp = {{0},}; + call_frame_t *frame = NULL; + int32_t op_ret = -1; + int32_t op_errno = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + GF_ASSERT (myframe); + + frame = myframe; + args = frame->local; + peerinfo = frame->cookie; + frame->local = NULL; + frame->cookie = NULL; + + if (-1 == req->rpc_status) { + op_errno = ENOTCONN; + goto out; + } + + if (!iov) { + gf_log (this->name, GF_LOG_ERROR, "iov is NULL"); + op_errno = EINVAL; + goto out; + } + + ret = xdr_to_generic (*iov, &rsp, + (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp); + if (ret < 0) + goto out; + + uuid_copy (args->uuid, rsp.uuid); + + op_ret = rsp.op_ret; + op_errno = rsp.op_errno; + +out: + gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL, + GLUSTERD_MGMT_V3_UNLOCK, + peerinfo, rsp.uuid); + free (rsp.dict.dict_val); + STACK_DESTROY (frame->root); + synctask_barrier_wake(args); + return 0; +} + +int32_t +gd_mgmt_v3_unlock_cbk (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + return glusterd_big_locked_cbk (req, iov, count, myframe, + gd_mgmt_v3_unlock_cbk_fn); +} + +int +gd_mgmt_v3_unlock (glusterd_op_t op, dict_t *op_ctx, + glusterd_peerinfo_t *peerinfo, + struct syncargs *args, uuid_t my_uuid, + uuid_t recv_uuid) +{ + int32_t ret = -1; + gd1_mgmt_v3_unlock_req req = {{0},}; + glusterd_conf_t *conf = THIS->private; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (op_ctx); + GF_ASSERT (peerinfo); + GF_ASSERT (args); + + ret = dict_allocate_and_serialize (op_ctx, + &req.dict.dict_val, + &req.dict.dict_len); + if (ret) + goto out; + + uuid_copy (req.uuid, my_uuid); + req.op = op; + synclock_unlock (&conf->big_lock); + ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo, + &gd_mgmt_v3_prog, + GLUSTERD_MGMT_V3_UNLOCK, + gd_mgmt_v3_unlock_cbk, + (xdrproc_t) xdr_gd1_mgmt_v3_unlock_req); + synclock_lock (&conf->big_lock); +out: + GF_FREE (req.dict.dict_val); + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +int +glusterd_mgmt_v3_release_peer_locks (glusterd_conf_t *conf, glusterd_op_t op, + dict_t *dict, int32_t op_ret, + char **op_errstr, int npeers, + gf_boolean_t is_acquired) +{ + int32_t ret = -1; + int32_t peer_cnt = 0; + uuid_t peer_uuid = {0}; + xlator_t *this = NULL; + glusterd_peerinfo_t *peerinfo = NULL; + struct syncargs args = {0}; + struct list_head *peers = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (conf); + GF_ASSERT (dict); + GF_ASSERT (op_errstr); + + peers = &conf->xaction_peers; + + /* If the lock has not been held during this + * transaction, do not send unlock requests */ + if (!is_acquired) + goto out; + + if (!npeers) { + ret = 0; + goto out; + } + + /* Sending mgmt_v3 unlock req to other nodes in the cluster */ + gd_syncargs_init (&args, NULL); + synctask_barrier_init((&args)); + peer_cnt = 0; + list_for_each_entry (peerinfo, peers, op_peers_list) { + gd_mgmt_v3_unlock (op, dict, peerinfo, &args, + MY_UUID, peer_uuid); + peer_cnt++; + } + gd_synctask_barrier_wait((&args), peer_cnt); + + if (args.op_ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unlock failed on peers"); + + if (!op_ret && args.errstr) + *op_errstr = gf_strdup (args.errstr); + } + + ret = args.op_ret; + + gf_log (this->name, GF_LOG_DEBUG, "Sent unlock op req for %s " + "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret); + +out: + return ret; +} + +int32_t +glusterd_mgmt_v3_initiate_all_phases (rpcsvc_request_t *req, glusterd_op_t op, + dict_t *dict) +{ + int32_t ret = -1; + int32_t op_ret = -1; + int32_t npeers = 0; + dict_t *req_dict = NULL; + dict_t *tmp_dict = NULL; + glusterd_conf_t *conf = NULL; + char *op_errstr = NULL; + xlator_t *this = NULL; + gf_boolean_t is_acquired = _gf_false; + uuid_t *originator_uuid = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + GF_ASSERT (dict); + conf = this->private; + GF_ASSERT (conf); + + /* Save the MY_UUID as the originator_uuid. This originator_uuid + * will be used by is_origin_glusterd() to determine if a node + * is the originator node for a command. */ + originator_uuid = GF_CALLOC (1, sizeof(uuid_t), + gf_common_mt_uuid_t); + if (!originator_uuid) { + ret = -1; + goto out; + } + + uuid_copy (*originator_uuid, MY_UUID); + ret = dict_set_bin (dict, "originator_uuid", + originator_uuid, sizeof (uuid_t)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set originator_uuid."); + goto out; + } + + /* Marking the operation as complete synctasked */ + ret = dict_set_int32 (dict, "is_synctasked", _gf_true); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set synctasked flag."); + goto out; + } + + /* Use a copy at local unlock as cli response will be sent before + * the unlock and the volname in the dict might be removed */ + tmp_dict = dict_new(); + if (!tmp_dict) { + gf_log (this->name, GF_LOG_ERROR, "Unable to create dict"); + goto out; + } + dict_copy (dict, tmp_dict); + + /* BUILD PEERS LIST */ + INIT_LIST_HEAD (&conf->xaction_peers); + npeers = gd_build_peers_list (&conf->peers, &conf->xaction_peers, op); + + /* LOCKDOWN PHASE - Acquire mgmt_v3 locks */ + ret = glusterd_mgmt_v3_initiate_lockdown (conf, op, dict, &op_errstr, + npeers, &is_acquired); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "mgmt_v3 lockdown failed."); + goto out; + } + + /* BUILD PAYLOAD */ + ret = glusterd_mgmt_v3_build_payload (&req_dict, &op_errstr, dict, op); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, LOGSTR_BUILD_PAYLOAD, + gd_op_list[op]); + if (op_errstr == NULL) + gf_asprintf (&op_errstr, OPERRSTR_BUILD_PAYLOAD); + goto out; + } + + /* PRE-COMMIT VALIDATE PHASE */ + ret = glusterd_mgmt_v3_pre_validate (conf, op, req_dict, + &op_errstr, npeers); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Pre Validation Failed"); + goto out; + } + + /* COMMIT OP PHASE */ + ret = glusterd_mgmt_v3_commit (conf, op, dict, req_dict, + &op_errstr, npeers); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Commit Op Failed"); + goto out; + } + + /* POST-COMMIT VALIDATE PHASE */ + /* As of now, post_validate is not handling any other + commands other than snapshot. So as of now, I am + sending 0 (op_ret as 0). + */ + ret = glusterd_mgmt_v3_post_validate (conf, op, 0, dict, req_dict, + &op_errstr, npeers); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Post Validation Failed"); + goto out; + } + + ret = 0; +out: + op_ret = ret; + /* UNLOCK PHASE FOR PEERS*/ + (void) glusterd_mgmt_v3_release_peer_locks (conf, op, dict, + op_ret, &op_errstr, + npeers, is_acquired); + + /* LOCAL VOLUME(S) UNLOCK */ + if (is_acquired) { + /* Trying to release multiple mgmt_v3 locks */ + ret = glusterd_multiple_mgmt_v3_unlock (tmp_dict, MY_UUID); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to release mgmt_v3 locks on localhost"); + op_ret = ret; + } + } + + /* SEND CLI RESPONSE */ + glusterd_op_send_cli_response (op, op_ret, 0, req, dict, op_errstr); + + if (req_dict) + dict_unref (req_dict); + + if (tmp_dict) + dict_unref (tmp_dict); + + if (op_errstr) { + GF_FREE (op_errstr); + op_errstr = NULL; + } + + return 0; +} + +int32_t +glusterd_mgmt_v3_initiate_snap_phases (rpcsvc_request_t *req, glusterd_op_t op, + dict_t *dict) +{ + int32_t ret = -1; + int32_t op_ret = -1; + int32_t npeers = 0; + dict_t *req_dict = NULL; + dict_t *tmp_dict = NULL; + glusterd_conf_t *conf = NULL; + char *op_errstr = NULL; + xlator_t *this = NULL; + gf_boolean_t is_acquired = _gf_false; + uuid_t *originator_uuid = NULL; + gf_boolean_t success = _gf_false; + char *tmp_errstr = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + GF_ASSERT (dict); + conf = this->private; + GF_ASSERT (conf); + + /* Save the MY_UUID as the originator_uuid. This originator_uuid + * will be used by is_origin_glusterd() to determine if a node + * is the originator node for a command. */ + originator_uuid = GF_CALLOC (1, sizeof(uuid_t), + gf_common_mt_uuid_t); + if (!originator_uuid) { + ret = -1; + goto out; + } + + uuid_copy (*originator_uuid, MY_UUID); + ret = dict_set_bin (dict, "originator_uuid", + originator_uuid, sizeof (uuid_t)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set originator_uuid."); + goto out; + } + + /* Marking the operation as complete synctasked */ + ret = dict_set_int32 (dict, "is_synctasked", _gf_true); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set synctasked flag."); + goto out; + } + + /* Use a copy at local unlock as cli response will be sent before + * the unlock and the volname in the dict might be removed */ + tmp_dict = dict_new(); + if (!tmp_dict) { + gf_log (this->name, GF_LOG_ERROR, "Unable to create dict"); + goto out; + } + dict_copy (dict, tmp_dict); + + /* BUILD PEERS LIST */ + INIT_LIST_HEAD (&conf->xaction_peers); + npeers = gd_build_peers_list (&conf->peers, &conf->xaction_peers, op); + + /* LOCKDOWN PHASE - Acquire mgmt_v3 locks */ + ret = glusterd_mgmt_v3_initiate_lockdown (conf, op, dict, &op_errstr, + npeers, &is_acquired); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "mgmt_v3 lockdown failed."); + goto out; + } + + /* BUILD PAYLOAD */ + ret = glusterd_mgmt_v3_build_payload (&req_dict, &op_errstr, dict, op); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, LOGSTR_BUILD_PAYLOAD, + gd_op_list[op]); + if (op_errstr == NULL) + gf_asprintf (&op_errstr, OPERRSTR_BUILD_PAYLOAD); + goto out; + } + + /* PRE-COMMIT VALIDATE PHASE */ + ret = glusterd_mgmt_v3_pre_validate (conf, op, req_dict, + &op_errstr, npeers); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Pre Validation Failed"); + goto out; + } + + /* BRICK OP PHASE for initiating barrier*/ + ret = dict_set_int32 (req_dict, "barrier", 1); + if (ret) + goto out; + ret = glusterd_mgmt_v3_brick_op (conf, op, req_dict, + &op_errstr, npeers); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Brick Ops Failed"); + goto unbarrier; + } + + /* COMMIT OP PHASE */ + /* TODO: As of now, the plan is to do quorum check before sending the + commit fop and if the quorum succeeds, then commit is sent to all + the other glusterds. + snap create functionality now creates the in memory and on disk + objects for the snapshot (marking them as incomplete), takes the lvm + snapshot and then updates the status of the in memory and on disk + snap objects as complete. Suppose one of the glusterds goes down + after taking the lvm snapshot, but before updating the snap object, + then treat it as a snapshot create failure and trigger cleanup. + i.e the number of commit responses received by the originator + glusterd shold be the same as the number of peers it has sent the + request to (i.e npeers variable). If not, then originator glusterd + will initiate cleanup in post-validate fop. + Question: What if one of the other glusterds goes down as explained + above and along with it the originator glusterd also goes down? + Who will initiate the cleanup? + */ + ret = glusterd_mgmt_v3_commit (conf, op, dict, req_dict, + &op_errstr, npeers); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Commit Op Failed"); + /* If the main op fails, we should save the error string. + Because, op_errstr will be used for unbarrier and + unlock ops also. We might lose the actual error that + caused the failure. + */ + tmp_errstr = op_errstr; + op_errstr = NULL; + goto unbarrier; + } + + success = _gf_true; +unbarrier: + /* BRICK OP PHASE for removing the barrier*/ + ret = dict_set_int32 (req_dict, "barrier", 0); + if (ret) + goto out; + ret = glusterd_mgmt_v3_brick_op (conf, op, req_dict, + &op_errstr, npeers); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Brick Ops Failed"); + goto out; + } + + ret = 0; + +out: + op_ret = ret; + + if (success == _gf_false) + op_ret = -1; + + /* POST-COMMIT VALIDATE PHASE */ + ret = glusterd_mgmt_v3_post_validate (conf, op, op_ret, dict, req_dict, + &op_errstr, npeers); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Post Validation Failed"); + op_ret = -1; + } + + /* UNLOCK PHASE FOR PEERS*/ + (void) glusterd_mgmt_v3_release_peer_locks (conf, op, dict, + op_ret, &op_errstr, + npeers, is_acquired); + + /* If the commit op (snapshot taking) failed, then the error is stored + in tmp_errstr and unbarrier is called. Suppose, if unbarrier also + fails, then the error happened in unbarrier is logged and freed. + The error happened in commit op, which is stored in tmp_errstr + is sent to cli. + */ + if (tmp_errstr) { + if (op_errstr) { + gf_log (this->name, GF_LOG_ERROR, "unbarrier brick op" + "failed with the error %s", op_errstr); + GF_FREE (op_errstr); + op_errstr = NULL; + } + op_errstr = tmp_errstr; + } + + /* LOCAL VOLUME(S) UNLOCK */ + if (is_acquired) { + /* Trying to release multiple mgmt_v3 locks */ + ret = glusterd_multiple_mgmt_v3_unlock (tmp_dict, MY_UUID); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to release mgmt_v3 locks on localhost"); + op_ret = ret; + } + } + + /* SEND CLI RESPONSE */ + glusterd_op_send_cli_response (op, op_ret, 0, req, dict, op_errstr); + + if (req_dict) + dict_unref (req_dict); + + if (tmp_dict) + dict_unref (tmp_dict); + + if (op_errstr) { + GF_FREE (op_errstr); + op_errstr = NULL; + } + + return 0; +} diff --git a/xlators/mgmt/glusterd/src/glusterd-mgmt.h b/xlators/mgmt/glusterd/src/glusterd-mgmt.h new file mode 100644 index 000000000..b185a9bec --- /dev/null +++ b/xlators/mgmt/glusterd/src/glusterd-mgmt.h @@ -0,0 +1,45 @@ +/* + Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _GLUSTERD_MGMT_H_ +#define _GLUSTERD_MGMT_H_ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +int32_t +gd_mgmt_v3_pre_validate_fn (glusterd_op_t op, dict_t *dict, + char **op_errstr, dict_t *rsp_dict); + +int32_t +gd_mgmt_v3_brick_op_fn (glusterd_op_t op, dict_t *dict, + char **op_errstr, dict_t *rsp_dict); + +int32_t +gd_mgmt_v3_commit_fn (glusterd_op_t op, dict_t *dict, + char **op_errstr, dict_t *rsp_dict); + +int32_t +gd_mgmt_v3_post_validate_fn (glusterd_op_t op, int32_t op_ret, dict_t *dict, + char **op_errstr, dict_t *rsp_dict); + +int32_t +glusterd_mgmt_v3_initiate_all_phases (rpcsvc_request_t *req, glusterd_op_t op, + dict_t *dict); + +int32_t +glusterd_mgmt_v3_initiate_snap_phases (rpcsvc_request_t *req, glusterd_op_t op, + dict_t *dict); + +int +glusterd_snap_pre_validate_use_rsp_dict (dict_t *dst, dict_t *src); + +#endif /* _GLUSTERD_MGMT_H_ */ diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c index 7f19be072..1666f5e4d 100644 --- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c +++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c @@ -37,6 +37,7 @@ #include "glusterd-store.h" #include "glusterd-hooks.h" #include "glusterd-volgen.h" +#include "glusterd-locks.h" #include "syscall.h" #include "cli1-xdr.h" #include "common-utils.h" @@ -67,6 +68,165 @@ static struct list_head gd_op_sm_queue; pthread_mutex_t gd_op_sm_lock; glusterd_op_info_t opinfo = {{0},}; +uuid_t global_txn_id = {"\0"}; /* To be used in + * heterogeneous + * cluster with no + * transaction ids */ + +static dict_t *txn_opinfo; + +struct txn_opinfo_object_ { + glusterd_op_info_t opinfo; +}; +typedef struct txn_opinfo_object_ txn_opinfo_obj; + +int32_t +glusterd_txn_opinfo_dict_init () +{ + int32_t ret = -1; + + txn_opinfo = dict_new (); + if (!txn_opinfo) { + ret = -1; + goto out; + } + + ret = 0; +out: + return ret; +} + +void +glusterd_txn_opinfo_dict_fini () +{ + if (txn_opinfo) + dict_destroy (txn_opinfo); +} + +void +glusterd_txn_opinfo_init (glusterd_op_info_t *opinfo, + glusterd_op_sm_state_info_t *state, + glusterd_op_t *op, + dict_t *op_ctx, + rpcsvc_request_t *req) +{ + GF_ASSERT (opinfo); + + if (state) + opinfo->state = *state; + + if (op) + opinfo->op = *op; + + opinfo->op_ctx = dict_ref(op_ctx); + + if (req) + opinfo->req = req; + + return; +} + +int32_t +glusterd_get_txn_opinfo (uuid_t *txn_id, glusterd_op_info_t *opinfo) +{ + int32_t ret = -1; + txn_opinfo_obj *opinfo_obj = NULL; + + if (!txn_id || !opinfo) { + gf_log ("", GF_LOG_ERROR, + "Empty transaction id or opinfo received."); + ret = -1; + goto out; + } + + ret = dict_get_bin(txn_opinfo, uuid_utoa (*txn_id), + (void **) &opinfo_obj); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to get transaction opinfo"); + goto out; + } + + (*opinfo) = opinfo_obj->opinfo; + + ret = 0; +out: + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +int32_t +glusterd_set_txn_opinfo (uuid_t *txn_id, glusterd_op_info_t *opinfo) +{ + int32_t ret = -1; + txn_opinfo_obj *opinfo_obj = NULL; + + if (!txn_id) { + gf_log ("", GF_LOG_ERROR, "Empty transaction id received."); + ret = -1; + goto out; + } + + ret = dict_get_bin(txn_opinfo, uuid_utoa (*txn_id), + (void **) &opinfo_obj); + if (ret) { + opinfo_obj = GF_CALLOC (1, sizeof(txn_opinfo_obj), + gf_common_mt_txn_opinfo_obj_t); + if (!opinfo_obj) { + ret = -1; + goto out; + } + + ret = dict_set_bin(txn_opinfo, uuid_utoa (*txn_id), opinfo_obj, + sizeof(txn_opinfo_obj)); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to set opinfo for transaction ID : %s", + uuid_utoa (*txn_id)); + goto out; + } + } + + opinfo_obj->opinfo = (*opinfo); + + ret = 0; +out: + if (ret) + if (opinfo_obj) + GF_FREE (opinfo_obj); + + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +int32_t +glusterd_clear_txn_opinfo (uuid_t *txn_id) +{ + int32_t ret = -1; + glusterd_op_info_t txn_op_info = {{0},}; + + if (!txn_id) { + gf_log ("", GF_LOG_ERROR, "Empty transaction id received."); + ret = -1; + goto out; + } + + ret = glusterd_get_txn_opinfo (txn_id, &txn_op_info); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Transaction opinfo not found"); + goto out; + } + + dict_unref (txn_op_info.op_ctx); + + dict_del(txn_opinfo, uuid_utoa (*txn_id)); + + ret = 0; +out: + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + static int glusterfs_port = GLUSTERD_DEFAULT_PORT; static char *glusterd_op_sm_state_names[] = { "Default", @@ -147,10 +307,10 @@ glusterd_is_volume_started (glusterd_volinfo_t *volinfo) } static int -glusterd_op_sm_inject_all_acc () +glusterd_op_sm_inject_all_acc (uuid_t *txn_id) { int32_t ret = -1; - ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACC, NULL); + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACC, txn_id, NULL); gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); return ret; } @@ -235,20 +395,20 @@ glusterd_brick_op_build_payload (glusterd_op_t op, glusterd_brickinfo_t *brickin brick_req->name = gf_strdup (name); break; - -#ifdef HAVE_BD_XLATOR - case GD_OP_BD_OP: - { + case GD_OP_SNAP: brick_req = GF_CALLOC (1, sizeof (*brick_req), gf_gld_mt_mop_brick_req_t); if (!brick_req) goto out; - brick_req->op = GLUSTERD_BRICK_BD_OP; - brick_req->name = ""; - } + brick_req->op = GLUSTERD_VOLUME_BARRIER_OP; + ret = dict_get_str (dict, "volname", &volname); + if (ret) + goto out; + snprintf (name, 1024, "%s-server",volname); + brick_req->name = gf_strdup (name); + break; -#endif default: goto out; break; @@ -416,11 +576,17 @@ glusterd_op_stage_set_volume (dict_t *dict, char **op_errstr) if (!val_dict) goto out; + ret = dict_get_str (dict, "volname", &volname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name"); + goto out; + } + /* Check if we can support the required op-version * This check is not done on the originator glusterd. The originator * glusterd sets this value. */ - origin_glusterd = is_origin_glusterd (); + origin_glusterd = is_origin_glusterd (dict); if (!origin_glusterd) { /* Check for v3.3.x origin glusterd */ @@ -828,7 +994,7 @@ glusterd_op_stage_sync_volume (dict_t *dict, char **op_errstr) goto out; } - if (glusterd_is_local_addr (hostname)) { + if (gf_is_local_addr (hostname)) { //volname is not present in case of sync all ret = dict_get_str (dict, "volname", &volname); if (!ret) { @@ -1101,14 +1267,17 @@ _delete_reconfig_opt (dict_t *this, char *key, data_t *value, void *data) GF_ASSERT (data); is_force = (int32_t*)data; - if (*is_force != 1 && - (_gf_true == glusterd_check_voloption_flags (key, - OPT_FLAG_FORCE))) { + if (*is_force != 1) { + if (_gf_true == glusterd_check_voloption_flags (key, + OPT_FLAG_FORCE)) { /* indicate to caller that we don't set the option * due to being protected */ - *is_force = -1; - goto out; + *is_force = *is_force | GD_OP_PROTECTED; + goto out; + } else { + *is_force = *is_force | GD_OP_UNPROTECTED; + } } gf_log ("", GF_LOG_DEBUG, "deleting dict with key=%s,value=%s", @@ -1160,8 +1329,9 @@ glusterd_options_reset (glusterd_volinfo_t *volinfo, char *key, _delete_reconfig_opt (volinfo->dict, key, value, is_force); } - ret = glusterd_create_volfiles_and_notify_services (volinfo); + gd_update_volume_op_versions (volinfo); + ret = glusterd_create_volfiles_and_notify_services (volinfo); if (ret) { gf_log (this->name, GF_LOG_ERROR, "Unable to create volfile for" " 'volume reset'"); @@ -1276,7 +1446,7 @@ out: } static int -glusterd_op_reset_volume (dict_t *dict, char **op_errstr) +glusterd_op_reset_volume (dict_t *dict, char **op_rspstr) { glusterd_volinfo_t *volinfo = NULL; int ret = -1; @@ -1331,14 +1501,20 @@ glusterd_op_reset_volume (dict_t *dict, char **op_errstr) quorum_action = _gf_true; ret = glusterd_options_reset (volinfo, key, &is_force); - if (is_force == -1) { - ret = -1; - gf_asprintf(op_errstr, "'%s' is protected. To reset use 'force'.", - key); + if (ret == -1) { + gf_asprintf(op_rspstr, "Volume reset : failed"); + } else if (is_force & GD_OP_PROTECTED) { + if (is_force & GD_OP_UNPROTECTED) { + gf_asprintf (op_rspstr, "All unprotected fields were" + " reset. To reset the protected fields," + " use 'force'."); + } else { + ret = -1; + gf_asprintf (op_rspstr, "'%s' is protected. To reset" + " use 'force'.", key); + } } - gd_update_volume_op_versions (volinfo); - out: GF_FREE (key_fixed); if (quorum_action) @@ -1366,14 +1542,25 @@ glusterd_stop_bricks (glusterd_volinfo_t *volinfo) int glusterd_start_bricks (glusterd_volinfo_t *volinfo) { - glusterd_brickinfo_t *brickinfo = NULL; + int ret = -1; + glusterd_brickinfo_t *brickinfo = NULL; + + GF_ASSERT (volinfo); list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { - if (glusterd_brick_start (volinfo, brickinfo, _gf_false)) - return -1; + ret = glusterd_brick_start (volinfo, brickinfo, _gf_false); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, + "Failed to start %s:%s for %s", + brickinfo->hostname, brickinfo->path, + volinfo->volname); + goto out; + } } - return 0; + ret = 0; +out: + return ret; } static int @@ -1476,6 +1663,7 @@ glusterd_op_set_volume (dict_t *dict) char str[50] = {0, }; char *op_errstr = NULL; gf_boolean_t global_opt = _gf_false; + gf_boolean_t global_opts_set = _gf_false; glusterd_volinfo_t *voliter = NULL; int32_t dict_count = 0; gf_boolean_t check_op_version = _gf_false; @@ -1537,7 +1725,6 @@ glusterd_op_set_volume (dict_t *dict) for (count = 1; ret != -1 ; count++) { - global_opt = _gf_false; sprintf (str, "key%d", count); ret = dict_get_str (dict, str, &key); if (ret) @@ -1585,8 +1772,11 @@ glusterd_op_set_volume (dict_t *dict) } } - if (glusterd_check_globaloption (key)) + global_opt = _gf_false; + if (glusterd_check_globaloption (key)) { global_opt = _gf_true; + global_opts_set = _gf_true; + } if (!global_opt) value = gf_strdup (value); @@ -1642,7 +1832,8 @@ glusterd_op_set_volume (dict_t *dict) } } - if (!global_opt) { + if (!global_opts_set) { + gd_update_volume_op_versions (volinfo); ret = glusterd_create_volfiles_and_notify_services (volinfo); if (ret) { gf_log (this->name, GF_LOG_ERROR, @@ -1664,11 +1855,11 @@ glusterd_op_set_volume (dict_t *dict) goto out; } } - gd_update_volume_op_versions (volinfo); } else { list_for_each_entry (voliter, &priv->volumes, vol_list) { volinfo = voliter; + gd_update_volume_op_versions (volinfo); ret = glusterd_create_volfiles_and_notify_services (volinfo); if (ret) { gf_log (this->name, GF_LOG_ERROR, @@ -1691,7 +1882,6 @@ glusterd_op_set_volume (dict_t *dict) goto out; } } - gd_update_volume_op_versions (volinfo); } } @@ -1731,7 +1921,7 @@ glusterd_op_sync_volume (dict_t *dict, char **op_errstr, goto out; } - if (!glusterd_is_local_addr (hostname)) { + if (!gf_is_local_addr (hostname)) { ret = 0; goto out; } @@ -1902,6 +2092,105 @@ out: } static int +_add_brick_name_to_dict (dict_t *dict, char *key, glusterd_brickinfo_t *brick) +{ + int ret = -1; + char tmp[1024] = {0,}; + char *brickname = NULL; + xlator_t *this = NULL; + + GF_ASSERT (dict); + GF_ASSERT (key); + GF_ASSERT (brick); + + this = THIS; + GF_ASSERT (this); + + snprintf (tmp, sizeof (tmp), "%s:%s", brick->hostname, brick->path); + brickname = gf_strdup (tmp); + if (!brickname) { + gf_log (this->name, GF_LOG_ERROR, "Failed to dup brick name"); + goto out; + } + + ret = dict_set_dynstr (dict, key, brickname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to add brick name to dict"); + goto out; + } + brickname = NULL; +out: + if (brickname) + GF_FREE (brickname); + return ret; +} + +static int +_add_remove_bricks_to_dict (dict_t *dict, glusterd_volinfo_t *volinfo, + char *prefix) +{ + int ret = -1; + int count = 0; + int i = 0; + char brick_key[1024] = {0,}; + char dict_key[1024] ={0,}; + char *brick = NULL; + xlator_t *this = NULL; + + GF_ASSERT (dict); + GF_ASSERT (volinfo); + GF_ASSERT (prefix); + + this = THIS; + GF_ASSERT (this); + + ret = dict_get_int32 (volinfo->rebal.dict, "count", &count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to get brick count"); + goto out; + } + + snprintf (dict_key, sizeof (dict_key), "%s.count", prefix); + ret = dict_set_int32 (dict, dict_key, count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set brick count in dict"); + goto out; + } + + for (i = 1; i <= count; i++) { + memset (brick_key, 0, sizeof (brick_key)); + snprintf (brick_key, sizeof (brick_key), "brick%d", i); + + ret = dict_get_str (volinfo->rebal.dict, brick_key, &brick); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to get %s", brick_key); + goto out; + } + + memset (dict_key, 0, sizeof (dict_key)); + snprintf (dict_key, sizeof (dict_key), "%s.%s", prefix, + brick_key); + ret = dict_set_str (dict, dict_key, brick); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to add brick to dict"); + goto out; + } + brick = NULL; + } + +out: + return ret; +} + +/* This adds the respective task-id and all available parameters of a task into + * a dictionary + */ +static int _add_task_to_dict (dict_t *dict, glusterd_volinfo_t *volinfo, int op, int index) { @@ -1918,13 +2207,34 @@ _add_task_to_dict (dict_t *dict, glusterd_volinfo_t *volinfo, int op, int index) GF_ASSERT (this); switch (op) { - case GD_OP_REBALANCE: case GD_OP_REMOVE_BRICK: + snprintf (key, sizeof (key), "task%d", index); + ret = _add_remove_bricks_to_dict (dict, volinfo, key); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to add remove bricks to dict"); + goto out; + } + case GD_OP_REBALANCE: uuid_str = gf_strdup (uuid_utoa (volinfo->rebal.rebalance_id)); status = volinfo->rebal.defrag_status; break; case GD_OP_REPLACE_BRICK: + snprintf (key, sizeof (key), "task%d.src-brick", index); + ret = _add_brick_name_to_dict (dict, key, + volinfo->rep_brick.src_brick); + if (ret) + goto out; + memset (key, 0, sizeof (key)); + + snprintf (key, sizeof (key), "task%d.dst-brick", index); + ret = _add_brick_name_to_dict (dict, key, + volinfo->rep_brick.dst_brick); + if (ret) + goto out; + memset (key, 0, sizeof (key)); + uuid_str = gf_strdup (uuid_utoa (volinfo->rep_brick.rb_id)); status = volinfo->rep_brick.rb_status; break; @@ -1937,8 +2247,7 @@ _add_task_to_dict (dict_t *dict, glusterd_volinfo_t *volinfo, int op, int index) } snprintf (key, sizeof (key), "task%d.type", index); - ret = dict_set_str (dict, key, - (char *)gd_op_list[op]); + ret = dict_set_str (dict, key, (char *)gd_op_list[op]); if (ret) { gf_log (this->name, GF_LOG_ERROR, "Error setting task type in dict"); @@ -1948,7 +2257,6 @@ _add_task_to_dict (dict_t *dict, glusterd_volinfo_t *volinfo, int op, int index) memset (key, 0, sizeof (key)); snprintf (key, sizeof (key), "task%d.id", index); - if (!uuid_str) goto out; ret = dict_set_dynstr (dict, key, uuid_str); @@ -1975,6 +2283,50 @@ out: } static int +glusterd_aggregate_task_status (dict_t *rsp_dict, glusterd_volinfo_t *volinfo) +{ + int ret = -1; + int tasks = 0; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + + if (!uuid_is_null (volinfo->rebal.rebalance_id)) { + ret = _add_task_to_dict (rsp_dict, volinfo, volinfo->rebal.op, + tasks); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to add task details to dict"); + goto out; + } + tasks++; + } + + if (!uuid_is_null (volinfo->rep_brick.rb_id)) { + ret = _add_task_to_dict (rsp_dict, volinfo, GD_OP_REPLACE_BRICK, + tasks); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to add task details to dict"); + goto out; + } + tasks++; + } + + ret = dict_set_int32 (rsp_dict, "tasks", tasks); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Error setting tasks count in dict"); + goto out; + } + ret = 0; + +out: + return ret; +} + +static int glusterd_op_status_volume (dict_t *dict, char **op_errstr, dict_t *rsp_dict) { @@ -1994,7 +2346,6 @@ glusterd_op_status_volume (dict_t *dict, char **op_errstr, gf_boolean_t nfs_disabled = _gf_false; gf_boolean_t shd_enabled = _gf_true; gf_boolean_t origin_glusterd = _gf_false; - int tasks = 0; this = THIS; GF_ASSERT (this); @@ -2004,13 +2355,13 @@ glusterd_op_status_volume (dict_t *dict, char **op_errstr, GF_ASSERT (dict); - origin_glusterd = is_origin_glusterd (); + origin_glusterd = is_origin_glusterd (dict); ret = dict_get_uint32 (dict, "cmd", &cmd); if (ret) goto out; - if (is_origin_glusterd ()) { + if (origin_glusterd) { ret = 0; if ((cmd & GF_CLI_STATUS_ALL)) { ret = glusterd_get_all_volnames (rsp_dict); @@ -2077,6 +2428,10 @@ glusterd_op_status_volume (dict_t *dict, char **op_errstr, brick_index); node_count++; + } else if ((cmd & GF_CLI_STATUS_TASKS) != 0) { + ret = glusterd_aggregate_task_status (rsp_dict, volinfo); + goto out; + } else { list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { brick_index++; @@ -2150,35 +2505,18 @@ glusterd_op_status_volume (dict_t *dict, char **op_errstr, } /* Active tasks */ + /* Tasks are added only for normal volume status request for either a + * single volume or all volumes, and only by the origin glusterd + */ if (((cmd & GF_CLI_STATUS_MASK) != GF_CLI_STATUS_NONE) || + !(cmd & (GF_CLI_STATUS_VOL | GF_CLI_STATUS_ALL)) || !origin_glusterd) goto out; - if (!uuid_is_null (volinfo->rebal.rebalance_id)) { - ret = _add_task_to_dict (rsp_dict, volinfo, volinfo->rebal.op, - tasks); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to add task details to dict"); - goto out; - } - tasks++; - } - if (!uuid_is_null (volinfo->rep_brick.rb_id)) { - ret = _add_task_to_dict (rsp_dict, volinfo, GD_OP_REPLACE_BRICK, - tasks); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to add task details to dict"); - goto out; - } - tasks++; - } - - ret = dict_set_int32 (rsp_dict, "tasks", tasks); + ret = glusterd_aggregate_task_status (rsp_dict, volinfo); if (ret) - gf_log (this->name, GF_LOG_ERROR, - "Error setting tasks count in dict"); + goto out; + ret = 0; out: gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret); @@ -2205,6 +2543,7 @@ glusterd_op_ac_send_lock (glusterd_op_sm_event_t *event, void *ctx) xlator_t *this = NULL; glusterd_peerinfo_t *peerinfo = NULL; uint32_t pending_count = 0; + dict_t *dict = NULL; this = THIS; priv = this->private; @@ -2219,27 +2558,60 @@ glusterd_op_ac_send_lock (glusterd_op_sm_event_t *event, void *ctx) (glusterd_op_get_op() != GD_OP_SYNC_VOLUME)) continue; - proc = &peerinfo->mgmt->proctable[GLUSTERD_MGMT_CLUSTER_LOCK]; - if (proc->fn) { - ret = proc->fn (NULL, this, peerinfo); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, "Failed to " - "send lock request for operation " - "'Volume %s' to peer %s", - gd_op_list[opinfo.op], - peerinfo->hostname); - continue; + /* Based on the op_version, acquire a cluster or mgmt_v3 lock */ + if (priv->op_version < 3) { + proc = &peerinfo->mgmt->proctable[GLUSTERD_MGMT_CLUSTER_LOCK]; + if (proc->fn) { + ret = proc->fn (NULL, this, peerinfo); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "Failed to send lock request " + "for operation 'Volume %s' to " + "peer %s", + gd_op_list[opinfo.op], + peerinfo->hostname); + continue; + } + pending_count++; + } + } else { + dict = glusterd_op_get_ctx (); + dict_ref (dict); + + proc = &peerinfo->mgmt_v3->proctable + [GLUSTERD_MGMT_V3_LOCK]; + if (proc->fn) { + ret = dict_set_static_ptr (dict, "peerinfo", + peerinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set peerinfo"); + dict_unref (dict); + goto out; + } + + ret = proc->fn (NULL, this, dict); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "Failed to send mgmt_v3 lock " + "request for operation " + "'Volume %s' to peer %s", + gd_op_list[opinfo.op], + peerinfo->hostname); + dict_unref (dict); + continue; + } + pending_count++; } - pending_count++; } } opinfo.pending_count = pending_count; if (!opinfo.pending_count) - ret = glusterd_op_sm_inject_all_acc (); + ret = glusterd_op_sm_inject_all_acc (&event->txn_id); +out: gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret); - return ret; } @@ -2252,17 +2624,12 @@ glusterd_op_ac_send_unlock (glusterd_op_sm_event_t *event, void *ctx) xlator_t *this = NULL; glusterd_peerinfo_t *peerinfo = NULL; uint32_t pending_count = 0; + dict_t *dict = NULL; this = THIS; priv = this->private; GF_ASSERT (priv); - /*ret = glusterd_unlock (MY_UUID); - - if (ret) - goto out; - */ - list_for_each_entry (peerinfo, &priv->peers, uuid_list) { GF_ASSERT (peerinfo); @@ -2272,29 +2639,62 @@ glusterd_op_ac_send_unlock (glusterd_op_sm_event_t *event, void *ctx) (glusterd_op_get_op() != GD_OP_SYNC_VOLUME)) continue; - proc = &peerinfo->mgmt->proctable[GLUSTERD_MGMT_CLUSTER_UNLOCK]; - if (proc->fn) { - ret = proc->fn (NULL, this, peerinfo); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, "Failed to " - "send unlock request for operation " - "'Volume %s' to peer %s", - gd_op_list[opinfo.op], - peerinfo->hostname); - continue; + /* Based on the op_version, release the * + * cluster or mgmt_v3 lock */ + if (priv->op_version < 3) { + proc = &peerinfo->mgmt->proctable[GLUSTERD_MGMT_CLUSTER_UNLOCK]; + if (proc->fn) { + ret = proc->fn (NULL, this, peerinfo); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "Failed to send unlock request " + "for operation 'Volume %s' to " + "peer %s", + gd_op_list[opinfo.op], + peerinfo->hostname); + continue; + } + pending_count++; + } + } else { + dict = glusterd_op_get_ctx (); + dict_ref (dict); + + proc = &peerinfo->mgmt_v3->proctable + [GLUSTERD_MGMT_V3_UNLOCK]; + if (proc->fn) { + ret = dict_set_static_ptr (dict, "peerinfo", + peerinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set peerinfo"); + dict_unref (dict); + goto out; + } + + ret = proc->fn (NULL, this, dict); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "Failed to send mgmt_v3 unlock " + "request for operation " + "'Volume %s' to peer %s", + gd_op_list[opinfo.op], + peerinfo->hostname); + dict_unref (dict); + continue; + } + pending_count++; } - pending_count++; } } opinfo.pending_count = pending_count; if (!opinfo.pending_count) - ret = glusterd_op_sm_inject_all_acc (); + ret = glusterd_op_sm_inject_all_acc (&event->txn_id); +out: gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret); - return ret; - } static int @@ -2306,7 +2706,8 @@ glusterd_op_ac_ack_drain (glusterd_op_sm_event_t *event, void *ctx) opinfo.pending_count--; if (!opinfo.pending_count) - ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, NULL); + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, + &event->txn_id, NULL); gf_log (THIS->name, GF_LOG_DEBUG, "Returning with %d", ret); @@ -2322,43 +2723,94 @@ glusterd_op_ac_send_unlock_drain (glusterd_op_sm_event_t *event, void *ctx) static int glusterd_op_ac_lock (glusterd_op_sm_event_t *event, void *ctx) { - glusterd_op_lock_ctx_t *lock_ctx = NULL; - int32_t ret = 0; + int32_t ret = 0; + char *volname = NULL; + glusterd_op_lock_ctx_t *lock_ctx = NULL; + glusterd_conf_t *priv = NULL; + xlator_t *this = NULL; GF_ASSERT (event); GF_ASSERT (ctx); + this = THIS; + priv = this->private; + lock_ctx = (glusterd_op_lock_ctx_t *)ctx; - ret = glusterd_lock (lock_ctx->uuid); + /* If the req came from a node running on older op_version + * the dict won't be present. Based on it acquiring a cluster + * or mgmt_v3 lock */ + if (lock_ctx->dict == NULL) { + ret = glusterd_lock (lock_ctx->uuid); + glusterd_op_lock_send_resp (lock_ctx->req, ret); + } else { + ret = dict_get_str (lock_ctx->dict, "volname", &volname); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Unable to acquire volname"); + else { + ret = glusterd_mgmt_v3_lock (volname, lock_ctx->uuid, + "vol"); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Unable to acquire lock for %s", + volname); + } - gf_log (THIS->name, GF_LOG_DEBUG, "Lock Returned %d", ret); + glusterd_op_mgmt_v3_lock_send_resp (lock_ctx->req, + &event->txn_id, ret); - glusterd_op_lock_send_resp (lock_ctx->req, ret); + dict_unref (lock_ctx->dict); + } + gf_log (THIS->name, GF_LOG_DEBUG, "Lock Returned %d", ret); return ret; } static int glusterd_op_ac_unlock (glusterd_op_sm_event_t *event, void *ctx) { - int ret = 0; - glusterd_op_lock_ctx_t *lock_ctx = NULL; - xlator_t *this = NULL; - glusterd_conf_t *priv = NULL; + int32_t ret = 0; + char *volname = NULL; + glusterd_op_lock_ctx_t *lock_ctx = NULL; + glusterd_conf_t *priv = NULL; + xlator_t *this = NULL; + GF_ASSERT (event); GF_ASSERT (ctx); this = THIS; priv = this->private; + lock_ctx = (glusterd_op_lock_ctx_t *)ctx; - ret = glusterd_unlock (lock_ctx->uuid); + /* If the req came from a node running on older op_version + * the dict won't be present. Based on it releasing the cluster + * or mgmt_v3 lock */ + if (lock_ctx->dict == NULL) { + ret = glusterd_unlock (lock_ctx->uuid); + glusterd_op_unlock_send_resp (lock_ctx->req, ret); + } else { + ret = dict_get_str (lock_ctx->dict, "volname", &volname); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Unable to acquire volname"); + else { + ret = glusterd_mgmt_v3_unlock (volname, lock_ctx->uuid, + "vol"); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Unable to release lock for %s", volname); + } + + glusterd_op_mgmt_v3_unlock_send_resp (lock_ctx->req, + &event->txn_id, ret); - gf_log (this->name, GF_LOG_DEBUG, "Unlock Returned %d", ret); + dict_unref (lock_ctx->dict); + } - glusterd_op_unlock_send_resp (lock_ctx->req, ret); + gf_log (this->name, GF_LOG_DEBUG, "Unlock Returned %d", ret); if (priv->pending_quorum_action) glusterd_do_quorum_action (); @@ -2396,7 +2848,8 @@ glusterd_op_ac_rcvd_lock_acc (glusterd_op_sm_event_t *event, void *ctx) if (opinfo.pending_count > 0) goto out; - ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACC, NULL); + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACC, + &event->txn_id, NULL); gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret); @@ -2504,12 +2957,13 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx) } break; + case GD_OP_GSYNC_CREATE: case GD_OP_GSYNC_SET: { ret = glusterd_op_gsync_args_get (dict, &errstr, &volname, - NULL); + NULL, NULL); if (ret == 0) { ret = glusterd_dict_set_volid (dict, volname, op_errstr); @@ -2599,9 +3053,6 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx) case GD_OP_STATEDUMP_VOLUME: case GD_OP_CLEARLOCKS_VOLUME: case GD_OP_DEFRAG_BRICK_VOLUME: -#ifdef HAVE_BD_XLATOR - case GD_OP_BD_OP: -#endif { ret = dict_get_str (dict, "volname", &volname); if (ret) { @@ -2622,6 +3073,18 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx) } break; + case GD_OP_COPY_FILE: + { + dict_copy (dict, req_dict); + break; + } + + case GD_OP_SYS_EXEC: + { + dict_copy (dict, req_dict); + break; + } + default: break; } @@ -2818,7 +3281,8 @@ out: if (dict) dict_unref (dict); if (ret) { - glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, NULL); + glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, + &event->txn_id, NULL); opinfo.op_ret = ret; } @@ -2827,7 +3291,7 @@ out: opinfo.pending_count); if (!opinfo.pending_count) - ret = glusterd_op_sm_inject_all_acc (); + ret = glusterd_op_sm_inject_all_acc (&event->txn_id); gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret); @@ -2836,10 +3300,10 @@ out: } static int32_t -glusterd_op_start_rb_timer (dict_t *dict) +glusterd_op_start_rb_timer (dict_t *dict, uuid_t *txn_id) { int32_t op = 0; - struct timeval timeout = {0, }; + struct timespec timeout = {0, }; glusterd_conf_t *priv = NULL; int32_t ret = -1; dict_t *rb_ctx = NULL; @@ -2855,12 +3319,12 @@ glusterd_op_start_rb_timer (dict_t *dict) } if (op != GF_REPLACE_OP_START) { - ret = glusterd_op_sm_inject_all_acc (); + ret = glusterd_op_sm_inject_all_acc (txn_id); goto out; } timeout.tv_sec = 5; - timeout.tv_usec = 0; + timeout.tv_nsec = 0; rb_ctx = dict_copy (dict, rb_ctx); @@ -2870,6 +3334,17 @@ glusterd_op_start_rb_timer (dict_t *dict) ret = -1; goto out; } + + ret = dict_set_bin (rb_ctx, "transaction_id", + txn_id, sizeof(*txn_id)); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Failed to set transaction id."); + goto out; + } else + gf_log ("", GF_LOG_DEBUG, + "transaction_id = %s", uuid_utoa (*txn_id)); + priv->timer = gf_timer_call_after (THIS->ctx, timeout, glusterd_do_replace_brick, (void *) rb_ctx); @@ -2938,6 +3413,97 @@ out: return ret; } +static int +reassign_defrag_status (dict_t *dict, char *key, gf_defrag_status_t *status) +{ + int ret = 0; + + if (!*status) + return ret; + + switch (*status) { + case GF_DEFRAG_STATUS_STARTED: + *status = GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED; + break; + + case GF_DEFRAG_STATUS_STOPPED: + *status = GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED; + break; + + case GF_DEFRAG_STATUS_COMPLETE: + *status = GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE; + break; + + case GF_DEFRAG_STATUS_FAILED: + *status = GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED; + break; + default: + break; + } + + ret = dict_set_int32(dict, key, *status); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to reset defrag %s in dict", key); + + return ret; +} + +/* Check and reassign the defrag_status enum got from the rebalance process + * of all peers so that the rebalance-status CLI command can display if a + * full-rebalance or just a fix-layout was carried out. + */ +static int +glusterd_op_check_peer_defrag_status (dict_t *dict, int count) +{ + glusterd_volinfo_t *volinfo = NULL; + gf_defrag_status_t status = GF_DEFRAG_STATUS_NOT_STARTED; + char key[256] = {0,}; + char *volname = NULL; + int ret = -1; + int i = 1; + + ret = dict_get_str (dict, "volname", &volname); + if (ret) { + gf_log (THIS->name, GF_LOG_WARNING, "Unable to get volume name"); + goto out; + } + + ret = glusterd_volinfo_find (volname, &volinfo); + if (ret) { + gf_log (THIS->name, GF_LOG_WARNING, FMTSTR_CHECK_VOL_EXISTS, + volname); + goto out; + } + + if (volinfo->rebal.defrag_cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { + /* Fix layout was not issued; we don't need to reassign + the status */ + ret = 0; + goto out; + } + + do { + memset (key, 0, 256); + snprintf (key, 256, "status-%d", i); + ret = dict_get_int32 (dict, key, (int32_t *)&status); + if (ret) { + gf_log (THIS->name, GF_LOG_WARNING, + "failed to get defrag %s", key); + goto out; + } + ret = reassign_defrag_status (dict, key, &status); + if (ret) + goto out; + i++; + } while (i <= count); + + ret = 0; +out: + return ret; + +} + /* This function is used to modify the op_ctx dict before sending it back * to cli. This is useful in situations like changing the peer uuids to * hostnames etc. @@ -3041,12 +3607,38 @@ glusterd_op_modify_op_ctx (glusterd_op_t op, void *ctx) goto out; } + /* add 'node-name-%d' into op_ctx with value uuid_str. + this will be used to convert to hostname later */ + { + char key[1024]; + char *uuid_str = NULL; + int i; + + for (i = 1; i <= count; i++) { + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "node-uuid-%d", i); + ret = dict_get_str (op_ctx, key, &uuid_str); + if (!ret) { + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), + "node-name-%d", i); + ret = dict_set_str (op_ctx, key, + uuid_str); + } + } + } + ret = glusterd_op_volume_dict_uuid_to_hostname (op_ctx, - "node-uuid-%d", + "node-name-%d", 1, (count + 1)); if (ret) gf_log (this->name, GF_LOG_WARNING, "Failed uuid to hostname conversion"); + + ret = glusterd_op_check_peer_defrag_status (op_ctx, count); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Failed to reset defrag status for fix-layout"); break; default: @@ -3199,17 +3791,19 @@ out: if (dict) dict_unref (dict); if (ret) { - glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, NULL); + glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, + &event->txn_id, NULL); opinfo.op_ret = ret; } if (!opinfo.pending_count) { if (op == GD_OP_REPLACE_BRICK) { - ret = glusterd_op_start_rb_timer (op_dict); + ret = glusterd_op_start_rb_timer (op_dict, + &event->txn_id); } else { glusterd_op_modify_op_ctx (op, NULL); - ret = glusterd_op_sm_inject_all_acc (); + ret = glusterd_op_sm_inject_all_acc (&event->txn_id); } goto err; } @@ -3234,7 +3828,8 @@ glusterd_op_ac_rcvd_stage_op_acc (glusterd_op_sm_event_t *event, void *ctx) if (opinfo.pending_count > 0) goto out; - ret = glusterd_op_sm_inject_event (GD_OP_EVENT_STAGE_ACC, NULL); + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_STAGE_ACC, + &event->txn_id, NULL); out: gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret); @@ -3255,7 +3850,8 @@ glusterd_op_ac_stage_op_failed (glusterd_op_sm_event_t *event, void *ctx) if (opinfo.pending_count > 0) goto out; - ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, NULL); + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, + &event->txn_id, NULL); out: gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret); @@ -3276,7 +3872,8 @@ glusterd_op_ac_commit_op_failed (glusterd_op_sm_event_t *event, void *ctx) if (opinfo.pending_count > 0) goto out; - ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, NULL); + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, + &event->txn_id, NULL); out: gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret); @@ -3319,7 +3916,8 @@ glusterd_op_ac_brick_op_failed (glusterd_op_sm_event_t *event, void *ctx) if (opinfo.brick_pending_count > 0) goto out; - ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, ev_ctx->commit_ctx); + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, + &event->txn_id, ev_ctx->commit_ctx); out: if (ev_ctx->rsp_dict) @@ -3361,7 +3959,7 @@ glusterd_op_ac_rcvd_commit_op_acc (glusterd_op_sm_event_t *event, void *ctx) goto out; } - ret = glusterd_op_start_rb_timer (op_ctx); + ret = glusterd_op_start_rb_timer (op_ctx, &event->txn_id); if (ret) { gf_log (this->name, GF_LOG_ERROR, "Couldn't start " "replace-brick operation."); @@ -3376,10 +3974,12 @@ glusterd_op_ac_rcvd_commit_op_acc (glusterd_op_sm_event_t *event, void *ctx) out: if (commit_ack_inject) { if (ret) - ret = glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, NULL); + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, + &event->txn_id, NULL); else if (!opinfo.pending_count) { glusterd_op_modify_op_ctx (op, NULL); - ret = glusterd_op_sm_inject_event (GD_OP_EVENT_COMMIT_ACC, NULL); + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_COMMIT_ACC, + &event->txn_id, NULL); } /*else do nothing*/ } @@ -3400,7 +4000,8 @@ glusterd_op_ac_rcvd_unlock_acc (glusterd_op_sm_event_t *event, void *ctx) if (opinfo.pending_count > 0) goto out; - ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACC, NULL); + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACC, + &event->txn_id, NULL); gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret); @@ -3434,7 +4035,7 @@ glusterd_op_reset_ctx () } int32_t -glusterd_op_txn_complete () +glusterd_op_txn_complete (uuid_t *txn_id) { int32_t ret = -1; glusterd_conf_t *priv = NULL; @@ -3444,6 +4045,7 @@ glusterd_op_txn_complete () rpcsvc_request_t *req = NULL; void *ctx = NULL; char *op_errstr = NULL; + char *volname = NULL; xlator_t *this = NULL; this = THIS; @@ -3466,14 +4068,29 @@ glusterd_op_txn_complete () glusterd_op_reset_ctx (); glusterd_op_clear_errstr (); - ret = glusterd_unlock (MY_UUID); - - /* unlock cant/shouldnt fail here!! */ - if (ret) { - gf_log (this->name, GF_LOG_CRITICAL, - "Unable to clear local lock, ret: %d", ret); + /* Based on the op-version, we release the cluster or mgmt_v3 lock */ + if (priv->op_version < 3) { + ret = glusterd_unlock (MY_UUID); + /* unlock cant/shouldnt fail here!! */ + if (ret) + gf_log (this->name, GF_LOG_CRITICAL, + "Unable to clear local lock, ret: %d", ret); + else + gf_log (this->name, GF_LOG_DEBUG, "Cleared local lock"); } else { - gf_log (this->name, GF_LOG_DEBUG, "Cleared local lock"); + ret = dict_get_str (ctx, "volname", &volname); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Unable to acquire volname"); + + if (volname) { + ret = glusterd_mgmt_v3_unlock (volname, MY_UUID, + "vol"); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Unable to release lock for %s", + volname); + } } ret = glusterd_op_send_cli_response (op, op_ret, @@ -3492,6 +4109,13 @@ glusterd_op_txn_complete () if (priv->pending_quorum_action) glusterd_do_quorum_action (); + + /* Clearing the transaction opinfo */ + ret = glusterd_clear_txn_opinfo (txn_id); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Unable to clear transaction's opinfo"); + gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret); return ret; } @@ -3503,7 +4127,7 @@ glusterd_op_ac_unlocked_all (glusterd_op_sm_event_t *event, void *ctx) GF_ASSERT (event); - ret = glusterd_op_txn_complete (); + ret = glusterd_op_txn_complete (&event->txn_id); gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret); @@ -3520,6 +4144,7 @@ glusterd_op_ac_stage_op (glusterd_op_sm_event_t *event, void *ctx) char *op_errstr = NULL; dict_t *dict = NULL; xlator_t *this = NULL; + uuid_t *txn_id = NULL; this = THIS; GF_ASSERT (this); @@ -3545,6 +4170,19 @@ glusterd_op_ac_stage_op (glusterd_op_sm_event_t *event, void *ctx) status); } + txn_id = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t); + + if (txn_id) + uuid_copy (*txn_id, event->txn_id); + else + gf_log (this->name, GF_LOG_ERROR, "Out of Memory"); + + ret = dict_set_bin (rsp_dict, "transaction_id", + txn_id, sizeof(*txn_id)); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Failed to set transaction id."); + ret = glusterd_op_stage_send_resp (req_ctx->req, req_ctx->op, status, op_errstr, rsp_dict); @@ -3609,6 +4247,7 @@ glusterd_op_ac_commit_op (glusterd_op_sm_event_t *event, void *ctx) dict_t *dict = NULL; dict_t *rsp_dict = NULL; xlator_t *this = NULL; + uuid_t *txn_id = NULL; this = THIS; GF_ASSERT (this); @@ -3638,10 +4277,22 @@ glusterd_op_ac_commit_op (glusterd_op_sm_event_t *event, void *ctx) "'Volume %s' failed: %d", gd_op_list[req_ctx->op], status); + txn_id = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t); + + if (txn_id) + uuid_copy (*txn_id, event->txn_id); + else + gf_log (this->name, GF_LOG_ERROR, "Out of Memory"); + + ret = dict_set_bin (rsp_dict, "transaction_id", + txn_id, sizeof(*txn_id)); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Failed to set transaction id."); + ret = glusterd_op_commit_send_resp (req_ctx->req, req_ctx->op, status, op_errstr, rsp_dict); - glusterd_op_fini_ctx (); if (op_errstr && (strcmp (op_errstr, ""))) GF_FREE (op_errstr); @@ -3670,7 +4321,6 @@ glusterd_op_ac_send_commit_failed (glusterd_op_sm_event_t *event, void *ctx) opinfo.op_ret, opinfo.op_errstr, op_ctx); - glusterd_op_fini_ctx (); if (opinfo.op_errstr && (strcmp (opinfo.op_errstr, ""))) { GF_FREE (opinfo.op_errstr); opinfo.op_errstr = NULL; @@ -3755,6 +4405,10 @@ glusterd_op_stage_validate (glusterd_op_t op, dict_t *dict, char **op_errstr, ret = glusterd_op_stage_sync_volume (dict, op_errstr); break; + case GD_OP_GSYNC_CREATE: + ret = glusterd_op_stage_gsync_create (dict, op_errstr); + break; + case GD_OP_GSYNC_SET: ret = glusterd_op_stage_gsync_set (dict, op_errstr); break; @@ -3788,18 +4442,21 @@ glusterd_op_stage_validate (glusterd_op_t op, dict_t *dict, char **op_errstr, ret = glusterd_op_stage_clearlocks_volume (dict, op_errstr); break; -#ifdef HAVE_BD_XLATOR - case GD_OP_BD_OP: - ret = glusterd_op_stage_bd (dict, op_errstr); + + case GD_OP_COPY_FILE: + ret = glusterd_op_stage_copy_file (dict, op_errstr); break; -#endif + + case GD_OP_SYS_EXEC: + ret = glusterd_op_stage_sys_exec (dict, op_errstr); + break; + default: gf_log (this->name, GF_LOG_ERROR, "Unknown op %s", gd_op_list[op]); } - gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret); - + gf_log (this->name, GF_LOG_DEBUG, "OP = %d. Returning %d", op, ret); return ret; } @@ -3857,6 +4514,11 @@ glusterd_op_commit_perform (glusterd_op_t op, dict_t *dict, char **op_errstr, ret = glusterd_op_sync_volume (dict, op_errstr, rsp_dict); break; + case GD_OP_GSYNC_CREATE: + ret = glusterd_op_gsync_create (dict, op_errstr, + rsp_dict); + break; + case GD_OP_GSYNC_SET: ret = glusterd_op_gsync_set (dict, op_errstr, rsp_dict); break; @@ -3891,11 +4553,15 @@ glusterd_op_commit_perform (glusterd_op_t op, dict_t *dict, char **op_errstr, ret = glusterd_op_clearlocks_volume (dict, op_errstr, rsp_dict); break; -#ifdef HAVE_BD_XLATOR - case GD_OP_BD_OP: - ret = 0; + + case GD_OP_COPY_FILE: + ret = glusterd_op_copy_file (dict, op_errstr); break; -#endif + + case GD_OP_SYS_EXEC: + ret = glusterd_op_sys_exec (dict, op_errstr, rsp_dict); + break; + default: gf_log (this->name, GF_LOG_ERROR, "Unknown op %s", gd_op_list[op]); @@ -3904,11 +4570,12 @@ glusterd_op_commit_perform (glusterd_op_t op, dict_t *dict, char **op_errstr, if (ret == 0) glusterd_op_commit_hook (op, dict, GD_COMMIT_HOOK_POST); - gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret); + gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret); return ret; } + static int glusterd_bricks_select_stop_volume (dict_t *dict, char **op_errstr, struct list_head *selected) @@ -4225,24 +4892,95 @@ out: } int +get_replica_index_for_per_replica_cmd (glusterd_volinfo_t *volinfo, + dict_t *dict) { + int ret = 0; + char *hostname = NULL; + char *path = NULL; + int index = 0; + glusterd_brickinfo_t *brickinfo = NULL; + int cmd_replica_index = -1; + int replica_count = -1; + + + if (!dict) { + ret = -1; + goto out; + } + + ret = dict_get_str (dict, "per-replica-cmd-hostname", &hostname); + if (ret) + goto out; + ret = dict_get_str (dict, "per-replica-cmd-path", &path); + if (ret) + goto out; + + replica_count = volinfo->replica_count; + + list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { + if (uuid_is_null (brickinfo->uuid)) + (void)glusterd_resolve_brick (brickinfo); + if (!strcmp (brickinfo->path, path) && + !strcmp (brickinfo->hostname, hostname)) { + cmd_replica_index = index/(replica_count); + goto out; + } + index++; + } + + +out: + if (ret) + cmd_replica_index = -1; + + return cmd_replica_index; +} + +int _select_rxlators_with_local_bricks (xlator_t *this, glusterd_volinfo_t *volinfo, - dict_t *dict) + dict_t *dict, cli_cmd_type type) { glusterd_brickinfo_t *brickinfo = NULL; glusterd_conf_t *priv = NULL; - int index = 1; + int index = 0; int rxlator_count = 0; int replica_count = 0; gf_boolean_t add = _gf_false; + int ret = 0; + int cmd_replica_index = -1; priv = this->private; replica_count = volinfo->replica_count; + + if (type == PER_REPLICA) { + + cmd_replica_index = get_replica_index_for_per_replica_cmd + (volinfo, dict); + if (cmd_replica_index == -1) { + ret = -1; + goto err; + } + } + + index = 1; + list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { if (uuid_is_null (brickinfo->uuid)) (void)glusterd_resolve_brick (brickinfo); - if (!uuid_compare (MY_UUID, brickinfo->uuid)) - add = _gf_true; + switch (type) { + case ALL_REPLICA: + if (!uuid_compare (MY_UUID, brickinfo->uuid)) + add = _gf_true; + break; + case PER_REPLICA: + if (!uuid_compare (MY_UUID, brickinfo->uuid) && + ((index-1)/replica_count == cmd_replica_index)) + + add = _gf_true; + break; + } + if (index % replica_count == 0) { if (add) { _add_rxlator_to_dict (dict, volinfo->volname, @@ -4255,6 +4993,10 @@ _select_rxlators_with_local_bricks (xlator_t *this, glusterd_volinfo_t *volinfo, index++; } +err: + if (ret) + rxlator_count = -1; + return rxlator_count; } @@ -4295,9 +5037,10 @@ _select_rxlators_for_full_self_heal (xlator_t *this, return rxlator_count; } -#ifdef HAVE_BD_XLATOR + static int -glusterd_bricks_select_bd (dict_t *dict, char **op_errstr) +glusterd_bricks_select_snap (dict_t *dict, char **op_errstr, + struct list_head *selected) { int ret = -1; glusterd_conf_t *priv = NULL; @@ -4315,31 +5058,31 @@ glusterd_bricks_select_bd (dict_t *dict, char **op_errstr) ret = dict_get_str (dict, "volname", &volname); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Unable to get volname"); + gf_log (this->name, GF_LOG_ERROR, "Unable to get" + " volname"); goto out; } ret = glusterd_volinfo_find (volname, &volinfo); if (ret) goto out; - pending_node = GF_CALLOC (1, sizeof (*pending_node), - gf_gld_mt_pending_node_t); - if (!pending_node) { - ret = -1; - goto out; - } - list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { brick_index++; if (uuid_compare (brickinfo->uuid, MY_UUID) || !glusterd_is_brick_started (brickinfo)) { continue; } + pending_node = GF_CALLOC (1, sizeof (*pending_node), + gf_gld_mt_pending_node_t); + if (!pending_node) { + ret = -1; + goto out; + } pending_node->node = brickinfo; pending_node->type = GD_NODE_BRICK; pending_node->index = brick_index; list_add_tail (&pending_node->list, - &opinfo.pending_bricks); + selected); pending_node = NULL; } @@ -4349,10 +5092,10 @@ out: gf_log (THIS->name, GF_LOG_DEBUG, "Returning ret %d", ret); return ret; } -#endif static int -fill_shd_status_for_local_bricks (dict_t *dict, glusterd_volinfo_t *volinfo) +fill_shd_status_for_local_bricks (dict_t *dict, glusterd_volinfo_t *volinfo, + cli_cmd_type type, dict_t *req_dict) { glusterd_brickinfo_t *brickinfo = NULL; char msg[1024] = {0,}; @@ -4361,10 +5104,22 @@ fill_shd_status_for_local_bricks (dict_t *dict, glusterd_volinfo_t *volinfo) int index = 0; int ret = 0; xlator_t *this = NULL; + int cmd_replica_index = -1; this = THIS; snprintf (msg, sizeof (msg), "self-heal-daemon is not running on"); + if (type == PER_REPLICA) { + cmd_replica_index = get_replica_index_for_per_replica_cmd + (volinfo, req_dict); + if (cmd_replica_index == -1) { + gf_log (THIS->name, GF_LOG_ERROR, "Could not find the " + "replica index for per replica type command"); + ret = -1; + goto out; + } + } + list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { if (uuid_is_null (brickinfo->uuid)) (void)glusterd_resolve_brick (brickinfo); @@ -4373,6 +5128,14 @@ fill_shd_status_for_local_bricks (dict_t *dict, glusterd_volinfo_t *volinfo) index++; continue; } + + if (type == PER_REPLICA) { + if (cmd_replica_index != (index/volinfo->replica_count)) { + index++; + continue; + } + + } snprintf (key, sizeof (key), "%d-status",index); snprintf (value, sizeof (value), "%s %s",msg, uuid_utoa(MY_UUID)); @@ -4441,21 +5204,49 @@ glusterd_bricks_select_heal_volume (dict_t *dict, char **op_errstr, goto out; } + switch (heal_op) { + case GF_AFR_OP_INDEX_SUMMARY: + case GF_AFR_OP_STATISTICS_HEAL_COUNT: + if (!glusterd_is_nodesvc_online ("glustershd")) { + if (!rsp_dict) { + gf_log (this->name, GF_LOG_ERROR, "Received " + "empty ctx."); + goto out; + } - if (!glusterd_is_nodesvc_online ("glustershd") && - (heal_op == GF_AFR_OP_INDEX_SUMMARY)) { - - if (!rsp_dict) { - gf_log (this->name, GF_LOG_ERROR, "Received empty " - "ctx."); + ret = fill_shd_status_for_local_bricks (rsp_dict, + volinfo, + ALL_REPLICA, + dict); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Unable to " + "fill the shd status for the local " + "bricks"); goto out; + } + break; + case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: + if (!glusterd_is_nodesvc_online ("glustershd")) { + if (!rsp_dict) { + gf_log (this->name, GF_LOG_ERROR, "Received " + "empty ctx."); + goto out; + } + ret = fill_shd_status_for_local_bricks (rsp_dict, + volinfo, + PER_REPLICA, + dict); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Unable to " + "fill the shd status for the local" + " bricks."); + goto out; - ret = fill_shd_status_for_local_bricks (rsp_dict, volinfo); - if (ret) - gf_log (this->name, GF_LOG_ERROR, "Unable to fill the shd" - " status for the local bricks"); - goto out; + } + break; + default: + break; } @@ -4465,14 +5256,28 @@ glusterd_bricks_select_heal_volume (dict_t *dict, char **op_errstr, volinfo, dict); break; + case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: + rxlator_count = _select_rxlators_with_local_bricks (this, + volinfo, + dict, + PER_REPLICA); + break; default: rxlator_count = _select_rxlators_with_local_bricks (this, volinfo, - dict); + dict, + ALL_REPLICA); break; } if (!rxlator_count) goto out; + if (rxlator_count == -1){ + gf_log (this->name, GF_LOG_ERROR, "Could not determine the" + "translator count"); + ret = -1; + goto out; + } + ret = dict_set_int32 (dict, "count", rxlator_count); if (ret) goto out; @@ -4734,7 +5539,8 @@ glusterd_op_ac_send_brick_op (glusterd_op_sm_event_t *event, void *ctx) if (!opinfo.pending_count && !opinfo.brick_pending_count) { glusterd_clear_pending_nodes (&opinfo.pending_bricks); - ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, req_ctx); + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, + &event->txn_id, req_ctx); } out: @@ -4788,7 +5594,8 @@ glusterd_op_ac_rcvd_brick_op_acc (glusterd_op_sm_event_t *event, void *ctx) if (opinfo.brick_pending_count > 0) goto out; - ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, ev_ctx->commit_ctx); + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, &event->txn_id, + ev_ctx->commit_ctx); out: if (ev_ctx->rsp_dict) @@ -4840,11 +5647,9 @@ glusterd_op_bricks_select (glusterd_op_t op, dict_t *dict, char **op_errstr, ret = glusterd_bricks_select_rebalance_volume (dict, op_errstr, selected); break; -#ifdef HAVE_BD_XLATOR - case GD_OP_BD_OP: - ret = glusterd_bricks_select_bd (dict, op_errstr); + case GD_OP_SNAP: + ret = glusterd_bricks_select_snap (dict, op_errstr, selected); break; -#endif default: break; } @@ -5166,7 +5971,7 @@ glusterd_op_sm_new_event (glusterd_op_sm_event_type_t event_type, int glusterd_op_sm_inject_event (glusterd_op_sm_event_type_t event_type, - void *ctx) + uuid_t *txn_id, void *ctx) { int32_t ret = -1; glusterd_op_sm_event_t *event = NULL; @@ -5181,6 +5986,9 @@ glusterd_op_sm_inject_event (glusterd_op_sm_event_type_t event_type, event->ctx = ctx; + if (txn_id) + uuid_copy (event->txn_id, *txn_id); + gf_log (THIS->name, GF_LOG_DEBUG, "Enqueue event: '%s'", glusterd_op_sm_event_name_get (event->event)); list_add_tail (&event->list, &gd_op_sm_queue); @@ -5241,6 +6049,7 @@ glusterd_op_sm () glusterd_op_sm_t *state = NULL; glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE; xlator_t *this = NULL; + glusterd_op_info_t txn_op_info; this = THIS; GF_ASSERT (this); @@ -5261,6 +6070,20 @@ glusterd_op_sm () "type: '%s'", glusterd_op_sm_event_name_get(event_type)); + gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", + uuid_utoa (event->txn_id)); + + ret = glusterd_get_txn_opinfo (&event->txn_id, + &txn_op_info); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to get transaction's opinfo"); + glusterd_destroy_op_event_ctx (event); + GF_FREE (event); + continue; + } else + opinfo = txn_op_info; + state = glusterd_op_state_table[opinfo.state.state]; GF_ASSERT (state); @@ -5291,8 +6114,27 @@ glusterd_op_sm () return ret; } + if ((state[event_type].next_state == + GD_OP_STATE_DEFAULT) && + (event_type == GD_OP_EVENT_UNLOCK)) { + /* Clearing the transaction opinfo */ + ret = glusterd_clear_txn_opinfo(&event->txn_id); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Unable to clear " + "transaction's opinfo"); + } else { + ret = glusterd_set_txn_opinfo (&event->txn_id, + &opinfo); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Unable to set " + "transaction's opinfo"); + } + glusterd_destroy_op_event_ctx (event); GF_FREE (event); + } } @@ -5346,52 +6188,6 @@ glusterd_op_clear_op (glusterd_op_t op) } int32_t -glusterd_op_init_ctx (glusterd_op_t op) -{ - int ret = 0; - dict_t *dict = NULL; - xlator_t *this = NULL; - - this = THIS; - GF_ASSERT (this); - GF_ASSERT (GD_OP_NONE < op && op < GD_OP_MAX); - - if (_gf_false == glusterd_need_brick_op (op)) { - gf_log (this->name, GF_LOG_DEBUG, "Received op: %s, returning", - gd_op_list[op]); - goto out; - } - dict = dict_new (); - if (dict == NULL) { - ret = -1; - goto out; - } - ret = glusterd_op_set_ctx (dict); - if (ret) - goto out; -out: - gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret); - return ret; -} - - - -int32_t -glusterd_op_fini_ctx () -{ - dict_t *dict = NULL; - - dict = glusterd_op_get_ctx (); - if (dict) - dict_unref (dict); - - glusterd_op_reset_ctx (); - return 0; -} - - - -int32_t glusterd_op_free_ctx (glusterd_op_t op, void *ctx) { @@ -5417,9 +6213,6 @@ glusterd_op_free_ctx (glusterd_op_t op, void *ctx) case GD_OP_STATEDUMP_VOLUME: case GD_OP_CLEARLOCKS_VOLUME: case GD_OP_DEFRAG_BRICK_VOLUME: -#ifdef HAVE_BD_XLATOR - case GD_OP_BD_OP: -#endif dict_unref (ctx); break; default: @@ -5448,4 +6241,3 @@ glusterd_op_sm_init () pthread_mutex_init (&gd_op_sm_lock, NULL); return 0; } - diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.h b/xlators/mgmt/glusterd/src/glusterd-op-sm.h index df8b8c141..cf57b78e0 100644 --- a/xlators/mgmt/glusterd/src/glusterd-op-sm.h +++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.h @@ -15,8 +15,8 @@ #include "config.h" #endif -#ifndef GSYNC_CONF -#define GSYNC_CONF GEOREP"/gsyncd.conf" +#ifndef GSYNC_CONF_TEMPLATE +#define GSYNC_CONF_TEMPLATE GEOREP"/gsyncd_template.conf" #endif #include <pthread.h> @@ -32,6 +32,8 @@ #include "protocol-common.h" #define GD_VOLUME_NAME_MAX 256 +#define GD_OP_PROTECTED (0x02) +#define GD_OP_UNPROTECTED (0x04) typedef enum glusterd_op_sm_state_ { GD_OP_STATE_DEFAULT = 0, @@ -75,6 +77,7 @@ struct glusterd_op_sm_event_ { struct list_head list; void *ctx; glusterd_op_sm_event_type_t event; + uuid_t txn_id; }; typedef struct glusterd_op_sm_event_ glusterd_op_sm_event_t; @@ -117,6 +120,7 @@ typedef struct glusterd_op_log_filename_ctx_ glusterd_op_log_filename_ctx_t; struct glusterd_op_lock_ctx_ { uuid_t uuid; + dict_t *dict; rpcsvc_request_t *req; }; @@ -162,12 +166,18 @@ typedef struct glusterd_gsync_status_temp { glusterd_volinfo_t *volinfo; char *node; }glusterd_gsync_status_temp_t; + +typedef enum cli_cmd_type_ { + PER_REPLICA, + ALL_REPLICA, + } cli_cmd_type; + int glusterd_op_sm_new_event (glusterd_op_sm_event_type_t event_type, glusterd_op_sm_event_t **new_event); int glusterd_op_sm_inject_event (glusterd_op_sm_event_type_t event_type, - void *ctx); + uuid_t *txn_id, void *ctx); int glusterd_op_sm_init (); @@ -251,10 +261,7 @@ glusterd_op_init_commit_rsp_dict (glusterd_op_t op); void glusterd_op_modify_op_ctx (glusterd_op_t op, void *op_ctx); -int32_t -glusterd_op_init_ctx (glusterd_op_t op); -int32_t -glusterd_op_fini_ctx (); + int32_t glusterd_volume_stats_read_perf (char *brick_path, int32_t blk_size, int32_t blk_count, double *throughput, double *time); @@ -270,7 +277,7 @@ glusterd_are_all_volumes_stopped (); int glusterd_stop_bricks (glusterd_volinfo_t *volinfo); int -gsync_status (char *master, char *slave, int *status); +gsync_status (char *master, char *slave, char *conf_path, int *status); int glusterd_check_gsync_running (glusterd_volinfo_t *volinfo, gf_boolean_t *flag); @@ -278,4 +285,15 @@ glusterd_check_gsync_running (glusterd_volinfo_t *volinfo, gf_boolean_t *flag); int glusterd_defrag_volume_node_rsp (dict_t *req_dict, dict_t *rsp_dict, dict_t *op_ctx); +int +glusterd_is_valid_vg (glusterd_brickinfo_t *brick, int check_tag, char *msg); + +int32_t +glusterd_get_txn_opinfo (uuid_t *txn_id, glusterd_op_info_t *opinfo); + +int32_t +glusterd_set_txn_opinfo (uuid_t *txn_id, glusterd_op_info_t *opinfo); + +int32_t +glusterd_clear_txn_opinfo (uuid_t *txn_id); #endif diff --git a/xlators/mgmt/glusterd/src/glusterd-pmap.c b/xlators/mgmt/glusterd/src/glusterd-pmap.c index aab6744a4..a153ca1a9 100644 --- a/xlators/mgmt/glusterd/src/glusterd-pmap.c +++ b/xlators/mgmt/glusterd/src/glusterd-pmap.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2010-2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -52,8 +52,8 @@ pmap_port_isfree (int port) } -struct pmap_registry * -pmap_registry_new (void) +static struct pmap_registry * +pmap_registry_new (xlator_t *this) { struct pmap_registry *pmap = NULL; int i = 0; @@ -69,8 +69,8 @@ pmap_registry_new (void) pmap->ports[i].type = GF_PMAP_PORT_FOREIGN; } - pmap->base_port = GF_IANA_PRIV_PORTS_START; - pmap->last_alloc = GF_IANA_PRIV_PORTS_START; + pmap->base_port = pmap->last_alloc = + ((glusterd_conf_t *)(this->private))->base_port; return pmap; } @@ -86,7 +86,7 @@ pmap_registry_get (xlator_t *this) pmap = priv->pmap; if (!pmap) { - pmap = pmap_registry_new (); + pmap = pmap_registry_new (this); if (!pmap) return NULL; priv->pmap = pmap; @@ -474,17 +474,12 @@ gluster_pmap_signout (rpcsvc_request_t *req) } rpcsvc_actor_t gluster_pmap_actors[] = { - [GF_PMAP_NULL] = {"NULL", GF_PMAP_NULL, NULL, NULL, 0}, - [GF_PMAP_PORTBYBRICK] = {"PORTBYBRICK", GF_PMAP_PORTBYBRICK, - gluster_pmap_portbybrick, NULL, 0}, - [GF_PMAP_BRICKBYPORT] = {"BRICKBYPORT", GF_PMAP_BRICKBYPORT, - gluster_pmap_brickbyport, NULL, 0}, - [GF_PMAP_SIGNIN] = {"SIGNIN", GF_PMAP_SIGNIN, - gluster_pmap_signin, NULL, 0}, - [GF_PMAP_SIGNOUT] = {"SIGNOUT", GF_PMAP_SIGNOUT, - gluster_pmap_signout, NULL, 0}, - [GF_PMAP_SIGNUP] = {"SIGNUP", GF_PMAP_SIGNUP, - gluster_pmap_signup, NULL, 0}, + [GF_PMAP_NULL] = {"NULL", GF_PMAP_NULL, NULL, NULL, 0, DRC_NA}, + [GF_PMAP_PORTBYBRICK] = {"PORTBYBRICK", GF_PMAP_PORTBYBRICK, gluster_pmap_portbybrick, NULL, 0, DRC_NA}, + [GF_PMAP_BRICKBYPORT] = {"BRICKBYPORT", GF_PMAP_BRICKBYPORT, gluster_pmap_brickbyport, NULL, 0, DRC_NA}, + [GF_PMAP_SIGNIN] = {"SIGNIN", GF_PMAP_SIGNIN, gluster_pmap_signin, NULL, 0, DRC_NA}, + [GF_PMAP_SIGNOUT] = {"SIGNOUT", GF_PMAP_SIGNOUT, gluster_pmap_signout, NULL, 0, DRC_NA}, + [GF_PMAP_SIGNUP] = {"SIGNUP", GF_PMAP_SIGNUP, gluster_pmap_signup, NULL, 0, DRC_NA}, }; diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c index ebb9e7dd4..b7b974c68 100644 --- a/xlators/mgmt/glusterd/src/glusterd-rebalance.c +++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c @@ -42,12 +42,27 @@ glusterd_brick_op_cbk (struct rpc_req *req, struct iovec *iov, int count, void *myframe); int glusterd_defrag_start_validate (glusterd_volinfo_t *volinfo, char *op_errstr, - size_t len) + size_t len, glusterd_op_t op) { - int ret = -1; + int ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + + /* Check only if operation is not remove-brick */ + if ((GD_OP_REMOVE_BRICK != op) && + !gd_is_remove_brick_committed (volinfo)) { + gf_log (this->name, GF_LOG_DEBUG, "A remove-brick task on " + "volume %s is not yet committed", volinfo->volname); + snprintf (op_errstr, len, "A remove-brick task on volume %s is" + " not yet committed. Either commit or stop the " + "remove-brick task.", volinfo->volname); + goto out; + } if (glusterd_is_defrag_on (volinfo)) { - gf_log ("glusterd", GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_DEBUG, "rebalance on volume %s already started", volinfo->volname); snprintf (op_errstr, len, "Rebalance on %s is already started", @@ -57,7 +72,7 @@ glusterd_defrag_start_validate (glusterd_volinfo_t *volinfo, char *op_errstr, if (glusterd_is_rb_started (volinfo) || glusterd_is_rb_paused (volinfo)) { - gf_log ("glusterd", GF_LOG_DEBUG, + gf_log (this->name, GF_LOG_DEBUG, "Rebalance failed as replace brick is in progress on volume %s", volinfo->volname); snprintf (op_errstr, len, "Rebalance failed as replace brick is in progress on " @@ -66,7 +81,7 @@ glusterd_defrag_start_validate (glusterd_volinfo_t *volinfo, char *op_errstr, } ret = 0; out: - gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret); + gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret); return ret; } @@ -131,8 +146,6 @@ __glusterd_defrag_notify (struct rpc_clnt *rpc, void *mydata, GF_DEFRAG_STATUS_STARTED) { volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_FAILED; - } else { - volinfo->rebal.defrag_cmd = 0; } } @@ -190,7 +203,7 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr, GF_ASSERT (volinfo); GF_ASSERT (op_errstr); - ret = glusterd_defrag_start_validate (volinfo, op_errstr, len); + ret = glusterd_defrag_start_validate (volinfo, op_errstr, len, op); if (ret) goto out; if (!volinfo->rebal.defrag) @@ -204,6 +217,7 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr, defrag->cmd = cmd; + volinfo->rebal.defrag_cmd = cmd; volinfo->rebal.op = op; LOCK_INIT (&defrag->lock); @@ -518,7 +532,7 @@ glusterd_op_stage_rebalance (dict_t *dict, char **op_errstr) case GF_DEFRAG_CMD_START: case GF_DEFRAG_CMD_START_LAYOUT_FIX: case GF_DEFRAG_CMD_START_FORCE: - if (is_origin_glusterd ()) { + if (is_origin_glusterd (dict)) { op_ctx = glusterd_op_get_ctx (); if (!op_ctx) { ret = -1; @@ -544,8 +558,9 @@ glusterd_op_stage_rebalance (dict_t *dict, char **op_errstr) ret = 0; } } - ret = glusterd_defrag_start_validate (volinfo, - msg, sizeof (msg)); + ret = glusterd_defrag_start_validate (volinfo, msg, + sizeof (msg), + GD_OP_REBALANCE); if (ret) { gf_log (this->name, GF_LOG_DEBUG, "start validate failed"); @@ -648,15 +663,18 @@ glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict) ret = 0; } else { uuid_parse (task_id_str, volinfo->rebal.rebalance_id) ; + volinfo->rebal.op = GD_OP_REBALANCE; } ret = glusterd_handle_defrag_start (volinfo, msg, sizeof (msg), cmd, NULL, GD_OP_REBALANCE); break; case GF_DEFRAG_CMD_STOP: - /* Clear task-id only on explicitly stopping the - * rebalance process. + /* Clear task-id only on explicitly stopping rebalance. + * Also clear the stored operation, so it doesn't cause trouble + * with future rebalance/remove-brick starts */ uuid_clear (volinfo->rebal.rebalance_id); + volinfo->rebal.op = GD_OP_NONE; /* Fall back to the old volume file in case of decommission*/ list_for_each_entry_safe (brickinfo, tmp, &volinfo->bricks, diff --git a/xlators/mgmt/glusterd/src/glusterd-replace-brick.c b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c index c1506033b..54b830870 100644 --- a/xlators/mgmt/glusterd/src/glusterd-replace-brick.c +++ b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c @@ -31,6 +31,7 @@ DEFAULT_VAR_RUN_DIRECTORY"/%s-"RB_CLIENT_MOUNTPOINT, \ volinfo->volname); +extern uuid_t global_txn_id; int glusterd_get_replace_op_str (gf1_cli_replace_op op, char *op_str) @@ -268,13 +269,6 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr, goto out; } - if (volinfo->backend == GD_VOL_BK_BD) { - snprintf (msg, sizeof (msg), "replace brick not supported " - "for Block backend volume"); - *op_errstr = gf_strdup (msg); - goto out; - } - if (GLUSTERD_STATUS_STARTED != volinfo->status) { ret = -1; snprintf (msg, sizeof (msg), "volume: %s is not started", @@ -332,7 +326,7 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr, ret = -1; goto out; } - if (is_origin_glusterd ()) { + if (is_origin_glusterd (dict)) { if (!ctx) { ret = -1; gf_log (this->name, GF_LOG_ERROR, @@ -441,7 +435,7 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr, } } - if (glusterd_is_local_addr (src_brickinfo->hostname)) { + if (gf_is_local_addr (src_brickinfo->hostname)) { gf_log (this->name, GF_LOG_DEBUG, "I AM THE SOURCE HOST"); if (src_brickinfo->port && rsp_dict) { @@ -518,7 +512,7 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr, } if (!glusterd_is_rb_ongoing (volinfo) && - glusterd_is_local_addr (host)) { + gf_is_local_addr (host)) { ret = glusterd_validate_and_create_brickpath (dst_brickinfo, volinfo->volume_id, op_errstr, is_force); @@ -526,7 +520,7 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr, goto out; } - if (!glusterd_is_local_addr (host)) { + if (!gf_is_local_addr (host)) { ret = glusterd_friend_find (NULL, host, &peerinfo); if (ret) { snprintf (msg, sizeof (msg), "%s, is not a friend", @@ -553,7 +547,7 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr, } if (replace_op == GF_REPLACE_OP_START && - glusterd_is_local_addr (volinfo->rep_brick.dst_brick->hostname)) { + gf_is_local_addr (volinfo->rep_brick.dst_brick->hostname)) { port = pmap_registry_alloc (THIS); if (!port) { gf_log (THIS->name, GF_LOG_CRITICAL, @@ -694,7 +688,7 @@ rb_src_brick_restart (glusterd_volinfo_t *volinfo, sleep (2); ret = glusterd_volume_start_glusterfs (volinfo, src_brickinfo, - _gf_false); + _gf_false); if (ret) { gf_log ("", GF_LOG_ERROR, "Unable to start " "glusterfs, ret: %d", ret); @@ -1414,7 +1408,7 @@ rb_update_srcbrick_port (glusterd_brickinfo_t *src_brickinfo, dict_t *rsp_dict, if (src_port) src_brickinfo->port = src_port; - if (glusterd_is_local_addr (src_brickinfo->hostname)) { + if (gf_is_local_addr (src_brickinfo->hostname)) { gf_log ("", GF_LOG_INFO, "adding src-brick port no"); @@ -1468,7 +1462,7 @@ rb_update_dstbrick_port (glusterd_brickinfo_t *dst_brickinfo, dict_t *rsp_dict, dst_brickinfo->port = dst_port; - if (glusterd_is_local_addr (dst_brickinfo->hostname)) { + if (gf_is_local_addr (dst_brickinfo->hostname)) { gf_log ("", GF_LOG_INFO, "adding dst-brick port no"); @@ -1638,7 +1632,7 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict) /* Set task-id, if available, in op_ctx dict for operations * other than start */ - if (is_origin_glusterd ()) { + if (is_origin_glusterd (dict)) { ctx = glusterd_op_get_ctx(); if (!ctx) { gf_log (this->name, GF_LOG_ERROR, "Failed to " @@ -1676,7 +1670,7 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict) uuid_parse (task_id_str, volinfo->rep_brick.rb_id); } - if (glusterd_is_local_addr (dst_brickinfo->hostname)) { + if (gf_is_local_addr (dst_brickinfo->hostname)) { gf_log (this->name, GF_LOG_INFO, "I AM THE DESTINATION HOST"); if (!glusterd_is_rb_paused (volinfo)) { @@ -1696,7 +1690,7 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict) } - if (glusterd_is_local_addr (src_brickinfo->hostname)) { + if (gf_is_local_addr (src_brickinfo->hostname)) { ret = rb_src_brick_restart (volinfo, src_brickinfo, 1); if (ret) { @@ -1706,7 +1700,7 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict) } } - if (glusterd_is_local_addr (dst_brickinfo->hostname)) { + if (gf_is_local_addr (dst_brickinfo->hostname)) { gf_log (this->name, GF_LOG_INFO, "adding dst-brick port no"); @@ -1737,7 +1731,7 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict) /* fall through */ case GF_REPLACE_OP_COMMIT_FORCE: { - if (glusterd_is_local_addr (dst_brickinfo->hostname)) { + if (gf_is_local_addr (dst_brickinfo->hostname)) { gf_log (this->name, GF_LOG_DEBUG, "I AM THE DESTINATION HOST"); ret = rb_kill_destination_brick (volinfo, @@ -1818,7 +1812,7 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict) } } - if (glusterd_is_local_addr (src_brickinfo->hostname)) { + if (gf_is_local_addr (src_brickinfo->hostname)) { ret = rb_src_brick_restart (volinfo, src_brickinfo, 0); if (ret) { @@ -1829,7 +1823,7 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict) } } - if (glusterd_is_local_addr (dst_brickinfo->hostname)) { + if (gf_is_local_addr (dst_brickinfo->hostname)) { gf_log (this->name, GF_LOG_INFO, "I AM THE DESTINATION HOST"); ret = rb_kill_destination_brick (volinfo, dst_brickinfo); @@ -1901,6 +1895,7 @@ glusterd_do_replace_brick (void *data) glusterd_brickinfo_t *src_brickinfo = NULL; glusterd_brickinfo_t *dst_brickinfo = NULL; glusterd_conf_t *priv = NULL; + uuid_t *txn_id = &global_txn_id; int ret = 0; @@ -1920,6 +1915,10 @@ glusterd_do_replace_brick (void *data) gf_log ("", GF_LOG_DEBUG, "Replace brick operation detected"); + ret = dict_get_bin (dict, "transaction_id", (void **)&txn_id); + + gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id)); + ret = dict_get_int32 (dict, "operation", &op); if (ret) { gf_log ("", GF_LOG_DEBUG, @@ -2015,9 +2014,11 @@ glusterd_do_replace_brick (void *data) out: if (ret) - ret = glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, NULL); + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, + txn_id, NULL); else - ret = glusterd_op_sm_inject_event (GD_OP_EVENT_COMMIT_ACC, NULL); + ret = glusterd_op_sm_inject_event (GD_OP_EVENT_COMMIT_ACC, + txn_id, NULL); glusterd_op_sm (); } diff --git a/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c b/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c index 01fb77775..d5200a4ae 100644 --- a/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c @@ -33,6 +33,7 @@ extern glusterd_op_info_t opinfo; +extern uuid_t global_txn_id; int32_t glusterd_op_send_cli_response (glusterd_op_t op, int32_t op_ret, @@ -84,6 +85,7 @@ glusterd_op_send_cli_response (glusterd_op_t op, int32_t op_ret, } break; } + case GD_OP_GSYNC_CREATE: case GD_OP_GSYNC_SET: { if (ctx) { @@ -141,11 +143,26 @@ glusterd_op_send_cli_response (glusterd_op_t op, int32_t op_ret, case GD_OP_LIST_VOLUME: case GD_OP_CLEARLOCKS_VOLUME: case GD_OP_HEAL_VOLUME: - case GD_OP_BD_OP: + case GD_OP_SNAP: { /*nothing specific to be done*/ break; } + case GD_OP_COPY_FILE: + { + if (ctx) + ret = dict_get_str (ctx, "errstr", &errstr); + break; + } + case GD_OP_SYS_EXEC: + { + if (ctx) { + ret = dict_get_str (ctx, "errstr", &errstr); + ret = dict_set_str (ctx, "glusterd_workdir", + conf->workdir); + } + break; + } } rsp.op_ret = op_ret; @@ -233,7 +250,8 @@ __glusterd_probe_cbk (struct rpc_req *req, struct iovec *iov, glusterd_xfer_cli_probe_resp (ctx->req, rsp.op_ret, rsp.op_errno, rsp.op_errstr, - ctx->hostname, ctx->port); + ctx->hostname, ctx->port, + ctx->dict); } glusterd_destroy_probe_ctx (ctx); @@ -261,7 +279,8 @@ __glusterd_probe_cbk (struct rpc_req *req, struct iovec *iov, glusterd_xfer_cli_probe_resp (ctx->req, rsp.op_ret, rsp.op_errno, rsp.op_errstr, - ctx->hostname, ctx->port); + ctx->hostname, ctx->port, + ctx->dict); } glusterd_destroy_probe_ctx (ctx); @@ -390,7 +409,7 @@ out: if (ctx->req)//reverse probe doesn't have req ret = glusterd_xfer_cli_probe_resp (ctx->req, op_ret, op_errno, NULL, ctx->hostname, - ctx->port); + ctx->port, ctx->dict); if (!ret) { glusterd_friend_sm (); glusterd_op_sm (); @@ -488,7 +507,7 @@ inject: respond: ret = glusterd_xfer_cli_deprobe_resp (ctx->req, op_ret, op_errno, NULL, - ctx->hostname); + ctx->hostname, ctx->dict); if (!ret && move_sm_now) { glusterd_friend_sm (); glusterd_op_sm (); @@ -563,6 +582,7 @@ __glusterd_cluster_lock_cbk (struct rpc_req *req, struct iovec *iov, glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE; glusterd_peerinfo_t *peerinfo = NULL; xlator_t *this = NULL; + uuid_t *txn_id = &global_txn_id; this = THIS; GF_ASSERT (this); @@ -607,7 +627,7 @@ out: event_type = GD_OP_EVENT_RCVD_ACC; } - ret = glusterd_op_sm_inject_event (event_type, NULL); + ret = glusterd_op_sm_inject_event (event_type, txn_id, NULL); if (!ret) { glusterd_friend_sm (); @@ -626,6 +646,164 @@ glusterd_cluster_lock_cbk (struct rpc_req *req, struct iovec *iov, __glusterd_cluster_lock_cbk); } +static int32_t +glusterd_mgmt_v3_lock_peers_cbk_fn (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + gd1_mgmt_v3_lock_rsp rsp = {{0},}; + int ret = -1; + int32_t op_ret = -1; + glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE; + glusterd_peerinfo_t *peerinfo = NULL; + xlator_t *this = NULL; + uuid_t *txn_id = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + + if (-1 == req->rpc_status) { + rsp.op_ret = -1; + rsp.op_errno = EINVAL; + goto out; + } + + ret = xdr_to_generic (*iov, &rsp, + (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to decode mgmt_v3 lock " + "response received from peer"); + rsp.op_ret = -1; + rsp.op_errno = EINVAL; + goto out; + } + +out: + op_ret = rsp.op_ret; + + txn_id = &rsp.txn_id; + + gf_log (this->name, (op_ret) ? GF_LOG_ERROR : GF_LOG_DEBUG, + "Received mgmt_v3 lock %s from uuid: %s", + (op_ret) ? "RJT" : "ACC", uuid_utoa (rsp.uuid)); + + ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo); + if (ret) { + gf_log (this->name, GF_LOG_CRITICAL, + "mgmt_v3 lock response received " + "from unknown peer: %s", uuid_utoa (rsp.uuid)); + } + + if (op_ret) { + event_type = GD_OP_EVENT_RCVD_RJT; + opinfo.op_ret = op_ret; + opinfo.op_errstr = gf_strdup ("Another transaction could be in " + "progress. Please try again after" + " sometime."); + } else { + event_type = GD_OP_EVENT_RCVD_ACC; + } + + ret = glusterd_op_sm_inject_event (event_type, txn_id, NULL); + + if (!ret) { + glusterd_friend_sm (); + glusterd_op_sm (); + } + + GLUSTERD_STACK_DESTROY (((call_frame_t *)myframe)); + return ret; +} + +int32_t +glusterd_mgmt_v3_lock_peers_cbk (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + return glusterd_big_locked_cbk (req, iov, count, myframe, + glusterd_mgmt_v3_lock_peers_cbk_fn); +} + +static int32_t +glusterd_mgmt_v3_unlock_peers_cbk_fn (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + gd1_mgmt_v3_unlock_rsp rsp = {{0},}; + int ret = -1; + int32_t op_ret = -1; + glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE; + glusterd_peerinfo_t *peerinfo = NULL; + xlator_t *this = NULL; + uuid_t *txn_id = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + + if (-1 == req->rpc_status) { + rsp.op_ret = -1; + rsp.op_errno = EINVAL; + goto out; + } + + ret = xdr_to_generic (*iov, &rsp, + (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to decode mgmt_v3 unlock " + "response received from peer"); + rsp.op_ret = -1; + rsp.op_errno = EINVAL; + goto out; + } + +out: + op_ret = rsp.op_ret; + + txn_id = &rsp.txn_id; + + gf_log (this->name, (op_ret) ? GF_LOG_ERROR : GF_LOG_DEBUG, + "Received mgmt_v3 unlock %s from uuid: %s", + (op_ret) ? "RJT" : "ACC", + uuid_utoa (rsp.uuid)); + + ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo); + + if (ret) { + gf_log (this->name, GF_LOG_CRITICAL, + "mgmt_v3 unlock response received " + "from unknown peer: %s", uuid_utoa (rsp.uuid)); + } + + if (op_ret) { + event_type = GD_OP_EVENT_RCVD_RJT; + opinfo.op_ret = op_ret; + opinfo.op_errstr = gf_strdup ("Another transaction could be in " + "progress. Please try again after" + " sometime."); + } else { + event_type = GD_OP_EVENT_RCVD_ACC; + } + + ret = glusterd_op_sm_inject_event (event_type, txn_id, NULL); + + if (!ret) { + glusterd_friend_sm (); + glusterd_op_sm (); + } + + GLUSTERD_STACK_DESTROY (((call_frame_t *)myframe)); + return ret; +} + +int32_t +glusterd_mgmt_v3_unlock_peers_cbk (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + return glusterd_big_locked_cbk (req, iov, count, myframe, + glusterd_mgmt_v3_unlock_peers_cbk_fn); +} + int32_t __glusterd_cluster_unlock_cbk (struct rpc_req *req, struct iovec *iov, int count, void *myframe) @@ -636,6 +814,7 @@ __glusterd_cluster_unlock_cbk (struct rpc_req *req, struct iovec *iov, glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE; glusterd_peerinfo_t *peerinfo = NULL; xlator_t *this = NULL; + uuid_t *txn_id = &global_txn_id; this = THIS; GF_ASSERT (this); @@ -677,7 +856,7 @@ out: event_type = GD_OP_EVENT_RCVD_ACC; } - ret = glusterd_op_sm_inject_event (event_type, NULL); + ret = glusterd_op_sm_inject_event (event_type, txn_id, NULL); if (!ret) { glusterd_friend_sm (); @@ -709,6 +888,7 @@ __glusterd_stage_op_cbk (struct rpc_req *req, struct iovec *iov, char err_str[2048] = {0}; char *peer_str = NULL; xlator_t *this = NULL; + uuid_t *txn_id = NULL; this = THIS; GF_ASSERT (this); @@ -761,6 +941,10 @@ out: "Received stage %s from uuid: %s", (op_ret) ? "RJT" : "ACC", uuid_utoa (rsp.uuid)); + ret = dict_get_bin (dict, "transaction_id", (void **)&txn_id); + + gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id)); + ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo); if (ret) { @@ -796,7 +980,7 @@ out: break; } - ret = glusterd_op_sm_inject_event (event_type, NULL); + ret = glusterd_op_sm_inject_event (event_type, txn_id, NULL); if (!ret) { glusterd_friend_sm (); @@ -836,6 +1020,8 @@ __glusterd_commit_op_cbk (struct rpc_req *req, struct iovec *iov, char err_str[2048] = {0}; char *peer_str = NULL; xlator_t *this = NULL; + uuid_t *txn_id = NULL; + this = THIS; GF_ASSERT (this); @@ -889,6 +1075,10 @@ __glusterd_commit_op_cbk (struct rpc_req *req, struct iovec *iov, "Received commit %s from uuid: %s", (op_ret)?"RJT":"ACC", uuid_utoa (rsp.uuid)); + ret = dict_get_bin (dict, "transaction_id", (void **)&txn_id); + + gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id)); + ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo); if (ret) { @@ -968,7 +1158,7 @@ __glusterd_commit_op_cbk (struct rpc_req *req, struct iovec *iov, } out: - ret = glusterd_op_sm_inject_event (event_type, NULL); + ret = glusterd_op_sm_inject_event (event_type, txn_id, NULL); if (!ret) { glusterd_friend_sm (); @@ -1205,6 +1395,134 @@ out: } int32_t +glusterd_mgmt_v3_lock_peers (call_frame_t *frame, xlator_t *this, + void *data) +{ + gd1_mgmt_v3_lock_req req = {{0},}; + int ret = -1; + glusterd_peerinfo_t *peerinfo = NULL; + glusterd_conf_t *priv = NULL; + call_frame_t *dummy_frame = NULL; + dict_t *dict = NULL; + uuid_t *txn_id = NULL; + + if (!this) + goto out; + + dict = data; + + priv = this->private; + GF_ASSERT (priv); + + ret = dict_get_ptr (dict, "peerinfo", VOID (&peerinfo)); + if (ret) + goto out; + + //peerinfo should not be in payload + dict_del (dict, "peerinfo"); + + glusterd_get_uuid (&req.uuid); + + ret = dict_allocate_and_serialize (dict, &req.dict.dict_val, + &req.dict.dict_len); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to serialize dict " + "to request buffer"); + goto out; + } + + /* Sending valid transaction ID to peers */ + ret = dict_get_bin (dict, "transaction_id", + (void **)&txn_id); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to get transaction id."); + goto out; + } else { + gf_log (this->name, GF_LOG_DEBUG, + "Transaction_id = %s", uuid_utoa (*txn_id)); + uuid_copy (req.txn_id, *txn_id); + } + + dummy_frame = create_frame (this, this->ctx->pool); + if (!dummy_frame) + goto out; + + ret = glusterd_submit_request (peerinfo->rpc, &req, dummy_frame, + peerinfo->mgmt_v3, + GLUSTERD_MGMT_V3_LOCK, NULL, + this, glusterd_mgmt_v3_lock_peers_cbk, + (xdrproc_t)xdr_gd1_mgmt_v3_lock_req); +out: + gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +int32_t +glusterd_mgmt_v3_unlock_peers (call_frame_t *frame, xlator_t *this, + void *data) +{ + gd1_mgmt_v3_unlock_req req = {{0},}; + int ret = -1; + glusterd_peerinfo_t *peerinfo = NULL; + glusterd_conf_t *priv = NULL; + call_frame_t *dummy_frame = NULL; + dict_t *dict = NULL; + uuid_t *txn_id = NULL; + + if (!this) + goto out; + + dict = data; + + priv = this->private; + GF_ASSERT (priv); + + ret = dict_get_ptr (dict, "peerinfo", VOID (&peerinfo)); + if (ret) + goto out; + + //peerinfo should not be in payload + dict_del (dict, "peerinfo"); + + glusterd_get_uuid (&req.uuid); + + ret = dict_allocate_and_serialize (dict, &req.dict.dict_val, + &req.dict.dict_len); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to serialize dict " + "to request buffer"); + goto out; + } + + /* Sending valid transaction ID to peers */ + ret = dict_get_bin (dict, "transaction_id", + (void **)&txn_id); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to get transaction id."); + goto out; + } else { + gf_log (this->name, GF_LOG_DEBUG, + "Transaction_id = %s", uuid_utoa (*txn_id)); + uuid_copy (req.txn_id, *txn_id); + } + + dummy_frame = create_frame (this, this->ctx->pool); + if (!dummy_frame) + goto out; + + ret = glusterd_submit_request (peerinfo->rpc, &req, dummy_frame, + peerinfo->mgmt_v3, + GLUSTERD_MGMT_V3_UNLOCK, NULL, + this, glusterd_mgmt_v3_unlock_peers_cbk, + (xdrproc_t)xdr_gd1_mgmt_v3_unlock_req); +out: + gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +int32_t glusterd_cluster_unlock (call_frame_t *frame, xlator_t *this, void *data) { @@ -1367,6 +1685,7 @@ __glusterd_brick_op_cbk (struct rpc_req *req, struct iovec *iov, glusterd_req_ctx_t *req_ctx = NULL; glusterd_pending_node_t *node = NULL; xlator_t *this = NULL; + uuid_t *txn_id = &global_txn_id; this = THIS; GF_ASSERT (this); @@ -1429,6 +1748,11 @@ __glusterd_brick_op_cbk (struct rpc_req *req, struct iovec *iov, } } out: + + ret = dict_get_bin (req_ctx->dict, "transaction_id", (void **)&txn_id); + + gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id)); + ev_ctx = GF_CALLOC (1, sizeof (*ev_ctx), gf_gld_mt_brick_rsp_ctx_t); GF_ASSERT (ev_ctx); if (op_ret) { @@ -1441,7 +1765,7 @@ out: ev_ctx->pending_node = frame->cookie; ev_ctx->rsp_dict = dict; ev_ctx->commit_ctx = frame->local; - ret = glusterd_op_sm_inject_event (event_type, ev_ctx); + ret = glusterd_op_sm_inject_event (event_type, txn_id, ev_ctx); if (!ret) { glusterd_friend_sm (); glusterd_op_sm (); @@ -1466,6 +1790,7 @@ int32_t glusterd_brick_op (call_frame_t *frame, xlator_t *this, void *data) { + gd1_mgmt_brick_op_req *req = NULL; int ret = 0; glusterd_conf_t *priv = NULL; @@ -1476,6 +1801,7 @@ glusterd_brick_op (call_frame_t *frame, xlator_t *this, glusterd_req_ctx_t *req_ctx = NULL; struct rpc_clnt *rpc = NULL; dict_t *op_ctx = NULL; + uuid_t *txn_id = &global_txn_id; if (!this) { ret = -1; @@ -1498,6 +1824,11 @@ glusterd_brick_op (call_frame_t *frame, xlator_t *this, goto out; } + + ret = dict_get_bin (req_ctx->dict, "transaction_id", (void **)&txn_id); + + gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id)); + list_for_each_entry (pending_node, &opinfo.pending_bricks, list) { dummy_frame = create_frame (this, this->ctx->pool); if (!dummy_frame) @@ -1575,7 +1906,8 @@ glusterd_brick_op (call_frame_t *frame, xlator_t *this, out: if (ret) { - glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, data); + glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, + txn_id, data); opinfo.op_ret = ret; } gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret); @@ -1603,6 +1935,12 @@ struct rpc_clnt_procedure gd_mgmt_actors[GLUSTERD_MGMT_MAXVALUE] = { [GLUSTERD_MGMT_COMMIT_OP] = {"COMMIT_OP", glusterd_commit_op}, }; +struct rpc_clnt_procedure gd_mgmt_v3_actors[GLUSTERD_MGMT_V3_MAXVALUE] = { + [GLUSTERD_MGMT_V3_NULL] = {"NULL", NULL }, + [GLUSTERD_MGMT_V3_LOCK] = {"MGMT_V3_LOCK", glusterd_mgmt_v3_lock_peers}, + [GLUSTERD_MGMT_V3_UNLOCK] = {"MGMT_V3_UNLOCK", glusterd_mgmt_v3_unlock_peers}, +}; + struct rpc_clnt_program gd_mgmt_prog = { .progname = "glusterd mgmt", .prognum = GD_MGMT_PROGRAM, @@ -1627,4 +1965,10 @@ struct rpc_clnt_program gd_peer_prog = { .numproc = GLUSTERD_FRIEND_MAXVALUE, }; - +struct rpc_clnt_program gd_mgmt_v3_prog = { + .progname = "glusterd mgmt v3", + .prognum = GD_MGMT_V3_PROGRAM, + .progver = GD_MGMT_V3_VERSION, + .proctable = gd_mgmt_v3_actors, + .numproc = GLUSTERD_MGMT_V3_MAXVALUE, +}; diff --git a/xlators/mgmt/glusterd/src/glusterd-sm.c b/xlators/mgmt/glusterd/src/glusterd-sm.c index 86ce86a3e..c671edf68 100644 --- a/xlators/mgmt/glusterd/src/glusterd-sm.c +++ b/xlators/mgmt/glusterd/src/glusterd-sm.c @@ -400,7 +400,8 @@ glusterd_ac_send_friend_remove_req (glusterd_friend_sm_event_t *event, if (ctx) ret = glusterd_xfer_cli_deprobe_resp (ctx->req, ret, 0, NULL, - ctx->hostname); + ctx->hostname, + ctx->dict); glusterd_friend_sm (); glusterd_op_sm (); diff --git a/xlators/mgmt/glusterd/src/glusterd-sm.h b/xlators/mgmt/glusterd/src/glusterd-sm.h index 99eabfa86..b9bedbe69 100644 --- a/xlators/mgmt/glusterd/src/glusterd-sm.h +++ b/xlators/mgmt/glusterd/src/glusterd-sm.h @@ -27,14 +27,7 @@ #include "byte-order.h" //#include "glusterd.h" #include "rpcsvc.h" - -struct glusterd_store_handle_ { - char *path; - int fd; - FILE *read; -}; - -typedef struct glusterd_store_handle_ glusterd_store_handle_t; +#include "store.h" typedef enum gd_quorum_contribution_ { QUORUM_NONE, @@ -104,8 +97,9 @@ struct glusterd_peerinfo_ { struct rpc_clnt *rpc; rpc_clnt_prog_t *mgmt; rpc_clnt_prog_t *peer; + rpc_clnt_prog_t *mgmt_v3; int connected; - glusterd_store_handle_t *shandle; + gf_store_handle_t *shandle; glusterd_sm_tr_log_t sm_log; gf_boolean_t quorum_action; gd_quorum_contrib_t quorum_contrib; @@ -123,6 +117,7 @@ typedef enum glusterd_ev_gen_mode_ { typedef struct glusterd_peer_ctx_args_ { rpcsvc_request_t *req; glusterd_ev_gen_mode_t mode; + dict_t *dict; } glusterd_peerctx_args_t; typedef struct glusterd_peer_ctx_ { @@ -189,6 +184,7 @@ typedef struct glusterd_probe_ctx_ { char *hostname; rpcsvc_request_t *req; int port; + dict_t *dict; } glusterd_probe_ctx_t; int glusterd_friend_sm_new_event (glusterd_friend_sm_event_type_t event_type, diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot.c b/xlators/mgmt/glusterd/src/glusterd-snapshot.c new file mode 100644 index 000000000..9b811cd05 --- /dev/null +++ b/xlators/mgmt/glusterd/src/glusterd-snapshot.c @@ -0,0 +1,5590 @@ +/* + Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <inttypes.h> +#include <sys/types.h> +#include <unistd.h> +#include <sys/resource.h> +#include <sys/statvfs.h> +#include <sys/mount.h> + +#include "globals.h" +#include "compat.h" +#include "protocol-common.h" +#include "xlator.h" +#include "logging.h" +#include "timer.h" +#include "glusterd-mem-types.h" +#include "glusterd.h" +#include "glusterd-sm.h" +#include "glusterd-op-sm.h" +#include "glusterd-utils.h" +#include "glusterd-store.h" +#include "run.h" +#include "glusterd-volgen.h" +#include "glusterd-mgmt.h" +#include "glusterd-syncop.h" + +#include "syscall.h" +#include "cli1-xdr.h" +#include "xdr-generic.h" + +#ifdef GF_LINUX_HOST_OS +#include <mntent.h> +#endif + +char snap_mount_folder[PATH_MAX]; + +static int32_t +glusterd_find_missed_snap (dict_t *rsp_dict, glusterd_volinfo_t *vol, + char *snap_uuid, struct list_head *peers, + int32_t op); + +/* This function will restore a snapshot volumes + * + * @param dict dictionary containing snapshot restore request + * @param op_errstr In case of any failure error message will be returned + * in this variable + * @return Negative value on Failure and 0 in success + */ +int +glusterd_snapshot_restore (dict_t *dict, char **op_errstr, dict_t *rsp_dict) +{ + int ret = -1; + char *volname = NULL; + char *snapname = NULL; + xlator_t *this = NULL; + glusterd_volinfo_t *snap_volinfo = NULL; + glusterd_volinfo_t *volinfo = NULL; + glusterd_snap_t *snap = NULL; + glusterd_conf_t *priv = NULL; + + this = THIS; + + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (op_errstr); + GF_ASSERT (rsp_dict); + + priv = this->private; + GF_ASSERT (priv); + + ret = dict_get_str (dict, "snapname", &snapname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "snap name"); + goto out; + } + + snap = glusterd_find_snap_by_name (snapname); + if (NULL == snap) { + ret = gf_asprintf (op_errstr, "Snap (%s) not found", + snapname); + if (ret < 0) { + goto out; + } + gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr); + ret = -1; + goto out; + } + + /* TODO : As of now there is only volume in snapshot. + * Change this when multiple volume snapshot is introduced + */ + snap_volinfo = list_entry (snap->volumes.next, glusterd_volinfo_t, + vol_list); + + ret = glusterd_volinfo_find (snap_volinfo->parent_volname, &volinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not get volinfo of " + "%s", snap_volinfo->parent_volname); + goto out; + } + + if (is_origin_glusterd (dict) == _gf_true) { + /* From origin glusterd check if * + * any peers with snap bricks is down */ + ret = glusterd_find_missed_snap (rsp_dict, snap_volinfo, + snap_volinfo->volname, + &priv->peers, + GF_SNAP_OPTION_TYPE_RESTORE); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to find missed snap restores"); + goto out; + } + } + + ret = gd_restore_snap_volume (rsp_dict, volinfo, snap_volinfo); + if (ret) { + /* No need to update op_errstr because it is assumed + * that the called function will do that in case of + * failure. + */ + gf_log (this->name, GF_LOG_ERROR, "Failed to restore " + "snap for %s volume", volname); + goto out; + } + + ret = 0; + + /* TODO: Need to check if we need to delete the snap after the + * operation is successful or not. Also need to persist the state + * of restore operation in the store. + */ +out: + return ret; +} + +/* This function is called before actual restore is taken place. This function + * will validate whether the snapshot volumes are ready to be restored or not. + * + * @param dict dictionary containing snapshot restore request + * @param op_errstr In case of any failure error message will be returned + * in this variable + * @param rsp_dict response dictionary + * @return Negative value on Failure and 0 in success + */ +int +glusterd_snapshot_restore_prevalidate (dict_t *dict, char **op_errstr, + dict_t *rsp_dict) +{ + int ret = -1; + int32_t i = 0; + int32_t volcount = 0; + gf_boolean_t snap_restored = _gf_false; + char key[PATH_MAX] = {0, }; + char *volname = NULL; + char *snapname = NULL; + glusterd_volinfo_t *volinfo = NULL; + glusterd_snap_t *snap = NULL; + xlator_t *this = NULL; + + this = THIS; + + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (op_errstr); + GF_ASSERT (rsp_dict); + + ret = dict_get_str (dict, "snapname", &snapname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "snap name"); + goto out; + } + + snap = glusterd_find_snap_by_name (snapname); + if (NULL == snap) { + ret = gf_asprintf (op_errstr, "Snap (%s) not found", + snapname); + if (ret < 0) { + goto out; + } + gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr); + ret = -1; + goto out; + } + + snap_restored = snap->snap_restored; + + if (snap_restored) { + ret = gf_asprintf (op_errstr, "Snap (%s) is already " + "restored", snapname); + if (ret < 0) { + goto out; + } + gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr); + ret = -1; + goto out; + } + + ret = dict_set_str (rsp_dict, "snapname", snapname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set " + "snap name"); + goto out; + } + + ret = dict_get_int32 (dict, "volcount", &volcount); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get volume count"); + goto out; + } + + /* Snapshot restore will only work if all the volumes, + that are part of the snapshot, are stopped. */ + for (i = 1; i <= volcount; ++i) { + snprintf (key, sizeof (key), "volname%d", i); + ret = dict_get_str (dict, key, &volname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "get volume name"); + goto out; + } + + ret = glusterd_volinfo_find (volname, &volinfo); + if (ret) { + ret = gf_asprintf (op_errstr, "Volume (%s) not found", + volname); + if (ret < 0) { + goto out; + } + gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr); + ret = -1; + goto out; + } + + if (glusterd_is_volume_started (volinfo)) { + ret = gf_asprintf (op_errstr, "Volume (%s) has been " + "started. Volume needs to be stopped before restoring " + "a snapshot.", volname); + if (ret < 0) { + goto out; + } + gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr); + ret = -1; + goto out; + } + } + + ret = 0; +out: + return ret; +} + +int +snap_max_hard_limits_validate (dict_t *dict, char *volname, + uint64_t value, char **op_errstr) +{ + char err_str[PATH_MAX] = ""; + glusterd_conf_t *conf = NULL; + glusterd_volinfo_t *volinfo = NULL; + int ret = -1; + uint64_t max_limit = GLUSTERD_SNAPS_MAX_HARD_LIMIT; + xlator_t *this = NULL; + + this = THIS; + + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (op_errstr); + + conf = this->private; + + GF_ASSERT (conf); + + if (volname) { + ret = glusterd_volinfo_find (volname, &volinfo); + if (!ret) { + if (volinfo->is_snap_volume) { + ret = -1; + snprintf (err_str, PATH_MAX, + "%s is a snap volume. Configuring " + "snap-max-hard-limit for a snap " + "volume is prohibited.", volname); + goto out; + } + } + } + + if (value) { + /* Max limit for the system is GLUSTERD_SNAPS_MAX_HARD_LIMIT + * but max limit for a volume is conf->snap_max_hard_limit. + */ + if (volname) { + max_limit = conf->snap_max_hard_limit; + } else { + max_limit = GLUSTERD_SNAPS_MAX_HARD_LIMIT; + } + } + + if ((value < 0) || (value > max_limit)) { + ret = -1; + snprintf (err_str, PATH_MAX, "Invalid snap-max-hard-limit" + "%"PRIu64 ". Expected range 0 - %"PRIu64, + value, max_limit); + goto out; + } + + ret = 0; +out: + if (ret) { + *op_errstr = gf_strdup (err_str); + gf_log (this->name, GF_LOG_ERROR, "%s", err_str); + } + return ret; +} + +int +glusterd_snapshot_config_prevalidate (dict_t *dict, char **op_errstr) +{ + char *volname = NULL; + glusterd_volinfo_t *volinfo = NULL; + xlator_t *this = NULL; + int ret = -1; + int config_command = 0; + char err_str[PATH_MAX] = {0,}; + glusterd_conf_t *conf = NULL; + uint64_t value = 0; + uint64_t hard_limit = 0; + uint64_t soft_limit = 0; + gf_loglevel_t loglevel = GF_LOG_ERROR; + uint64_t max_limit = GLUSTERD_SNAPS_MAX_HARD_LIMIT; + + this = THIS; + + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (op_errstr); + + conf = this->private; + + GF_ASSERT (conf); + + ret = dict_get_int32 (dict, "config-command", &config_command); + if (ret) { + snprintf (err_str, sizeof (err_str), + "failed to get config-command type"); + goto out; + } + + ret = dict_get_uint64 (dict, "snap-max-hard-limit", &hard_limit); + + ret = dict_get_uint64 (dict, "snap-max-soft-limit", &soft_limit); + + ret = dict_get_str (dict, "volname", &volname); + + if (volname) { + ret = glusterd_volinfo_find (volname, &volinfo); + if (ret) { + snprintf (err_str, sizeof (err_str), + "Volume %s does not exist.", volname); + goto out; + } + } + + switch (config_command) { + case GF_SNAP_CONFIG_TYPE_SET: + if (hard_limit) { + /* Validations for snap-max-hard-limits */ + ret = snap_max_hard_limits_validate (dict, volname, + hard_limit, op_errstr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "snap-max-hard-limit validation " + "failed."); + goto out; + } + } + + if (soft_limit) { + max_limit = GLUSTERD_SNAPS_MAX_SOFT_LIMIT_PERCENT; + if ((soft_limit < 0) || (soft_limit > max_limit)) { + ret = -1; + snprintf (err_str, PATH_MAX, "Invalid " + "snap-max-soft-limit ""%" + PRIu64 ". Expected range 0 - %"PRIu64, + value, max_limit); + goto out; + } + break; + } + default: + break; + } + + ret = 0; +out: + + if (ret && err_str[0] != '\0') { + gf_log (this->name, loglevel, "%s", err_str); + *op_errstr = gf_strdup (err_str); + } + + return ret; +} + +int +glusterd_snap_create_pre_val_use_rsp_dict (dict_t *dst, dict_t *src) +{ + char *snap_brick_dir = NULL; + char *snap_device = NULL; + char *tmpstr = NULL; + char key[PATH_MAX] = ""; + char snapbrckcnt[PATH_MAX] = ""; + char snapbrckord[PATH_MAX] = ""; + int ret = -1; + int64_t i = -1; + int64_t j = -1; + int64_t volume_count = 0; + int64_t brick_count = 0; + int64_t brick_order = 0; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (dst); + GF_ASSERT (src); + + ret = dict_get_int64 (src, "volcount", &volume_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to " + "get the volume count"); + goto out; + } + + for (i = 0; i < volume_count; i++) { + memset (snapbrckcnt, '\0', sizeof(snapbrckcnt)); + ret = snprintf (snapbrckcnt, sizeof(snapbrckcnt) - 1, + "vol%ld_brickcount", i+1); + ret = dict_get_int64 (src, snapbrckcnt, &brick_count); + if (ret) { + gf_log (this->name, GF_LOG_TRACE, + "No bricks for this volume in this dict"); + continue; + } + + for (j = 0; j < brick_count; j++) { + /* Fetching data from source dict */ + snprintf (key, sizeof(key) - 1, + "vol%ld.brickdir%ld", i+1, j); + + ret = dict_get_ptr (src, key, + (void **)&snap_brick_dir); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "Unable to fetch %s", key); + continue; + } + + snprintf (key, sizeof(key) - 1, + "vol%ld.brick_snapdevice%ld", i+1, j); + + ret = dict_get_ptr (src, key, + (void **)&snap_device); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to fetch snap_device"); + goto out; + } + + snprintf (snapbrckord, sizeof(snapbrckord) - 1, + "vol%ld.brick%ld.order", i+1, j); + + ret = dict_get_int64 (src, snapbrckord, &brick_order); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to get brick order"); + goto out; + } + + /* Adding the data in the dst dict */ + snprintf (key, sizeof(key) - 1, + "vol%ld.brickdir%ld", i+1, brick_order); + + tmpstr = gf_strdup (snap_brick_dir); + if (!tmpstr) { + gf_log (this->name, GF_LOG_ERROR, + "Out Of Memory"); + ret = -1; + goto out; + } + ret = dict_set_dynstr (dst, key, tmpstr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set %s", key); + GF_FREE (tmpstr); + goto out; + } + + snprintf (key, sizeof(key) - 1, + "vol%ld.brick_snapdevice%ld", + i+1, brick_order); + + tmpstr = gf_strdup (snap_device); + if (!tmpstr) { + ret = -1; + goto out; + } + ret = dict_set_dynstr (dst, key, tmpstr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set %s", key); + GF_FREE (tmpstr); + goto out; + } + + } + } + + ret = 0; +out: + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +int +glusterd_snap_pre_validate_use_rsp_dict (dict_t *dst, dict_t *src) +{ + int ret = -1; + int32_t snap_command = 0; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + + if (!dst || !src) { + gf_log (this->name, GF_LOG_ERROR, "Source or Destination " + "dict is empty."); + goto out; + } + + ret = dict_get_int32 (dst, "type", &snap_command); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "unable to get the type of " + "the snapshot command"); + goto out; + } + + switch (snap_command) { + case GF_SNAP_OPTION_TYPE_CREATE: + ret = glusterd_snap_create_pre_val_use_rsp_dict (dst, src); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Unable to use " + "rsp dict"); + goto out; + } + break; + default: + break; + } + + ret = 0; +out: + gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +int +glusterd_snapshot_create_prevalidate (dict_t *dict, char **op_errstr, + dict_t *rsp_dict) +{ + char *volname = NULL; + char *snapname = NULL; + char *device = NULL; + char *tmpstr = NULL; + char *brick_dir = NULL; + char snap_brick_dir[PATH_MAX] = ""; + char *mnt_pt = NULL; + char key[PATH_MAX] = ""; + char snap_mount[PATH_MAX] = ""; + char snap_volname[64] = ""; + char err_str[PATH_MAX] = ""; + int ret = -1; + int64_t i = 0; + int64_t volcount = 0; + int64_t brick_count = 0; + int64_t brick_order = 0; + glusterd_brickinfo_t *brickinfo = NULL; + glusterd_volinfo_t *volinfo = NULL; + xlator_t *this = NULL; + uuid_t *snap_volid = NULL; + gf_loglevel_t loglevel = GF_LOG_ERROR; + glusterd_conf_t *conf = NULL; + int64_t effective_max_limit = 0; + + this = THIS; + GF_ASSERT (op_errstr); + conf = this->private; + GF_ASSERT (conf); + + ret = dict_get_int64 (dict, "volcount", &volcount); + if (ret) { + snprintf (err_str, sizeof (err_str), "Failed to " + "get the volume count"); + goto out; + } + if (volcount <= 0) { + snprintf (err_str, sizeof (err_str), "Invalid volume count %ld " + "supplied", volcount); + ret = -1; + goto out; + } + + ret = dict_get_str (dict, "snapname", &snapname); + if (ret) { + snprintf (err_str, sizeof (err_str), "Failed to get snapname"); + goto out; + } + + if (glusterd_find_snap_by_name (snapname)) { + ret = -1; + snprintf (err_str, sizeof (err_str), "Snap %s already exists", + snapname); + goto out; + } + + for (i = 1; i <= volcount; i++) { + snprintf (key, sizeof (key), "volname%ld", i); + ret = dict_get_str (dict, key, &volname); + if (ret) { + snprintf (err_str, sizeof (err_str), + "failed to get volume name"); + goto out; + } + ret = glusterd_volinfo_find (volname, &volinfo); + if (ret) { + snprintf (err_str, sizeof (err_str), + "Volume (%s) does not exist ", volname); + goto out; + } + + ret = -1; + if (!glusterd_is_volume_started (volinfo)) { + snprintf (err_str, sizeof (err_str), "volume %s is " + "not started", volinfo->volname); + loglevel = GF_LOG_WARNING; + goto out; + } + if (glusterd_is_defrag_on (volinfo)) { + snprintf (err_str, sizeof (err_str), + "rebalance process is running for the " + "volume %s", volname); + loglevel = GF_LOG_WARNING; + goto out; + } + /* TODO: Also check whether geo replication is running */ + + if (volinfo->is_snap_volume == _gf_true) { + snprintf (err_str, sizeof (err_str), + "Volume %s is a snap volume", volname); + loglevel = GF_LOG_WARNING; + goto out; + } + + if (volinfo->snap_max_hard_limit < conf->snap_max_hard_limit) + effective_max_limit = volinfo->snap_max_hard_limit; + else + effective_max_limit = conf->snap_max_hard_limit; + + if (volinfo->snap_count >= effective_max_limit) { + snprintf (err_str, sizeof (err_str), + "The number of existing snaps has reached " + "the effective maximum limit of %"PRIu64" ," + "for the volume %s", effective_max_limit, + volname); + loglevel = GF_LOG_WARNING; + goto out; + } + + snprintf (key, sizeof(key) - 1, "vol%ld_volid", i); + ret = dict_get_bin (dict, key, (void **)&snap_volid); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to fetch snap_volid"); + goto out; + } + + /* snap volume uuid is used as lvm snapshot name. + This will avoid restrictions on snapshot names + provided by user */ + GLUSTERD_GET_UUID_NOHYPHEN (snap_volname, *snap_volid); + + brick_count = 0; + brick_order = 0; + /* Adding snap bricks mount paths to the dict */ + list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { + if (uuid_compare (brickinfo->uuid, MY_UUID)) { + brick_order++; + continue; + } + + if (!glusterd_is_brick_started (brickinfo)) { + gf_log (this->name, GF_LOG_WARNING, + "brick %s:%s is not started", + brickinfo->hostname, + brickinfo->path); + brick_order++; + brick_count++; + continue; + } + + device = glusterd_get_brick_mount_details (brickinfo); + if (!device) { + snprintf (err_str, sizeof (err_str), + "getting device name for the brick " + "%s:%s failed", brickinfo->hostname, + brickinfo->path); + ret = -1; + goto out; + } + + device = glusterd_build_snap_device_path (device, + snap_volname); + if (!device) { + snprintf (err_str, sizeof (err_str), + "cannot copy the snapshot device " + "name (volname: %s, snapname: %s)", + volinfo->volname, snapname); + loglevel = GF_LOG_WARNING; + ret = -1; + goto out; + } + + snprintf (key, sizeof(key), + "vol%ld.brick_snapdevice%ld", i, + brick_count); + ret = dict_set_dynstr (rsp_dict, key, device); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set %s", key); + GF_FREE (device); + goto out; + } + + ret = glusterd_get_brick_root (brickinfo->path, + &mnt_pt); + if (ret) { + snprintf (err_str, sizeof (err_str), + "could not get the root of the brick path %s", + brickinfo->path); + loglevel = GF_LOG_WARNING; + goto out; + } + if (strncmp (brickinfo->path, mnt_pt, strlen(mnt_pt))) { + snprintf (err_str, sizeof (err_str), + "brick: %s brick mount: %s", + brickinfo->path, mnt_pt); + loglevel = GF_LOG_WARNING; + goto out; + } + + brick_dir = &brickinfo->path[strlen (mnt_pt)]; + brick_dir++; + + snprintf (snap_brick_dir, sizeof (snap_brick_dir), + "/%s", brick_dir); + + tmpstr = gf_strdup (snap_brick_dir); + if (!tmpstr) { + ret = -1; + goto out; + } + snprintf (key, sizeof(key), "vol%ld.brickdir%ld", i, + brick_count); + ret = dict_set_dynstr (rsp_dict, key, tmpstr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set %s", snap_mount); + goto out; + } + tmpstr = NULL; + + snprintf (key, sizeof(key) - 1, "vol%ld.brick%ld.order", + i, brick_count); + ret = dict_set_int64 (rsp_dict, key, brick_order); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set %s", key); + goto out; + } + + brick_count++; + brick_order++; + } + snprintf (key, sizeof(key) - 1, "vol%ld_brickcount", i); + ret = dict_set_int64 (rsp_dict, key, brick_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set %s", + key); + goto out; + } + } + + ret = dict_set_int64 (rsp_dict, "volcount", volcount); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set volcount"); + goto out; + } + + ret = 0; +out: + if (ret) + GF_FREE (tmpstr); + + if (ret && err_str[0] != '\0') { + gf_log (this->name, loglevel, "%s", err_str); + *op_errstr = gf_strdup (err_str); + } + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +glusterd_snap_t* +glusterd_new_snap_object() +{ + glusterd_snap_t *snap = NULL; + + snap = GF_CALLOC (1, sizeof (*snap), gf_gld_mt_snap_t); + + if (snap) { + if (LOCK_INIT (&snap->lock)) { + gf_log (THIS->name, GF_LOG_ERROR, "Failed initiating" + " snap lock"); + GF_FREE (snap); + return NULL; + } + + INIT_LIST_HEAD (&snap->snap_list); + INIT_LIST_HEAD (&snap->volumes); + snap->snapname[0] = 0; + snap->snap_status = GD_SNAP_STATUS_INIT; + } + + return snap; + +}; + +/* Function glusterd_list_add_snapvol adds the volinfo object (snapshot volume) + to the snapshot object list and to the parent volume list */ +int32_t +glusterd_list_add_snapvol (glusterd_volinfo_t *origin_vol, + glusterd_volinfo_t *snap_vol) +{ + int ret = -1; + glusterd_snap_t *snap = NULL; + + GF_VALIDATE_OR_GOTO ("glusterd", origin_vol, out); + GF_VALIDATE_OR_GOTO ("glusterd", snap_vol, out); + + snap = snap_vol->snapshot; + GF_ASSERT (snap); + + list_add_tail (&snap_vol->vol_list, &snap->volumes); + LOCK (&origin_vol->lock); + { + list_add_order (&snap_vol->snapvol_list, + &origin_vol->snap_volumes, + glusterd_compare_snap_vol_time); + origin_vol->snap_count++; + } + UNLOCK (&origin_vol->lock); + + gf_log (THIS->name, GF_LOG_DEBUG, "Snap %s added to the list", + snap->snapname); + ret = 0; +out: + return ret; +} + +glusterd_snap_t* +glusterd_find_snap_by_name (char *snapname) +{ + glusterd_snap_t *snap = NULL; + glusterd_conf_t *priv = NULL; + + priv = THIS->private; + GF_ASSERT (priv); + GF_ASSERT (snapname); + + + list_for_each_entry (snap, &priv->snapshots, snap_list) { + if (!strcmp (snap->snapname, snapname)) { + gf_log (THIS->name, GF_LOG_DEBUG, "Found " + "snap %s (%s)", snap->snapname, + uuid_utoa (snap->snap_id)); + goto out; + } + } + snap = NULL; +out: + return snap; +} + +glusterd_snap_t* +glusterd_find_snap_by_id (uuid_t snap_id) +{ + glusterd_snap_t *snap = NULL; + glusterd_conf_t *priv = NULL; + + priv = THIS->private; + GF_ASSERT (priv); + + if (uuid_is_null(snap_id)) + goto out; + + list_for_each_entry (snap, &priv->snapshots, snap_list) { + if (!uuid_compare (snap->snap_id, snap_id)) { + gf_log (THIS->name, GF_LOG_DEBUG, "Found " + "snap %s (%s)", snap->snapname, + uuid_utoa (snap->snap_id)); + goto out; + } + } + snap = NULL; +out: + return snap; +} + +int +glusterd_do_lvm_snapshot_remove (glusterd_volinfo_t *snap_vol, + glusterd_brickinfo_t *brickinfo, + const char *mount_pt, const char *snap_device) +{ + int ret = -1; + xlator_t *this = NULL; + glusterd_conf_t *priv = NULL; + runner_t runner = {0,}; + char msg[1024] = {0, }; + char pidfile[PATH_MAX] = {0, }; + pid_t pid = -1; + + this = THIS; + GF_ASSERT (this); + priv = this->private; + GF_ASSERT (priv); + + if (!brickinfo) { + gf_log (this->name, GF_LOG_ERROR, "brickinfo NULL"); + goto out; + } + + GF_ASSERT (snap_vol); + GF_ASSERT (mount_pt); + GF_ASSERT (snap_device); + + GLUSTERD_GET_BRICK_PIDFILE (pidfile, snap_vol, brickinfo, priv); + if (glusterd_is_service_running (pidfile, &pid)) { + ret = kill (pid, SIGKILL); + if (ret && errno != ESRCH) { + gf_log (this->name, GF_LOG_ERROR, "Unable to kill pid " + "%d reason : %s", pid, strerror(errno)); + goto out; + } + } + + runinit (&runner); + snprintf (msg, sizeof (msg), "umount the snapshot mounted path %s", + mount_pt); + runner_add_args (&runner, "umount", mount_pt, NULL); + runner_log (&runner, "", GF_LOG_DEBUG, msg); + + /* We need not do synclock_unlock => runner_run => synclock_lock here. + Because it is needed if we are running a glusterfs process in + runner_run, so that when the glusterfs process started wants to + communicate to glusterd, glusterd wont be able to respond if it + has held the big lock. So we do unlock, run glusterfs process + (thus communicate to glusterd), lock. But since this is not a + glusterfs command that is being run, unlocking and then relocking + is not needed. + */ + ret = runner_run (&runner); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "unmounting the " + "path %s (brick: %s) failed (%s)", mount_pt, + brickinfo->path, strerror (errno)); + goto out; + } + + runinit (&runner); + snprintf (msg, sizeof(msg), "remove snapshot of the brick %s:%s, " + "device: %s", brickinfo->hostname, brickinfo->path, + snap_device); + runner_add_args (&runner, "/sbin/lvremove", "-f", snap_device, NULL); + runner_log (&runner, "", GF_LOG_DEBUG, msg); + + ret = runner_run (&runner); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "removing snapshot of the " + "brick (%s:%s) of device %s failed", + brickinfo->hostname, brickinfo->path, snap_device); + goto out; + } + +out: + return ret; +} + +int32_t +glusterd_lvm_snapshot_remove (dict_t *rsp_dict, glusterd_volinfo_t *snap_vol) +{ + char *mnt_pt = NULL; + struct mntent *entry = NULL; + int32_t brick_count = -1; + int32_t ret = -1; + glusterd_brickinfo_t *brickinfo = NULL; + xlator_t *this = NULL; + FILE *mtab = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (rsp_dict); + GF_ASSERT (snap_vol); + + if (!snap_vol) { + gf_log (this->name, GF_LOG_ERROR, "snap volinfo is NULL"); + goto out; + } + + brick_count = -1; + list_for_each_entry (brickinfo, &snap_vol->bricks, brick_list) { + brick_count++; + if (uuid_compare (brickinfo->uuid, MY_UUID)) + continue; + + if (brickinfo->snap_status == -1) { + gf_log (this->name, GF_LOG_INFO, + "snapshot was pending. lvm not present " + "for brick %s:%s of the snap %s.", + brickinfo->hostname, brickinfo->path, + snap_vol->snapshot->snapname); + + /* Adding missed delete to the dict */ + ret = glusterd_add_missed_snaps_to_dict + (rsp_dict, + snap_vol->volname, + brickinfo, + brick_count + 1, + GF_SNAP_OPTION_TYPE_DELETE); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to add missed snapshot info " + "for %s:%s in the rsp_dict", + brickinfo->hostname, + brickinfo->path); + goto out; + } + + continue; + } + + ret = glusterd_get_brick_root (brickinfo->path, &mnt_pt); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "getting the root " + "of the brick for volume %s (snap %s) failed ", + snap_vol->volname, snap_vol->snapshot->snapname); + goto out; + } + + entry = glusterd_get_mnt_entry_info (mnt_pt, mtab); + if (!entry) { + gf_log (this->name, GF_LOG_WARNING, "getting the mount" + " entry for the brick %s:%s of the snap %s " + "(volume: %s) failed", brickinfo->hostname, + brickinfo->path, snap_vol->snapshot->snapname, + snap_vol->volname); + ret = -1; + goto out; + } + ret = glusterd_do_lvm_snapshot_remove (snap_vol, brickinfo, + mnt_pt, + entry->mnt_fsname); + if (mtab) + endmntent (mtab); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to " + "remove the snapshot %s (%s)", + brickinfo->path, entry->mnt_fsname); + goto out; + } + + } + + ret = 0; +out: + return ret; +} + +int32_t +glusterd_snap_volume_remove (dict_t *rsp_dict, + glusterd_volinfo_t *snap_vol, + gf_boolean_t remove_lvm, + gf_boolean_t force) +{ + int ret = -1; + int save_ret = 0; + glusterd_brickinfo_t *brickinfo = NULL; + glusterd_volinfo_t *origin_vol = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (rsp_dict); + GF_ASSERT (snap_vol); + + if (!snap_vol) { + gf_log(this->name, GF_LOG_WARNING, "snap_vol in NULL"); + ret = -1; + goto out; + } + + list_for_each_entry (brickinfo, &snap_vol->bricks, brick_list) { + if (uuid_compare (brickinfo->uuid, MY_UUID)) + continue; + + ret = glusterd_brick_stop (snap_vol, brickinfo, _gf_false); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, "Failed to stop " + "brick for volume %s", snap_vol->volname); + save_ret = ret; + + /* Continue to cleaning up the snap in case of error + if force flag is enabled */ + if (!force) + goto out; + } + } + + /* Only remove the backend lvm when required */ + if (remove_lvm) { + ret = glusterd_lvm_snapshot_remove (rsp_dict, snap_vol); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, "Failed to remove " + "lvm snapshot volume %s", snap_vol->volname); + save_ret = ret; + if (!force) + goto out; + } + } + + ret = glusterd_store_delete_volume (snap_vol); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, "Failed to remove volume %s " + "from store", snap_vol->volname); + save_ret = ret; + if (!force) + goto out; + } + + if (!list_empty(&snap_vol->snapvol_list)) { + ret = glusterd_volinfo_find (snap_vol->parent_volname, + &origin_vol); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "parent volinfo %s for volume %s", + snap_vol->parent_volname, snap_vol->volname); + save_ret = ret; + if (!force) + goto out; + } + origin_vol->snap_count--; + } + + ret = glusterd_volinfo_delete (snap_vol); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, "Failed to remove volinfo " + "%s ", snap_vol->volname); + save_ret = ret; + if (!force) + goto out; + } + + if (save_ret) + ret = save_ret; +out: + gf_log (this->name, GF_LOG_TRACE, "returning %d", ret); + return ret; +} + +int32_t +glusterd_snapobject_delete (glusterd_snap_t *snap) +{ + if (snap == NULL) { + gf_log(THIS->name, GF_LOG_WARNING, "snap is NULL"); + return -1; + } + + list_del_init (&snap->snap_list); + list_del_init (&snap->volumes); + if (LOCK_DESTROY(&snap->lock)) + gf_log (THIS->name, GF_LOG_WARNING, "Failed destroying lock" + "of snap %s", snap->snapname); + + GF_FREE (snap->description); + GF_FREE (snap); + + return 0; +} + +int32_t +glusterd_snap_remove (dict_t *rsp_dict, + glusterd_snap_t *snap, + gf_boolean_t remove_lvm, + gf_boolean_t force) +{ + int ret = -1; + int save_ret = 0; + glusterd_volinfo_t *snap_vol = NULL; + glusterd_volinfo_t *tmp = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (rsp_dict); + GF_ASSERT (snap); + + if (!snap) { + gf_log(this->name, GF_LOG_WARNING, "snap is NULL"); + ret = -1; + goto out; + } + + list_for_each_entry_safe (snap_vol, tmp, &snap->volumes, vol_list) { + ret = glusterd_snap_volume_remove (rsp_dict, snap_vol, + remove_lvm, force); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, "Failed to remove " + "volinfo %s for snap %s", snap_vol->volname, + snap->snapname); + save_ret = ret; + + /* Continue to cleaning up the snap in case of error + if force flag is enabled */ + if (!force) + goto out; + } + } + + ret = glusterd_store_delete_snap (snap); + if (ret) { + gf_log(this->name, GF_LOG_WARNING, "Failed to remove snap %s " + "from store", snap->snapname); + save_ret = ret; + if (!force) + goto out; + } + + ret = glusterd_snapobject_delete (snap); + if (ret) + gf_log (this->name, GF_LOG_WARNING, "Failed to delete " + "snap object %s", snap->snapname); + + if (save_ret) + ret = save_ret; +out: + gf_log (THIS->name, GF_LOG_TRACE, "returning %d", ret); + return ret; +} + +static int +glusterd_snapshot_get_snapvol_detail (dict_t *dict, + glusterd_volinfo_t *snap_vol, + char *keyprefix, int detail) +{ + int ret = -1; + int snap_limit = 0; + char key[PATH_MAX] = {0,}; + char *value = NULL; + glusterd_volinfo_t *origin_vol = NULL; + glusterd_conf_t *conf = NULL; + xlator_t *this = NULL; + + this = THIS; + conf = this->private; + GF_ASSERT (conf); + + GF_ASSERT (dict); + GF_ASSERT (snap_vol); + GF_ASSERT (keyprefix); + + /* Volume Name */ + value = gf_strdup (snap_vol->volname); + if (!value) + goto out; + + snprintf (key, sizeof (key), "%s.volname", keyprefix); + ret = dict_set_dynstr (dict, key, value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set " + "volume name in dictionary: %s", key); + goto out; + } + + /* Volume ID */ + value = gf_strdup (uuid_utoa (snap_vol->volume_id)); + if (NULL == value) { + ret = -1; + goto out; + } + + snprintf (key, sizeof (key), "%s.vol-id", keyprefix); + ret = dict_set_dynstr (dict, key, value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set " + "volume id in dictionary: %s", key); + goto out; + } + value = NULL; + + /* volume status */ + snprintf (key, sizeof (key), "%s.vol-status", keyprefix); + switch (snap_vol->status) { + case GLUSTERD_STATUS_STARTED: + ret = dict_set_str (dict, key, "Started"); + break; + case GLUSTERD_STATUS_STOPPED: + ret = dict_set_str (dict, key, "Stopped"); + break; + case GD_SNAP_STATUS_NONE: + ret = dict_set_str (dict, key, "None"); + break; + default: + gf_log (this->name, GF_LOG_ERROR, "Invalid volume status"); + ret = -1; + goto out; + } + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set volume status" + " in dictionary: %s", key); + goto out; + } + + + ret = glusterd_volinfo_find (snap_vol->parent_volname, &origin_vol); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to get the parent " + "volinfo for the volume %s", snap_vol->volname); + goto out; + } + + /* Snaps available */ + if (conf->snap_max_hard_limit < origin_vol->snap_max_hard_limit) { + snap_limit = conf->snap_max_hard_limit; + gf_log(this->name, GF_LOG_DEBUG, "system snap-max-hard-limit is" + " lesser than volume snap-max-hard-limit, " + "snap-max-hard-limit value is set to %d", snap_limit); + } else { + snap_limit = origin_vol->snap_max_hard_limit; + gf_log(this->name, GF_LOG_DEBUG, "volume snap-max-hard-limit is" + " lesser than system snap-max-hard-limit, " + "snap-max-hard-limit value is set to %d", snap_limit); + } + + snprintf (key, sizeof (key), "%s.snaps-available", keyprefix); + if (snap_limit > origin_vol->snap_count) + ret = dict_set_int32 (dict, key, + snap_limit - origin_vol->snap_count); + else + ret = dict_set_int32 (dict, key, 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set available snaps"); + goto out; + } + + snprintf (key, sizeof (key), "%s.snapcount", keyprefix); + ret = dict_set_int32 (dict, key, origin_vol->snap_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not save snapcount"); + goto out; + } + + if (!detail) + goto out; + + /* Parent volume name */ + value = gf_strdup (snap_vol->parent_volname); + if (!value) + goto out; + + snprintf (key, sizeof (key), "%s.origin-volname", keyprefix); + ret = dict_set_dynstr (dict, key, value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set parent " + "volume name in dictionary: %s", key); + goto out; + } + value = NULL; + + ret = 0; +out: + if (value) + GF_FREE (value); + + return ret; +} + +static int +glusterd_snapshot_get_snap_detail (dict_t *dict, glusterd_snap_t *snap, + char *keyprefix, glusterd_volinfo_t *volinfo) +{ + int ret = -1; + int volcount = 0; + char key[PATH_MAX] = {0,}; + char *value = NULL; + char *timestr = NULL; + struct tm *tmptr = NULL; + glusterd_volinfo_t *snap_vol = NULL; + glusterd_volinfo_t *tmp_vol = NULL; + xlator_t *this = NULL; + + this = THIS; + + GF_ASSERT (dict); + GF_ASSERT (snap); + GF_ASSERT (keyprefix); + + /* Snap Name */ + value = gf_strdup (snap->snapname); + if (!value) + goto out; + + snprintf (key, sizeof (key), "%s.snapname", keyprefix); + ret = dict_set_dynstr (dict, key, value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set " + "snap name in dictionary"); + goto out; + } + + /* Snap ID */ + value = gf_strdup (uuid_utoa (snap->snap_id)); + if (NULL == value) { + ret = -1; + goto out; + } + + snprintf (key, sizeof (key), "%s.snap-id", keyprefix); + ret = dict_set_dynstr (dict, key, value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set " + "snap id in dictionary"); + goto out; + } + value = NULL; + + tmptr = localtime (&(snap->time_stamp)); + if (NULL == tmptr) { + gf_log (this->name, GF_LOG_ERROR, "Failed to convert " + "time_t to *tm"); + ret = -1; + goto out; + } + + timestr = GF_CALLOC (1, PATH_MAX, gf_gld_mt_char); + if (NULL == timestr) { + ret = -1; + goto out; + } + + ret = strftime (timestr, PATH_MAX, "%Y-%m-%d %H:%M:%S", tmptr); + if (0 == ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to convert time_t " + "to string"); + ret = -1; + goto out; + } + + snprintf (key, sizeof (key), "%s.snap-time", keyprefix); + ret = dict_set_dynstr (dict, key, timestr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set " + "snap time stamp in dictionary"); + goto out; + } + timestr = NULL; + + /* If snap description is provided then add that into dictionary */ + if (NULL != snap->description) { + value = gf_strdup (snap->description); + if (NULL == value) { + ret = -1; + goto out; + } + + snprintf (key, sizeof (key), "%s.snap-desc", keyprefix); + ret = dict_set_dynstr (dict, key, value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set " + "snap description in dictionary"); + goto out; + } + value = NULL; + } + + snprintf (key, sizeof (key), "%s.snap-status", keyprefix); + switch (snap->snap_status) { + case GD_SNAP_STATUS_INIT: + ret = dict_set_str (dict, key, "Init"); + break; + case GD_SNAP_STATUS_IN_USE: + ret = dict_set_str (dict, key, "In-use"); + break; + case GD_SNAP_STATUS_DECOMMISSION: + ret = dict_set_str (dict, key, "Decommisioned"); + break; + case GD_SNAP_STATUS_RESTORED: + ret = dict_set_str (dict, key, "Restored"); + break; + case GD_SNAP_STATUS_NONE: + ret = dict_set_str (dict, key, "None"); + break; + default: + gf_log (this->name, GF_LOG_ERROR, "Invalid snap status"); + ret = -1; + goto out; + } + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set snap status " + "in dictionary"); + goto out; + } + + if (volinfo) { + volcount = 1; + snprintf (key, sizeof (key), "%s.vol%d", keyprefix, volcount); + ret = glusterd_snapshot_get_snapvol_detail (dict, + volinfo, key, 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "get volume detail %s for snap %s", + snap_vol->volname, snap->snapname); + goto out; + } + goto done; + } + + list_for_each_entry_safe (snap_vol, tmp_vol, &snap->volumes, vol_list) { + volcount++; + snprintf (key, sizeof (key), "%s.vol%d", keyprefix, volcount); + ret = glusterd_snapshot_get_snapvol_detail (dict, + snap_vol, key, 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "get volume detail %s for snap %s", + snap_vol->volname, snap->snapname); + goto out; + } + } + +done: + snprintf (key, sizeof (key), "%s.vol-count", keyprefix); + ret = dict_set_int32 (dict, key, volcount); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set %s", + key); + goto out; + } + + ret = 0; +out: + if (value) + GF_FREE (value); + + if (timestr) + GF_FREE(timestr); + + return ret; +} + +static int +glusterd_snapshot_get_all_snap_info (dict_t *dict) +{ + int ret = -1; + int snapcount = 0; + char key[PATH_MAX] = {0,}; + glusterd_snap_t *snap = NULL; + glusterd_snap_t *tmp_snap = NULL; + glusterd_conf_t *priv = NULL; + xlator_t *this = NULL; + + this = THIS; + priv = this->private; + GF_ASSERT (priv); + + /* General parameter validation */ + GF_ASSERT (dict); + + list_for_each_entry_safe (snap, tmp_snap, &priv->snapshots, snap_list) { + snapcount++; + snprintf (key, sizeof (key), "snap%d", snapcount); + ret = glusterd_snapshot_get_snap_detail (dict, snap, key, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "snapdetail for snap %s", snap->snapname); + goto out; + } + } + + ret = dict_set_int32 (dict, "snap-count", snapcount); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set snapcount"); + goto out; + } + + ret = 0; +out: + return ret; +} + +int +glusterd_snapshot_get_info_by_volume (dict_t *dict, char *volname, + char *err_str, size_t len) +{ + int ret = -1; + int snapcount = 0; + int snap_limit = 0; + char *value = NULL; + char key[PATH_MAX] = ""; + glusterd_volinfo_t *volinfo = NULL; + glusterd_volinfo_t *snap_vol = NULL; + glusterd_volinfo_t *tmp_vol = NULL; + glusterd_conf_t *conf = NULL; + xlator_t *this = NULL; + + this = THIS; + conf = this->private; + GF_ASSERT (conf); + + GF_ASSERT (dict); + GF_ASSERT (volname); + + ret = glusterd_volinfo_find (volname, &volinfo); + if (ret) { + snprintf (err_str, len, "Volume (%s) does not exist", volname); + gf_log (this->name, GF_LOG_ERROR, "%s", err_str); + goto out; + } + + /* Snaps available */ + if (conf->snap_max_hard_limit < volinfo->snap_max_hard_limit) { + snap_limit = conf->snap_max_hard_limit; + gf_log(this->name, GF_LOG_DEBUG, "system snap-max-hard-limit is" + " lesser than volume snap-max-hard-limit, " + "snap-max-hard-limit value is set to %d", snap_limit); + } else { + snap_limit = volinfo->snap_max_hard_limit; + gf_log(this->name, GF_LOG_DEBUG, "volume snap-max-hard-limit is" + " lesser than system snap-max-hard-limit, " + "snap-max-hard-limit value is set to %d", snap_limit); + } + + if (snap_limit > volinfo->snap_count) + ret = dict_set_int32 (dict, "snaps-available", + snap_limit - volinfo->snap_count); + else + ret = dict_set_int32 (dict, "snaps-available", 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set available snaps"); + goto out; + } + + /* Origin volume name */ + value = gf_strdup (volinfo->volname); + if (!value) + goto out; + + ret = dict_set_dynstr (dict, "origin-volname", value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set parent " + "volume name in dictionary: %s", key); + goto out; + } + value = NULL; + + list_for_each_entry_safe (snap_vol, tmp_vol, &volinfo->snap_volumes, + snapvol_list) { + snapcount++; + snprintf (key, sizeof (key), "snap%d", snapcount); + ret = glusterd_snapshot_get_snap_detail (dict, + snap_vol->snapshot, + key, snap_vol); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "snapdetail for snap %s", + snap_vol->snapshot->snapname); + goto out; + } + } + ret = dict_set_int32 (dict, "snap-count", snapcount); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set snapcount"); + goto out; + } + + ret = 0; +out: + if (value) + GF_FREE (value); + + return ret; +} + +/* This function will be called from RPC handler routine. + * This function is responsible for getting the requested + * snapshot info into the dictionary. + * + * @param req RPC request object. Required for sending a response back. + * @param op glusterd operation. Required for sending a response back. + * @param dict pointer to dictionary which will contain both + * request and response key-pair values. + * @return -1 on error and 0 on success + */ +int +glusterd_handle_snapshot_info (rpcsvc_request_t *req, glusterd_op_t op, + dict_t *dict, char *err_str, size_t len) +{ + int ret = -1; + int8_t snap_driven = 1; + char *volname = NULL; + char *snapname = NULL; + glusterd_snap_t *snap = NULL; + xlator_t *this = NULL; + int32_t cmd = GF_SNAP_INFO_TYPE_ALL; + + this = THIS; + GF_ASSERT (this); + + GF_VALIDATE_OR_GOTO (this->name, req, out); + GF_VALIDATE_OR_GOTO (this->name, dict, out); + + + ret = dict_get_int32 (dict, "cmd", &cmd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get type " + "of snapshot info"); + goto out; + } + + switch (cmd) { + case GF_SNAP_INFO_TYPE_ALL: + { + ret = glusterd_snapshot_get_all_snap_info (dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to get info of all snaps"); + goto out; + } + break; + } + + case GF_SNAP_INFO_TYPE_SNAP: + { + ret = dict_get_str (dict, "snapname", &snapname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to get snap name"); + goto out; + } + + ret = dict_set_int32 (dict, "snap-count", 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set snapcount"); + goto out; + } + + snap = glusterd_find_snap_by_name (snapname); + if (!snap) { + snprintf (err_str, len, + "Snap (%s) does not exist", snapname); + gf_log (this->name, GF_LOG_ERROR, + "%s", err_str); + ret = -1; + goto out; + } + ret = glusterd_snapshot_get_snap_detail (dict, snap, + "snap1", NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to get snap detail of snap " + "%s", snap->snapname); + goto out; + } + break; + } + + case GF_SNAP_INFO_TYPE_VOL: + { + ret = dict_get_str (dict, "volname", &volname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to get volname"); + goto out; + } + ret = glusterd_snapshot_get_info_by_volume (dict, + volname, err_str, len); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to get volume info of volume " + "%s", volname); + goto out; + } + snap_driven = 0; + break; + } + } + + ret = dict_set_int8 (dict, "snap-driven", snap_driven); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set snap-driven"); + goto out; + } + + /* If everything is successful then send the response back to cli. + * In case of failure the caller of this function will take care + of the response */ + ret = glusterd_op_send_cli_response (op, 0, 0, req, dict, err_str); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to send cli " + "response"); + goto out; + } + + ret = 0; + +out: + return ret; +} + +/* This function sets all the snapshot names in the dictionary */ +int +glusterd_snapshot_get_all_snapnames (dict_t *dict) +{ + int ret = -1; + int snapcount = 0; + char *snapname = NULL; + char key[PATH_MAX] = {0,}; + glusterd_snap_t *snap = NULL; + glusterd_snap_t *tmp_snap = NULL; + glusterd_conf_t *priv = NULL; + xlator_t *this = NULL; + + this = THIS; + priv = this->private; + GF_ASSERT (priv); + GF_ASSERT (dict); + + list_for_each_entry_safe (snap, tmp_snap, &priv->snapshots, snap_list) { + snapcount++; + snapname = gf_strdup (snap->snapname); + if (!snapname) { + gf_log (this->name, GF_LOG_ERROR, "strdup failed"); + ret = -1; + goto out; + } + snprintf (key, sizeof (key), "snapname%d", snapcount); + ret = dict_set_dynstr (dict, key, snapname); + if (ret) { + GF_FREE (snapname); + gf_log (this->name, GF_LOG_ERROR, "Failed to set %s", + key); + goto out; + } + } + + ret = dict_set_int32 (dict, "snap-count", snapcount); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set snapcount"); + goto out; + } + + ret = 0; +out: + + return ret; +} + +/* This function sets all the snapshot names + under a given volume in the dictionary */ +int +glusterd_snapshot_get_vol_snapnames (dict_t *dict, glusterd_volinfo_t *volinfo) +{ + int ret = -1; + int snapcount = 0; + char *snapname = NULL; + char key[PATH_MAX] = {0,}; + glusterd_volinfo_t *snap_vol = NULL; + glusterd_volinfo_t *tmp_vol = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (dict); + GF_ASSERT (volinfo); + + list_for_each_entry_safe (snap_vol, tmp_vol, + &volinfo->snap_volumes, snapvol_list) { + snapcount++; + snapname = gf_strdup (snap_vol->snapshot->snapname); + if (!snapname) { + gf_log (this->name, GF_LOG_ERROR, + "strdup failed"); + ret = -1; + goto out; + } + snprintf (key, sizeof (key), "snapname%d", snapcount); + ret = dict_set_dynstr (dict, key, snapname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "set %s", key); + GF_FREE (snapname); + goto out; + } + } + + ret = dict_set_int32 (dict, "snap-count", snapcount); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set snapcount"); + goto out; + } + + ret = 0; +out: + + return ret; +} + +int +glusterd_handle_snapshot_list (rpcsvc_request_t *req, glusterd_op_t op, + dict_t *dict, char *err_str, size_t len) +{ + int ret = -1; + char *volname = NULL; + glusterd_volinfo_t *volinfo = NULL; + xlator_t *this = NULL; + + this = THIS; + + GF_VALIDATE_OR_GOTO (this->name, req, out); + GF_VALIDATE_OR_GOTO (this->name, dict, out); + + /* Ignore error for getting volname as it is optional */ + ret = dict_get_str (dict, "volname", &volname); + + if (NULL == volname) { + ret = glusterd_snapshot_get_all_snapnames (dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to get snapshot list"); + goto out; + } + } else { + ret = glusterd_volinfo_find (volname, &volinfo); + if (ret) { + snprintf (err_str, len, + "Volume (%s) does not exist", volname); + gf_log (this->name, GF_LOG_ERROR, + "%s", err_str); + goto out; + } + + ret = glusterd_snapshot_get_vol_snapnames (dict, volinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to get snapshot list for volume %s", + volname); + goto out; + } + } + + /* If everything is successful then send the response back to cli. + In case of failure the caller of this function will take of response.*/ + ret = glusterd_op_send_cli_response (op, 0, 0, req, dict, err_str); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to send cli " + "response"); + goto out; + } + + ret = 0; + +out: + return ret; +} + +/* This is a snapshot create handler function. This function will be + * executed in the originator node. This function is responsible for + * calling mgmt_v3 framework to do the actual snap creation on all the bricks + * + * @param req RPC request object + * @param op gluster operation + * @param dict dictionary containing snapshot restore request + * @param err_str In case of an err this string should be populated + * @param len length of err_str buffer + * + * @return Negative value on Failure and 0 in success + */ +int +glusterd_handle_snapshot_create (rpcsvc_request_t *req, glusterd_op_t op, + dict_t *dict, char *err_str, size_t len) +{ + int ret = -1; + char *volname = NULL; + char *snapname = NULL; + int64_t volcount = 0; + xlator_t *this = NULL; + char key[PATH_MAX] = ""; + char *username = NULL; + char *password = NULL; + uuid_t *uuid_ptr = NULL; + uuid_t tmp_uuid = {0}; + int i = 0; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (req); + GF_ASSERT (dict); + GF_ASSERT (err_str); + + ret = dict_get_int64 (dict, "volcount", &volcount); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to " + "get the volume count"); + goto out; + } + if (volcount <= 0) { + gf_log (this->name, GF_LOG_ERROR, "Invalid volume count %ld " + "supplied", volcount); + ret = -1; + goto out; + } + + ret = dict_get_str (dict, "snapname", &snapname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to get the snapname"); + goto out; + } + + if (strlen(snapname) >= GLUSTERD_MAX_SNAP_NAME) { + snprintf (err_str, len, "snapname cannot exceed 255 " + "characters"); + gf_log (this->name, GF_LOG_ERROR, "%s", err_str); + ret = -1; + goto out; + } + + uuid_ptr = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t); + if (!uuid_ptr) { + gf_log (this->name, GF_LOG_ERROR, "Out Of Memory"); + ret = -1; + goto out; + } + + uuid_generate (*uuid_ptr); + ret = dict_set_bin (dict, "snap-id", uuid_ptr, sizeof(uuid_t)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Unable to set snap-id"); + GF_FREE (uuid_ptr); + goto out; + } + uuid_ptr = NULL; + + ret = dict_set_int64 (dict, "snap-time", (int64_t)time(NULL)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Unable to set snap-time"); + goto out; + } + + for (i = 1; i <= volcount; i++) { + snprintf (key, sizeof (key), "volname%d", i); + ret = dict_get_str (dict, key, &volname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to get volume name"); + goto out; + } + + /* generate internal username and password for the snap*/ + uuid_generate (tmp_uuid); + username = gf_strdup (uuid_utoa (tmp_uuid)); + snprintf (key, sizeof(key), "volume%d_username", i); + ret = dict_set_dynstr (dict, key, username); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set snap " + "username for volume %s", volname); + GF_FREE (username); + goto out; + } + + uuid_generate (tmp_uuid); + password = gf_strdup (uuid_utoa (tmp_uuid)); + snprintf (key, sizeof(key), "volume%d_password", i); + ret = dict_set_dynstr (dict, key, password); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set snap " + "password for volume %s", volname); + GF_FREE (password); + goto out; + } + + uuid_ptr = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t); + if (!uuid_ptr) { + gf_log (this->name, GF_LOG_ERROR, "Out Of Memory"); + ret = -1; + goto out; + } + + snprintf (key, sizeof(key) - 1, "vol%d_volid", i); + uuid_generate (*uuid_ptr); + ret = dict_set_bin (dict, key, uuid_ptr, sizeof(uuid_t)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to set snap_volid"); + GF_FREE (uuid_ptr); + goto out; + } + } + + ret = glusterd_mgmt_v3_initiate_snap_phases (req, op, dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to initiate snap " + "phases"); + } + +out: + return ret; +} + +/* This is a snapshot status handler function. This function will be + * executed in a originator node. This function is responsible for + * calling mgmt v3 framework to get the actual snapshot status from + * all the bricks + * + * @param req RPC request object + * @param op gluster operation + * @param dict dictionary containing snapshot status request + * @param err_str In case of an err this string should be populated + * @param len length of err_str buffer + * + * return : 0 in case of success. + * -1 in case of failure. + * + */ +int +glusterd_handle_snapshot_status (rpcsvc_request_t *req, glusterd_op_t op, + dict_t *dict, char *err_str, size_t len) +{ + int ret = -1; + char *volname = NULL; + char *snapname = NULL; + char *buf = NULL; + glusterd_conf_t *conf = NULL; + xlator_t *this = NULL; + int32_t cmd = -1; + int i = 0; + dict_t *voldict = NULL; + char key[PATH_MAX] = ""; + glusterd_volinfo_t *volinfo = NULL; + glusterd_snap_t *snap = NULL; + glusterd_volinfo_t *snap_volinfo = NULL; + + this = THIS; + GF_ASSERT (this); + conf = this->private; + + GF_ASSERT (conf); + GF_ASSERT (req); + GF_ASSERT (dict); + GF_ASSERT (err_str); + + ret = dict_get_int32 (dict, "cmd", &cmd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not get status type"); + goto out; + } + switch (cmd) { + case GF_SNAP_STATUS_TYPE_ALL: + { + /* IF we give "gluster snapshot status" + * then lock is held on all snaps. + * This is the place where necessary information + * (snapname and snapcount)is populated in dictionary + * for locking. + */ + ++i; + list_for_each_entry (snap, &conf->snapshots, snap_list) + { + snprintf (key, sizeof (key), "snapname%d", i); + buf = gf_strdup (snap->snapname); + if (!buf) { + ret = -1; + goto out; + } + ret = dict_set_dynstr (dict, key, buf); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not save snapname (%s) " + "in the dictionary", + snap->snapname); + GF_FREE (buf); + goto out; + } + + buf = NULL; + i++; + } + + ret = dict_set_int32 (dict, "snapcount", i - 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not " + "save snapcount in the dictionary"); + goto out; + } + break; + } + + case GF_SNAP_STATUS_TYPE_SNAP: + { + /* IF we give "gluster snapshot status <snapname>" + * then lock is held on single snap. + * This is the place where necessary information + * (snapname)is populated in dictionary + * for locking. + */ + ret = dict_get_str (dict, "snapname", &snapname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to fetch snap name"); + goto out; + } + + snap = glusterd_find_snap_by_name (snapname); + if (!snap) { + snprintf (err_str, len, "Snap (%s)" + "does not exist", snapname); + gf_log(this->name, GF_LOG_ERROR, + "%s", err_str); + ret = -1; + goto out; + } + break; + } + case GF_SNAP_STATUS_TYPE_VOL: + ret = dict_get_str (dict, "volname", &volname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to fetch volname"); + goto out; + } + + ret = glusterd_volinfo_find (volname, &volinfo); + if (ret) { + snprintf (err_str, len, "Volume (%s) " + "does not exist", volname); + gf_log (this->name, GF_LOG_ERROR, + "%s", err_str); + goto out; + } + + i = 1; + list_for_each_entry (snap_volinfo, + &volinfo->snap_volumes, snapvol_list) { + snprintf (key, sizeof (key), "snapname%d", i); + + buf = gf_strdup + (snap_volinfo->snapshot->snapname); + if (!buf) { + ret = -1; + goto out; + } + + ret = dict_set_dynstr (dict, key, buf); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not save snapname"); + GF_FREE (buf); + goto out; + } + + buf = NULL; + i++; + } + + ret = dict_set_int32 (dict, "snapcount", i-1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not save snapcount"); + goto out; + } + break; + default: + { + gf_log (this->name, GF_LOG_ERROR, "Unknown type"); + ret = -1; + goto out; + } + } + + /* Volume lock is not necessary for snapshot status, hence + * turning it off + */ + ret = dict_set_int8 (dict, "hold_vol_locks", 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Setting volume lock " + "flag failed"); + goto out; + } + + ret = glusterd_mgmt_v3_initiate_snap_phases (req, op, dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to initiate " + "snap phases"); + goto out; + } + + ret = 0; + +out: + if (voldict) { + dict_unref (voldict); + } + return ret; +} + + +/* This is a snapshot restore handler function. This function will be + * executed in the originator node. This function is responsible for + * calling mgmt_v3 framework to do the actual restore on all the bricks + * + * @param req RPC request object + * @param op gluster operation + * @param dict dictionary containing snapshot restore request + * @param err_str In case of an err this string should be populated + * @param len length of err_str buffer + * + * @return Negative value on Failure and 0 in success + */ +int +glusterd_handle_snapshot_restore (rpcsvc_request_t *req, glusterd_op_t op, + dict_t *dict, char *err_str, size_t len) +{ + int ret = -1; + char *snapname = NULL; + char *buf = NULL; + glusterd_conf_t *conf = NULL; + xlator_t *this = NULL; + glusterd_snap_t *snap = NULL; + glusterd_volinfo_t *snap_volinfo = NULL; + int32_t i = 0; + char key[PATH_MAX] = ""; + + this = THIS; + GF_ASSERT (this); + conf = this->private; + + GF_ASSERT (conf); + GF_ASSERT (req); + GF_ASSERT (dict); + GF_ASSERT (err_str); + + ret = dict_get_str (dict, "snapname", &snapname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "get snapname"); + goto out; + } + + snap = glusterd_find_snap_by_name (snapname); + if (!snap) { + snprintf (err_str, len, "Snap (%s) does not exist", snapname); + gf_log (this->name, GF_LOG_ERROR, "%s", err_str); + ret = -1; + goto out; + } + + list_for_each_entry (snap_volinfo, &snap->volumes, vol_list) { + i++; + snprintf (key, sizeof (key), "volname%d", i); + buf = gf_strdup (snap_volinfo->parent_volname); + if (!buf) { + ret = -1; + goto out; + } + ret = dict_set_dynstr (dict, key, buf); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not set " + "parent volume name %s in the dict", + snap_volinfo->parent_volname); + GF_FREE (buf); + goto out; + } + buf = NULL; + } + + ret = dict_set_int32 (dict, "volcount", i); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not save volume count"); + goto out; + } + + ret = glusterd_mgmt_v3_initiate_snap_phases (req, op, dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to initiate snap " + "phases"); + goto out; + } + + ret = 0; + +out: + return ret; +} + +glusterd_snap_t* +glusterd_create_snap_object (dict_t *dict, dict_t *rsp_dict) +{ + char *snapname = NULL; + uuid_t *snap_id = NULL; + char *description = NULL; + glusterd_snap_t *snap = NULL; + xlator_t *this = NULL; + glusterd_conf_t *priv = NULL; + int ret = -1; + int64_t time_stamp = 0; + + this = THIS; + priv = this->private; + + GF_ASSERT (dict); + GF_ASSERT (rsp_dict); + + /* Fetch snapname, description, id and time from dict */ + ret = dict_get_str (dict, "snapname", &snapname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Unable to fetch snapname"); + goto out; + } + + /* Ignore ret value for description*/ + ret = dict_get_str (dict, "description", &description); + + ret = dict_get_bin (dict, "snap-id", (void **)&snap_id); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Unable to fetch snap_id"); + goto out; + } + + ret = dict_get_int64 (dict, "snap-time", &time_stamp); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Unable to fetch snap-time"); + goto out; + } + if (time_stamp <= 0) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, "Invalid time-stamp: %ld", + time_stamp); + goto out; + } + + list_for_each_entry (snap, &priv->snapshots, snap_list) { + if (!strcmp (snap->snapname, snapname) || + !uuid_compare (snap->snap_id, *snap_id)) { + gf_log (THIS->name, GF_LOG_ERROR, + "Found duplicate snap %s (%s)", + snap->snapname, uuid_utoa (snap->snap_id)); + ret = -1; + break; + } + } + if (ret) { + snap = NULL; + goto out; + } + + snap = glusterd_new_snap_object (); + if (!snap) { + gf_log (this->name, GF_LOG_ERROR, "Could not create " + "the snap object for snap %s", snapname); + goto out; + } + + strcpy (snap->snapname, snapname); + uuid_copy (snap->snap_id, *snap_id); + snap->time_stamp = (time_t)time_stamp; + /* Set the status as GD_SNAP_STATUS_INIT and once the backend snapshot + is taken and snap is really ready to use, set the status to + GD_SNAP_STATUS_IN_USE. This helps in identifying the incomplete + snapshots and cleaning them up. + */ + snap->snap_status = GD_SNAP_STATUS_INIT; + if (description) { + snap->description = gf_strdup (description); + if (snap->description == NULL) { + gf_log (this->name, GF_LOG_ERROR, + "Saving the Snap Description Failed"); + ret = -1; + goto out; + } + } + + ret = glusterd_store_snap (snap); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Could not store snap" + "object %s", snap->snapname); + goto out; + } + + list_add_order (&snap->snap_list, &priv->snapshots, + glusterd_compare_snap_time); + + gf_log (this->name, GF_LOG_TRACE, "Snap %s added to the list", + snap->snapname); + + ret = 0; + +out: + if (ret) { + if (snap) + glusterd_snap_remove (rsp_dict, snap, + _gf_true, _gf_true); + snap = NULL; + } + + return snap; +} + +/* This function is called to get the device path of the snap lvm. Usually + if /dev/mapper/<group-name>-<lvm-name> is the device for the lvm, + then the snap device will be /dev/<group-name>/<snapname>. + This function takes care of building the path for the snap device. +*/ +char * +glusterd_build_snap_device_path (char *device, char *snapname) +{ + char snap[PATH_MAX] = ""; + char msg[1024] = ""; + char volgroup[PATH_MAX] = ""; + char *snap_device = NULL; + xlator_t *this = NULL; + runner_t runner = {0,}; + char *ptr = NULL; + int ret = -1; + + this = THIS; + GF_ASSERT (this); + if (!device) { + gf_log (this->name, GF_LOG_ERROR, "device is NULL"); + goto out; + } + if (!snapname) { + gf_log (this->name, GF_LOG_ERROR, "snapname is NULL"); + goto out; + } + + runinit (&runner); + runner_add_args (&runner, "/sbin/lvs", "--noheadings", "-o", "vg_name", + device, NULL); + runner_redir (&runner, STDOUT_FILENO, RUN_PIPE); + snprintf (msg, sizeof (msg), "Get volume group for device %s", device); + runner_log (&runner, this->name, GF_LOG_DEBUG, msg); + ret = runner_start (&runner); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get volume group " + "for device %s", device); + runner_end (&runner); + goto out; + } + ptr = fgets(volgroup, sizeof(volgroup), + runner_chio (&runner, STDOUT_FILENO)); + if (!ptr || !strlen(volgroup)) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get volume group " + "for snap %s", snapname); + runner_end (&runner); + ret = -1; + goto out; + } + runner_end (&runner); + + snprintf (snap, sizeof(snap), "/dev/%s/%s", gf_trim(volgroup), + snapname); + snap_device = gf_strdup (snap); + if (!snap_device) { + gf_log (this->name, GF_LOG_WARNING, "Cannot copy the " + "snapshot device name for snapname: %s)", snapname); + } + +out: + return snap_device; +} + +/* This function actually calls the command (or the API) for taking the + snapshot of the backend brick filesystem. If this is successful, + then call the glusterd_snap_create function to create the snap object + for glusterd +*/ +char * +glusterd_take_lvm_snapshot (glusterd_volinfo_t *snap_vol, + glusterd_brickinfo_t *brickinfo) +{ + char msg[NAME_MAX] = ""; + char buf[PATH_MAX] = ""; + char *snap_device = NULL; + char *ptr = NULL; + char *device = NULL; + int ret = -1; + gf_boolean_t match = _gf_false; + runner_t runner = {0,}; + xlator_t *this = NULL; + + this = THIS; + + if (!brickinfo) { + gf_log (this->name, GF_LOG_ERROR, "brickinfo NULL"); + goto out; + } + + device = glusterd_get_brick_mount_details (brickinfo); + if (!device) { + gf_log (this->name, GF_LOG_ERROR, "getting device name for " + "the brick %s:%s failed", brickinfo->hostname, + brickinfo->path); + goto out; + } + + /* Figuring out if setactivationskip flag is supported or not */ + runinit (&runner); + snprintf (msg, sizeof (msg), "running lvcreate help"); + runner_add_args (&runner, "/sbin/lvcreate", "--help", NULL); + runner_log (&runner, "", GF_LOG_DEBUG, msg); + runner_redir (&runner, STDOUT_FILENO, RUN_PIPE); + ret = runner_start (&runner); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to run lvcreate help"); + runner_end (&runner); + goto out; + } + + /* Looking for setactivationskip in lvcreate --help */ + do { + ptr = fgets(buf, sizeof(buf), + runner_chio (&runner, STDOUT_FILENO)); + if (ptr) { + if (strstr(buf, "setactivationskip")) { + match = _gf_true; + break; + } + } + } while (ptr != NULL); + runner_end (&runner); + + /* Takng the actual snapshot */ + runinit (&runner); + snprintf (msg, sizeof (msg), "taking snapshot of the brick %s:%s", + brickinfo->hostname, brickinfo->path); + if (match == _gf_true) + runner_add_args (&runner, "/sbin/lvcreate", "-s", device, + "--setactivationskip", "n", "--name", + snap_vol->volname, NULL); + else + runner_add_args (&runner, "/sbin/lvcreate", "-s", device, + "--name", snap_vol->volname, NULL); + runner_log (&runner, "", GF_LOG_DEBUG, msg); + ret = runner_start (&runner); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "taking snapshot of the " + "brick (%s:%s) of device %s failed", + brickinfo->hostname, brickinfo->path, device); + runner_end (&runner); + goto out; + } + runner_end (&runner); + + snap_device = glusterd_build_snap_device_path (device, + snap_vol->volname); + if (!snap_device) { + gf_log (this->name, GF_LOG_WARNING, "Cannot copy the snapshot " + "device name for snap %s (volume id: %s)", + snap_vol->snapshot->snapname, snap_vol->volname); + ret = -1; + goto out; + } + +out: + return snap_device; +} + +int32_t +glusterd_snap_brick_create (char *device, glusterd_volinfo_t *snap_volinfo, + glusterd_brickinfo_t *original_brickinfo, + int32_t brick_count, char *snap_brick_dir) +{ + int32_t ret = -1; + xlator_t *this = NULL; + glusterd_conf_t *priv = NULL; + char snap_brick_mount_path[PATH_MAX] = ""; + char snap_brick_path[PATH_MAX] = ""; + char msg[1024] = ""; + struct stat statbuf = {0, }; + runner_t runner = {0, }; + + this = THIS; + priv = this->private; + + GF_ASSERT (device); + GF_ASSERT (snap_volinfo); + GF_ASSERT (original_brickinfo); + GF_ASSERT (snap_brick_dir); + + snprintf (snap_brick_mount_path, sizeof (snap_brick_mount_path), + "%s/%s/brick%d", snap_mount_folder, snap_volinfo->volname, + brick_count+1); + + snprintf (snap_brick_path, sizeof (snap_brick_path), "%s%s", + snap_brick_mount_path, snap_brick_dir); + + ret = mkdir_p (snap_brick_mount_path, 0777, _gf_true); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "creating the brick directory" + " %s for the snapshot %s(device: %s) failed", + snap_brick_mount_path, snap_volinfo->volname, device); + goto out; + } + /* mount the snap logical device on the directory inside + /run/gluster/snaps/<snapname>/@snap_brick_mount_path + Way to mount the snap brick via mount api is this. + ret = mount (device, snap_brick_mount_path, entry->mnt_type, + MS_MGC_VAL, "nouuid"); + But for now, mounting using runner apis. + */ + runinit (&runner); + snprintf (msg, sizeof (msg), "mounting snapshot of the brick %s:%s", + original_brickinfo->hostname, original_brickinfo->path); + runner_add_args (&runner, "mount", "-o", "nouuid", device, + snap_brick_mount_path, NULL); + runner_log (&runner, "", GF_LOG_DEBUG, msg); + + /* let glusterd get blocked till snapshot is over */ + synclock_unlock (&priv->big_lock); + ret = runner_run (&runner); + synclock_lock (&priv->big_lock); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "mounting the snapshot " + "logical device %s failed (error: %s)", device, + strerror (errno)); + goto out; + } else + gf_log (this->name, GF_LOG_DEBUG, "mounting the snapshot " + "logical device %s successful", device); + + ret = stat (snap_brick_path, &statbuf); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "stat of the brick %s" + "(brick mount: %s) failed (%s)", snap_brick_path, + snap_brick_mount_path, strerror (errno)); + goto out; + } + ret = sys_lsetxattr (snap_brick_path, + GF_XATTR_VOL_ID_KEY, + snap_volinfo->volume_id, 16, + XATTR_REPLACE); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set " + "extended attribute %s on %s. Reason: " + "%s, snap: %s", GF_XATTR_VOL_ID_KEY, + snap_brick_path, strerror (errno), + snap_volinfo->volname); + goto out; + } + +out: + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "unmounting the snap brick" + " mount %s", snap_brick_mount_path); + umount (snap_brick_mount_path); + } + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +/* Added missed_snap_entry to rsp_dict */ +int32_t +glusterd_add_missed_snaps_to_dict (dict_t *rsp_dict, char *snap_uuid, + glusterd_brickinfo_t *brickinfo, + int32_t brick_number, int32_t op) +{ + char *buf = NULL; + char missed_snap_entry[PATH_MAX] = ""; + char name_buf[PATH_MAX] = ""; + int32_t missed_snap_count = -1; + int32_t ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (rsp_dict); + GF_ASSERT (snap_uuid); + GF_ASSERT (brickinfo); + + snprintf (missed_snap_entry, sizeof(missed_snap_entry), + "%s:%s=%d:%s:%d:%d", uuid_utoa(brickinfo->uuid), + snap_uuid, brick_number, brickinfo->path, op, + GD_MISSED_SNAP_PENDING); + + buf = gf_strdup (missed_snap_entry); + if (!buf) { + ret = -1; + goto out; + } + + /* Fetch the missed_snap_count from the dict */ + ret = dict_get_int32 (rsp_dict, "missed_snap_count", + &missed_snap_count); + if (ret) { + /* Initialize the missed_snap_count for the first time */ + missed_snap_count = 0; + } + + /* Setting the missed_snap_entry in the rsp_dict */ + snprintf (name_buf, sizeof(name_buf), "missed_snaps_%d", + missed_snap_count); + ret = dict_set_dynstr (rsp_dict, name_buf, buf); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set missed_snap_entry (%s) " + "in the rsp_dict.", buf); + GF_FREE (buf); + goto out; + } + missed_snap_count++; + + /* Setting the new missed_snap_count in the dict */ + ret = dict_set_int32 (rsp_dict, "missed_snap_count", + missed_snap_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set missed_snap_count for %s " + "in the rsp_dict.", missed_snap_entry); + goto out; + } + +out: + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +static int32_t +glusterd_add_bricks_to_snap_volume (dict_t *dict, dict_t *rsp_dict, + glusterd_volinfo_t *snap_vol, + glusterd_brickinfo_t *original_brickinfo, + glusterd_brickinfo_t *snap_brickinfo, + char **snap_brick_dir, int64_t volcount, + int32_t brick_count) +{ + char key[PATH_MAX] = ""; + char snap_brick_path[PATH_MAX] = ""; + char *snap_device = NULL; + gf_boolean_t add_missed_snap = _gf_false; + int32_t ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (rsp_dict); + GF_ASSERT (snap_vol); + GF_ASSERT (original_brickinfo); + GF_ASSERT (snap_brickinfo); + GF_ASSERT (snap_brick_dir); + + snprintf (key, sizeof(key) - 1, "vol%ld.brickdir%d", volcount, + brick_count); + ret = dict_get_ptr (dict, key, (void **)snap_brick_dir); + if (ret) { + /* Using original brickinfo here because it will be a + * pending snapshot and storing the original brickinfo + * will help in mapping while recreating the missed snapshot + */ + gf_log (this->name, GF_LOG_WARNING, "Unable to fetch " + "snap mount path (%s). Using original brickinfo", key); + snap_brickinfo->snap_status = -1; + strcpy (snap_brick_path, original_brickinfo->path); + + /* In origiator node add snaps missed + * from different nodes to the dict + */ + if (is_origin_glusterd (dict) == _gf_true) + add_missed_snap = _gf_true; + } else { + /* Create brick-path in the format /var/run/gluster/snaps/ * + * <snap-uuid>/<original-brick#>/snap-brick-dir * + */ + snprintf (snap_brick_path, sizeof(snap_brick_path), + "%s/%s/brick%d%s", snap_mount_folder, + snap_vol->volname, brick_count+1, + *snap_brick_dir); + } + + if ((snap_brickinfo->snap_status != -1) && + (!uuid_compare (original_brickinfo->uuid, MY_UUID)) && + (!glusterd_is_brick_started (original_brickinfo))) { + /* In case if the brick goes down after prevalidate. */ + gf_log (this->name, GF_LOG_WARNING, "brick %s:%s is not" + " started (snap: %s)", + original_brickinfo->hostname, + original_brickinfo->path, + snap_vol->snapshot->snapname); + + snap_brickinfo->snap_status = -1; + strcpy (snap_brick_path, original_brickinfo->path); + add_missed_snap = _gf_true; + } + + if (add_missed_snap) { + ret = glusterd_add_missed_snaps_to_dict (rsp_dict, + snap_vol->volname, + original_brickinfo, + brick_count + 1, + GF_SNAP_OPTION_TYPE_CREATE); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to add missed" + " snapshot info for %s:%s in the rsp_dict", + original_brickinfo->hostname, + original_brickinfo->path); + goto out; + } + } + + snprintf (key, sizeof(key), "vol%ld.brick_snapdevice%d", + volcount, brick_count); + ret = dict_get_ptr (dict, key, (void **)&snap_device); + if (ret) { + /* If the device name is empty, so will be the brick path + * Hence the missed snap has already been added above + */ + gf_log (this->name, GF_LOG_ERROR, "Unable to fetch " + "snap device (%s). Leaving empty", key); + } else + strcpy (snap_brickinfo->device_path, snap_device); + + ret = gf_canonicalize_path (snap_brick_path); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to canonicalize path"); + goto out; + } + + strcpy (snap_brickinfo->hostname, original_brickinfo->hostname); + strcpy (snap_brickinfo->path, snap_brick_path); + uuid_copy (snap_brickinfo->uuid, original_brickinfo->uuid); + list_add_tail (&snap_brickinfo->brick_list, &snap_vol->bricks); + +out: + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +static int32_t +glusterd_take_brick_snapshot (glusterd_volinfo_t *origin_vol, + glusterd_volinfo_t *snap_vol, dict_t *rsp_dict, + glusterd_brickinfo_t *original_brickinfo, + glusterd_brickinfo_t *snap_brickinfo, + char *snap_brick_dir, int32_t brick_count) +{ + char *device = NULL; + int32_t ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (origin_vol); + GF_ASSERT (snap_vol); + GF_ASSERT (rsp_dict); + GF_ASSERT (original_brickinfo); + GF_ASSERT (snap_brickinfo); + GF_ASSERT (snap_brick_dir); + + device = glusterd_take_lvm_snapshot (snap_vol, original_brickinfo); + /* Fail the snapshot even though snapshot on one of + the bricks fails. At the end when we check whether + the snapshot volume meets quorum or not, then the + the snapshot can either be treated as success, or + in case of failure we can undo the changes and return + failure to cli. */ + if (!device) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to take snapshot of %s:%s", + original_brickinfo->hostname, + original_brickinfo->path); + goto out; + } + + /* create the complete brick here */ + ret = glusterd_snap_brick_create (device, snap_vol, + original_brickinfo, + brick_count, snap_brick_dir); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "not able to" + " create the brickinfo for the snap %s" + ", volume %s", snap_vol->snapshot->snapname, + origin_vol->volname); + goto out; + } + +out: + if (device) + GF_FREE (device); + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +/* Look for disconnected peers, for missed snap creates or deletes */ +static int32_t +glusterd_find_missed_snap (dict_t *rsp_dict, glusterd_volinfo_t *vol, + char *snap_uuid, struct list_head *peers, + int32_t op) +{ + int32_t brick_count = -1; + int32_t ret = -1; + xlator_t *this = NULL; + glusterd_peerinfo_t *peerinfo = NULL; + glusterd_brickinfo_t *brickinfo = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (rsp_dict); + GF_ASSERT (peers); + GF_ASSERT (vol); + GF_ASSERT (snap_uuid); + + brick_count = 0; + list_for_each_entry (brickinfo, &vol->bricks, brick_list) { + if (!uuid_compare (brickinfo->uuid, MY_UUID)) { + /* If the brick belongs to the same node */ + brick_count++; + continue; + } + + list_for_each_entry (peerinfo, peers, uuid_list) { + if (uuid_compare (peerinfo->uuid, brickinfo->uuid)) { + /* If the brick doesnt belong to this peer */ + continue; + } + + /* Found peer who owns the brick, * + * if peer is not connected or not * + * friend add it to missed snap list */ + if (!(peerinfo->connected) || + (peerinfo->state.state != + GD_FRIEND_STATE_BEFRIENDED)) { + ret = glusterd_add_missed_snaps_to_dict + (rsp_dict, + snap_uuid, + brickinfo, + brick_count + 1, + op); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to add missed snapshot " + "info for %s:%s in the " + "rsp_dict", brickinfo->hostname, + brickinfo->path); + goto out; + } + } + } + brick_count++; + } + + ret = 0; +out: + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +glusterd_volinfo_t * +glusterd_do_snap_vol (glusterd_volinfo_t *origin_vol, glusterd_snap_t *snap, + dict_t *dict, dict_t *rsp_dict, int64_t volcount) +{ + char key[PATH_MAX] = ""; + char *snap_brick_dir = NULL; + char *username = NULL; + char *password = NULL; + glusterd_brickinfo_t *brickinfo = NULL; + glusterd_conf_t *priv = NULL; + glusterd_volinfo_t *snap_vol = NULL; + uuid_t *snap_volid = NULL; + int32_t ret = -1; + int32_t brick_count = 0; + glusterd_brickinfo_t *snap_brickinfo = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + priv = this->private; + GF_ASSERT (priv); + GF_ASSERT (origin_vol); + GF_ASSERT (dict); + GF_ASSERT (rsp_dict); + + /* fetch username, password and vol_id from dict*/ + snprintf (key, sizeof(key), "volume%ld_username", volcount); + ret = dict_get_str (dict, key, &username); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get %s for " + "snap %s", key, snap->snapname); + goto out; + } + + snprintf (key, sizeof(key), "volume%ld_password", volcount); + ret = dict_get_str (dict, key, &password); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get %s for " + "snap %s", key, snap->snapname); + goto out; + } + snprintf (key, sizeof(key) - 1, "vol%ld_volid", volcount); + ret = dict_get_bin (dict, key, (void **)&snap_volid); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to fetch snap_volid"); + goto out; + } + + /* We are not setting the username and password here as + * we need to set the user name and password passed in + * the dictionary + */ + ret = glusterd_volinfo_dup (origin_vol, &snap_vol, _gf_false); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to duplicate volinfo " + "for the snapshot %s", snap->snapname); + goto out; + } + + /* uuid is used as lvm snapshot name. + This will avoid restrictions on snapshot names provided by user */ + GLUSTERD_GET_UUID_NOHYPHEN (snap_vol->volname, *snap_volid); + uuid_copy (snap_vol->volume_id, *snap_volid); + snap_vol->is_snap_volume = _gf_true; + strcpy (snap_vol->parent_volname, origin_vol->volname); + snap_vol->snapshot = snap; + + glusterd_auth_set_username (snap_vol, username); + glusterd_auth_set_password (snap_vol, password); + + /* Adding snap brickinfos to the snap volinfo */ + brick_count = 0; + list_for_each_entry (brickinfo, &origin_vol->bricks, brick_list) { + snap_brickinfo = NULL; + + ret = glusterd_brickinfo_new (&snap_brickinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "initializing the brick for the snap " + "volume failed (snapname: %s)", snap->snapname); + goto out; + } + + ret = glusterd_add_bricks_to_snap_volume (dict, rsp_dict, + snap_vol, + brickinfo, + snap_brickinfo, + &snap_brick_dir, + volcount, + brick_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to add the snap brick for " + "%s:%s to the snap volume", + brickinfo->hostname, brickinfo->path); + GF_FREE (snap_brickinfo); + goto out; + } + + /* Take snapshot of the brick */ + if ((uuid_compare (brickinfo->uuid, MY_UUID)) || + (snap_brickinfo->snap_status == -1)) { + brick_count++; + continue; + } + + ret = glusterd_take_brick_snapshot (origin_vol, snap_vol, + rsp_dict, brickinfo, + snap_brickinfo, + snap_brick_dir, + brick_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to take snapshot for %s:%s", + brickinfo->hostname, brickinfo->path); + goto out; + } + + brick_count++; + } + + /*TODO: the quorum check of the snap volume here */ + + ret = glusterd_store_volinfo (snap_vol, + GLUSTERD_VOLINFO_VER_AC_INCREMENT); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to store snapshot " + "volinfo (%s) for snap %s", snap_vol->volname, + snap->snapname); + goto out; + } + + ret = generate_brick_volfiles (snap_vol); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "generating the brick " + "volfiles for the snap %s (volume: %s) failed", + snap->snapname, origin_vol->volname); + goto out; + } + + ret = generate_client_volfiles (snap_vol, GF_CLIENT_TRUSTED); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "generating the trusted " + "client volfiles for the snap %s (volume: %s) failed", + snap->snapname, origin_vol->volname); + goto out; + } + ret = generate_client_volfiles (snap_vol, GF_CLIENT_OTHER); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "generating the client " + "volfiles for the snap %s (volume: %s) failed", + snap->snapname, origin_vol->volname); + goto out; + } + + ret = glusterd_list_add_snapvol (origin_vol, snap_vol); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "could not add the snap " + "volume %s to the list", snap_vol->volname); + goto out; + } + + list_for_each_entry (brickinfo, &snap_vol->bricks, brick_list) { + if (uuid_compare (brickinfo->uuid, MY_UUID)) + continue; + + if (brickinfo->snap_status == -1) { + gf_log (this->name, GF_LOG_INFO, + "not starting snap brick %s:%s for " + "for the snap %s (volume: %s)", + brickinfo->hostname, brickinfo->path, + snap->snapname, origin_vol->volname); + continue; + } + + ret = glusterd_brick_start (snap_vol, brickinfo, _gf_true); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "starting the " + "brick %s:%s for the snap %s (volume: %s) " + "failed", brickinfo->hostname, brickinfo->path, + snap->snapname, origin_vol->volname); + goto out; + } + } + + snap_vol->status = GLUSTERD_STATUS_STARTED; + ret = glusterd_store_volinfo (snap_vol, + GLUSTERD_VOLINFO_VER_AC_INCREMENT); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to store snap volinfo"); + goto out; + } + +out: + if (ret) { + if (snap_vol) + glusterd_snap_volume_remove (rsp_dict, snap_vol, + _gf_true, _gf_true); + snap_vol = NULL; + } + + return snap_vol; +} + +/* This is a snapshot remove handler function. This function will be + * executed in the originator node. This function is responsible for + * calling mgmt v3 framework to do the actual remove on all the bricks + * + * @param req RPC request object + * @param op gluster operation + * @param dict dictionary containing snapshot remove request + * @param err_str In case of an err this string should be populated + * @param len length of err_str buffer + * + * @return Negative value on Failure and 0 in success + */ +int +glusterd_handle_snapshot_remove (rpcsvc_request_t *req, glusterd_op_t op, + dict_t *dict, char *err_str, size_t len) +{ + int ret = -1; + int64_t volcount = 0; + char *snapname = NULL; + char *volname = NULL; + char key[PATH_MAX] = ""; + glusterd_snap_t *snap = NULL; + glusterd_volinfo_t *snap_vol = NULL; + glusterd_volinfo_t *tmp = NULL; + xlator_t *this = NULL; + + this = THIS; + + GF_ASSERT (req); + GF_ASSERT (dict); + GF_ASSERT (err_str); + + ret = dict_get_str (dict, "snapname", &snapname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get snapname"); + goto out; + } + + snap = glusterd_find_snap_by_name (snapname); + if (!snap) { + snprintf (err_str, len, "Snap (%s) does not exist", snapname); + gf_log (this->name, GF_LOG_ERROR, + "%s", err_str); + ret = -1; + goto out; + } + + /* Set volnames in the dict to get mgmt_v3 lock */ + list_for_each_entry_safe (snap_vol, tmp, &snap->volumes, vol_list) { + volcount++; + volname = gf_strdup (snap_vol->parent_volname); + if (!volname) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, "strdup failed"); + goto out; + } + + snprintf (key, sizeof (key), "volname%ld", volcount); + ret = dict_set_dynstr (dict, key, volname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set " + "volume name in dictionary"); + GF_FREE (volname); + goto out; + } + volname = NULL; + } + ret = dict_set_int64 (dict, "volcount", volcount); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set volcount"); + goto out; + } + + ret = glusterd_mgmt_v3_initiate_snap_phases (req, op, dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to initiate snap " + "phases"); + goto out; + } + + ret = 0; +out: + return ret; +} + +int +glusterd_snapshot_remove_prevalidate (dict_t *dict, char **op_errstr, + dict_t *rsp_dict) +{ + int32_t ret = -1; + char *snapname = NULL; + xlator_t *this = NULL; + glusterd_snap_t *snap = NULL; + + this = THIS; + + if (!dict || !op_errstr) { + gf_log (this->name, GF_LOG_ERROR, "input parameters NULL"); + goto out; + } + + ret = dict_get_str (dict, "snapname", &snapname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Getting the snap name " + "failed"); + goto out; + } + + snap = glusterd_find_snap_by_name (snapname); + if (!snap) { + gf_log (this->name, GF_LOG_ERROR, "Snap %s does not exist", + snapname); + ret = -1; + goto out; + } + ret = 0; +out: + return ret; +} + +int +glusterd_snapshot_status_prevalidate (dict_t *dict, char **op_errstr, + dict_t *rsp_dict) +{ + int ret = -1; + char *snapname = NULL; + glusterd_conf_t *conf = NULL; + xlator_t *this = NULL; + int32_t cmd = -1; + glusterd_volinfo_t *volinfo = NULL; + char *volname = NULL; + + this = THIS; + GF_ASSERT (this); + conf = this->private; + + GF_ASSERT (conf); + GF_ASSERT (op_errstr); + if (!dict) { + gf_log (this->name, GF_LOG_ERROR, "Input dict is NULL"); + goto out; + } + + ret = dict_get_int32 (dict, "cmd", &cmd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not fetch status cmd"); + goto out; + } + + switch (cmd) { + case GF_SNAP_STATUS_TYPE_ALL: + { + break; + } + case GF_SNAP_STATUS_TYPE_SNAP: + { + ret = dict_get_str (dict, "snapname", &snapname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not fetch snapname"); + goto out; + } + + if (!glusterd_find_snap_by_name (snapname)) { + ret = gf_asprintf (op_errstr, "Snap (%s) " + "not found", snapname); + if (ret < 0) { + goto out; + } + ret = -1; + gf_log (this->name, GF_LOG_ERROR, "Snap (%s) " + "not found", snapname); + goto out; + } + break; + } + case GF_SNAP_STATUS_TYPE_VOL: + { + ret = dict_get_str (dict, "volname", &volname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not fetch volname"); + goto out; + } + + ret = glusterd_volinfo_find (volname, &volinfo); + if (ret) { + ret = gf_asprintf (op_errstr, "Volume (%s)" + "not found", volname); + if (ret < 0) { + goto out; + } + ret = -1; + gf_log (this->name, GF_LOG_ERROR, "Volume " + "%s not present", volname); + goto out; + } + break; + + } + default: + { + gf_log (this->name, GF_LOG_ERROR, "Invalid command"); + break; + } + } + ret = 0; + +out: + return ret; +} + +int32_t +glusterd_snapshot_remove_commit (dict_t *dict, char **op_errstr, + dict_t *rsp_dict) +{ + int32_t ret = -1; + char *snapname = NULL; + char *dup_snapname = NULL; + glusterd_snap_t *snap = NULL; + glusterd_conf_t *priv = NULL; + glusterd_volinfo_t *snap_volinfo = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (rsp_dict); + GF_ASSERT (op_errstr); + + priv = this->private; + GF_ASSERT (priv); + + if (!dict || !op_errstr) { + gf_log (this->name, GF_LOG_ERROR, "input parameters NULL"); + goto out; + } + + ret = dict_get_str (dict, "snapname", &snapname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Getting the snap name " + "failed"); + goto out; + } + + snap = glusterd_find_snap_by_name (snapname); + if (!snap) { + gf_log (this->name, GF_LOG_ERROR, "Snap %s does not exist", + snapname); + ret = -1; + goto out; + } + + if (is_origin_glusterd (dict) == _gf_true) { + /* TODO : As of now there is only volume in snapshot. + * Change this when multiple volume snapshot is introduced + */ + snap_volinfo = list_entry (snap->volumes.next, + glusterd_volinfo_t, + vol_list); + if (!snap_volinfo) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to fetch snap_volinfo"); + ret = -1; + goto out; + } + + /* From origin glusterd check if * + * any peers with snap bricks is down */ + ret = glusterd_find_missed_snap (rsp_dict, snap_volinfo, + snap_volinfo->volname, + &priv->peers, + GF_SNAP_OPTION_TYPE_DELETE); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to find missed snap deletes"); + goto out; + } + } + + ret = glusterd_snap_remove (rsp_dict, snap, _gf_true, _gf_false); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to remove snap %s", + snapname); + goto out; + } + + dup_snapname = gf_strdup (snapname); + if (!dup_snapname) { + gf_log (this->name, GF_LOG_ERROR, "Strdup failed"); + ret = -1; + goto out; + } + + ret = dict_set_dynstr (rsp_dict, "snapname", dup_snapname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set the snapname"); + GF_FREE (dup_snapname); + goto out; + } + + ret = 0; +out: + return ret; +} + +int32_t +glusterd_do_snap_cleanup (dict_t *dict, char **op_errstr, dict_t *rsp_dict) +{ + int32_t ret = -1; + char *name = NULL; + xlator_t *this = NULL; + glusterd_conf_t *conf = NULL; + glusterd_volinfo_t *volinfo = NULL; + glusterd_snap_t *snap = NULL; + + this = THIS; + GF_ASSERT (this); + conf = this->private; + GF_ASSERT (conf); + + if (!dict || !op_errstr) { + gf_log (this->name, GF_LOG_ERROR, "input parameters NULL"); + goto out; + } + + ret = dict_get_str (dict, "snapname", &name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "getting the snap " + "name failed (volume: %s)", volinfo->volname); + goto out; + } + + /* + If the snapname is not found that means the failure happened at + staging, or in commit, before the snap object is created, in which + case there is nothing to cleanup. So set ret to 0. + */ + snap = glusterd_find_snap_by_name (name); + if (!snap) { + gf_log (this->name, GF_LOG_INFO, "snap %s is not found", name); + ret = 0; + goto out; + } + + ret = glusterd_snap_remove (rsp_dict, snap, _gf_true, _gf_true); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "removing the snap %s failed", + name); + goto out; + } + + name = NULL; + + ret = 0; + +out: + + return ret; +} + +/* In case of a successful, delete or create operation, during post_validate * + * look for missed snap operations and update the missed snap lists */ +int32_t +glusterd_snapshot_update_snaps_post_validate (dict_t *dict, char **op_errstr, + dict_t *rsp_dict) +{ + int32_t ret = -1; + int32_t missed_snap_count = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (rsp_dict); + GF_ASSERT (op_errstr); + + ret = dict_get_int32 (dict, "missed_snap_count", + &missed_snap_count); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "No missed snaps"); + ret = 0; + goto out; + } + + ret = glusterd_store_update_missed_snaps (dict, missed_snap_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to update missed_snaps_list"); + goto out; + } + +out: + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +int32_t +glusterd_snapshot_create_commit (dict_t *dict, char **op_errstr, + dict_t *rsp_dict) +{ + int ret = -1; + int64_t i = 0; + int64_t volcount = 0; + char *snapname = NULL; + char *volname = NULL; + char *tmp_name = NULL; + char key[PATH_MAX] = ""; + xlator_t *this = NULL; + glusterd_snap_t *snap = NULL; + glusterd_volinfo_t *origin_vol = NULL; + glusterd_volinfo_t *snap_vol = NULL; + glusterd_conf_t *priv = NULL; + + this = THIS; + GF_ASSERT(this); + GF_ASSERT(dict); + GF_ASSERT(op_errstr); + GF_ASSERT(rsp_dict); + priv = this->private; + GF_ASSERT(priv); + + ret = dict_get_int64 (dict, "volcount", &volcount); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to " + "get the volume count"); + goto out; + } + + ret = dict_get_str (dict, "snapname", &snapname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Unable to fetch snapname"); + goto out; + } + tmp_name = gf_strdup (snapname); + if (!tmp_name) { + gf_log (this->name, GF_LOG_ERROR, "Out of memory"); + ret = -1; + goto out; + } + + ret = dict_set_dynstr (rsp_dict, "snapname", tmp_name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to set snapname in rsp_dict"); + GF_FREE (tmp_name); + goto out; + } + tmp_name = NULL; + + snap = glusterd_create_snap_object (dict, rsp_dict); + if (!snap) { + gf_log (this->name, GF_LOG_ERROR, "creating the" + "snap object %s failed", snapname); + ret = -1; + goto out; + } + + for (i = 1; i <= volcount; i++) { + snprintf (key, sizeof (key), "volname%ld", i); + ret = dict_get_str (dict, key, &volname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get volume name"); + goto out; + } + + ret = glusterd_volinfo_find (volname, &origin_vol); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get the volinfo for " + "the volume %s", volname); + goto out; + } + + /* TODO: Create a stub where the bricks are + added parallely by worker threads so that + the snap creating happens parallely. */ + snap_vol = glusterd_do_snap_vol (origin_vol, snap, dict, + rsp_dict, i); + if (!snap_vol) { + ret = -1; + gf_log (this->name, GF_LOG_WARNING, "taking the " + "snapshot of the volume %s failed", volname); + goto out; + } + } + + snap->snap_status = GD_SNAP_STATUS_IN_USE; + ret = glusterd_store_snap (snap); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Could not store snap" + "object %s", snap->snapname); + goto out; + } + + ret = 0; + +out: + if (ret) { + if (snap) + glusterd_snap_remove (rsp_dict, snap, + _gf_true, _gf_true); + snap = NULL; + } + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +int +snap_max_hard_limit_set_commit (dict_t *dict, uint64_t value, + char *volname, char **op_errstr) +{ + char err_str[PATH_MAX] = ""; + glusterd_conf_t *conf = NULL; + glusterd_volinfo_t *volinfo = NULL; + int ret = -1; + xlator_t *this = NULL; + + this = THIS; + + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (volname); + GF_ASSERT (op_errstr); + + conf = this->private; + + GF_ASSERT (conf); + + /* TODO: Initiate auto deletion when there is a limit change */ + if (!volname) { + /* For system limit */ + conf->snap_max_hard_limit = value; + + ret = glusterd_store_global_info (this); + if (ret) { + snprintf (err_str, PATH_MAX, "Failed to store " + "snap-max-hard-limit for system"); + goto out; + } + } else { + /* For one volume */ + ret = glusterd_volinfo_find (volname, &volinfo); + if (ret) { + snprintf (err_str, PATH_MAX, "Failed to get the" + " volinfo for volume %s", volname); + goto out; + } + + volinfo->snap_max_hard_limit = value; + + ret = glusterd_store_volinfo (volinfo, + GLUSTERD_VOLINFO_VER_AC_INCREMENT); + if (ret) { + snprintf (err_str, PATH_MAX, "Failed to store " + "snap-max-hard-limit for volume %s", volname); + goto out; + } + } + + ret = 0; +out: + if (ret) { + *op_errstr = gf_strdup (err_str); + gf_log (this->name, GF_LOG_ERROR, "%s", err_str); + } + return ret; +} + +int +snap_max_limits_display_commit (dict_t *rsp_dict, char *volname, + char **op_errstr) +{ + char err_str[PATH_MAX] = ""; + char buf[PATH_MAX] = ""; + glusterd_conf_t *conf = NULL; + glusterd_volinfo_t *volinfo = NULL; + int ret = -1; + uint64_t active_hard_limit = 0; + uint64_t snap_max_limit = 0; + uint64_t soft_limit_value = -1; + uint64_t count = 0; + xlator_t *this = NULL; + + this = THIS; + + GF_ASSERT (this); + GF_ASSERT (rsp_dict); + GF_ASSERT (volname); + GF_ASSERT (op_errstr); + + conf = this->private; + + GF_ASSERT (conf); + + if (!volname) { + /* For system limit */ + list_for_each_entry (volinfo, &conf->volumes, vol_list) { + if (volinfo->is_snap_volume == _gf_true) + continue; + snap_max_limit = volinfo->snap_max_hard_limit; + if (snap_max_limit > conf->snap_max_hard_limit) + active_hard_limit = conf->snap_max_hard_limit; + else + active_hard_limit = snap_max_limit; + soft_limit_value = (active_hard_limit * + conf->snap_max_soft_limit) / 100; + + snprintf (buf, sizeof(buf), "volume%ld-volname", count); + ret = dict_set_str (rsp_dict, buf, volinfo->volname); + if (ret) { + snprintf (err_str, PATH_MAX, + "Failed to set %s", buf); + goto out; + } + + snprintf (buf, sizeof(buf), + "volume%ld-snap-max-hard-limit", count); + ret = dict_set_uint64 (rsp_dict, buf, snap_max_limit); + if (ret) { + snprintf (err_str, PATH_MAX, + "Failed to set %s", buf); + goto out; + } + + snprintf (buf, sizeof(buf), + "volume%ld-active-hard-limit", count); + ret = dict_set_uint64 (rsp_dict, buf, + active_hard_limit); + if (ret) { + snprintf (err_str, PATH_MAX, + "Failed to set %s", buf); + goto out; + } + + snprintf (buf, sizeof(buf), + "volume%ld-snap-max-soft-limit", count); + ret = dict_set_uint64 (rsp_dict, buf, soft_limit_value); + if (ret) { + snprintf (err_str, PATH_MAX, + "Failed to set %s", buf); + goto out; + } + count++; + } + + ret = dict_set_uint64 (rsp_dict, "voldisplaycount", count); + if (ret) { + snprintf (err_str, PATH_MAX, + "Failed to set voldisplaycount"); + goto out; + } + } else { + /* For one volume */ + ret = glusterd_volinfo_find (volname, &volinfo); + if (ret) { + snprintf (err_str, PATH_MAX, "Failed to get the" + " volinfo for volume %s", volname); + goto out; + } + + snap_max_limit = volinfo->snap_max_hard_limit; + if (snap_max_limit > conf->snap_max_hard_limit) + active_hard_limit = conf->snap_max_hard_limit; + else + active_hard_limit = snap_max_limit; + + soft_limit_value = (active_hard_limit * + conf->snap_max_soft_limit) / 100; + + snprintf (buf, sizeof(buf), "volume%ld-volname", count); + ret = dict_set_str (rsp_dict, buf, volinfo->volname); + if (ret) { + snprintf (err_str, PATH_MAX, + "Failed to set %s", buf); + goto out; + } + + snprintf (buf, sizeof(buf), + "volume%ld-snap-max-hard-limit", count); + ret = dict_set_uint64 (rsp_dict, buf, snap_max_limit); + if (ret) { + snprintf (err_str, PATH_MAX, + "Failed to set %s", buf); + goto out; + } + + snprintf (buf, sizeof(buf), + "volume%ld-active-hard-limit", count); + ret = dict_set_uint64 (rsp_dict, buf, active_hard_limit); + if (ret) { + snprintf (err_str, PATH_MAX, + "Failed to set %s", buf); + goto out; + } + + snprintf (buf, sizeof(buf), + "volume%ld-snap-max-soft-limit", count); + ret = dict_set_uint64 (rsp_dict, buf, soft_limit_value); + if (ret) { + snprintf (err_str, PATH_MAX, + "Failed to set %s", buf); + goto out; + } + + count++; + + ret = dict_set_uint64 (rsp_dict, "voldisplaycount", count); + if (ret) { + snprintf (err_str, PATH_MAX, + "Failed to set voldisplaycount"); + goto out; + } + + } + + ret = dict_set_uint64 (rsp_dict, "snap-max-hard-limit", + conf->snap_max_hard_limit); + if (ret) { + snprintf (err_str, PATH_MAX, + "Failed to set sys-snap-max-hard-limit "); + goto out; + } + + ret = dict_set_uint64 (rsp_dict, "snap-max-soft-limit", + conf->snap_max_soft_limit); + if (ret) { + snprintf (err_str, PATH_MAX, + "Failed to set sys-snap-max-hard-limit "); + goto out; + } + + ret = 0; +out: + if (ret) { + *op_errstr = gf_strdup (err_str); + gf_log (this->name, GF_LOG_ERROR, "%s", err_str); + } + return ret; +} + +int +glusterd_snapshot_config_commit (dict_t *dict, char **op_errstr, + dict_t *rsp_dict) +{ + char *volname = NULL; + xlator_t *this = NULL; + int ret = -1; + char err_str[PATH_MAX] = {0,}; + glusterd_conf_t *conf = NULL; + int config_command = 0; + uint64_t hard_limit = 0; + uint64_t soft_limit = 0; + + this = THIS; + + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (op_errstr); + + conf = this->private; + + GF_ASSERT (conf); + + ret = dict_get_int32 (dict, "config-command", &config_command); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get config-command type"); + goto out; + } + + /* Ignore the return value of the following dict_get, + * as they are optional + */ + ret = dict_get_str (dict, "volname", &volname); + + ret = dict_get_uint64 (dict, "snap-max-hard-limit", &hard_limit); + + ret = dict_get_uint64 (dict, "snap-max-soft-limit", &soft_limit); + + switch (config_command) { + case GF_SNAP_CONFIG_TYPE_SET: + if (hard_limit) { + /* Commit ops for snap-max-hard-limit */ + ret = snap_max_hard_limit_set_commit (dict, hard_limit, + volname, + op_errstr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "snap-max-hard-limit set " + "commit failed."); + goto out; + } + } + + if (soft_limit) { + /* For system limit */ + conf->snap_max_soft_limit = soft_limit; + + ret = glusterd_store_global_info (this); + if (ret) { + snprintf (err_str, PATH_MAX, "Failed to store " + "snap-max-soft-limit for system"); + *op_errstr = gf_strdup (err_str); + gf_log (this->name, GF_LOG_ERROR, "%s", + err_str); + goto out; + } + } + break; + + case GF_SNAP_CONFIG_DISPLAY: + /* Reading data from local node only */ + if (!is_origin_glusterd (dict)) { + ret = 0; + break; + } + + ret = snap_max_limits_display_commit (rsp_dict, volname, + op_errstr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "snap-max-limit " + "display commit failed."); + goto out; + } + break; + default: + break; + } + +out: + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +int +glusterd_get_brick_lvm_details (dict_t *rsp_dict, + glusterd_brickinfo_t *brickinfo, char *volname, + char *device, char *key_prefix) +{ + + int ret = -1; + glusterd_conf_t *priv = NULL; + runner_t runner = {0,}; + xlator_t *this = NULL; + char msg[PATH_MAX] = ""; + char buf[PATH_MAX] = ""; + char *ptr = NULL; + char *token = NULL; + char key[PATH_MAX] = ""; + char *value = NULL; + + GF_ASSERT (rsp_dict); + GF_ASSERT (brickinfo); + GF_ASSERT (volname); + this = THIS; + GF_ASSERT (this); + priv = this->private; + GF_ASSERT (priv); + + device = glusterd_get_brick_mount_details (brickinfo); + if (!device) { + gf_log (this->name, GF_LOG_ERROR, "Getting device name for " + "the brick %s:%s failed", brickinfo->hostname, + brickinfo->path); + goto out; + } + runinit (&runner); + snprintf (msg, sizeof (msg), "running lvs command, " + "for getting snap status"); + /* Using lvs command fetch the Volume Group name, + * Percentage of data filled and Logical Volume size + * + * "-o" argument is used to get the desired information, + * example : "lvs /dev/VolGroup/thin_vol -o vgname,lv_size", + * will get us Volume Group name and Logical Volume size. + * + * Here separator used is ":", + * for the above given command with separator ":", + * The output will be "vgname:lvsize" + */ + runner_add_args (&runner, "lvs", device, "--noheading", "-o", + "vg_name,data_percent,lv_size", + "--separator", ":", NULL); + runner_redir (&runner, STDOUT_FILENO, RUN_PIPE); + runner_log (&runner, "", GF_LOG_DEBUG, msg); + ret = runner_start (&runner); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not perform lvs action"); + goto end; + } + do { + ptr = fgets (buf, sizeof (buf), + runner_chio (&runner, STDOUT_FILENO)); + + if (ptr == NULL) + break; + token = strtok (buf, ":"); + if (token != NULL) { + while (token && token[0] == ' ') + token++; + if (!token) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "Invalid vg entry"); + goto end; + } + value = gf_strdup (token); + if (!value) { + ret = -1; + goto end; + } + ret = snprintf (key, sizeof (key), "%s.vgname", + key_prefix); + if (ret < 0) { + goto end; + } + + ret = dict_set_dynstr (rsp_dict, key, value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not save vgname "); + goto end; + } + } + + token = strtok (NULL, ":"); + if (token != NULL) { + value = gf_strdup (token); + if (!value) { + ret = -1; + goto end; + } + ret = snprintf (key, sizeof (key), "%s.data", + key_prefix); + if (ret < 0) { + goto end; + } + + ret = dict_set_dynstr (rsp_dict, key, value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not save data percent "); + goto end; + } + } + token = strtok (NULL, ":"); + if (token != NULL) { + value = gf_strdup (token); + if (!value) { + ret = -1; + goto end; + } + ret = snprintf (key, sizeof (key), "%s.lvsize", + key_prefix); + if (ret < 0) { + goto end; + } + + ret = dict_set_dynstr (rsp_dict, key, value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not save meta data percent "); + goto end; + } + } + + } while (ptr != NULL); + + ret = 0; + +end: + runner_end (&runner); + +out: + if (ret && value) { + GF_FREE (value); + } + + return ret; +} + +int +glusterd_get_single_brick_status (char **op_errstr, dict_t *rsp_dict, + char *keyprefix, int index, + glusterd_volinfo_t *snap_volinfo, + glusterd_brickinfo_t *brickinfo) +{ + int ret = -1; + xlator_t *this = NULL; + glusterd_conf_t *priv = NULL; + char key[PATH_MAX] = ""; + char *device = NULL; + char *value = NULL; + char brick_path[PATH_MAX] = ""; + char pidfile[PATH_MAX] = ""; + pid_t pid = -1; + + this = THIS; + GF_ASSERT (this); + priv = this->private; + GF_ASSERT (priv); + + GF_ASSERT (op_errstr); + GF_ASSERT (rsp_dict); + GF_ASSERT (keyprefix); + GF_ASSERT (snap_volinfo); + GF_ASSERT (brickinfo); + + ret = snprintf (key, sizeof (key), "%s.brick%d.path", keyprefix, + index); + if (ret < 0) { + goto out; + } + + ret = snprintf (brick_path, sizeof (brick_path), + "%s:%s", brickinfo->hostname, brickinfo->path); + if (ret < 0) { + goto out; + } + + value = gf_strdup (brick_path); + if (!value) { + ret = -1; + goto out; + } + + ret = dict_set_dynstr (rsp_dict, key, value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Unable to store " + "brick_path %s", brickinfo->path); + goto out; + } + + if (brickinfo->snap_status == -1) { + /* Setting vgname as "Pending Snapshot" */ + value = gf_strdup ("Pending Snapshot"); + if (!value) { + ret = -1; + goto out; + } + + snprintf (key, sizeof (key), "%s.brick%d.vgname", + keyprefix, index); + ret = dict_set_dynstr (rsp_dict, key, value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not save vgname "); + goto out; + } + + ret = 0; + goto out; + } + value = NULL; + + ret = snprintf (key, sizeof (key), "%s.brick%d.status", + keyprefix, index); + if (ret < 0) { + goto out; + } + + if (brickinfo->status == GF_BRICK_STOPPED) { + value = gf_strdup ("No"); + if (!value) { + ret = -1; + goto out; + } + ret = dict_set_str (rsp_dict, key, value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not save brick status"); + goto out; + } + value = NULL; + } else { + value = gf_strdup ("Yes"); + if (!value) { + ret = -1; + goto out; + } + ret = dict_set_str (rsp_dict, key, value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not save brick status"); + goto out; + } + value = NULL; + + GLUSTERD_GET_BRICK_PIDFILE (pidfile, snap_volinfo, + brickinfo, priv); + ret = glusterd_is_service_running (pidfile, &pid); + + ret = snprintf (key, sizeof (key), "%s.brick%d.pid", + keyprefix, index); + if (ret < 0) { + goto out; + } + + ret = dict_set_int32 (rsp_dict, key, pid); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not save pid %d", pid); + goto out; + } + } + + ret = snprintf (key, sizeof (key), "%s.brick%d", + keyprefix, index); + if (ret < 0) { + goto out; + } + + ret = glusterd_get_brick_lvm_details (rsp_dict, brickinfo, + snap_volinfo->volname, + device, key); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "brick LVM details"); + goto out; + } +out: + if (ret && value) { + GF_FREE (value); + } + + return ret; +} + +int +glusterd_get_single_snap_status (char **op_errstr, dict_t *rsp_dict, + char *keyprefix, glusterd_snap_t *snap) +{ + int ret = -1; + xlator_t *this = NULL; + char key[PATH_MAX] = ""; + char brickkey[PATH_MAX] = ""; + glusterd_volinfo_t *snap_volinfo = NULL; + glusterd_volinfo_t *tmp_volinfo = NULL; + glusterd_brickinfo_t *brickinfo = NULL; + int volcount = 0; + int brickcount = 0; + + this = THIS; + GF_ASSERT (this); + + GF_ASSERT (op_errstr); + GF_ASSERT (rsp_dict); + GF_ASSERT (keyprefix); + GF_ASSERT (snap); + + list_for_each_entry_safe (snap_volinfo, tmp_volinfo, &snap->volumes, + vol_list) { + ret = snprintf (key, sizeof (key), "%s.vol%d", keyprefix, + volcount); + if (ret < 0) { + goto out; + } + list_for_each_entry (brickinfo, &snap_volinfo->bricks, + brick_list) { + if (!glusterd_is_local_brick (this, snap_volinfo, + brickinfo)) { + brickcount++; + continue; + } + + ret = glusterd_get_single_brick_status (op_errstr, + rsp_dict, key, brickcount, + snap_volinfo, brickinfo); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Getting " + "single snap status failed"); + goto out; + } + brickcount++; + } + ret = snprintf (brickkey, sizeof (brickkey), "%s.brickcount", + key); + if (ret < 0) { + goto out; + } + + ret = dict_set_int32 (rsp_dict, brickkey, brickcount); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not save brick count"); + goto out; + } + volcount++; + } + + ret = snprintf (key, sizeof (key), "%s.volcount", keyprefix); + if (ret < 0) { + goto out; + } + + ret = dict_set_int32 (rsp_dict, key, volcount); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not save volcount"); + goto out; + } + +out: + + return ret; +} + +int +glusterd_get_each_snap_object_status (char **op_errstr, dict_t *rsp_dict, + glusterd_snap_t *snap, char *keyprefix) +{ + int ret = -1; + char key[PATH_MAX] = ""; + char *temp = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (op_errstr); + GF_ASSERT (rsp_dict); + GF_ASSERT (snap); + GF_ASSERT (keyprefix); + + /* TODO : Get all the snap volume info present in snap object, + * as of now, There will be only one snapvolinfo per snap object + */ + ret = snprintf (key, sizeof (key), "%s.snapname", keyprefix); + if (ret < 0) { + goto out; + } + + temp = gf_strdup (snap->snapname); + if (temp == NULL) { + ret = -1; + goto out; + } + ret = dict_set_dynstr (rsp_dict, key, temp); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not save " + "snap name"); + goto out; + } + + temp = NULL; + + ret = snprintf (key, sizeof (key), "%s.uuid", keyprefix); + if (ret < 0) { + goto out; + } + + temp = gf_strdup (uuid_utoa (snap->snap_id)); + if (temp == NULL) { + ret = -1; + goto out; + } + + ret = dict_set_dynstr (rsp_dict, key, temp); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not save " + "snap UUID"); + goto out; + } + + temp = NULL; + + ret = glusterd_get_single_snap_status (op_errstr, rsp_dict, keyprefix, + snap); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not get single snap status"); + goto out; + } + + ret = snprintf (key, sizeof (key), "%s.volcount", keyprefix); + if (ret < 0) { + goto out; + } + + ret = dict_set_int32 (rsp_dict, key, 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not save volcount"); + goto out; + } +out: + if (ret && temp) + GF_FREE (temp); + + return ret; +} + +int +glusterd_get_snap_status_of_volume (char **op_errstr, dict_t *rsp_dict, + char *volname, char *keyprefix) { + int ret = -1; + glusterd_volinfo_t *snap_volinfo = NULL; + glusterd_volinfo_t *temp_volinfo = NULL; + glusterd_volinfo_t *volinfo = NULL; + char key[PATH_MAX] = ""; + xlator_t *this = NULL; + glusterd_conf_t *priv = NULL; + int i = 0; + + this = THIS; + GF_ASSERT (this); + priv = this->private; + GF_ASSERT (priv); + + GF_ASSERT (op_errstr); + GF_ASSERT (rsp_dict); + GF_ASSERT (volname); + GF_ASSERT (keyprefix); + + ret = glusterd_volinfo_find (volname, &volinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get volinfo of " + "volume %s", volname); + goto out; + } + + list_for_each_entry_safe (snap_volinfo, temp_volinfo, + &volinfo->snap_volumes, snapvol_list) { + ret = snprintf (key, sizeof (key), "status.snap%d", i); + if (ret < 0) { + goto out; + } + + ret = glusterd_get_each_snap_object_status (op_errstr, + rsp_dict, snap_volinfo->snapshot, key); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Function : " + "glusterd_get_single_snap_status failed"); + goto out; + } + i++; + } + + ret = dict_set_int32 (rsp_dict, "status.snapcount", i); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to save snapcount"); + ret = -1; + goto out; + } +out: + return ret; +} + +int +glusterd_get_all_snapshot_status (dict_t *dict, char **op_errstr, + dict_t *rsp_dict) +{ + int32_t i = 0; + int ret = -1; + char key[PATH_MAX] = ""; + glusterd_conf_t *priv = NULL; + glusterd_snap_t *snap = NULL; + glusterd_snap_t *tmp_snap = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + priv = this->private; + GF_ASSERT (priv); + + GF_ASSERT (dict); + GF_ASSERT (op_errstr); + + list_for_each_entry_safe (snap, tmp_snap, + &priv->snapshots, snap_list) { + ret = snprintf (key, sizeof (key), "status.snap%d", i); + if (ret < 0) { + goto out; + } + + ret = glusterd_get_each_snap_object_status (op_errstr, + rsp_dict, snap, key); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not get " + "the details of a snap object: %s", + snap->snapname); + goto out; + } + i++; + } + + ret = dict_set_int32 (rsp_dict, "status.snapcount", i); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not save snapcount"); + goto out; + } + + ret = 0; +out: + return ret; +} + + +int +glusterd_snapshot_status_commit (dict_t *dict, char **op_errstr, + dict_t *rsp_dict) +{ + xlator_t *this = NULL; + int ret = -1; + glusterd_conf_t *conf = NULL; + char *get_buffer = NULL; + int32_t cmd = -1; + char *snapname = NULL; + glusterd_snap_t *snap = NULL; + char *volname = NULL; + + this = THIS; + + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (op_errstr); + + conf = this->private; + + GF_ASSERT (conf); + ret = dict_get_int32 (dict, "cmd", &cmd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to get status cmd type"); + goto out; + } + + ret = dict_set_int32 (rsp_dict, "cmd", cmd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not save status cmd in rsp dictionary"); + goto out; + } + switch (cmd) { + case GF_SNAP_STATUS_TYPE_ALL: + { + ret = glusterd_get_all_snapshot_status (dict, op_errstr, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Unable to " + "get snapshot status"); + goto out; + } + break; + } + case GF_SNAP_STATUS_TYPE_SNAP: + { + + ret = dict_get_str (dict, "snapname", &snapname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Unable to " + "get snap name"); + goto out; + } + + snap = glusterd_find_snap_by_name (snapname); + if (!snap) { + ret = gf_asprintf (op_errstr, "Snap (%s) " + "not found", snapname); + if (ret < 0) { + goto out; + } + ret = -1; + gf_log (this->name, GF_LOG_ERROR, "Unable to " + "get snap volinfo"); + goto out; + } + ret = glusterd_get_each_snap_object_status (op_errstr, + rsp_dict, snap, "status.snap0"); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Unable to " + "get status of snap %s", get_buffer); + goto out; + } + break; + } + case GF_SNAP_STATUS_TYPE_VOL: + { + ret = dict_get_str (dict, "volname", &volname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Unable to" + " get volume name"); + goto out; + } + + ret = glusterd_get_snap_status_of_volume (op_errstr, + rsp_dict, volname, "status.vol0"); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Function :" + " glusterd_get_snap_status_of_volume " + "failed"); + goto out; + } + } + } + ret = 0; +out: + return ret; +} + +int32_t +glusterd_snapshot_create_postvalidate (dict_t *dict, int32_t op_ret, + char **op_errstr, dict_t *rsp_dict) +{ + xlator_t *this = NULL; + glusterd_conf_t *priv = NULL; + int ret = -1; + + this = THIS; + + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (rsp_dict); + + priv = this->private; + GF_ASSERT (priv); + + if (op_ret) { + ret = glusterd_do_snap_cleanup (dict, op_errstr, rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "cleanup operation " + "failed"); + goto out; + } + } else { + ret = glusterd_snapshot_update_snaps_post_validate (dict, + op_errstr, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "create snapshot"); + goto out; + } + } + + ret = 0; +out: + return ret; +} + +int32_t +glusterd_snapshot (dict_t *dict, char **op_errstr, dict_t *rsp_dict) +{ + + xlator_t *this = NULL; + glusterd_conf_t *priv = NULL; + int32_t snap_command = 0; + int ret = -1; + + this = THIS; + + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (rsp_dict); + + priv = this->private; + GF_ASSERT (priv); + + ret = dict_get_int32 (dict, "type", &snap_command); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "unable to get the type of " + "the snapshot command"); + goto out; + } + + switch (snap_command) { + case (GF_SNAP_OPTION_TYPE_CREATE): + ret = glusterd_snapshot_create_commit (dict, op_errstr, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "create snapshot"); + goto out; + } + break; + + case GF_SNAP_OPTION_TYPE_CONFIG: + ret = glusterd_snapshot_config_commit (dict, op_errstr, + rsp_dict); + break; + + case GF_SNAP_OPTION_TYPE_DELETE: + ret = glusterd_snapshot_remove_commit (dict, op_errstr, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "delete snapshot"); + goto out; + } + break; + + case GF_SNAP_OPTION_TYPE_RESTORE: + ret = glusterd_snapshot_restore (dict, op_errstr, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Failed to " + "restore snapshot"); + goto out; + } + + break; + + case GF_SNAP_OPTION_TYPE_STATUS: + ret = glusterd_snapshot_status_commit (dict, op_errstr, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "show snapshot status"); + goto out; + } + break; + + + default: + gf_log (this->name, GF_LOG_WARNING, "invalid snap command"); + goto out; + break; + } + + ret = 0; + +out: + return ret; +} + +int +glusterd_snapshot_brickop (dict_t *dict, char **op_errstr, dict_t *rsp_dict) +{ + int ret = -1; + int64_t vol_count = 0; + int64_t count = 1; + char key[1024] = {0,}; + char *volname = NULL; + int32_t snap_command = 0; + xlator_t *this = NULL; + + this = THIS; + + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (rsp_dict); + + ret = dict_get_int32 (dict, "type", &snap_command); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "unable to get the type of " + "the snapshot command"); + goto out; + } + + switch (snap_command) { + case GF_SNAP_OPTION_TYPE_CREATE: + ret = dict_get_int64 (dict, "volcount", &vol_count); + if (ret) + goto out; + while (count <= vol_count) { + snprintf (key, 1024, "volname%"PRId64, count); + ret = dict_get_str (dict, key, &volname); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to get volname"); + goto out; + } + ret = dict_set_str (dict, "volname", volname); + if (ret) + goto out; + + ret = gd_brick_op_phase (GD_OP_SNAP, NULL, dict, + op_errstr); + if (ret) + goto out; + volname = NULL; + count++; + } + + dict_del (dict, "volname"); + ret = 0; + break; + case GF_SNAP_OPTION_TYPE_DELETE: + break; + default: + break; + } + +out: + return ret; +} + +int +glusterd_snapshot_prevalidate (dict_t *dict, char **op_errstr, + dict_t *rsp_dict) +{ + int snap_command = 0; + xlator_t *this = NULL; + int ret = -1; + + this = THIS; + + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (rsp_dict); + + ret = dict_get_int32 (dict, "type", &snap_command); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "unable to get the type of " + "the snapshot command"); + goto out; + } + + switch (snap_command) { + case (GF_SNAP_OPTION_TYPE_CREATE): + ret = glusterd_snapshot_create_prevalidate (dict, op_errstr, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Snapshot create " + "pre-validation failed"); + goto out; + } + break; + + case (GF_SNAP_OPTION_TYPE_CONFIG): + ret = glusterd_snapshot_config_prevalidate (dict, op_errstr); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Snapshot config " + "pre-validation failed"); + goto out; + } + break; + + case GF_SNAP_OPTION_TYPE_RESTORE: + ret = glusterd_snapshot_restore_prevalidate (dict, op_errstr, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Snapshot restore " + "validation failed"); + goto out; + } + break; + case GF_SNAP_OPTION_TYPE_DELETE: + ret = glusterd_snapshot_remove_prevalidate (dict, op_errstr, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Snapshot remove " + "validation failed"); + goto out; + } + break; + + case GF_SNAP_OPTION_TYPE_STATUS: + ret = glusterd_snapshot_status_prevalidate (dict, op_errstr, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Snapshot status " + "validation failed"); + goto out; + } + break; + + default: + gf_log (this->name, GF_LOG_WARNING, "invalid snap command"); + goto out; + } + + ret = 0; +out: + return ret; +} + +int +glusterd_snapshot_postvalidate (dict_t *dict, int32_t op_ret, char **op_errstr, + dict_t *rsp_dict) +{ + int snap_command = 0; + xlator_t *this = NULL; + int ret = -1; + + this = THIS; + + GF_ASSERT (this); + GF_ASSERT (dict); + GF_ASSERT (rsp_dict); + + ret = dict_get_int32 (dict, "type", &snap_command); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "unable to get the type of " + "the snapshot command"); + goto out; + } + + switch (snap_command) { + case GF_SNAP_OPTION_TYPE_CREATE: + ret = glusterd_snapshot_create_postvalidate (dict, op_ret, + op_errstr, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Snapshot create " + "post-validation failed"); + goto out; + } + break; + + case GF_SNAP_OPTION_TYPE_DELETE: + case GF_SNAP_OPTION_TYPE_RESTORE: + ret = glusterd_snapshot_update_snaps_post_validate (dict, + op_errstr, + rsp_dict); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "update missed snaps list"); + goto out; + } + break; + + default: + gf_log (this->name, GF_LOG_WARNING, "invalid snap command"); + goto out; + } + + ret = 0; +out: + return ret; +} + +int +glusterd_handle_snapshot_fn (rpcsvc_request_t *req) +{ + int32_t ret = 0; + dict_t *dict = NULL; + gf_cli_req cli_req = {{0},}; + glusterd_op_t cli_op = GD_OP_SNAP; + int type = 0; + glusterd_conf_t *conf = NULL; + char *host_uuid = NULL; + char err_str[2048] = {0,}; + xlator_t *this = NULL; + char *volname = NULL; + + GF_ASSERT (req); + + this = THIS; + GF_ASSERT (this); + conf = this->private; + GF_ASSERT (conf); + + ret = xdr_to_generic (req->msg[0], &cli_req, + (xdrproc_t)xdr_gf_cli_req); + if (ret < 0) { + req->rpc_err = GARBAGE_ARGS; + goto out; + } + + if (cli_req.dict.dict_len > 0) { + dict = dict_new (); + if (!dict) + goto out; + + ret = dict_unserialize (cli_req.dict.dict_val, + cli_req.dict.dict_len, + &dict); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to " + "unserialize req-buffer to dictionary"); + snprintf (err_str, sizeof (err_str), "Unable to decode " + "the command"); + goto out; + } + + dict->extra_stdfree = cli_req.dict.dict_val; + + host_uuid = gf_strdup (uuid_utoa(MY_UUID)); + if (host_uuid == NULL) { + snprintf (err_str, sizeof (err_str), "Failed to get " + "the uuid of local glusterd"); + ret = -1; + goto out; + } + ret = dict_set_dynstr (dict, "host-uuid", host_uuid); + if (ret) { + GF_FREE (host_uuid); + goto out; + } + + + } else { + gf_log (this->name, GF_LOG_ERROR, "request dict length is %d", + cli_req.dict.dict_len); + goto out; + } + + ret = dict_get_int32 (dict, "type", &type); + if (ret < 0) { + snprintf (err_str, sizeof (err_str), "Command type not found"); + gf_log (this->name, GF_LOG_ERROR, "%s", err_str); + goto out; + } + + switch (type) { + case GF_SNAP_OPTION_TYPE_CREATE: + ret = glusterd_handle_snapshot_create (req, cli_op, dict, + err_str, sizeof (err_str)); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Snapshot create " + "failed: %s", err_str); + } + break; + case GF_SNAP_OPTION_TYPE_RESTORE: + ret = glusterd_handle_snapshot_restore (req, cli_op, dict, + err_str, sizeof (err_str)); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Snapshot restore " + "failed: %s", err_str); + } + + break; + case GF_SNAP_OPTION_TYPE_INFO: + ret = glusterd_handle_snapshot_info (req, cli_op, dict, + err_str, sizeof (err_str)); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Snapshot info " + "failed"); + } + break; + case GF_SNAP_OPTION_TYPE_LIST: + ret = glusterd_handle_snapshot_list (req, cli_op, dict, + err_str, sizeof (err_str)); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Snapshot list " + "failed"); + } + break; + case GF_SNAP_OPTION_TYPE_CONFIG: + /* TODO : Type of lock to be taken when we are setting + * limits system wide + */ + ret = dict_get_str (dict, "volname", &volname); + if (!volname) { + ret = dict_set_int32 (dict, "hold_vol_locks", + _gf_false); + if (ret) { + gf_log ("cli", GF_LOG_ERROR, + "Unable to set hold_vol_locks value " + "as _gf_false"); + goto out; + } + + } + ret = glusterd_mgmt_v3_initiate_all_phases (req, cli_op, dict); + break; + case GF_SNAP_OPTION_TYPE_DELETE: + ret = glusterd_handle_snapshot_remove (req, cli_op, dict, + err_str, + sizeof (err_str)); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Snapshot delete " + "failed: %s", err_str); + } + break; + case GF_SNAP_OPTION_TYPE_START: + case GF_SNAP_OPTION_TYPE_STOP: + case GF_SNAP_OPTION_TYPE_STATUS: + ret = glusterd_handle_snapshot_status (req, cli_op, dict, + err_str, + sizeof (err_str)); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Snapshot status " + "failed: %s", err_str); + } + break; + default: + gf_log (this->name, GF_LOG_ERROR, "Unkown snapshot request " + "type (%d)", type); + ret = -1; /* Failure */ + } + +out: + if (ret) { + if (err_str[0] == '\0') + snprintf (err_str, sizeof (err_str), + "Operation failed"); + ret = glusterd_op_send_cli_response (cli_op, ret, 0, req, + dict, err_str); + } + + return ret; +} + +int +glusterd_handle_snapshot (rpcsvc_request_t *req) +{ + return glusterd_big_locked_handler (req, glusterd_handle_snapshot_fn); +} + +static inline void +glusterd_free_snap_op (glusterd_snap_op_t *snap_op) +{ + if (snap_op) { + if (snap_op->brick_path) + GF_FREE (snap_op->brick_path); + + GF_FREE (snap_op); + } +} + +/* Look for duplicates and accordingly update the list */ +int32_t +glusterd_update_missed_snap_entry (glusterd_missed_snap_info *missed_snapinfo, + glusterd_snap_op_t *missed_snap_op) +{ + int32_t ret = -1; + glusterd_snap_op_t *snap_opinfo = NULL; + gf_boolean_t match = _gf_false; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT(this); + GF_ASSERT(missed_snapinfo); + GF_ASSERT(missed_snap_op); + + list_for_each_entry (snap_opinfo, &missed_snapinfo->snap_ops, + snap_ops_list) { + if ((!strcmp (snap_opinfo->brick_path, + missed_snap_op->brick_path)) && + (snap_opinfo->op == missed_snap_op->op)) { + /* If two entries have conflicting status + * GD_MISSED_SNAP_DONE takes precedence + */ + if ((snap_opinfo->status == GD_MISSED_SNAP_PENDING) && + (missed_snap_op->status == GD_MISSED_SNAP_DONE)) { + snap_opinfo->status = GD_MISSED_SNAP_DONE; + gf_log (this->name, GF_LOG_INFO, + "Updating missed snap status " + "for %s:%d:%s:%d as DONE", + missed_snapinfo->node_snap_info, + snap_opinfo->brick_num, + snap_opinfo->brick_path, + snap_opinfo->op); + ret = 0; + glusterd_free_snap_op (missed_snap_op); + goto out; + } + match = _gf_true; + break; + } else if ((snap_opinfo->brick_num == + missed_snap_op->brick_num) && + (snap_opinfo->op == GF_SNAP_OPTION_TYPE_CREATE) && + (missed_snap_op->op == + GF_SNAP_OPTION_TYPE_DELETE)) { + /* Optimizing create and delete entries for the same + * brick and same node + */ + gf_log (this->name, GF_LOG_INFO, + "Updating missed snap status " + "for %s:%d:%s:%d as DONE", + missed_snapinfo->node_snap_info, + snap_opinfo->brick_num, + snap_opinfo->brick_path, + snap_opinfo->op); + snap_opinfo->status = GD_MISSED_SNAP_DONE; + ret = 0; + glusterd_free_snap_op (missed_snap_op); + goto out; + } + } + + if (match == _gf_true) { + gf_log (this->name, GF_LOG_INFO, + "Duplicate entry. Not updating"); + glusterd_free_snap_op (missed_snap_op); + } else { + list_add_tail (&missed_snap_op->snap_ops_list, + &missed_snapinfo->snap_ops); + } + + ret = 0; +out: + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +/* Add new missed snap entry to the missed_snaps list. */ +int32_t +glusterd_store_missed_snaps_list (char *missed_info, int32_t brick_num, + char *brick_path, int32_t snap_op, + int32_t snap_status) +{ + int32_t ret = -1; + glusterd_missed_snap_info *missed_snapinfo = NULL; + glusterd_snap_op_t *missed_snap_op = NULL; + glusterd_conf_t *priv = NULL; + gf_boolean_t match = _gf_false; + gf_boolean_t free_missed_snap_info = _gf_false; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT(this); + GF_ASSERT(missed_info); + GF_ASSERT(brick_path); + + priv = this->private; + GF_ASSERT (priv); + + /* Create the snap_op object consisting of the * + * snap id and the op */ + ret = glusterd_missed_snap_op_new (&missed_snap_op); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to create new missed snap object."); + ret = -1; + goto out; + } + + missed_snap_op->brick_path = gf_strdup(brick_path); + if (!missed_snap_op->brick_path) { + ret = -1; + goto out; + } + missed_snap_op->brick_num = brick_num; + missed_snap_op->op = snap_op; + missed_snap_op->status = snap_status; + + /* Look for other entries for the same node and same snap */ + list_for_each_entry (missed_snapinfo, &priv->missed_snaps_list, + missed_snaps) { + if (!strcmp (missed_snapinfo->node_snap_info, + missed_info)) { + /* Found missed snapshot info for * + * the same node and same snap */ + match = _gf_true; + break; + } + } + + if (match == _gf_false) { + /* First snap op missed for the brick */ + ret = glusterd_missed_snapinfo_new (&missed_snapinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to create missed snapinfo"); + goto out; + } + free_missed_snap_info = _gf_true; + missed_snapinfo->node_snap_info = gf_strdup(missed_info); + if (!missed_snapinfo->node_snap_info) { + ret = -1; + goto out; + } + + list_add_tail (&missed_snap_op->snap_ops_list, + &missed_snapinfo->snap_ops); + list_add_tail (&missed_snapinfo->missed_snaps, + &priv->missed_snaps_list); + + ret = 0; + goto out; + } else { + ret = glusterd_update_missed_snap_entry (missed_snapinfo, + missed_snap_op); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to update existing missed snap entry."); + goto out; + } + } + +out: + if (ret) { + glusterd_free_snap_op (missed_snap_op); + + if (missed_snapinfo && + (free_missed_snap_info == _gf_true)) { + if (missed_snapinfo->node_snap_info) + GF_FREE (missed_snapinfo->node_snap_info); + + GF_FREE (missed_snapinfo); + } + } + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +/* Add missing snap entries to the in-memory conf->missed_snap_list */ +int32_t +glusterd_add_missed_snaps_to_list (dict_t *dict, int32_t missed_snap_count) +{ + char *buf = NULL; + char *tmp = NULL; + char *save_ptr = NULL; + char *nodeid = NULL; + char *snap_uuid = NULL; + char *brick_path = NULL; + char missed_info[PATH_MAX] = ""; + char name_buf[PATH_MAX] = ""; + int32_t i = -1; + int32_t ret = -1; + int32_t brick_num = -1; + int32_t snap_op = -1; + int32_t snap_status = -1; + glusterd_conf_t *priv = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT(this); + GF_ASSERT(dict); + + priv = this->private; + GF_ASSERT (priv); + + /* We can update the missed_snaps_list without acquiring * + * any additional locks as big lock will be held. */ + for (i = 0; i < missed_snap_count; i++) { + snprintf (name_buf, sizeof(name_buf), "missed_snaps_%d", + i); + ret = dict_get_str (dict, name_buf, &buf); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to fetch %s", name_buf); + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, "missed_snap_entry = %s", + buf); + + /* Need to make a duplicate string coz the same dictionary * + * is resent to the non-originator nodes */ + tmp = gf_strdup (buf); + if (!tmp) { + ret = -1; + goto out; + } + + /* Fetch the node-id, snap-id, brick_num, + * brick_path, snap_op and snap status + */ + nodeid = strtok_r (tmp, ":", &save_ptr); + snap_uuid = strtok_r (NULL, "=", &save_ptr); + brick_num = atoi(strtok_r (NULL, ":", &save_ptr)); + brick_path = strtok_r (NULL, ":", &save_ptr); + snap_op = atoi(strtok_r (NULL, ":", &save_ptr)); + snap_status = atoi(strtok_r (NULL, ":", &save_ptr)); + + if (!nodeid || !snap_uuid || !brick_path || + brick_num < 1 || snap_op < 1 || + snap_status < 1) { + gf_log (this->name, GF_LOG_ERROR, + "Invalid missed_snap_entry"); + ret = -1; + goto out; + } + + snprintf (missed_info, sizeof(missed_info), "%s:%s", + nodeid, snap_uuid); + + ret = glusterd_store_missed_snaps_list (missed_info, + brick_num, + brick_path, + snap_op, + snap_status); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to store missed snaps_list"); + goto out; + } + + GF_FREE (tmp); + tmp = NULL; + } + +out: + if (tmp) + GF_FREE (tmp); + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c index 9cfed8db9..1c2ec58e8 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.c +++ b/xlators/mgmt/glusterd/src/glusterd-store.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2007-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2007-2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -35,6 +35,7 @@ #include "glusterd-op-sm.h" #include "glusterd-utils.h" #include "glusterd-hooks.h" +#include "store.h" #include "glusterd-store.h" #include "rpc-clnt.h" @@ -44,151 +45,8 @@ #include <inttypes.h> #include <dirent.h> -static int32_t -glusterd_store_mkdir (char *path) -{ - int32_t ret = -1; - - ret = mkdir (path, 0777); - - if ((-1 == ret) && (EEXIST != errno)) { - gf_log (THIS->name, GF_LOG_ERROR, "mkdir() failed on path %s," - "errno: %s", path, strerror (errno)); - } else { - ret = 0; - } - - return ret; -} - -int32_t -glusterd_store_handle_create_on_absence (glusterd_store_handle_t **shandle, - char *path) -{ - GF_ASSERT (shandle); - int32_t ret = 0; - - if (*shandle == NULL) { - ret = glusterd_store_handle_new (path, shandle); - - if (ret) { - gf_log (THIS->name, GF_LOG_ERROR, "Unable to create " - "store handle for path: %s", path); - } - } - return ret; -} - -int32_t -glusterd_store_mkstemp (glusterd_store_handle_t *shandle) -{ - int fd = -1; - char tmppath[PATH_MAX] = {0,}; - - GF_ASSERT (shandle); - GF_ASSERT (shandle->path); - - snprintf (tmppath, sizeof (tmppath), "%s.tmp", shandle->path); - fd = open (tmppath, O_RDWR | O_CREAT | O_TRUNC | O_SYNC, 0600); - if (fd <= 0) { - gf_log (THIS->name, GF_LOG_ERROR, "Failed to open %s, " - "error: %s", tmppath, strerror (errno)); - } - - return fd; -} - -int -glusterd_store_sync_direntry (char *path) -{ - int ret = -1; - int dirfd = -1; - char *dir = NULL; - char *pdir = NULL; - xlator_t *this = NULL; - - this = THIS; - - dir = gf_strdup (path); - if (!dir) - goto out; - - pdir = dirname (dir); - dirfd = open (pdir, O_RDONLY); - if (dirfd == -1) { - gf_log (this->name, GF_LOG_ERROR, "Failed to open directory " - "%s, due to %s", pdir, strerror (errno)); - goto out; - } - - ret = fsync (dirfd); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to fsync %s, due to " - "%s", pdir, strerror (errno)); - goto out; - } - - ret = 0; -out: - if (dirfd >= 0) { - ret = close (dirfd); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to close " - "%s, due to %s", pdir, strerror (errno)); - } - } - - if (dir) - GF_FREE (dir); - - return ret; -} - -int32_t -glusterd_store_rename_tmppath (glusterd_store_handle_t *shandle) -{ - int32_t ret = -1; - char tmppath[PATH_MAX] = {0,}; - - GF_ASSERT (shandle); - GF_ASSERT (shandle->path); - - snprintf (tmppath, sizeof (tmppath), "%s.tmp", shandle->path); - ret = rename (tmppath, shandle->path); - if (ret) { - gf_log (THIS->name, GF_LOG_ERROR, "Failed to rename %s to %s, " - "error: %s", tmppath, shandle->path, strerror (errno)); - goto out; - } - - ret = glusterd_store_sync_direntry (tmppath); -out: - return ret; -} - -int32_t -glusterd_store_unlink_tmppath (glusterd_store_handle_t *shandle) -{ - int32_t ret = -1; - char tmppath[PATH_MAX] = {0,}; - - GF_ASSERT (shandle); - GF_ASSERT (shandle->path); - - snprintf (tmppath, sizeof (tmppath), "%s.tmp", shandle->path); - ret = unlink (tmppath); - if (ret && (errno != ENOENT)) { - gf_log (THIS->name, GF_LOG_ERROR, "Failed to mv %s to %s, " - "error: %s", tmppath, shandle->path, strerror (errno)); - } else { - ret = 0; - } - - return ret; -} - -static void -glusterd_replace_slash_with_hipen (char *str) +void +glusterd_replace_slash_with_hyphen (char *str) { char *ptr = NULL; @@ -213,7 +71,7 @@ glusterd_store_create_brick_dir (glusterd_volinfo_t *volinfo) GF_ASSERT (priv); GLUSTERD_GET_BRICK_DIR (brickdirpath, volinfo, priv); - ret = glusterd_store_mkdir (brickdirpath); + ret = gf_store_mkdir (brickdirpath); return ret; } @@ -227,7 +85,7 @@ glusterd_store_key_vol_brick_set (glusterd_brickinfo_t *brickinfo, GF_ASSERT (len >= PATH_MAX); snprintf (key_vol_brick, len, "%s", brickinfo->path); - glusterd_replace_slash_with_hipen (key_vol_brick); + glusterd_replace_slash_with_hyphen (key_vol_brick); } static void @@ -326,7 +184,7 @@ glusterd_store_volinfo_brick_fname_write (int vol_fd, brick_count); glusterd_store_brickinfofname_set (brickinfo, brickfname, sizeof (brickfname)); - ret = glusterd_store_save_value (vol_fd, key, brickfname); + ret = gf_store_save_value (vol_fd, key, brickfname); if (ret) goto out; @@ -345,9 +203,9 @@ glusterd_store_create_brick_shandle_on_absence (glusterd_volinfo_t *volinfo, GF_ASSERT (brickinfo); glusterd_store_brickinfopath_set (volinfo, brickinfo, brickpath, - sizeof (brickpath)); - ret = glusterd_store_handle_create_on_absence (&brickinfo->shandle, - brickpath); + sizeof (brickpath)); + ret = gf_store_handle_create_on_absence (&brickinfo->shandle, + brickpath); return ret; } @@ -360,30 +218,48 @@ glusterd_store_brickinfo_write (int fd, glusterd_brickinfo_t *brickinfo) GF_ASSERT (brickinfo); GF_ASSERT (fd > 0); - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_HOSTNAME, - brickinfo->hostname); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_HOSTNAME, + brickinfo->hostname); if (ret) goto out; - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_PATH, - brickinfo->path); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_PATH, + brickinfo->path); if (ret) goto out; snprintf (value, sizeof(value), "%d", brickinfo->port); - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_PORT, - value); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_PORT, value); snprintf (value, sizeof(value), "%d", brickinfo->rdma_port); - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_RDMA_PORT, - value); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_RDMA_PORT, + value); snprintf (value, sizeof(value), "%d", brickinfo->decommissioned); - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED, - value); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED, + value); if (ret) goto out; + if (strlen(brickinfo->device_path) > 0) { + snprintf (value, sizeof(value), "%s", brickinfo->device_path); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_DEVICE_PATH, + value); + if (ret) + goto out; + } + + snprintf (value, sizeof(value), "%d", brickinfo->snap_status); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS, + value); + if (ret) + goto out; + + if (!brickinfo->vg[0]) + goto out; + + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_VGNAME, + brickinfo->vg); out: gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret); return ret; @@ -396,7 +272,7 @@ glusterd_store_perform_brick_store (glusterd_brickinfo_t *brickinfo) int32_t ret = -1; GF_ASSERT (brickinfo); - fd = glusterd_store_mkstemp (brickinfo->shandle); + fd = gf_store_mkstemp (brickinfo->shandle); if (fd <= 0) { ret = -1; goto out; @@ -408,7 +284,7 @@ glusterd_store_perform_brick_store (glusterd_brickinfo_t *brickinfo) out: if (ret && (fd > 0)) - glusterd_store_unlink_tmppath (brickinfo->shandle); + gf_store_unlink_tmppath (brickinfo->shandle); if (fd > 0) close (fd); gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret); @@ -440,6 +316,7 @@ glusterd_store_brickinfo (glusterd_volinfo_t *volinfo, goto out; ret = glusterd_store_perform_brick_store (brickinfo); + out: gf_log (THIS->name, GF_LOG_DEBUG, "Returning with %d", ret); return ret; @@ -490,7 +367,7 @@ glusterd_store_delete_brick (glusterd_brickinfo_t *brickinfo, char *delete_path) out: if (brickinfo->shandle) { - glusterd_store_handle_destroy (brickinfo->shandle); + gf_store_handle_destroy (brickinfo->shandle); brickinfo->shandle = NULL; } gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret); @@ -554,13 +431,13 @@ static int _storeslaves (dict_t *this, char *key, data_t *value, void *data) { int32_t ret = 0; - glusterd_store_handle_t *shandle = NULL; - xlator_t *xl = NULL; + gf_store_handle_t *shandle = NULL; + xlator_t *xl = NULL; xl = THIS; GF_ASSERT (xl); - shandle = (glusterd_store_handle_t*)data; + shandle = (gf_store_handle_t*)data; GF_ASSERT (shandle); GF_ASSERT (shandle->fd > 0); @@ -579,7 +456,7 @@ _storeslaves (dict_t *this, char *key, data_t *value, void *data) gf_log (xl->name, GF_LOG_DEBUG, "Storing in volinfo:key= %s, val=%s", key, value->data); - ret = glusterd_store_save_value (shandle->fd, key, (char*)value->data); + ret = gf_store_save_value (shandle->fd, key, (char*)value->data); if (ret) { gf_log (xl->name, GF_LOG_ERROR, "Unable to write into store" " handle for path: %s", shandle->path); @@ -593,13 +470,13 @@ int _storeopts (dict_t *this, char *key, data_t *value, void *data) { int32_t ret = 0; int32_t exists = 0; - glusterd_store_handle_t *shandle = NULL; - xlator_t *xl = NULL; + gf_store_handle_t *shandle = NULL; + xlator_t *xl = NULL; xl = THIS; GF_ASSERT (xl); - shandle = (glusterd_store_handle_t*)data; + shandle = (gf_store_handle_t*)data; GF_ASSERT (shandle); GF_ASSERT (shandle->fd > 0); @@ -632,7 +509,7 @@ int _storeopts (dict_t *this, char *key, data_t *value, void *data) return 0; } - ret = glusterd_store_save_value (shandle->fd, key, (char*)value->data); + ret = gf_store_save_value (shandle->fd, key, (char*)value->data); if (ret) { gf_log (xl->name, GF_LOG_ERROR, "Unable to write into store" " handle for path: %s", shandle->path); @@ -644,90 +521,118 @@ int _storeopts (dict_t *this, char *key, data_t *value, void *data) int32_t glusterd_volume_exclude_options_write (int fd, glusterd_volinfo_t *volinfo) { - char *str = NULL; + char *str = NULL; + char buf[PATH_MAX] = {0,}; + int32_t ret = -1; GF_ASSERT (fd > 0); GF_ASSERT (volinfo); - char buf[PATH_MAX] = {0,}; - int32_t ret = -1; - snprintf (buf, sizeof (buf), "%d", volinfo->type); - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_TYPE, buf); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_TYPE, buf); if (ret) goto out; snprintf (buf, sizeof (buf), "%d", volinfo->brick_count); - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_COUNT, buf); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_COUNT, buf); if (ret) goto out; snprintf (buf, sizeof (buf), "%d", volinfo->status); - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_STATUS, buf); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_STATUS, buf); if (ret) goto out; snprintf (buf, sizeof (buf), "%d", volinfo->sub_count); - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_SUB_COUNT, - buf); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_SUB_COUNT, buf); if (ret) goto out; snprintf (buf, sizeof (buf), "%d", volinfo->stripe_count); - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_STRIPE_CNT, - buf); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_STRIPE_CNT, buf); if (ret) goto out; snprintf (buf, sizeof (buf), "%d", volinfo->replica_count); - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_REPLICA_CNT, - buf); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_REPLICA_CNT, + buf); if (ret) goto out; snprintf (buf, sizeof (buf), "%d", volinfo->version); - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_VERSION, - buf); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_VERSION, buf); if (ret) goto out; snprintf (buf, sizeof (buf), "%d", volinfo->transport_type); - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_TRANSPORT, - buf); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_TRANSPORT, buf); if (ret) goto out; - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_ID, - uuid_utoa (volinfo->volume_id)); + snprintf (buf, sizeof (buf), "%s", volinfo->parent_volname); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_PARENT_VOLNAME, buf); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "Failed to store " + GLUSTERD_STORE_KEY_PARENT_VOLNAME); + goto out; + } + + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_ID, + uuid_utoa (volinfo->volume_id)); if (ret) goto out; str = glusterd_auth_get_username (volinfo); if (str) { - ret = glusterd_store_save_value (fd, - GLUSTERD_STORE_KEY_USERNAME, - str); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_USERNAME, + str); if (ret) goto out; } str = glusterd_auth_get_password (volinfo); if (str) { - ret = glusterd_store_save_value (fd, - GLUSTERD_STORE_KEY_PASSWORD, - str); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_PASSWORD, + str); if (ret) goto out; } - if (volinfo->backend == GD_VOL_BK_BD) { - snprintf (buf, sizeof (buf), "%d", volinfo->backend); - ret = glusterd_store_save_value (fd, - GLUSTERD_STORE_KEY_VOL_BACKEND, buf); + snprintf (buf, sizeof (buf), "%d", volinfo->op_version); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_OP_VERSION, buf); + if (ret) + goto out; + + snprintf (buf, sizeof (buf), "%d", volinfo->client_op_version); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION, + buf); + if (ret) + goto out; + if (volinfo->caps) { + snprintf (buf, sizeof (buf), "%d", volinfo->caps); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_CAPS, + buf); if (ret) goto out; } + snprintf (buf, sizeof (buf), "%d", volinfo->is_volume_restored); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_IS_RESTORED, buf); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, + "Unable to write is_volume_restored"); + goto out; + } + + snprintf (buf, sizeof (buf), "%"PRIu64, volinfo->snap_max_hard_limit); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT, + buf); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, + "Unable to write snap-max-hard-limit"); + goto out; + } + out: if (ret) gf_log (THIS->name, GF_LOG_ERROR, "Unable to write volume " @@ -745,8 +650,7 @@ glusterd_store_voldirpath_set (glusterd_volinfo_t *volinfo, char *voldirpath, priv = THIS->private; GF_ASSERT (priv); - snprintf (voldirpath, len, "%s/%s/%s", priv->workdir, - GLUSTERD_VOLUME_DIR_PREFIX, volinfo->volname); + GLUSTERD_GET_VOLUME_DIR (voldirpath, volinfo, priv); } static int32_t @@ -759,16 +663,38 @@ glusterd_store_create_volume_dir (glusterd_volinfo_t *volinfo) glusterd_store_voldirpath_set (volinfo, voldirpath, sizeof (voldirpath)); - ret = glusterd_store_mkdir (voldirpath); + ret = gf_store_mkdir (voldirpath); + gf_log (THIS->name, GF_LOG_DEBUG, "Returning with %d", ret); return ret; } +static int32_t +glusterd_store_create_snap_dir (glusterd_snap_t *snap) +{ + int32_t ret = -1; + char snapdirpath[PATH_MAX] = {0,}; + glusterd_conf_t *priv = NULL; + + priv = THIS->private; + GF_ASSERT (priv); + GF_ASSERT (snap); + + GLUSTERD_GET_SNAP_DIR (snapdirpath, snap, priv); + + ret = mkdir_p (snapdirpath, 0755, _gf_true); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "Failed to create snaps dir " + "%s", snapdirpath); + } + return ret; +} + int32_t glusterd_store_volinfo_write (int fd, glusterd_volinfo_t *volinfo) { int32_t ret = -1; - glusterd_store_handle_t *shandle = NULL; + gf_store_handle_t *shandle = NULL; GF_ASSERT (fd > 0); GF_ASSERT (volinfo); GF_ASSERT (volinfo->shandle); @@ -788,6 +714,49 @@ out: return ret; } +int32_t +glusterd_store_snapinfo_write (glusterd_snap_t *snap) +{ + int32_t ret = -1; + int fd = 0; + char buf[PATH_MAX] = ""; + + GF_ASSERT (snap); + + fd = gf_store_mkstemp (snap->shandle); + if (fd <= 0) + goto out; + + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_SNAP_ID, + uuid_utoa (snap->snap_id)); + if (ret) + goto out; + + snprintf (buf, sizeof (buf), "%d", snap->snap_status); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_SNAP_STATUS, buf); + if (ret) + goto out; + + snprintf (buf, sizeof (buf), "%d", snap->snap_restored); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_SNAP_RESTORED, buf); + if (ret) + goto out; + + if (snap->description) { + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_SNAP_DESC, + snap->description); + if (ret) + goto out; + } + + snprintf (buf, sizeof (buf), "%ld", snap->time_stamp); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_SNAP_TIMESTAMP, buf); + +out: + gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + static void glusterd_store_rbstatepath_set (glusterd_volinfo_t *volinfo, char *rbstatepath, size_t len) @@ -832,6 +801,37 @@ glusterd_store_node_state_path_set (glusterd_volinfo_t *volinfo, GLUSTERD_NODE_STATE_FILE); } +static void +glusterd_store_missed_snaps_list_path_set (char *missed_snaps_list, + size_t len) +{ + glusterd_conf_t *priv = NULL; + + priv = THIS->private; + GF_ASSERT (priv); + GF_ASSERT (missed_snaps_list); + GF_ASSERT (len <= PATH_MAX); + + snprintf (missed_snaps_list, len, "%s/snaps/" + GLUSTERD_MISSED_SNAPS_LIST_FILE, priv->workdir); +} + +static void +glusterd_store_snapfpath_set (glusterd_snap_t *snap, char *snap_fpath, + size_t len) +{ + glusterd_conf_t *priv = NULL; + + priv = THIS->private; + GF_ASSERT (priv); + GF_ASSERT (snap); + GF_ASSERT (snap_fpath); + GF_ASSERT (len <= PATH_MAX); + + snprintf (snap_fpath, len, "%s/snaps/%s/%s", priv->workdir, + snap->snapname, GLUSTERD_SNAP_INFO_FILE); +} + int32_t glusterd_store_create_rbstate_shandle_on_absence (glusterd_volinfo_t *volinfo) { @@ -841,8 +841,8 @@ glusterd_store_create_rbstate_shandle_on_absence (glusterd_volinfo_t *volinfo) GF_ASSERT (volinfo); glusterd_store_rbstatepath_set (volinfo, rbstatepath, sizeof (rbstatepath)); - ret = glusterd_store_handle_create_on_absence (&volinfo->rb_shandle, - rbstatepath); + ret = gf_store_handle_create_on_absence (&volinfo->rb_shandle, + rbstatepath); return ret; } @@ -855,8 +855,7 @@ glusterd_store_create_vol_shandle_on_absence (glusterd_volinfo_t *volinfo) GF_ASSERT (volinfo); glusterd_store_volfpath_set (volinfo, volfpath, sizeof (volfpath)); - ret = glusterd_store_handle_create_on_absence (&volinfo->shandle, - volfpath); + ret = gf_store_handle_create_on_absence (&volinfo->shandle, volfpath); return ret; } @@ -871,12 +870,48 @@ glusterd_store_create_nodestate_sh_on_absence (glusterd_volinfo_t *volinfo) glusterd_store_node_state_path_set (volinfo, node_state_path, sizeof (node_state_path)); ret = - glusterd_store_handle_create_on_absence (&volinfo->node_state_shandle, - node_state_path); + gf_store_handle_create_on_absence (&volinfo->node_state_shandle, + node_state_path); return ret; } +static int32_t +glusterd_store_create_missed_snaps_list_shandle_on_absence () +{ + char missed_snaps_list[PATH_MAX] = ""; + int32_t ret = -1; + glusterd_conf_t *priv = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + + priv = this->private; + GF_ASSERT (priv); + + glusterd_store_missed_snaps_list_path_set (missed_snaps_list, + sizeof(missed_snaps_list)); + + ret = gf_store_handle_create_on_absence + (&priv->missed_snaps_list_shandle, + missed_snaps_list); + return ret; +} + +int32_t +glusterd_store_create_snap_shandle_on_absence (glusterd_snap_t *snap) +{ + char snapfpath[PATH_MAX] = {0}; + int32_t ret = 0; + + GF_ASSERT (snap); + + glusterd_store_snapfpath_set (snap, snapfpath, sizeof (snapfpath)); + ret = gf_store_handle_create_on_absence (&snap->shandle, snapfpath); + return ret; +} + int32_t glusterd_store_brickinfos (glusterd_volinfo_t *volinfo, int vol_fd) { @@ -909,8 +944,7 @@ glusterd_store_rbstate_write (int fd, glusterd_volinfo_t *volinfo) GF_ASSERT (volinfo); snprintf (buf, sizeof (buf), "%d", volinfo->rep_brick.rb_status); - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_RB_STATUS, - buf); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_RB_STATUS, buf); if (ret) goto out; @@ -919,16 +953,16 @@ glusterd_store_rbstate_write (int fd, glusterd_volinfo_t *volinfo) snprintf (buf, sizeof (buf), "%s:%s", volinfo->rep_brick.src_brick->hostname, volinfo->rep_brick.src_brick->path); - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_RB_SRC_BRICK, - buf); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_RB_SRC_BRICK, + buf); if (ret) goto out; snprintf (buf, sizeof (buf), "%s:%s", volinfo->rep_brick.dst_brick->hostname, volinfo->rep_brick.dst_brick->path); - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_RB_DST_BRICK, - buf); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_RB_DST_BRICK, + buf); if (ret) goto out; @@ -944,13 +978,12 @@ glusterd_store_rbstate_write (int fd, glusterd_volinfo_t *volinfo) } snprintf (buf, sizeof (buf), "%d", port); - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_RB_DST_PORT, - buf); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_RB_DST_PORT, + buf); if (ret) goto out; uuid_unparse (volinfo->rep_brick.rb_id, buf); - ret = glusterd_store_save_value (fd, GF_REPLACE_BRICK_TID_KEY, - buf); + ret = gf_store_save_value (fd, GF_REPLACE_BRICK_TID_KEY, buf); } ret = 0; @@ -966,7 +999,7 @@ glusterd_store_perform_rbstate_store (glusterd_volinfo_t *volinfo) int32_t ret = -1; GF_ASSERT (volinfo); - fd = glusterd_store_mkstemp (volinfo->rb_shandle); + fd = gf_store_mkstemp (volinfo->rb_shandle); if (fd <= 0) { ret = -1; goto out; @@ -976,13 +1009,13 @@ glusterd_store_perform_rbstate_store (glusterd_volinfo_t *volinfo) if (ret) goto out; - ret = glusterd_store_rename_tmppath (volinfo->rb_shandle); + ret = gf_store_rename_tmppath (volinfo->rb_shandle); if (ret) goto out; out: if (ret && (fd > 0)) - glusterd_store_unlink_tmppath (volinfo->rb_shandle); + gf_store_unlink_tmppath (volinfo->rb_shandle); if (fd > 0) close (fd); gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret); @@ -1004,22 +1037,18 @@ glusterd_store_node_state_write (int fd, glusterd_volinfo_t *volinfo) } snprintf (buf, sizeof (buf), "%d", volinfo->rebal.defrag_cmd); - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_DEFRAG, - buf); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_DEFRAG, buf); if (ret) goto out; snprintf (buf, sizeof (buf), "%d", volinfo->rebal.op); - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_DEFRAG_OP, - buf); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_DEFRAG_OP, buf); if (ret) goto out; if (volinfo->rebal.defrag_cmd) { uuid_unparse (volinfo->rebal.rebalance_id, buf); - ret = glusterd_store_save_value (fd, - GF_REBALANCE_TID_KEY, - buf); + ret = gf_store_save_value (fd, GF_REBALANCE_TID_KEY, buf); } out: gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret); @@ -1033,7 +1062,7 @@ glusterd_store_perform_node_state_store (glusterd_volinfo_t *volinfo) int32_t ret = -1; GF_ASSERT (volinfo); - fd = glusterd_store_mkstemp (volinfo->node_state_shandle); + fd = gf_store_mkstemp (volinfo->node_state_shandle); if (fd <= 0) { ret = -1; goto out; @@ -1043,13 +1072,13 @@ glusterd_store_perform_node_state_store (glusterd_volinfo_t *volinfo) if (ret) goto out; - ret = glusterd_store_rename_tmppath (volinfo->node_state_shandle); + ret = gf_store_rename_tmppath (volinfo->node_state_shandle); if (ret) goto out; out: if (ret && (fd > 0)) - glusterd_store_unlink_tmppath (volinfo->node_state_shandle); + gf_store_unlink_tmppath (volinfo->node_state_shandle); if (fd > 0) close (fd); gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret); @@ -1063,7 +1092,7 @@ glusterd_store_perform_volume_store (glusterd_volinfo_t *volinfo) int32_t ret = -1; GF_ASSERT (volinfo); - fd = glusterd_store_mkstemp (volinfo->shandle); + fd = gf_store_mkstemp (volinfo->shandle); if (fd <= 0) { ret = -1; goto out; @@ -1079,7 +1108,7 @@ glusterd_store_perform_volume_store (glusterd_volinfo_t *volinfo) out: if (ret && (fd > 0)) - glusterd_store_unlink_tmppath (volinfo->shandle); + gf_store_unlink_tmppath (volinfo->shandle); if (fd > 0) close (fd); gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret); @@ -1112,7 +1141,7 @@ glusterd_store_bricks_cleanup_tmp (glusterd_volinfo_t *volinfo) GF_ASSERT (volinfo); list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { - glusterd_store_unlink_tmppath (brickinfo->shandle); + gf_store_unlink_tmppath (brickinfo->shandle); } } @@ -1123,11 +1152,19 @@ glusterd_store_volume_cleanup_tmp (glusterd_volinfo_t *volinfo) glusterd_store_bricks_cleanup_tmp (volinfo); - glusterd_store_unlink_tmppath (volinfo->shandle); + gf_store_unlink_tmppath (volinfo->shandle); + + gf_store_unlink_tmppath (volinfo->rb_shandle); + + gf_store_unlink_tmppath (volinfo->node_state_shandle); +} - glusterd_store_unlink_tmppath (volinfo->rb_shandle); +void +glusterd_store_snap_cleanup_tmp (glusterd_snap_t *snap) +{ + GF_ASSERT (snap); - glusterd_store_unlink_tmppath (volinfo->node_state_shandle); + gf_store_unlink_tmppath (snap->shandle); } int32_t @@ -1139,7 +1176,7 @@ glusterd_store_brickinfos_atomic_update (glusterd_volinfo_t *volinfo) GF_ASSERT (volinfo); list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { - ret = glusterd_store_rename_tmppath (brickinfo->shandle); + ret = gf_store_rename_tmppath (brickinfo->shandle); if (ret) goto out; } @@ -1153,7 +1190,7 @@ glusterd_store_volinfo_atomic_update (glusterd_volinfo_t *volinfo) int ret = -1; GF_ASSERT (volinfo); - ret = glusterd_store_rename_tmppath (volinfo->shandle); + ret = gf_store_rename_tmppath (volinfo->shandle); if (ret) goto out; @@ -1181,6 +1218,60 @@ out: } int32_t +glusterd_store_snap_atomic_update (glusterd_snap_t *snap) +{ + int ret = -1; + GF_ASSERT (snap); + + ret = gf_store_rename_tmppath (snap->shandle); + if (ret) + gf_log (THIS->name, GF_LOG_ERROR, "Couldn't rename " + "temporary file(s): Reason %s", strerror (errno)); + + return ret; +} + +int32_t +glusterd_store_snap (glusterd_snap_t *snap) +{ + int32_t ret = -1; + + GF_ASSERT (snap); + + ret = glusterd_store_create_snap_dir (snap); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "Failed to create snap dir"); + goto out; + } + + ret = glusterd_store_create_snap_shandle_on_absence (snap); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "Failed to create snap info " + "file"); + goto out; + } + + ret = glusterd_store_snapinfo_write (snap); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "Failed to write snap info"); + goto out; + } + + ret = glusterd_store_snap_atomic_update (snap); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR,"Failed to do automic update"); + goto out; + } + +out: + if (ret) + glusterd_store_snap_cleanup_tmp (snap); + + gf_log (THIS->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +int32_t glusterd_store_volinfo (glusterd_volinfo_t *volinfo, glusterd_volinfo_ver_ac_t ac) { int32_t ret = -1; @@ -1237,7 +1328,6 @@ out: return ret; } - int32_t glusterd_store_delete_volume (glusterd_volinfo_t *volinfo) { @@ -1264,8 +1354,8 @@ glusterd_store_delete_volume (glusterd_volinfo_t *volinfo) GLUSTERD_GET_VOLUME_DIR (pathname, volinfo, priv); snprintf (delete_path, sizeof (delete_path), - "%s/"GLUSTERD_TRASH"/%s.deleted", priv->workdir, - uuid_utoa (volinfo->volume_id)); + "%s/"GLUSTERD_TRASH"/%s.deleted", priv->workdir, + uuid_utoa (volinfo->volume_id)); snprintf (trashdir, sizeof (trashdir), "%s/"GLUSTERD_TRASH, priv->workdir); @@ -1348,7 +1438,7 @@ stat_failed: out: if (volinfo->shandle) { - glusterd_store_handle_destroy (volinfo->shandle); + gf_store_handle_destroy (volinfo->shandle); volinfo->shandle = NULL; } ret = (rename_fail == _gf_true) ? -1: 0; @@ -1357,269 +1447,127 @@ out: return ret; } - -int -glusterd_store_read_and_tokenize (FILE *file, char *str, - char **iter_key, char **iter_val, - glusterd_store_op_errno_t *store_errno) -{ - int32_t ret = -1; - char *savetok = NULL; - - GF_ASSERT (file); - GF_ASSERT (str); - GF_ASSERT (iter_key); - GF_ASSERT (iter_val); - GF_ASSERT (store_errno); - - ret = fscanf (file, "%s", str); - if (ret <= 0 || feof (file)) { - ret = -1; - *store_errno = GD_STORE_EOF; - goto out; - } - - *iter_key = strtok_r (str, "=", &savetok); - if (*iter_key == NULL) { - ret = -1; - *store_errno = GD_STORE_KEY_NULL; - goto out; - } - - *iter_val = strtok_r (NULL, "=", &savetok); - if (*iter_key == NULL) { - ret = -1; - *store_errno = GD_STORE_VALUE_NULL; - goto out; - } - - *store_errno = GD_STORE_SUCCESS; - ret = 0; -out: - return ret; -} - +/*TODO: cleanup the duplicate code and implement a generic function for + * deleting snap/volume depending on the parameter flag */ int32_t -glusterd_store_retrieve_value (glusterd_store_handle_t *handle, - char *key, char **value) +glusterd_store_delete_snap (glusterd_snap_t *snap) { - int32_t ret = -1; - char *scan_str = NULL; - char *iter_key = NULL; - char *iter_val = NULL; - char *free_str = NULL; - struct stat st = {0,}; - glusterd_store_op_errno_t store_errno = GD_STORE_SUCCESS; + char pathname[PATH_MAX] = {0,}; + int32_t ret = 0; + glusterd_conf_t *priv = NULL; + DIR *dir = NULL; + struct dirent *entry = NULL; + char path[PATH_MAX] = {0,}; + char delete_path[PATH_MAX] = {0,}; + char trashdir[PATH_MAX] = {0,}; + struct stat st = {0, }; + xlator_t *this = NULL; + gf_boolean_t rename_fail = _gf_false; - GF_ASSERT (handle); + this = THIS; + priv = this->private; + GF_ASSERT (priv); - handle->fd = open (handle->path, O_RDWR); + GF_ASSERT (snap); + GLUSTERD_GET_SNAP_DIR (pathname, snap, priv); - if (handle->fd == -1) { - gf_log ("", GF_LOG_ERROR, "Unable to open file %s errno: %s", - handle->path, strerror (errno)); - goto out; - } - if (!handle->read) - handle->read = fdopen (handle->fd, "r"); + snprintf (delete_path, sizeof (delete_path), + "%s/"GLUSTERD_TRASH"/snap-%s.deleted", priv->workdir, + uuid_utoa (snap->snap_id)); - if (!handle->read) { - gf_log ("", GF_LOG_ERROR, "Unable to open file %s errno: %s", - handle->path, strerror (errno)); - goto out; - } + snprintf (trashdir, sizeof (trashdir), "%s/"GLUSTERD_TRASH, + priv->workdir); - ret = fstat (handle->fd, &st); - if (ret < 0) { - gf_log ("glusterd", GF_LOG_WARNING, - "stat on file failed"); + ret = mkdir (trashdir, 0777); + if (ret && errno != EEXIST) { + gf_log (this->name, GF_LOG_ERROR, "Failed to create trash " + "directory, reason : %s", strerror (errno)); ret = -1; - store_errno = GD_STORE_STAT_FAILED; goto out; } - scan_str = GF_CALLOC (1, st.st_size, - gf_gld_mt_char); - if (scan_str == NULL) { - ret = -1; - store_errno = GD_STORE_ENOMEM; + ret = rename (pathname, delete_path); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to rename snap " + "directory %s to %s", snap->snapname, delete_path); + rename_fail = _gf_true; goto out; } - free_str = scan_str; + dir = opendir (delete_path); + if (!dir) { + gf_log (this->name, GF_LOG_DEBUG, "Failed to open directory %s." + " Reason : %s", delete_path, strerror (errno)); + ret = 0; + goto out; + } - do { - ret = glusterd_store_read_and_tokenize (handle->read, scan_str, - &iter_key, &iter_val, - &store_errno); - if (ret < 0) { - goto out; + glusterd_for_each_entry (entry, dir); + while (entry) { + snprintf (path, PATH_MAX, "%s/%s", delete_path, entry->d_name); + ret = stat (path, &st); + if (ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, "Failed to stat " + "entry %s : %s", path, strerror (errno)); + goto stat_failed; } - gf_log ("", GF_LOG_DEBUG, "key %s read", iter_key); + if (S_ISDIR (st.st_mode)) + ret = rmdir (path); + else + ret = unlink (path); - if (!strcmp (key, iter_key)) { - gf_log ("", GF_LOG_DEBUG, "key %s found", key); - ret = 0; - if (iter_val) - *value = gf_strdup (iter_val); - goto out; + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, " Failed to remove " + "%s. Reason : %s", path, strerror (errno)); } - } while (1); -out: - if (handle->fd > 0) { - close (handle->fd); - handle->read = NULL; - } - - GF_FREE (free_str); - return ret; -} - -int32_t -glusterd_store_save_value (int fd, char *key, char *value) -{ - int32_t ret = -1; - FILE *fp = NULL; - xlator_t *this = NULL; - - this = THIS; - GF_ASSERT (this); - GF_ASSERT (fd > 0); - GF_ASSERT (key); - GF_ASSERT (value); - - fp = fdopen (fd, "a+"); - if (fp == NULL) { - gf_log (this->name, GF_LOG_WARNING, "fdopen failed."); - ret = -1; - goto out; + gf_log (this->name, GF_LOG_DEBUG, "%s %s", + ret ? "Failed to remove":"Removed", + entry->d_name); +stat_failed: + memset (path, 0, sizeof(path)); + glusterd_for_each_entry (entry, dir); } - ret = fprintf (fp, "%s=%s\n", key, value); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, "Unable to store key: %s," - "value: %s, error: %s", key, value, - strerror (errno)); - ret = -1; - goto out; + ret = closedir (dir); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "Failed to close dir %s. " + "Reason : %s",delete_path, strerror (errno)); } - ret = fflush (fp); - if (feof (fp)) { - gf_log (this->name, GF_LOG_WARNING, - "fflush failed, error: %s", - strerror (errno)); - ret = -1; - goto out; + ret = rmdir (delete_path); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "Failed to rmdir: %s,err: %s", + delete_path, strerror (errno)); } - - ret = 0; -out: - - gf_log (this->name, GF_LOG_DEBUG, "returning: %d", ret); - return ret; -} - -int32_t -glusterd_store_handle_new (char *path, glusterd_store_handle_t **handle) -{ - int32_t ret = -1; - glusterd_store_handle_t *shandle = NULL; - int fd = -1; - char *spath = NULL; - xlator_t *this = NULL; - - this = THIS; - GF_ASSERT (this); - - shandle = GF_CALLOC (1, sizeof (*shandle), gf_gld_mt_store_handle_t); - if (!shandle) - goto out; - - spath = gf_strdup (path); - - if (!spath) - goto out; - - fd = open (path, O_RDWR | O_CREAT | O_APPEND, 0600); - if (fd <= 0) { - gf_log (this->name, GF_LOG_ERROR, "Failed to open file: %s, " - "error: %s", path, strerror (errno)); - goto out; + ret = rmdir (trashdir); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "Failed to rmdir: %s, Reason:" + " %s", trashdir, strerror (errno)); } - ret = glusterd_store_sync_direntry (spath); - if (ret) - goto out; - - shandle->path = spath; - *handle = shandle; - - ret = 0; out: - if (fd > 0) - close (fd); - - if (ret == -1) { - GF_FREE (spath); - GF_FREE (shandle); + if (snap->shandle) { + gf_store_handle_destroy (snap->shandle); + snap->shandle = NULL; } + ret = (rename_fail == _gf_true) ? -1: 0; gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret); return ret; } int -glusterd_store_handle_retrieve (char *path, glusterd_store_handle_t **handle) -{ - int32_t ret = -1; - struct stat statbuf = {0}; - - ret = stat (path, &statbuf); - if (ret) { - gf_log ("glusterd", GF_LOG_ERROR, "Unable to retrieve store " - "handle for %s, error: %s", path, strerror (errno)); - goto out; - } - ret = glusterd_store_handle_new (path, handle); -out: - gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); - return ret; -} - -int32_t -glusterd_store_handle_destroy (glusterd_store_handle_t *handle) -{ - int32_t ret = -1; - - if (!handle) { - ret = 0; - goto out; - } - - GF_FREE (handle->path); - - GF_FREE (handle); - - ret = 0; - -out: - gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret); - - return ret; -} - -int glusterd_store_global_info (xlator_t *this) { int ret = -1; glusterd_conf_t *conf = NULL; char op_version_str[15] = {0,}; char path[PATH_MAX] = {0,}; - glusterd_store_handle_t *handle = NULL; + gf_store_handle_t *handle = NULL; char *uuid_str = NULL; + char buf[256] = {0, }; conf = this->private; @@ -1630,7 +1578,7 @@ glusterd_store_global_info (xlator_t *this) if (!conf->handle) { snprintf (path, PATH_MAX, "%s/%s", conf->workdir, GLUSTERD_INFO_FILE); - ret = glusterd_store_handle_new (path, &handle); + ret = gf_store_handle_new (path, &handle); if (ret) { gf_log (this->name, GF_LOG_ERROR, "Unable to get store handle"); @@ -1649,14 +1597,14 @@ glusterd_store_global_info (xlator_t *this) goto out; } - handle->fd = glusterd_store_mkstemp (handle); + handle->fd = gf_store_mkstemp (handle); if (handle->fd <= 0) { ret = -1; goto out; } - ret = glusterd_store_save_value (handle->fd, GLUSTERD_STORE_UUID_KEY, - uuid_str); + ret = gf_store_save_value (handle->fd, GLUSTERD_STORE_UUID_KEY, + uuid_str); if (ret) { gf_log (this->name, GF_LOG_CRITICAL, "Storing uuid failed ret = %d", ret); @@ -1664,18 +1612,36 @@ glusterd_store_global_info (xlator_t *this) } snprintf (op_version_str, 15, "%d", conf->op_version); - ret = glusterd_store_save_value (handle->fd, GD_OP_VERSION_KEY, - op_version_str); + ret = gf_store_save_value (handle->fd, GD_OP_VERSION_KEY, + op_version_str); if (ret) { gf_log (this->name, GF_LOG_ERROR, "Storing op-version failed ret = %d", ret); goto out; } - ret = glusterd_store_rename_tmppath (handle); + snprintf (buf, sizeof (buf), "%"PRIu64, conf->snap_max_hard_limit); + ret = gf_store_save_value (handle->fd, + GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT, buf); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Storing snap-max-hard-limit failed ret = %d", ret); + goto out; + } + + snprintf (buf, sizeof (buf), "%"PRIu64, conf->snap_max_soft_limit); + ret = gf_store_save_value (handle->fd, + GLUSTERD_STORE_KEY_SNAP_MAX_SOFT_LIMIT, buf); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Storing snap-max-soft-limit failed ret = %d", ret); + goto out; + } + + ret = gf_store_rename_tmppath (handle); out: if (ret && (handle->fd > 0)) - glusterd_store_unlink_tmppath (handle); + gf_store_unlink_tmppath (handle); if (handle->fd > 0) { close (handle->fd); @@ -1701,17 +1667,17 @@ glusterd_retrieve_op_version (xlator_t *this, int *op_version) int tmp_version = 0; char *tmp = NULL; char path[PATH_MAX] = {0,}; - glusterd_store_handle_t *handle = NULL; + gf_store_handle_t *handle = NULL; priv = this->private; if (!priv->handle) { snprintf (path, PATH_MAX, "%s/%s", priv->workdir, GLUSTERD_INFO_FILE); - ret = glusterd_store_handle_retrieve (path, &handle); + ret = gf_store_handle_retrieve (path, &handle); if (ret) { - gf_log ("", GF_LOG_ERROR, "Unable to get store " + gf_log ("", GF_LOG_DEBUG, "Unable to get store " "handle!"); goto out; } @@ -1719,9 +1685,8 @@ glusterd_retrieve_op_version (xlator_t *this, int *op_version) priv->handle = handle; } - ret = glusterd_store_retrieve_value (priv->handle, - GD_OP_VERSION_KEY, - &op_version_str); + ret = gf_store_retrieve_value (priv->handle, GD_OP_VERSION_KEY, + &op_version_str); if (ret) { gf_log (this->name, GF_LOG_DEBUG, "No previous op_version present"); @@ -1744,21 +1709,100 @@ out: return ret; } +int +glusterd_retrieve_sys_snap_max_limit (xlator_t *this, uint64_t *limit, + char *key) +{ + char *limit_str = NULL; + glusterd_conf_t *priv = NULL; + int ret = -1; + uint64_t tmp_limit = 0; + char *tmp = NULL; + char path[PATH_MAX] = {0,}; + gf_store_handle_t *handle = NULL; + + GF_ASSERT (this); + priv = this->private; + + GF_ASSERT (priv); + GF_ASSERT (limit); + GF_ASSERT (key); + + if (!priv->handle) { + snprintf (path, PATH_MAX, "%s/%s", priv->workdir, + GLUSTERD_INFO_FILE); + ret = gf_store_handle_retrieve (path, &handle); + + if (ret) { + gf_log ("", GF_LOG_DEBUG, "Unable to get store " + "handle!"); + goto out; + } + + priv->handle = handle; + } + + ret = gf_store_retrieve_value (priv->handle, + key, + &limit_str); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "No previous %s present", key); + goto out; + } + + tmp_limit = strtoul (limit_str, &tmp, 10); + if ((tmp_limit <= 0) || (tmp && strlen (tmp) > 1)) { + gf_log (this->name, GF_LOG_WARNING, "invalid version number"); + goto out; + } + + *limit = tmp_limit; + + ret = 0; +out: + if (limit_str) + GF_FREE (limit_str); + + return ret; +} static int glusterd_restore_op_version (xlator_t *this) { - glusterd_conf_t *conf = NULL; - int ret = 0; - int op_version = 0; + glusterd_conf_t *conf = NULL; + int ret = 0; + int op_version = 0; conf = this->private; + ret = glusterd_retrieve_sys_snap_max_limit (this, + &conf->snap_max_hard_limit, + GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "Unable to retrieve system snap-max-hard-limit, " + "setting it to default value(%d)", + GLUSTERD_SNAPS_MAX_HARD_LIMIT); + conf->snap_max_hard_limit = GLUSTERD_SNAPS_MAX_HARD_LIMIT; + } + + ret = glusterd_retrieve_sys_snap_max_limit (this, + &conf->snap_max_soft_limit, + GLUSTERD_STORE_KEY_SNAP_MAX_SOFT_LIMIT); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "Unable to retrieve system snap-max-soft-limit, " + "setting it to default value(%d)", + GLUSTERD_SNAPS_DEF_SOFT_LIMIT_PERCENT); + conf->snap_max_soft_limit = GLUSTERD_SNAPS_DEF_SOFT_LIMIT_PERCENT; + } + ret = glusterd_retrieve_op_version (this, &op_version); if (!ret) { if ((op_version < GD_OP_VERSION_MIN) || (op_version > GD_OP_VERSION_MAX)) { gf_log (this->name, GF_LOG_ERROR, - "wrong op-version (%d) retreived", op_version); + "wrong op-version (%d) retrieved", op_version); ret = -1; goto out; } @@ -1801,7 +1845,7 @@ glusterd_retrieve_uuid () { char *uuid_str = NULL; int32_t ret = -1; - glusterd_store_handle_t *handle = NULL; + gf_store_handle_t *handle = NULL; glusterd_conf_t *priv = NULL; char path[PATH_MAX] = {0,}; @@ -1810,10 +1854,10 @@ glusterd_retrieve_uuid () if (!priv->handle) { snprintf (path, PATH_MAX, "%s/%s", priv->workdir, GLUSTERD_INFO_FILE); - ret = glusterd_store_handle_retrieve (path, &handle); + ret = gf_store_handle_retrieve (path, &handle); if (ret) { - gf_log ("", GF_LOG_ERROR, "Unable to get store " + gf_log ("", GF_LOG_DEBUG, "Unable to get store" "handle!"); goto out; } @@ -1821,12 +1865,11 @@ glusterd_retrieve_uuid () priv->handle = handle; } - ret = glusterd_store_retrieve_value (priv->handle, - GLUSTERD_STORE_UUID_KEY, - &uuid_str); + ret = gf_store_retrieve_value (priv->handle, GLUSTERD_STORE_UUID_KEY, + &uuid_str); if (ret) { - gf_log ("", GF_LOG_INFO, "No previous uuid is present"); + gf_log ("", GF_LOG_DEBUG, "No previous uuid is present"); goto out; } @@ -1838,247 +1881,13 @@ out: return ret; } -int32_t -glusterd_store_iter_new (glusterd_store_handle_t *shandle, - glusterd_store_iter_t **iter) -{ - int32_t ret = -1; - glusterd_store_iter_t *tmp_iter = NULL; - int fd = -1; - - GF_ASSERT (shandle); - GF_ASSERT (iter); - - tmp_iter = GF_CALLOC (1, sizeof (*tmp_iter), - gf_gld_mt_store_iter_t); - - if (!tmp_iter) { - gf_log ("", GF_LOG_ERROR, "Out of Memory"); - goto out; - } - - fd = open (shandle->path, O_RDWR); - - if (fd < 0) { - gf_log ("", GF_LOG_ERROR, "Unable to open %s, errno: %d", - shandle->path, errno); - goto out; - } - - tmp_iter->fd = fd; - - tmp_iter->file = fdopen (tmp_iter->fd, "r"); - - if (!tmp_iter->file) { - gf_log ("", GF_LOG_ERROR, "Unable to open file %s errno: %d", - shandle->path, errno); - goto out; - } - - strncpy (tmp_iter->filepath, shandle->path, sizeof (tmp_iter->filepath)); - tmp_iter->filepath[sizeof (tmp_iter->filepath) - 1] = 0; - *iter = tmp_iter; - ret = 0; - -out: - gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret); - return ret; -} - -int32_t -glusterd_store_validate_key_value (char *storepath, char *key, char*val, - glusterd_store_op_errno_t *op_errno) -{ - int ret = 0; - - GF_ASSERT (op_errno); - GF_ASSERT (storepath); - - if ((key == NULL) && (val == NULL)) { - ret = -1; - gf_log ("glusterd", GF_LOG_ERROR, "Glusterd store may be " - "corrupted, Invalid key and value (null) in %s", - storepath); - *op_errno = GD_STORE_KEY_VALUE_NULL; - } else if (key == NULL) { - ret = -1; - gf_log ("glusterd", GF_LOG_ERROR, "Glusterd store may be " - "corrupted, Invalid key (null) in %s", storepath); - *op_errno = GD_STORE_KEY_NULL; - } else if (val == NULL) { - ret = -1; - gf_log ("glusterd", GF_LOG_ERROR, "Glusterd store may be " - "corrupted, Invalid value (null) for key %s in %s", - key, storepath); - *op_errno = GD_STORE_VALUE_NULL; - } else { - ret = 0; - *op_errno = GD_STORE_SUCCESS; - } - - return ret; -} - -int32_t -glusterd_store_iter_get_next (glusterd_store_iter_t *iter, - char **key, char **value, - glusterd_store_op_errno_t *op_errno) -{ - int32_t ret = -1; - char *scan_str = NULL; - char *free_str = NULL; - char *iter_key = NULL; - char *iter_val = NULL; - struct stat st = {0,}; - glusterd_store_op_errno_t store_errno = GD_STORE_SUCCESS; - - GF_ASSERT (iter); - GF_ASSERT (iter->file); - GF_ASSERT (key); - GF_ASSERT (value); - - ret = fstat (iter->fd, &st); - if (ret < 0) { - gf_log ("glusterd", GF_LOG_WARNING, - "stat on file failed"); - ret = -1; - store_errno = GD_STORE_STAT_FAILED; - goto out; - } - - scan_str = GF_CALLOC (1, st.st_size, - gf_gld_mt_char); - if (scan_str == NULL) { - ret = -1; - store_errno = GD_STORE_ENOMEM; - goto out; - } - - *key = NULL; - *value = NULL; - - free_str = scan_str; - - ret = glusterd_store_read_and_tokenize (iter->file, scan_str, - &iter_key, &iter_val, - &store_errno); - if (ret < 0) { - goto out; - } - - - ret = glusterd_store_validate_key_value (iter->filepath, iter_key, - iter_val, &store_errno); - if (ret) - goto out; - - *value = gf_strdup (iter_val); - - *key = gf_strdup (iter_key); - if (!iter_key || !iter_val) { - ret = -1; - store_errno = GD_STORE_ENOMEM; - goto out; - } - - ret = 0; - -out: - if (ret) { - if (*key) { - GF_FREE (*key); - *key = NULL; - } - if (*value) { - GF_FREE (*value); - *value = NULL; - } - } - GF_FREE (free_str); - if (op_errno) - *op_errno = store_errno; - - gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret); - return ret; -} - -int32_t -glusterd_store_iter_get_matching (glusterd_store_iter_t *iter, - char *key, char **value) -{ - int32_t ret = -1; - char *tmp_key = NULL; - char *tmp_value = NULL; - - ret = glusterd_store_iter_get_next (iter, &tmp_key, &tmp_value, - NULL); - while (!ret) { - if (!strncmp (key, tmp_key, strlen (key))){ - *value = tmp_value; - GF_FREE (tmp_key); - goto out; - } - GF_FREE (tmp_key); - GF_FREE (tmp_value); - ret = glusterd_store_iter_get_next (iter, &tmp_key, - &tmp_value, NULL); - } -out: - return ret; -} - -int32_t -glusterd_store_iter_destroy (glusterd_store_iter_t *iter) -{ - int32_t ret = -1; - - if (!iter) - return 0; - - if (iter->file) - ret = fclose (iter->file); - else - ret = 0; - - if (ret) { - gf_log ("", GF_LOG_ERROR, "Unable to close fd: %d, ret: %d, " - "errno: %d" ,iter->fd, ret, errno); - } - - GF_FREE (iter); - - return ret; -} - -char* -glusterd_store_strerror (glusterd_store_op_errno_t op_errno) -{ - switch (op_errno) { - case GD_STORE_SUCCESS: - return "Success"; - case GD_STORE_KEY_NULL: - return "Invalid Key"; - case GD_STORE_VALUE_NULL: - return "Invalid Value"; - case GD_STORE_KEY_VALUE_NULL: - return "Invalid Key and Value"; - case GD_STORE_EOF: - return "No data"; - case GD_STORE_ENOMEM: - return "No memory"; - default: - return "Invalid errno"; - } - return "Invalid errno"; -} int32_t glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo) { - int32_t ret = 0; glusterd_brickinfo_t *brickinfo = NULL; - glusterd_store_iter_t *iter = NULL; + gf_store_iter_t *iter = NULL; char *key = NULL; char *value = NULL; char brickdir[PATH_MAX] = {0,}; @@ -2086,20 +1895,19 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo) glusterd_conf_t *priv = NULL; int32_t brick_count = 0; char tmpkey[4096] = {0,}; - glusterd_store_iter_t *tmpiter = NULL; + gf_store_iter_t *tmpiter = NULL; char *tmpvalue = NULL; - struct pmap_registry *pmap = NULL; - glusterd_store_op_errno_t op_errno = GD_STORE_SUCCESS; + struct pmap_registry *pmap = NULL; + gf_store_op_errno_t op_errno = GD_STORE_SUCCESS; GF_ASSERT (volinfo); GF_ASSERT (volinfo->volname); priv = THIS->private; - GLUSTERD_GET_BRICK_DIR (brickdir, volinfo, priv) - - ret = glusterd_store_iter_new (volinfo->shandle, &tmpiter); + GLUSTERD_GET_BRICK_DIR (brickdir, volinfo, priv); + ret = gf_store_iter_new (volinfo->shandle, &tmpiter); if (ret) goto out; @@ -2110,30 +1918,28 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo) goto out; snprintf (tmpkey, sizeof (tmpkey), "%s-%d", GLUSTERD_STORE_KEY_VOL_BRICK,brick_count); - ret = glusterd_store_iter_get_matching (tmpiter, tmpkey, - &tmpvalue); + ret = gf_store_iter_get_matching (tmpiter, tmpkey, &tmpvalue); snprintf (path, sizeof (path), "%s/%s", brickdir, tmpvalue); GF_FREE (tmpvalue); tmpvalue = NULL; - ret = glusterd_store_handle_retrieve (path, &brickinfo->shandle); + ret = gf_store_handle_retrieve (path, &brickinfo->shandle); if (ret) goto out; - ret = glusterd_store_iter_new (brickinfo->shandle, &iter); + ret = gf_store_iter_new (brickinfo->shandle, &iter); if (ret) goto out; - ret = glusterd_store_iter_get_next (iter, &key, &value, - &op_errno); + ret = gf_store_iter_get_next (iter, &key, &value, &op_errno); if (ret) { gf_log ("glusterd", GF_LOG_ERROR, "Unable to iterate " "the store for brick: %s, reason: %s", path, - glusterd_store_strerror (op_errno)); + gf_store_strerror (op_errno)); goto out; } while (!ret) { @@ -2148,7 +1954,7 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo) strlen (GLUSTERD_STORE_KEY_BRICK_PORT))) { gf_string2int (value, &brickinfo->port); - if (brickinfo->port < GF_IANA_PRIV_PORTS_START){ + if (brickinfo->port < priv->base_port) { /* This is required to adhere to the IANA standards */ brickinfo->port = 0; @@ -2164,8 +1970,7 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo) strlen (GLUSTERD_STORE_KEY_BRICK_RDMA_PORT))) { gf_string2int (value, &brickinfo->rdma_port); - if (brickinfo->rdma_port < - GF_IANA_PRIV_PORTS_START){ + if (brickinfo->rdma_port < priv->base_port) { /* This is required to adhere to the IANA standards */ brickinfo->rdma_port = 0; @@ -2182,6 +1987,18 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo) } else if (!strncmp (key, GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED, strlen (GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED))) { gf_string2int (value, &brickinfo->decommissioned); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_BRICK_DEVICE_PATH, + strlen (GLUSTERD_STORE_KEY_BRICK_DEVICE_PATH))) { + strncpy (brickinfo->device_path, value, + sizeof (brickinfo->device_path)); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS, + strlen (GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS))) { + gf_string2int (value, &brickinfo->snap_status); + } else if (!strncmp (key, + GLUSTERD_STORE_KEY_BRICK_VGNAME, + strlen (GLUSTERD_STORE_KEY_BRICK_VGNAME))) { + strncpy (brickinfo->vg, value, + sizeof (brickinfo->vg)); } else { gf_log ("", GF_LOG_ERROR, "Unknown key: %s", key); @@ -2192,13 +2009,13 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo) key = NULL; value = NULL; - ret = glusterd_store_iter_get_next (iter, &key, &value, - &op_errno); + ret = gf_store_iter_get_next (iter, &key, &value, + &op_errno); } if (op_errno != GD_STORE_EOF) goto out; - ret = glusterd_store_iter_destroy (iter); + ret = gf_store_iter_destroy (iter); if (ret) goto out; @@ -2207,7 +2024,7 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo) brick_count++; } - ret = glusterd_store_iter_destroy (tmpiter); + ret = gf_store_iter_destroy (tmpiter); if (ret) goto out; out: @@ -2218,42 +2035,39 @@ out: int32_t -glusterd_store_retrieve_rbstate (char *volname) +glusterd_store_retrieve_rbstate (glusterd_volinfo_t *volinfo) { int32_t ret = -1; - glusterd_volinfo_t *volinfo = NULL; - glusterd_store_iter_t *iter = NULL; + gf_store_iter_t *iter = NULL; char *key = NULL; char *value = NULL; char volpath[PATH_MAX] = {0,}; glusterd_conf_t *priv = NULL; char path[PATH_MAX] = {0,}; - glusterd_store_op_errno_t op_errno = GD_STORE_SUCCESS; + gf_store_op_errno_t op_errno = GD_STORE_SUCCESS; + xlator_t *this = NULL; - priv = THIS->private; - - ret = glusterd_volinfo_find (volname, &volinfo); - if (ret) { - gf_log (THIS->name, GF_LOG_ERROR, "Couldn't get" - "volinfo for %s.", volname); - goto out; - } + this = THIS; + GF_ASSERT (this); + priv = this->private; + GF_ASSERT (priv); + GF_ASSERT (volinfo); GLUSTERD_GET_VOLUME_DIR(volpath, volinfo, priv); snprintf (path, sizeof (path), "%s/%s", volpath, GLUSTERD_VOLUME_RBSTATE_FILE); - ret = glusterd_store_handle_retrieve (path, &volinfo->rb_shandle); + ret = gf_store_handle_retrieve (path, &volinfo->rb_shandle); if (ret) goto out; - ret = glusterd_store_iter_new (volinfo->rb_shandle, &iter); + ret = gf_store_iter_new (volinfo->rb_shandle, &iter); if (ret) goto out; - ret = glusterd_store_iter_get_next (iter, &key, &value, &op_errno); + ret = gf_store_iter_get_next (iter, &key, &value, &op_errno); if (ret) goto out; @@ -2302,61 +2116,56 @@ glusterd_store_retrieve_rbstate (char *volname) key = NULL; value = NULL; - ret = glusterd_store_iter_get_next (iter, &key, &value, - &op_errno); + ret = gf_store_iter_get_next (iter, &key, &value, &op_errno); } if (op_errno != GD_STORE_EOF) goto out; - ret = glusterd_store_iter_destroy (iter); + ret = gf_store_iter_destroy (iter); if (ret) goto out; out: - gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret); + gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret); return ret; } int32_t -glusterd_store_retrieve_node_state (char *volname) +glusterd_store_retrieve_node_state (glusterd_volinfo_t *volinfo) { int32_t ret = -1; - glusterd_volinfo_t *volinfo = NULL; - glusterd_store_iter_t *iter = NULL; + gf_store_iter_t *iter = NULL; char *key = NULL; char *value = NULL; char volpath[PATH_MAX] = {0,}; glusterd_conf_t *priv = NULL; char path[PATH_MAX] = {0,}; - glusterd_store_op_errno_t op_errno = GD_STORE_SUCCESS; + gf_store_op_errno_t op_errno = GD_STORE_SUCCESS; + xlator_t *this = NULL; - priv = THIS->private; - - ret = glusterd_volinfo_find (volname, &volinfo); - if (ret) { - gf_log (THIS->name, GF_LOG_ERROR, "Couldn't get" - "volinfo for %s.", volname); - goto out; - } + this = THIS; + GF_ASSERT (this); + priv = this->private; + GF_ASSERT (priv); + GF_ASSERT (volinfo); GLUSTERD_GET_VOLUME_DIR(volpath, volinfo, priv); snprintf (path, sizeof (path), "%s/%s", volpath, GLUSTERD_NODE_STATE_FILE); - ret = glusterd_store_handle_retrieve (path, - &volinfo->node_state_shandle); + ret = gf_store_handle_retrieve (path, &volinfo->node_state_shandle); if (ret) goto out; - ret = glusterd_store_iter_new (volinfo->node_state_shandle, &iter); + ret = gf_store_iter_new (volinfo->node_state_shandle, &iter); if (ret) goto out; - ret = glusterd_store_iter_get_next (iter, &key, &value, &op_errno); + ret = gf_store_iter_get_next (iter, &key, &value, &op_errno); if (ret) goto out; @@ -2381,66 +2190,70 @@ glusterd_store_retrieve_node_state (char *volname) key = NULL; value = NULL; - ret = glusterd_store_iter_get_next (iter, &key, &value, - &op_errno); + ret = gf_store_iter_get_next (iter, &key, &value, &op_errno); } if (op_errno != GD_STORE_EOF) goto out; - ret = glusterd_store_iter_destroy (iter); + ret = gf_store_iter_destroy (iter); if (ret) goto out; out: - gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret); + gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret); return ret; } -int32_t -glusterd_store_retrieve_volume (char *volname) -{ - int32_t ret = -1; - glusterd_volinfo_t *volinfo = NULL; - glusterd_store_iter_t *iter = NULL; - char *key = NULL; - char *value = NULL; - char volpath[PATH_MAX] = {0,}; - glusterd_conf_t *priv = NULL; - char path[PATH_MAX] = {0,}; - int exists = 0; - glusterd_store_op_errno_t op_errno = GD_STORE_SUCCESS; - - ret = glusterd_volinfo_new (&volinfo); - if (ret) - goto out; +int +glusterd_store_update_volinfo (glusterd_volinfo_t *volinfo) +{ + int ret = -1; + int exists = 0; + char *key = NULL; + char *value = NULL; + char volpath[PATH_MAX] = {0,}; + char path[PATH_MAX] = {0,}; + xlator_t *this = NULL; + glusterd_conf_t *conf = NULL; + gf_store_iter_t *iter = NULL; + gf_store_op_errno_t op_errno = GD_STORE_SUCCESS; - strncpy (volinfo->volname, volname, GLUSTERD_MAX_VOLUME_NAME); + this = THIS; + GF_ASSERT (this); + conf = THIS->private; + GF_ASSERT (volinfo); - priv = THIS->private; + GLUSTERD_GET_VOLUME_DIR(volpath, volinfo, conf); - GLUSTERD_GET_VOLUME_DIR(volpath, volinfo, priv); snprintf (path, sizeof (path), "%s/%s", volpath, GLUSTERD_VOLUME_INFO_FILE); - ret = glusterd_store_handle_retrieve (path, &volinfo->shandle); - - if (ret) + ret = gf_store_handle_retrieve (path, &volinfo->shandle); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "volinfo handle is NULL"); goto out; + } - ret = glusterd_store_iter_new (volinfo->shandle, &iter); - - if (ret) + ret = gf_store_iter_new (volinfo->shandle, &iter); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get new store " + "iter"); goto out; + } - ret = glusterd_store_iter_get_next (iter, &key, &value, &op_errno); - if (ret) + ret = gf_store_iter_get_next (iter, &key, &value, &op_errno); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get next store " + "iter"); goto out; + } while (!ret) { + gf_log ("", GF_LOG_DEBUG, "key = %s value = %s", key, value); if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_TYPE, strlen (GLUSTERD_STORE_KEY_VOL_TYPE))) { volinfo->type = atoi (value); @@ -2500,9 +2313,24 @@ glusterd_store_retrieve_volume (char *volname) gf_log ("", GF_LOG_DEBUG, "Parsed as "GEOREP" " " slave:key=%s,value:%s", key, value); - } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_BACKEND, - strlen (GLUSTERD_STORE_KEY_VOL_BACKEND))) { - volinfo->backend = atoi (value); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_OP_VERSION, + strlen (GLUSTERD_STORE_KEY_VOL_OP_VERSION))) { + volinfo->op_version = atoi (value); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION, + strlen (GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION))) { + volinfo->client_op_version = atoi (value); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_CAPS, + strlen (GLUSTERD_STORE_KEY_VOL_CAPS))) { + volinfo->caps = atoi (value); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT, + strlen (GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT))) { + volinfo->snap_max_hard_limit = (uint64_t) atoll (value); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_IS_RESTORED, + strlen (GLUSTERD_STORE_KEY_VOL_IS_RESTORED))) { + volinfo->is_volume_restored = atoi (value); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_PARENT_VOLNAME, + strlen (GLUSTERD_STORE_KEY_PARENT_VOLNAME))) { + strncpy (volinfo->parent_volname, value, sizeof(volinfo->parent_volname) - 1); } else { if (is_key_glusterd_hooks_friendly (key)) { @@ -2542,8 +2370,7 @@ glusterd_store_retrieve_volume (char *volname) key = NULL; value = NULL; - ret = glusterd_store_iter_get_next (iter, &key, &value, - &op_errno); + ret = gf_store_iter_get_next (iter, &key, &value, &op_errno); } /* backward compatibility */ @@ -2582,16 +2409,59 @@ glusterd_store_retrieve_volume (char *volname) volinfo->subvol_count = (volinfo->brick_count / volinfo->dist_leaf_count); + /* Only calculate volume op-versions if they are not found */ + if (!volinfo->op_version && !volinfo->client_op_version) + gd_update_volume_op_versions (volinfo); } if (op_errno != GD_STORE_EOF) goto out; - ret = glusterd_store_iter_destroy (iter); + ret = gf_store_iter_destroy (iter); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to destroy store " + "iter"); + goto out; + } + ret = 0; +out: + return ret; +} + +glusterd_volinfo_t* +glusterd_store_retrieve_volume (char *volname, glusterd_snap_t *snap) +{ + int32_t ret = -1; + glusterd_volinfo_t *volinfo = NULL; + glusterd_volinfo_t *origin_volinfo = NULL; + glusterd_conf_t *priv = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + priv = this->private; + GF_ASSERT (priv); + GF_ASSERT (volname); + + ret = glusterd_volinfo_new (&volinfo); if (ret) goto out; + priv = THIS->private; + + strncpy (volinfo->volname, volname, GLUSTERD_MAX_VOLUME_NAME); + volinfo->snapshot = snap; + if (snap) + volinfo->is_snap_volume = _gf_true; + + ret = glusterd_store_update_volinfo (volinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to update volinfo " + "for %s volume", volname); + goto out; + } + ret = glusterd_store_retrieve_bricks (volinfo); if (ret) goto out; @@ -2600,14 +2470,28 @@ glusterd_store_retrieve_volume (char *volname) if (ret) goto out; - gd_update_volume_op_versions (volinfo); - - list_add_tail (&volinfo->vol_list, &priv->volumes); - + if (!snap) { + list_add_tail (&volinfo->vol_list, &priv->volumes); + } else { + ret = glusterd_volinfo_find (volinfo->parent_volname, + &origin_volinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Parent volinfo " + "not found for %s volume", volname); + goto out; + } + glusterd_list_add_snapvol (origin_volinfo, volinfo); + } out: - gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret); + if (ret) { + if (volinfo) + glusterd_volinfo_delete (volinfo); + volinfo = NULL; + } - return ret; + gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret); + + return volinfo; } inline void @@ -2619,16 +2503,16 @@ glusterd_store_set_options_path (glusterd_conf_t *conf, char *path, size_t len) int _store_global_opts (dict_t *this, char *key, data_t *value, void *data) { - glusterd_store_handle_t *shandle = data; + gf_store_handle_t *shandle = data; - glusterd_store_save_value (shandle->fd, key, (char*)value->data); + gf_store_save_value (shandle->fd, key, (char*)value->data); return 0; } int32_t glusterd_store_options (xlator_t *this, dict_t *opts) { - glusterd_store_handle_t *shandle = NULL; + gf_store_handle_t *shandle = NULL; glusterd_conf_t *conf = NULL; char path[PATH_MAX] = {0}; int fd = -1; @@ -2637,11 +2521,11 @@ glusterd_store_options (xlator_t *this, dict_t *opts) conf = this->private; glusterd_store_set_options_path (conf, path, sizeof (path)); - ret = glusterd_store_handle_new (path, &shandle); + ret = gf_store_handle_new (path, &shandle); if (ret) goto out; - fd = glusterd_store_mkstemp (shandle); + fd = gf_store_mkstemp (shandle); if (fd <= 0) { ret = -1; goto out; @@ -2650,11 +2534,11 @@ glusterd_store_options (xlator_t *this, dict_t *opts) shandle->fd = fd; dict_foreach (opts, _store_global_opts, shandle); shandle->fd = 0; - ret = glusterd_store_rename_tmppath (shandle); + ret = gf_store_rename_tmppath (shandle); if (ret) goto out; out: - glusterd_store_handle_destroy (shandle); + gf_store_handle_destroy (shandle); if (fd >=0 ) close (fd); return ret; @@ -2665,25 +2549,25 @@ glusterd_store_retrieve_options (xlator_t *this) { char path[PATH_MAX] = {0}; glusterd_conf_t *conf = NULL; - glusterd_store_handle_t *shandle = NULL; - glusterd_store_iter_t *iter = NULL; + gf_store_handle_t *shandle = NULL; + gf_store_iter_t *iter = NULL; char *key = NULL; char *value = NULL; - glusterd_store_op_errno_t op_errno = 0; + gf_store_op_errno_t op_errno = 0; int ret = -1; conf = this->private; glusterd_store_set_options_path (conf, path, sizeof (path)); - ret = glusterd_store_handle_retrieve (path, &shandle); + ret = gf_store_handle_retrieve (path, &shandle); if (ret) goto out; - ret = glusterd_store_iter_new (shandle, &iter); + ret = gf_store_iter_new (shandle, &iter); if (ret) goto out; - ret = glusterd_store_iter_get_next (iter, &key, &value, &op_errno); + ret = gf_store_iter_get_next (iter, &key, &value, &op_errno); while (!ret) { ret = dict_set_dynstr (conf->opts, key, value); if (ret) { @@ -2695,22 +2579,21 @@ glusterd_store_retrieve_options (xlator_t *this) key = NULL; value = NULL; - ret = glusterd_store_iter_get_next (iter, &key, &value, - &op_errno); + ret = gf_store_iter_get_next (iter, &key, &value, &op_errno); } if (op_errno != GD_STORE_EOF) goto out; ret = 0; out: - glusterd_store_iter_destroy (iter); - glusterd_store_handle_destroy (shandle); + gf_store_iter_destroy (iter); + gf_store_handle_destroy (shandle); return ret; } int32_t -glusterd_store_retrieve_volumes (xlator_t *this) +glusterd_store_retrieve_volumes (xlator_t *this, glusterd_snap_t *snap) { - int32_t ret = 0; + int32_t ret = -1; char path[PATH_MAX] = {0,}; glusterd_conf_t *priv = NULL; DIR *dir = NULL; @@ -2722,51 +2605,414 @@ glusterd_store_retrieve_volumes (xlator_t *this) GF_ASSERT (priv); - snprintf (path, PATH_MAX, "%s/%s", priv->workdir, - GLUSTERD_VOLUME_DIR_PREFIX); + if (snap) + snprintf (path, PATH_MAX, "%s/snaps/%s", priv->workdir, + snap->snapname); + else + snprintf (path, PATH_MAX, "%s/%s", priv->workdir, + GLUSTERD_VOLUME_DIR_PREFIX); dir = opendir (path); if (!dir) { gf_log ("", GF_LOG_ERROR, "Unable to open dir %s", path); - ret = -1; goto out; } glusterd_for_each_entry (entry, dir); while (entry) { - ret = glusterd_store_retrieve_volume (entry->d_name); - if (ret) { + if ( entry->d_type != DT_DIR ) + goto next; + + volinfo = glusterd_store_retrieve_volume (entry->d_name, snap); + if (!volinfo) { gf_log ("", GF_LOG_ERROR, "Unable to restore " "volume: %s", entry->d_name); + ret = -1; goto out; } - ret = glusterd_store_retrieve_rbstate (entry->d_name); + ret = glusterd_store_retrieve_rbstate (volinfo); if (ret) { /* Backward compatibility */ gf_log ("", GF_LOG_INFO, "Creating a new rbstate " "for volume: %s.", entry->d_name); - ret = glusterd_volinfo_find (entry->d_name, &volinfo); ret = glusterd_store_create_rbstate_shandle_on_absence (volinfo); ret = glusterd_store_perform_rbstate_store (volinfo); } - ret = glusterd_store_retrieve_node_state (entry->d_name); + ret = glusterd_store_retrieve_node_state (volinfo); if (ret) { /* Backward compatibility */ gf_log ("", GF_LOG_INFO, "Creating a new node_state " "for volume: %s.", entry->d_name); - ret = glusterd_volinfo_find (entry->d_name, &volinfo); - ret = glusterd_store_create_nodestate_sh_on_absence (volinfo); ret = glusterd_store_perform_node_state_store (volinfo); } + +next: + glusterd_for_each_entry (entry, dir); + } + + ret = 0; +out: + if (dir) + closedir (dir); + gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret); + + return ret; +} + +int32_t +glusterd_resolve_snap_bricks (xlator_t *this, glusterd_snap_t *snap) +{ + int32_t ret = -1; + glusterd_volinfo_t *volinfo = NULL; + glusterd_brickinfo_t *brickinfo = NULL; + + GF_ASSERT (this); + GF_VALIDATE_OR_GOTO (this->name, snap, out); + + list_for_each_entry (volinfo, &snap->volumes, vol_list) { + list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { + ret = glusterd_resolve_brick (brickinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "resolve brick failed in restore"); + goto out; + } + } + } + + ret = 0; + +out: + gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret); + + return ret; +} + +int +glusterd_store_update_snap (glusterd_snap_t *snap) +{ + int ret = -1; + char *key = NULL; + char *value = NULL; + char snappath[PATH_MAX] = {0,}; + char path[PATH_MAX] = {0,}; + xlator_t *this = NULL; + glusterd_conf_t *conf = NULL; + gf_store_iter_t *iter = NULL; + gf_store_op_errno_t op_errno = GD_STORE_SUCCESS; + + this = THIS; + conf = this->private; + GF_ASSERT (snap); + + GLUSTERD_GET_SNAP_DIR (snappath, snap, conf); + + snprintf (path, sizeof (path), "%s/%s", snappath, + GLUSTERD_SNAP_INFO_FILE); + + ret = gf_store_handle_retrieve (path, &snap->shandle); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "snap handle is NULL"); + goto out; + } + + ret = gf_store_iter_new (snap->shandle, &iter); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get new store " + "iter"); + goto out; + } + + ret = gf_store_iter_get_next (iter, &key, &value, &op_errno); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get next store " + "iter"); + goto out; + } + + while (!ret) { + gf_log (this->name, GF_LOG_DEBUG, "key = %s value = %s", + key, value); + + if (!strncmp (key, GLUSTERD_STORE_KEY_SNAP_ID, + strlen (GLUSTERD_STORE_KEY_SNAP_ID))) { + ret = uuid_parse (value, snap->snap_id); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "Failed to parse uuid"); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_SNAP_RESTORED, + strlen (GLUSTERD_STORE_KEY_SNAP_RESTORED))) { + snap->snap_restored = atoi (value); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_SNAP_STATUS, + strlen (GLUSTERD_STORE_KEY_SNAP_STATUS))) { + snap->snap_status = atoi (value); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_SNAP_DESC, + strlen (GLUSTERD_STORE_KEY_SNAP_DESC))) { + snap->description = gf_strdup (value); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_SNAP_TIMESTAMP, + strlen (GLUSTERD_STORE_KEY_SNAP_TIMESTAMP))) { + snap->time_stamp = atoi (value); + } + + GF_FREE (key); + GF_FREE (value); + key = NULL; + value = NULL; + + ret = gf_store_iter_get_next (iter, &key, &value, &op_errno); + } + + if (op_errno != GD_STORE_EOF) + goto out; + + ret = gf_store_iter_destroy (iter); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to destroy store " + "iter"); + } + +out: + return ret; +} + +int32_t +glusterd_store_retrieve_snap (char *snapname) +{ + int32_t ret = -1; + dict_t *dict = NULL; + glusterd_snap_t *snap = NULL; + glusterd_conf_t *priv = NULL; + xlator_t *this = NULL; + + this = THIS; + priv = this->private; + GF_ASSERT (priv); + GF_ASSERT (snapname); + + dict = dict_new(); + if (!dict) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to create dict"); + ret = -1; + goto out; + } + + snap = glusterd_new_snap_object (); + if (!snap) { + gf_log (this->name, GF_LOG_ERROR, "Failed to create " + " snap object"); + goto out; + } + + strncpy (snap->snapname, snapname, strlen(snapname)); + ret = glusterd_store_update_snap (snap); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to update snapshot " + "for %s snap", snapname); + goto out; + } + + ret = glusterd_store_retrieve_volumes (this, snap); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to retrieve " + "snap volumes for snap %s", snapname); + goto out; + } + + /* Unlike bricks of normal volumes which are resolved at the end of + the glusterd restore, the bricks belonging to the snap volumes of + each snap should be resolved as part of snapshot restore itself. + Because if the snapshot has to be removed, then resolving bricks + helps glusterd in understanding what all bricks have its own uuid + and killing those bricks. + */ + ret = glusterd_resolve_snap_bricks (this, snap); + if (ret) + gf_log (this->name, GF_LOG_WARNING, "resolving the snap bricks" + " failed (snap: %s)", snap?snap->snapname:""); + + /* When the snapshot command from cli is received, the on disk and + in memory structures for the snapshot are created (with the status) + being marked as GD_SNAP_STATUS_INIT. Once the backend snapshot is + taken, the status is changed to GD_SNAP_STATUS_IN_USE. If glusterd + dies after taking the backend snapshot, but before updating the + status, then when glusterd comes up, it should treat that snapshot + as a failed snapshot and clean it up. + */ + if (snap->snap_status != GD_SNAP_STATUS_IN_USE) { + ret = glusterd_snap_remove (dict, snap, _gf_true, _gf_true); + if (ret) + gf_log (this->name, GF_LOG_WARNING, "failed to remove" + " the snapshot %s", snap->snapname); + goto out; + } + + /* TODO: list_add_order can do 'N-square' comparisions and + is not efficient. Find a better solution to store the snap + in order */ + list_add_order (&snap->snap_list, &priv->snapshots, + glusterd_compare_snap_time); + +out: + if (dict) + dict_unref (dict); + + gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret); + return ret; +} + +/* Read the missed_snap_list and update the in-memory structs */ +int32_t +glusterd_store_retrieve_missed_snaps_list (xlator_t *this) +{ + char buf[PATH_MAX] = ""; + char path[PATH_MAX] = ""; + char *missed_node_info = NULL; + char *brick_path = NULL; + char *value = NULL; + char *save_ptr = NULL; + FILE *fp = NULL; + int32_t brick_num = -1; + int32_t snap_op = -1; + int32_t snap_status = -1; + int32_t ret = -1; + glusterd_conf_t *priv = NULL; + gf_store_op_errno_t store_errno = GD_STORE_SUCCESS; + + GF_ASSERT (this); + priv = this->private; + GF_ASSERT (priv); + + /* Get the path of the missed_snap_list */ + glusterd_store_missed_snaps_list_path_set (path, sizeof(path)); + + fp = fopen (path, "r"); + if (!fp) { + /* If errno is ENOENT then there are no missed snaps yet */ + if (errno != ENOENT) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "Failed to open %s. " + "Error: %s", path, strerror(errno)); + } else { + gf_log (this->name, GF_LOG_INFO, + "No missed snaps list."); + ret = 0; + } + goto out; + } + + do { + ret = gf_store_read_and_tokenize (fp, buf, + &missed_node_info, &value, + &store_errno); + if (ret) { + if (store_errno == GD_STORE_EOF) { + gf_log (this->name, + GF_LOG_DEBUG, + "EOF for missed_snap_list"); + ret = 0; + break; + } + gf_log (this->name, GF_LOG_ERROR, + "Failed to fetch data from " + "missed_snaps_list. Error: %s", + gf_store_strerror (store_errno)); + goto out; + } + + /* Fetch the brick_num, brick_path, snap_op and snap status */ + brick_num = atoi(strtok_r (value, ":", &save_ptr)); + brick_path = strtok_r (NULL, ":", &save_ptr); + snap_op = atoi(strtok_r (NULL, ":", &save_ptr)); + snap_status = atoi(strtok_r (NULL, ":", &save_ptr)); + + if (!missed_node_info || !brick_path || + brick_num < 1 || snap_op < 1 || + snap_status < 1) { + gf_log (this->name, GF_LOG_ERROR, + "Invalid missed_snap_entry"); + ret = -1; + goto out; + } + + ret = glusterd_store_missed_snaps_list (missed_node_info, + brick_num, + brick_path, + snap_op, + snap_status); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to store missed snaps_list"); + goto out; + } + + } while (store_errno == GD_STORE_SUCCESS); + + ret = 0; +out: + gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret); + return ret; +} + +int32_t +glusterd_store_retrieve_snaps (xlator_t *this) +{ + int32_t ret = 0; + char path[PATH_MAX] = {0,}; + glusterd_conf_t *priv = NULL; + DIR *dir = NULL; + struct dirent *entry = NULL; + + GF_ASSERT (this); + priv = this->private; + + GF_ASSERT (priv); + + snprintf (path, PATH_MAX, "%s/snaps", priv->workdir); + + dir = opendir (path); + + if (!dir) { + /* If snaps dir doesn't exists ignore the error for + backward compatibility */ + if (errno != ENOENT) { + ret = -1; + gf_log ("", GF_LOG_ERROR, "Unable to open dir %s", path); + } + goto out; + } + + glusterd_for_each_entry (entry, dir); + + while (entry) { + if (entry->d_type == DT_DIR) { + ret = glusterd_store_retrieve_snap (entry->d_name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to restore snapshot: %s", + entry->d_name); + goto out; + } + } + glusterd_for_each_entry (entry, dir); } + /* Retrieve missed_snaps_list */ + ret = glusterd_store_retrieve_missed_snaps_list (this); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "Failed to retrieve missed_snaps_list"); + goto out; + } + out: if (dir) closedir (dir); @@ -2775,6 +3021,126 @@ out: return ret; } +/* Writes all the contents of conf->missed_snap_list */ +int32_t +glusterd_store_write_missed_snapinfo (int32_t fd) +{ + char value[PATH_MAX] = ""; + int32_t ret = -1; + glusterd_conf_t *priv = NULL; + glusterd_missed_snap_info *missed_snapinfo = NULL; + glusterd_snap_op_t *snap_opinfo = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT(this); + + priv = this->private; + GF_ASSERT (priv); + + /* Write the missed_snap_entry */ + list_for_each_entry (missed_snapinfo, &priv->missed_snaps_list, + missed_snaps) { + list_for_each_entry (snap_opinfo, + &missed_snapinfo->snap_ops, + snap_ops_list) { + snprintf (value, sizeof(value), "%d:%s:%d:%d", + snap_opinfo->brick_num, + snap_opinfo->brick_path, + snap_opinfo->op, snap_opinfo->status); + ret = gf_store_save_value + (fd, + missed_snapinfo->node_snap_info, + value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to write missed snapinfo"); + goto out; + } + } + } + + ret = 0; +out: + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +/* Adds the missed snap entries to the in-memory conf->missed_snap_list * + * and writes them to disk */ +int32_t +glusterd_store_update_missed_snaps (dict_t *dict, int32_t missed_snap_count) +{ + int32_t fd = -1; + int32_t ret = -1; + glusterd_conf_t *priv = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT(this); + GF_ASSERT(dict); + + priv = this->private; + GF_ASSERT (priv); + + if (missed_snap_count < 1) { + gf_log (this->name, GF_LOG_DEBUG, "No missed snaps"); + ret = 0; + goto out; + } + + ret = glusterd_store_create_missed_snaps_list_shandle_on_absence (); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Unable to obtain " + "missed_snaps_list store handle."); + goto out; + } + + fd = gf_store_mkstemp (priv->missed_snaps_list_shandle); + if (fd <= 0) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to create tmp file"); + ret = -1; + goto out; + } + + ret = glusterd_add_missed_snaps_to_list (dict, missed_snap_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to add missed snaps to list"); + goto out; + } + + ret = glusterd_store_write_missed_snapinfo (fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to write missed snaps to disk"); + goto out; + } + + ret = gf_store_rename_tmppath (priv->missed_snaps_list_shandle); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to rename the tmp file"); + goto out; + } +out: + if (ret && (fd > 0)) { + ret = gf_store_unlink_tmppath (priv->missed_snaps_list_shandle); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to unlink the tmp file"); + } + ret = -1; + } + + if (fd > 0) + close (fd); + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + int32_t glusterd_store_delete_peerinfo (glusterd_peerinfo_t *peerinfo) { @@ -2823,7 +3189,7 @@ glusterd_store_delete_peerinfo (glusterd_peerinfo_t *peerinfo) out: if (peerinfo->shandle) { - glusterd_store_handle_destroy (peerinfo->shandle); + gf_store_handle_destroy (peerinfo->shandle); peerinfo->shandle = NULL; } gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret); @@ -2849,7 +3215,7 @@ glusterd_store_create_peer_dir () char path[PATH_MAX]; glusterd_store_peerinfo_dirpath_set (path, sizeof (path)); - ret = glusterd_store_mkdir (path); + ret = gf_store_mkdir (path); gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret); return ret; @@ -2893,8 +3259,8 @@ glusterd_store_peerinfo_hostname_shandle_create (glusterd_peerinfo_t *peerinfo) glusterd_store_hostname_peerpath_set (peerinfo, peerfpath, sizeof (peerfpath)); - ret = glusterd_store_handle_create_on_absence (&peerinfo->shandle, - peerfpath); + ret = gf_store_handle_create_on_absence (&peerinfo->shandle, + peerfpath); return ret; } @@ -2906,8 +3272,8 @@ glusterd_store_peerinfo_uuid_shandle_create (glusterd_peerinfo_t *peerinfo) glusterd_store_uuid_peerpath_set (peerinfo, peerfpath, sizeof (peerfpath)); - ret = glusterd_store_handle_create_on_absence (&peerinfo->shandle, - peerfpath); + ret = gf_store_handle_create_on_absence (&peerinfo->shandle, + peerfpath); return ret; } @@ -2923,7 +3289,7 @@ glusterd_peerinfo_hostname_shandle_check_destroy (glusterd_peerinfo_t *peerinfo) ret = stat (peerfpath, &stbuf); if (!ret) { if (peerinfo->shandle) - glusterd_store_handle_destroy (peerinfo->shandle); + gf_store_handle_destroy (peerinfo->shandle); peerinfo->shandle = NULL; ret = unlink (peerfpath); } @@ -2952,18 +3318,18 @@ glusterd_store_peer_write (int fd, glusterd_peerinfo_t *peerinfo) char buf[50] = {0}; int32_t ret = 0; - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_PEER_UUID, - uuid_utoa (peerinfo->uuid)); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_PEER_UUID, + uuid_utoa (peerinfo->uuid)); if (ret) goto out; snprintf (buf, sizeof (buf), "%d", peerinfo->state.state); - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_PEER_STATE, buf); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_PEER_STATE, buf); if (ret) goto out; - ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_PEER_HOSTNAME "1", - peerinfo->hostname); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_PEER_HOSTNAME "1", + peerinfo->hostname); if (ret) goto out; @@ -2980,7 +3346,7 @@ glusterd_store_perform_peer_store (glusterd_peerinfo_t *peerinfo) GF_ASSERT (peerinfo); - fd = glusterd_store_mkstemp (peerinfo->shandle); + fd = gf_store_mkstemp (peerinfo->shandle); if (fd <= 0) { ret = -1; goto out; @@ -2990,10 +3356,10 @@ glusterd_store_perform_peer_store (glusterd_peerinfo_t *peerinfo) if (ret) goto out; - ret = glusterd_store_rename_tmppath (peerinfo->shandle); + ret = gf_store_rename_tmppath (peerinfo->shandle); out: if (ret && (fd > 0)) - glusterd_store_unlink_tmppath (peerinfo->shandle); + gf_store_unlink_tmppath (peerinfo->shandle); if (fd > 0) close (fd); gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); @@ -3033,13 +3399,13 @@ glusterd_store_retrieve_peers (xlator_t *this) uuid_t uuid = {0,}; char *hostname = NULL; int32_t state = 0; - glusterd_store_handle_t *shandle = NULL; + gf_store_handle_t *shandle = NULL; char filepath[PATH_MAX] = {0,}; - glusterd_store_iter_t *iter = NULL; + gf_store_iter_t *iter = NULL; char *key = NULL; char *value = NULL; glusterd_peerctx_args_t args = {0}; - glusterd_store_op_errno_t op_errno = GD_STORE_SUCCESS; + gf_store_op_errno_t op_errno = GD_STORE_SUCCESS; GF_ASSERT (this); priv = this->private; @@ -3061,16 +3427,15 @@ glusterd_store_retrieve_peers (xlator_t *this) while (entry) { snprintf (filepath, PATH_MAX, "%s/%s", path, entry->d_name); - ret = glusterd_store_handle_retrieve (filepath, &shandle); + ret = gf_store_handle_retrieve (filepath, &shandle); if (ret) goto out; - ret = glusterd_store_iter_new (shandle, &iter); + ret = gf_store_iter_new (shandle, &iter); if (ret) goto out; - ret = glusterd_store_iter_get_next (iter, &key, &value, - &op_errno); + ret = gf_store_iter_get_next (iter, &key, &value, &op_errno); if (ret) goto out; @@ -3098,13 +3463,13 @@ glusterd_store_retrieve_peers (xlator_t *this) key = NULL; value = NULL; - ret = glusterd_store_iter_get_next (iter, &key, &value, - &op_errno); + ret = gf_store_iter_get_next (iter, &key, &value, + &op_errno); } if (op_errno != GD_STORE_EOF) goto out; - (void) glusterd_store_iter_destroy (iter); + (void) gf_store_iter_destroy (iter); ret = glusterd_friend_add (hostname, 0, state, &uuid, &peerinfo, 1, NULL); @@ -3177,7 +3542,11 @@ glusterd_restore () goto out; } - ret = glusterd_store_retrieve_volumes (this); + ret = glusterd_store_retrieve_volumes (this, NULL); + if (ret) + goto out; + + ret = glusterd_store_retrieve_snaps (this); if (ret) goto out; diff --git a/xlators/mgmt/glusterd/src/glusterd-store.h b/xlators/mgmt/glusterd/src/glusterd-store.h index 762604e23..1b5cebc0c 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.h +++ b/xlators/mgmt/glusterd/src/glusterd-store.h @@ -35,38 +35,55 @@ typedef enum glusterd_store_ver_ac_{ } glusterd_volinfo_ver_ac_t; -#define GLUSTERD_STORE_UUID_KEY "UUID" - -#define GLUSTERD_STORE_KEY_VOL_TYPE "type" -#define GLUSTERD_STORE_KEY_VOL_COUNT "count" -#define GLUSTERD_STORE_KEY_VOL_STATUS "status" -#define GLUSTERD_STORE_KEY_VOL_PORT "port" -#define GLUSTERD_STORE_KEY_VOL_SUB_COUNT "sub_count" -#define GLUSTERD_STORE_KEY_VOL_STRIPE_CNT "stripe_count" -#define GLUSTERD_STORE_KEY_VOL_REPLICA_CNT "replica_count" -#define GLUSTERD_STORE_KEY_VOL_BRICK "brick" -#define GLUSTERD_STORE_KEY_VOL_VERSION "version" -#define GLUSTERD_STORE_KEY_VOL_TRANSPORT "transport-type" -#define GLUSTERD_STORE_KEY_VOL_ID "volume-id" -#define GLUSTERD_STORE_KEY_RB_STATUS "rb_status" -#define GLUSTERD_STORE_KEY_RB_SRC_BRICK "rb_src" -#define GLUSTERD_STORE_KEY_RB_DST_BRICK "rb_dst" -#define GLUSTERD_STORE_KEY_RB_DST_PORT "rb_port" -#define GLUSTERD_STORE_KEY_VOL_DEFRAG "rebalance_status" -#define GLUSTERD_STORE_KEY_DEFRAG_OP "rebalance_op" -#define GLUSTERD_STORE_KEY_USERNAME "username" -#define GLUSTERD_STORE_KEY_PASSWORD "password" - -#define GLUSTERD_STORE_KEY_BRICK_HOSTNAME "hostname" -#define GLUSTERD_STORE_KEY_BRICK_PATH "path" -#define GLUSTERD_STORE_KEY_BRICK_PORT "listen-port" -#define GLUSTERD_STORE_KEY_BRICK_RDMA_PORT "rdma.listen-port" +#define GLUSTERD_STORE_UUID_KEY "UUID" + +#define GLUSTERD_STORE_KEY_VOL_TYPE "type" +#define GLUSTERD_STORE_KEY_VOL_COUNT "count" +#define GLUSTERD_STORE_KEY_VOL_STATUS "status" +#define GLUSTERD_STORE_KEY_VOL_PORT "port" +#define GLUSTERD_STORE_KEY_VOL_SUB_COUNT "sub_count" +#define GLUSTERD_STORE_KEY_VOL_STRIPE_CNT "stripe_count" +#define GLUSTERD_STORE_KEY_VOL_REPLICA_CNT "replica_count" +#define GLUSTERD_STORE_KEY_VOL_BRICK "brick" +#define GLUSTERD_STORE_KEY_VOL_VERSION "version" +#define GLUSTERD_STORE_KEY_VOL_TRANSPORT "transport-type" +#define GLUSTERD_STORE_KEY_VOL_ID "volume-id" +#define GLUSTERD_STORE_KEY_VOL_IS_RESTORED "is-volume-restored" +#define GLUSTERD_STORE_KEY_RB_STATUS "rb_status" +#define GLUSTERD_STORE_KEY_RB_SRC_BRICK "rb_src" +#define GLUSTERD_STORE_KEY_RB_DST_BRICK "rb_dst" +#define GLUSTERD_STORE_KEY_RB_DST_PORT "rb_port" +#define GLUSTERD_STORE_KEY_VOL_DEFRAG "rebalance_status" +#define GLUSTERD_STORE_KEY_DEFRAG_OP "rebalance_op" +#define GLUSTERD_STORE_KEY_USERNAME "username" +#define GLUSTERD_STORE_KEY_PASSWORD "password" +#define GLUSTERD_STORE_KEY_PARENT_VOLNAME "parent_volname" +#define GLUSTERD_STORE_KEY_VOL_OP_VERSION "op-version" +#define GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION "client-op-version" + +#define GLUSTERD_STORE_KEY_SNAP_NAME "name" +#define GLUSTERD_STORE_KEY_SNAP_ID "snap-id" +#define GLUSTERD_STORE_KEY_SNAP_DESC "desc" +#define GLUSTERD_STORE_KEY_SNAP_TIMESTAMP "time-stamp" +#define GLUSTERD_STORE_KEY_SNAP_STATUS "status" +#define GLUSTERD_STORE_KEY_SNAP_RESTORED "snap-restored" +#define GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT "snap-max-hard-limit" +#define GLUSTERD_STORE_KEY_SNAP_MAX_SOFT_LIMIT "snap-max-soft-limit" + +#define GLUSTERD_STORE_KEY_BRICK_HOSTNAME "hostname" +#define GLUSTERD_STORE_KEY_BRICK_PATH "path" +#define GLUSTERD_STORE_KEY_BRICK_PORT "listen-port" +#define GLUSTERD_STORE_KEY_BRICK_RDMA_PORT "rdma.listen-port" #define GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED "decommissioned" +#define GLUSTERD_STORE_KEY_BRICK_VGNAME "vg" +#define GLUSTERD_STORE_KEY_BRICK_DEVICE_PATH "device_path" +#define GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS "snap-status" -#define GLUSTERD_STORE_KEY_PEER_UUID "uuid" -#define GLUSTERD_STORE_KEY_PEER_HOSTNAME "hostname" -#define GLUSTERD_STORE_KEY_PEER_STATE "state" -#define GLUSTERD_STORE_KEY_VOL_BACKEND "backend" +#define GLUSTERD_STORE_KEY_PEER_UUID "uuid" +#define GLUSTERD_STORE_KEY_PEER_HOSTNAME "hostname" +#define GLUSTERD_STORE_KEY_PEER_STATE "state" + +#define GLUSTERD_STORE_KEY_VOL_CAPS "caps" #define glusterd_for_each_entry(entry, dir) \ do {\ @@ -74,6 +91,7 @@ typedef enum glusterd_store_ver_ac_{ if (dir) {\ entry = readdir (dir);\ while (entry && (!strcmp (entry->d_name, ".") ||\ + !fnmatch ("*.tmp", entry->d_name, 0) ||\ !strcmp (entry->d_name, ".."))) {\ entry = readdir (dir);\ }\ @@ -81,16 +99,6 @@ typedef enum glusterd_store_ver_ac_{ } while (0); \ -typedef enum { - GD_STORE_SUCCESS, - GD_STORE_KEY_NULL, - GD_STORE_VALUE_NULL, - GD_STORE_KEY_VALUE_NULL, - GD_STORE_EOF, - GD_STORE_ENOMEM, - GD_STORE_STAT_FAILED -} glusterd_store_op_errno_t; - int32_t glusterd_store_volinfo (glusterd_volinfo_t *volinfo, glusterd_volinfo_ver_ac_t ac); @@ -98,14 +106,7 @@ int32_t glusterd_store_delete_volume (glusterd_volinfo_t *volinfo); int32_t -glusterd_store_handle_new (char *path, glusterd_store_handle_t **handle); - -int32_t -glusterd_store_save_value (int fd, char *key, char *value); - -int32_t -glusterd_store_retrieve_value (glusterd_store_handle_t *handle, - char *key, char **value); +glusterd_store_delete_snap (glusterd_snap_t *snap); int32_t glusterd_retrieve_uuid (); @@ -121,9 +122,6 @@ glusterd_store_delete_brick (glusterd_brickinfo_t *brickinfo, char *delete_path); int32_t -glusterd_store_handle_destroy (glusterd_store_handle_t *handle); - -int32_t glusterd_restore (); void @@ -145,5 +143,22 @@ int32_t glusterd_store_retrieve_options (xlator_t *this); int32_t +glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo); + +int32_t glusterd_store_options (xlator_t *this, dict_t *opts); + +void +glusterd_replace_slash_with_hyphen (char *str); + +int32_t +glusterd_store_perform_volume_store (glusterd_volinfo_t *volinfo); + +int32_t +glusterd_store_snap (glusterd_snap_t *snap); + +int32_t +glusterd_store_update_missed_snaps (dict_t *dict, + int32_t missed_snap_count); + #endif diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.c b/xlators/mgmt/glusterd/src/glusterd-syncop.c index 7b0c28baf..438df8266 100644 --- a/xlators/mgmt/glusterd/src/glusterd-syncop.c +++ b/xlators/mgmt/glusterd/src/glusterd-syncop.c @@ -17,8 +17,11 @@ #include "glusterd.h" #include "glusterd-op-sm.h" #include "glusterd-utils.h" +#include "glusterd-locks.h" -static inline void +extern glusterd_op_info_t opinfo; + +void gd_synctask_barrier_wait (struct syncargs *args, int count) { glusterd_conf_t *conf = THIS->private; @@ -31,18 +34,135 @@ gd_synctask_barrier_wait (struct syncargs *args, int count) } static void -gd_collate_errors (struct syncargs *args, int op_ret, int op_errno, - char *op_errstr) +gd_mgmt_v3_collate_errors (struct syncargs *args, int op_ret, int op_errno, + char *op_errstr, int op_code, + glusterd_peerinfo_t *peerinfo, u_char *uuid) { - if (args->op_ret) - return; - args->op_ret = op_ret; - args->op_errno = op_errno; - if (op_ret && op_errstr && strcmp (op_errstr, "")) - args->errstr = gf_strdup (op_errstr); + char err_str[PATH_MAX] = "Please check log file for details."; + char op_err[PATH_MAX] = ""; + int len = -1; + char *peer_str = NULL; + + if (op_ret) { + args->op_ret = op_ret; + args->op_errno = op_errno; + + if (peerinfo) + peer_str = peerinfo->hostname; + else + peer_str = uuid_utoa (uuid); + + if (op_errstr && strcmp (op_errstr, "")) { + len = snprintf (err_str, sizeof(err_str) - 1, + "Error: %s", op_errstr); + err_str[len] = '\0'; + } + + switch (op_code){ + case GLUSTERD_MGMT_V3_LOCK: + { + len = snprintf (op_err, sizeof(op_err) - 1, + "Locking failed " + "on %s. %s", peer_str, err_str); + break; + } + case GLUSTERD_MGMT_V3_UNLOCK: + { + len = snprintf (op_err, sizeof(op_err) - 1, + "Unlocking failed " + "on %s. %s", peer_str, err_str); + break; + } + } + op_err[len] = '\0'; + + if (args->errstr) { + len = snprintf (err_str, sizeof(err_str) - 1, + "%s\n%s", args->errstr, + op_err); + GF_FREE (args->errstr); + args->errstr = NULL; + } else + len = snprintf (err_str, sizeof(err_str) - 1, + "%s", op_err); + err_str[len] = '\0'; + + gf_log ("", GF_LOG_ERROR, "%s", op_err); + args->errstr = gf_strdup (err_str); + } + + return; } static void +gd_collate_errors (struct syncargs *args, int op_ret, int op_errno, + char *op_errstr, int op_code, + glusterd_peerinfo_t *peerinfo, u_char *uuid) +{ + char err_str[PATH_MAX] = "Please check log file for details."; + char op_err[PATH_MAX] = ""; + int len = -1; + char *peer_str = NULL; + + if (op_ret) { + args->op_ret = op_ret; + args->op_errno = op_errno; + + if (peerinfo) + peer_str = peerinfo->hostname; + else + peer_str = uuid_utoa (uuid); + + if (op_errstr && strcmp (op_errstr, "")) { + len = snprintf (err_str, sizeof(err_str) - 1, + "Error: %s", op_errstr); + err_str[len] = '\0'; + } + + switch (op_code){ + case GLUSTERD_MGMT_CLUSTER_UNLOCK : + { + len = snprintf (op_err, sizeof(op_err) - 1, + "Unlocking failed on %s. %s", + peer_str, err_str); + break; + } + case GLUSTERD_MGMT_STAGE_OP : + { + len = snprintf (op_err, sizeof(op_err) - 1, + "Staging failed on %s. %s", + peer_str, err_str); + break; + } + case GLUSTERD_MGMT_COMMIT_OP : + { + len = snprintf (op_err, sizeof(op_err) - 1, + "Commit failed on %s. %s", + peer_str, err_str); + break; + } + } + op_err[len] = '\0'; + + if (args->errstr) { + len = snprintf (err_str, sizeof(err_str) - 1, + "%s\n%s", args->errstr, + op_err); + GF_FREE (args->errstr); + args->errstr = NULL; + } else + len = snprintf (err_str, sizeof(err_str) - 1, + "%s", op_err); + err_str[len] = '\0'; + + gf_log ("", GF_LOG_ERROR, "%s", op_err); + args->errstr = gf_strdup (err_str); + } + + return; +} + +void gd_syncargs_init (struct syncargs *args, dict_t *op_ctx) { args->dict = op_ctx; @@ -144,8 +264,9 @@ out: /* Defined in glusterd-rpc-ops.c */ extern struct rpc_clnt_program gd_mgmt_prog; extern struct rpc_clnt_program gd_brick_prog; +extern struct rpc_clnt_program gd_mgmt_v3_prog; -static int +int glusterd_syncop_aggr_rsp_dict (glusterd_op_t op, dict_t *aggr, dict_t *rsp) { int ret = 0; @@ -169,6 +290,9 @@ glusterd_syncop_aggr_rsp_dict (glusterd_op_t op, dict_t *aggr, dict_t *rsp) goto out; break; + case GD_OP_GSYNC_CREATE: + break; + case GD_OP_GSYNC_SET: ret = glusterd_gsync_use_rsp_dict (aggr, rsp, NULL); if (ret) @@ -203,6 +327,18 @@ glusterd_syncop_aggr_rsp_dict (glusterd_op_t op, dict_t *aggr, dict_t *rsp) break; + case GD_OP_SYS_EXEC: + ret = glusterd_sys_exec_output_rsp_dict (aggr, rsp); + if (ret) + goto out; + break; + + case GD_OP_SNAP: + ret = glusterd_snap_use_rsp_dict (aggr, rsp); + if (ret) + goto out; + break; + default: break; } @@ -246,7 +382,8 @@ _gd_syncop_mgmt_lock_cbk (struct rpc_req *req, struct iovec *iov, op_ret = rsp.op_ret; op_errno = rsp.op_errno; out: - gd_collate_errors (args, op_ret, op_errno, NULL); + gd_collate_errors (args, op_ret, op_errno, NULL, + GLUSTERD_MGMT_CLUSTER_LOCK, peerinfo, rsp.uuid); STACK_DESTROY (frame->root); synctask_barrier_wake(args); return 0; @@ -280,6 +417,183 @@ gd_syncop_mgmt_lock (glusterd_peerinfo_t *peerinfo, struct syncargs *args, } int32_t +gd_syncop_mgmt_v3_lock_cbk_fn (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + int ret = -1; + struct syncargs *args = NULL; + glusterd_peerinfo_t *peerinfo = NULL; + gd1_mgmt_v3_lock_rsp rsp = {{0},}; + call_frame_t *frame = NULL; + int op_ret = -1; + int op_errno = -1; + + GF_ASSERT(req); + GF_ASSERT(iov); + GF_ASSERT(myframe); + + frame = myframe; + args = frame->local; + peerinfo = frame->cookie; + frame->local = NULL; + frame->cookie = NULL; + + if (-1 == req->rpc_status) { + op_errno = ENOTCONN; + goto out; + } + + ret = xdr_to_generic (*iov, &rsp, + (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp); + if (ret < 0) + goto out; + + uuid_copy (args->uuid, rsp.uuid); + + op_ret = rsp.op_ret; + op_errno = rsp.op_errno; +out: + gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL, + GLUSTERD_MGMT_V3_LOCK, + peerinfo, rsp.uuid); + STACK_DESTROY (frame->root); + synctask_barrier_wake(args); + return 0; +} + +int32_t +gd_syncop_mgmt_v3_lock_cbk (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + return glusterd_big_locked_cbk (req, iov, count, myframe, + gd_syncop_mgmt_v3_lock_cbk_fn); +} + +int +gd_syncop_mgmt_v3_lock (glusterd_op_t op, dict_t *op_ctx, + glusterd_peerinfo_t *peerinfo, + struct syncargs *args, uuid_t my_uuid, + uuid_t recv_uuid, uuid_t txn_id) +{ + int ret = -1; + gd1_mgmt_v3_lock_req req = {{0},}; + glusterd_conf_t *conf = THIS->private; + + GF_ASSERT(op_ctx); + GF_ASSERT(peerinfo); + GF_ASSERT(args); + + ret = dict_allocate_and_serialize (op_ctx, + &req.dict.dict_val, + &req.dict.dict_len); + if (ret) + goto out; + + uuid_copy (req.uuid, my_uuid); + uuid_copy (req.txn_id, txn_id); + req.op = op; + synclock_unlock (&conf->big_lock); + ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo, + &gd_mgmt_v3_prog, + GLUSTERD_MGMT_V3_LOCK, + gd_syncop_mgmt_v3_lock_cbk, + (xdrproc_t) xdr_gd1_mgmt_v3_lock_req); + synclock_lock (&conf->big_lock); +out: + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +int32_t +gd_syncop_mgmt_v3_unlock_cbk_fn (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + int ret = -1; + struct syncargs *args = NULL; + glusterd_peerinfo_t *peerinfo = NULL; + gd1_mgmt_v3_unlock_rsp rsp = {{0},}; + call_frame_t *frame = NULL; + int op_ret = -1; + int op_errno = -1; + + GF_ASSERT(req); + GF_ASSERT(iov); + GF_ASSERT(myframe); + + frame = myframe; + args = frame->local; + peerinfo = frame->cookie; + frame->local = NULL; + frame->cookie = NULL; + + if (-1 == req->rpc_status) { + op_errno = ENOTCONN; + goto out; + } + + ret = xdr_to_generic (*iov, &rsp, + (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp); + if (ret < 0) + goto out; + + uuid_copy (args->uuid, rsp.uuid); + + /* Set peer as locked, so we unlock only the locked peers */ + if (rsp.op_ret == 0) + peerinfo->locked = _gf_true; + op_ret = rsp.op_ret; + op_errno = rsp.op_errno; +out: + gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL, + GLUSTERD_MGMT_V3_UNLOCK, + peerinfo, rsp.uuid); + STACK_DESTROY (frame->root); + synctask_barrier_wake(args); + return 0; +} + +int32_t +gd_syncop_mgmt_v3_unlock_cbk (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + return glusterd_big_locked_cbk (req, iov, count, myframe, + gd_syncop_mgmt_v3_unlock_cbk_fn); +} + +int +gd_syncop_mgmt_v3_unlock (dict_t *op_ctx, glusterd_peerinfo_t *peerinfo, + struct syncargs *args, uuid_t my_uuid, + uuid_t recv_uuid, uuid_t txn_id) +{ + int ret = -1; + gd1_mgmt_v3_unlock_req req = {{0},}; + glusterd_conf_t *conf = THIS->private; + + GF_ASSERT(op_ctx); + GF_ASSERT(peerinfo); + GF_ASSERT(args); + + ret = dict_allocate_and_serialize (op_ctx, + &req.dict.dict_val, + &req.dict.dict_len); + if (ret) + goto out; + + uuid_copy (req.uuid, my_uuid); + uuid_copy (req.txn_id, txn_id); + synclock_unlock (&conf->big_lock); + ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo, + &gd_mgmt_v3_prog, + GLUSTERD_MGMT_V3_UNLOCK, + gd_syncop_mgmt_v3_unlock_cbk, + (xdrproc_t) xdr_gd1_mgmt_v3_unlock_req); + synclock_lock (&conf->big_lock); +out: + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +int32_t _gd_syncop_mgmt_unlock_cbk (struct rpc_req *req, struct iovec *iov, int count, void *myframe) { @@ -312,7 +626,8 @@ _gd_syncop_mgmt_unlock_cbk (struct rpc_req *req, struct iovec *iov, op_ret = rsp.op_ret; op_errno = rsp.op_errno; out: - gd_collate_errors (args, op_ret, op_errno, NULL); + gd_collate_errors (args, op_ret, op_errno, NULL, + GLUSTERD_MGMT_CLUSTER_UNLOCK, peerinfo, rsp.uuid); STACK_DESTROY (frame->root); synctask_barrier_wake(args); return 0; @@ -356,6 +671,7 @@ _gd_syncop_stage_op_cbk (struct rpc_req *req, struct iovec *iov, xlator_t *this = NULL; dict_t *rsp_dict = NULL; call_frame_t *frame = NULL; + glusterd_peerinfo_t *peerinfo = NULL; int op_ret = -1; int op_errno = -1; @@ -389,6 +705,15 @@ _gd_syncop_stage_op_cbk (struct rpc_req *req, struct iovec *iov, } } + ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo); + if (ret) { + gf_log (this->name, GF_LOG_CRITICAL, "Staging response " + "for 'Volume %s' received from unknown " + "peer: %s", gd_op_list[rsp.op], + uuid_utoa (rsp.uuid)); + goto out; + } + uuid_copy (args->uuid, rsp.uuid); if (rsp.op == GD_OP_REPLACE_BRICK) { pthread_mutex_lock (&args->lock_dict); @@ -407,7 +732,9 @@ _gd_syncop_stage_op_cbk (struct rpc_req *req, struct iovec *iov, op_errno = rsp.op_errno; out: - gd_collate_errors (args, op_ret, op_errno, rsp.op_errstr); + gd_collate_errors (args, op_ret, op_errno, rsp.op_errstr, + GLUSTERD_MGMT_STAGE_OP, peerinfo, rsp.uuid); + if (rsp_dict) dict_unref (rsp_dict); @@ -555,10 +882,12 @@ gd_syncop_mgmt_brick_op (struct rpc_clnt *rpc, glusterd_pending_node_t *pnode, GD_SYNCOP (rpc, (&args), NULL, gd_syncop_brick_op_cbk, req, &gd_brick_prog, req->op, xdr_gd1_mgmt_brick_op_req); - if (args.errstr && errstr) - *errstr = args.errstr; - else - GF_FREE (args.errstr); + if (args.errstr) { + if ((strlen(args.errstr) > 0) && errstr) + *errstr = args.errstr; + else + GF_FREE (args.errstr); + } if (GD_OP_STATUS_VOLUME == op) { ret = dict_set_int32 (args.dict, "index", pnode->index); @@ -588,14 +917,15 @@ int32_t _gd_syncop_commit_op_cbk (struct rpc_req *req, struct iovec *iov, int count, void *myframe) { - int ret = -1; - gd1_mgmt_commit_op_rsp rsp = {{0},}; - struct syncargs *args = NULL; - xlator_t *this = NULL; - dict_t *rsp_dict = NULL; - call_frame_t *frame = NULL; - int op_ret = -1; - int op_errno = -1; + int ret = -1; + gd1_mgmt_commit_op_rsp rsp = {{0},}; + struct syncargs *args = NULL; + xlator_t *this = NULL; + dict_t *rsp_dict = NULL; + call_frame_t *frame = NULL; + glusterd_peerinfo_t *peerinfo = NULL; + int op_ret = -1; + int op_errno = -1; this = THIS; frame = myframe; @@ -628,6 +958,15 @@ _gd_syncop_commit_op_cbk (struct rpc_req *req, struct iovec *iov, } } + ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo); + if (ret) { + gf_log (this->name, GF_LOG_CRITICAL, "Commit response " + "for 'Volume %s' received from unknown " + "peer: %s", gd_op_list[rsp.op], + uuid_utoa (rsp.uuid)); + goto out; + } + uuid_copy (args->uuid, rsp.uuid); pthread_mutex_lock (&args->lock_dict); { @@ -642,8 +981,10 @@ _gd_syncop_commit_op_cbk (struct rpc_req *req, struct iovec *iov, op_ret = rsp.op_ret; op_errno = rsp.op_errno; + out: - gd_collate_errors (args, op_ret, op_errno, rsp.op_errstr); + gd_collate_errors (args, op_ret, op_errno, rsp.op_errstr, + GLUSTERD_MGMT_COMMIT_OP, peerinfo, rsp.uuid); if (rsp_dict) dict_unref (rsp_dict); @@ -716,8 +1057,8 @@ gd_build_peers_list (struct list_head *peers, struct list_head *xact_peers, } int -gd_lock_op_phase (struct list_head *peers, glusterd_op_t op, dict_t *op_ctx, - char **op_errstr, int npeers) +gd_lock_op_phase (glusterd_conf_t *conf, glusterd_op_t op, dict_t *op_ctx, + char **op_errstr, int npeers, uuid_t txn_id) { int ret = -1; int peer_cnt = 0; @@ -725,6 +1066,9 @@ gd_lock_op_phase (struct list_head *peers, glusterd_op_t op, dict_t *op_ctx, xlator_t *this = NULL; glusterd_peerinfo_t *peerinfo = NULL; struct syncargs args = {0}; + struct list_head *peers = NULL; + + peers = &conf->xaction_peers; if (!npeers) { ret = 0; @@ -735,22 +1079,38 @@ gd_lock_op_phase (struct list_head *peers, glusterd_op_t op, dict_t *op_ctx, synctask_barrier_init((&args)); peer_cnt = 0; list_for_each_entry (peerinfo, peers, op_peers_list) { - /* Reset lock status */ - peerinfo->locked = _gf_false; - gd_syncop_mgmt_lock (peerinfo, &args, MY_UUID, peer_uuid); + if (conf->op_version < 3) { + /* Reset lock status */ + peerinfo->locked = _gf_false; + gd_syncop_mgmt_lock (peerinfo, &args, + MY_UUID, peer_uuid); + } else + gd_syncop_mgmt_v3_lock (op, op_ctx, peerinfo, &args, + MY_UUID, peer_uuid, txn_id); peer_cnt++; } gd_synctask_barrier_wait((&args), peer_cnt); - ret = args.op_ret; - if (ret) { - gf_asprintf (op_errstr, "Another transaction could be " - "in progress. Please try again after " - "sometime."); - gf_log (this->name, GF_LOG_ERROR, "Failed to acquire lock"); - goto out; + + if (args.op_ret) { + if (args.errstr) + *op_errstr = gf_strdup (args.errstr); + else { + ret = gf_asprintf (op_errstr, "Another transaction could be " + "in progress. Please try again after " + "sometime."); + if (ret == -1) + *op_errstr = NULL; + + gf_log (this->name, GF_LOG_ERROR, + "Failed to acquire lock"); + + } } - ret = 0; + ret = args.op_ret; + + gf_log (this->name, GF_LOG_DEBUG, "Sent lock op req for 'Volume %s' " + "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret); out: return ret; } @@ -816,12 +1176,16 @@ stage_done: peer_cnt++; } gd_synctask_barrier_wait((&args), peer_cnt); - ret = args.op_ret; - if (dict_get_str (op_ctx, "errstr", &errstr) == 0) + + if (args.errstr) + *op_errstr = gf_strdup (args.errstr); + else if (dict_get_str (op_ctx, "errstr", &errstr) == 0) *op_errstr = gf_strdup (errstr); - else if (args.errstr) - *op_errstr = gf_strdup (args.errstr); + ret = args.op_ret; + + gf_log (this->name, GF_LOG_DEBUG, "Sent stage op req for 'Volume %s' " + "to %d peers", gd_op_list[op], peer_cnt); out: if (rsp_dict) dict_unref (rsp_dict); @@ -881,6 +1245,7 @@ commit_done: ret = 0; goto out; } + gd_syncargs_init (&args, op_ctx); synctask_barrier_init((&args)); peer_cnt = 0; @@ -892,24 +1257,31 @@ commit_done: } gd_synctask_barrier_wait((&args), peer_cnt); ret = args.op_ret; - if (dict_get_str (op_ctx, "errstr", &errstr) == 0) + if (args.errstr) + *op_errstr = gf_strdup (args.errstr); + else if (dict_get_str (op_ctx, "errstr", &errstr) == 0) *op_errstr = gf_strdup (errstr); - else if (args.errstr) - *op_errstr = gf_strdup (args.errstr); + gf_log (this->name, GF_LOG_DEBUG, "Sent commit op req for 'Volume %s' " + "to %d peers", gd_op_list[op], peer_cnt); out: if (!ret) glusterd_op_modify_op_ctx (op, op_ctx); if (rsp_dict) dict_unref (rsp_dict); + + GF_FREE (args.errstr); + args.errstr = NULL; + return ret; } int -gd_unlock_op_phase (struct list_head *peers, glusterd_op_t op, int op_ret, +gd_unlock_op_phase (glusterd_conf_t *conf, glusterd_op_t op, int op_ret, rpcsvc_request_t *req, dict_t *op_ctx, char *op_errstr, - int npeers) + int npeers, char *volname, gf_boolean_t is_acquired, + uuid_t txn_id) { glusterd_peerinfo_t *peerinfo = NULL; glusterd_peerinfo_t *tmp = NULL; @@ -918,23 +1290,34 @@ gd_unlock_op_phase (struct list_head *peers, glusterd_op_t op, int op_ret, int ret = -1; xlator_t *this = NULL; struct syncargs args = {0}; + struct list_head *peers = NULL; + + peers = &conf->xaction_peers; if (!npeers) { ret = 0; goto out; } + /* If the lock has not been held during this + * transaction, do not send unlock requests */ + if (!is_acquired) + goto out; + this = THIS; synctask_barrier_init((&args)); peer_cnt = 0; list_for_each_entry_safe (peerinfo, tmp, peers, op_peers_list) { - /* Only unlock peers that were locked */ - if (peerinfo->locked) { - gd_syncop_mgmt_unlock (peerinfo, &args, MY_UUID, - tmp_uuid); - peer_cnt++; - } - + if (conf->op_version < 3) { + /* Only unlock peers that were locked */ + if (peerinfo->locked) + gd_syncop_mgmt_unlock (peerinfo, &args, + MY_UUID, tmp_uuid); + } else + gd_syncop_mgmt_v3_unlock (op_ctx, peerinfo, + &args, MY_UUID, + tmp_uuid, txn_id); + peer_cnt++; list_del_init (&peerinfo->op_peers_list); } gd_synctask_barrier_wait((&args), peer_cnt); @@ -947,7 +1330,20 @@ gd_unlock_op_phase (struct list_head *peers, glusterd_op_t op, int op_ret, out: glusterd_op_send_cli_response (op, op_ret, 0, req, op_ctx, op_errstr); glusterd_op_clear_op (op); - glusterd_unlock (MY_UUID); + if (is_acquired) { + /* Based on the op-version, we release * + * the cluster or mgmt_v3 lock */ + if (conf->op_version < 3) + glusterd_unlock (MY_UUID); + else { + ret = glusterd_mgmt_v3_unlock (volname, MY_UUID, + "vol"); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Unable to release lock for %s", + volname); + } + } return 0; } @@ -964,7 +1360,8 @@ gd_get_brick_count (struct list_head *bricks) } int -gd_brick_op_phase (glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict, char **op_errstr) +gd_brick_op_phase (glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict, + char **op_errstr) { glusterd_pending_node_t *pending_node = NULL; struct list_head selected = {0,}; @@ -1036,20 +1433,63 @@ out: void gd_sync_task_begin (dict_t *op_ctx, rpcsvc_request_t * req) { - int ret = -1; - int npeers = 0; - dict_t *req_dict = NULL; - glusterd_conf_t *conf = NULL; - glusterd_op_t op = 0; - int32_t tmp_op = 0; - char *op_errstr = NULL; - xlator_t *this = NULL; + int ret = -1; + int npeers = 0; + dict_t *req_dict = NULL; + glusterd_conf_t *conf = NULL; + glusterd_op_t op = 0; + int32_t tmp_op = 0; + char *op_errstr = NULL; + char *tmp = NULL; + char *volname = NULL; + xlator_t *this = NULL; + gf_boolean_t is_acquired = _gf_false; + uuid_t *txn_id = NULL; + uuid_t *originator_uuid = NULL; + glusterd_op_info_t txn_opinfo; this = THIS; GF_ASSERT (this); conf = this->private; GF_ASSERT (conf); + /* Generate a transaction-id for this operation and + * save it in the dict */ + txn_id = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t); + if (!txn_id) { + ret = -1; + goto out; + } + + uuid_generate (*txn_id); + + ret = dict_set_bin (op_ctx, "transaction_id", + txn_id, sizeof(*txn_id)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set transaction id."); + goto out; + } else + gf_log (this->name, GF_LOG_DEBUG, + "Transaction_id = %s", uuid_utoa (*txn_id)); + + /* Save the MY_UUID as the originator_uuid */ + originator_uuid = GF_CALLOC (1, sizeof(uuid_t), + gf_common_mt_uuid_t); + if (!originator_uuid) { + ret = -1; + goto out; + } + + uuid_copy (*originator_uuid, MY_UUID); + ret = dict_set_bin (op_ctx, "originator_uuid", + originator_uuid, sizeof (uuid_t)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set originator_uuid."); + goto out; + } + ret = dict_get_int32 (op_ctx, GD_SYNC_OPCODE_KEY, &tmp_op); if (ret) { gf_log (this->name, GF_LOG_ERROR, "Failed to get volume " @@ -1058,24 +1498,77 @@ gd_sync_task_begin (dict_t *op_ctx, rpcsvc_request_t * req) } op = tmp_op; - ret = glusterd_lock (MY_UUID); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Unable to acquire lock"); - gf_asprintf (&op_errstr, "Another transaction is in progress. " - "Please try again after sometime."); - goto out; + + /* Based on the op_version, acquire a cluster or mgmt_v3 lock */ + if (conf->op_version < 3) { + ret = glusterd_lock (MY_UUID); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to acquire lock"); + gf_asprintf (&op_errstr, + "Another transaction is in progress. " + "Please try again after sometime."); + goto out; + } + } else { + + /* If no volname is given as a part of the command, locks will + * not be held */ + ret = dict_get_str (op_ctx, "volname", &tmp); + if (ret) { + gf_log ("", GF_LOG_DEBUG, "Failed to get volume " + "name"); + goto local_locking_done; + } else { + /* Use a copy of volname, as cli response will be + * sent before the unlock, and the volname in the + * dict, might be removed */ + volname = gf_strdup (tmp); + if (!volname) + goto out; + } + + ret = glusterd_mgmt_v3_lock (volname, MY_UUID, "vol"); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to acquire lock for %s", volname); + gf_asprintf (&op_errstr, + "Another transaction is in progress " + "for %s. Please try again after sometime.", + volname); + goto out; + } } - /* storing op globally to access in synctask code paths - * This is still acceptable, as we are performing this under - * the 'cluster' lock*/ - glusterd_op_set_op (op); - INIT_LIST_HEAD (&conf->xaction_peers); - npeers = gd_build_peers_list (&conf->peers, &conf->xaction_peers, op); + is_acquired = _gf_true; - ret = gd_lock_op_phase (&conf->xaction_peers, op, op_ctx, &op_errstr, npeers); +local_locking_done: + + /* Save opinfo for this transaction with the transaction id */ + glusterd_txn_opinfo_init (&txn_opinfo, NULL, &op, NULL, NULL); + ret = glusterd_set_txn_opinfo (txn_id, &txn_opinfo); if (ret) - goto out; + gf_log (this->name, GF_LOG_ERROR, + "Unable to set transaction's opinfo"); + + opinfo = txn_opinfo; + + INIT_LIST_HEAD (&conf->xaction_peers); + + /* Make 'volume status tasks' command a local operation. + * This is accomplished by setting npeers to 0. + */ + if (!glusterd_is_status_tasks_op (op, op_ctx)) + npeers = gd_build_peers_list (&conf->peers, + &conf->xaction_peers, op); + + /* If no volname is given as a part of the command, locks will + * not be held */ + if (volname) { + ret = gd_lock_op_phase (conf, op, op_ctx, &op_errstr, npeers, *txn_id); + if (ret) + goto out; + } ret = glusterd_op_build_payload (&req_dict, &op_errstr, op_ctx); if (ret) { @@ -1102,14 +1595,25 @@ gd_sync_task_begin (dict_t *op_ctx, rpcsvc_request_t * req) ret = 0; out: - (void) gd_unlock_op_phase (&conf->xaction_peers, op, ret, req, - op_ctx, op_errstr, npeers); + (void) gd_unlock_op_phase (conf, op, ret, req, op_ctx, op_errstr, + npeers, volname, is_acquired, *txn_id); + + /* Clearing the transaction opinfo */ + ret = glusterd_clear_txn_opinfo (txn_id); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Unable to clear transaction's opinfo"); + + if (volname) + GF_FREE (volname); if (req_dict) dict_unref (req_dict); - if (op_errstr) + if (op_errstr) { GF_FREE (op_errstr); + op_errstr = NULL; + } return; } diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.h b/xlators/mgmt/glusterd/src/glusterd-syncop.h index 7e4d47f9a..e83ea2f4c 100644 --- a/xlators/mgmt/glusterd/src/glusterd-syncop.h +++ b/xlators/mgmt/glusterd/src/glusterd-syncop.h @@ -12,6 +12,7 @@ #include "syncop.h" #include "glusterd-sm.h" +#include "glusterd.h" #define GD_SYNC_OPCODE_KEY "sync-mgmt-operation" @@ -51,4 +52,20 @@ int gd_syncop_mgmt_stage_op (struct rpc_clnt *rpc, struct syncargs *arg, int gd_syncop_mgmt_commit_op (struct rpc_clnt *rpc, struct syncargs *arg, uuid_t my_uuid, uuid_t recv_uuid, int op, dict_t *dict_out, dict_t *op_ctx); + +void +gd_synctask_barrier_wait (struct syncargs *args, int count); + +int +gd_build_peers_list (struct list_head *peers, struct list_head *xact_peers, + glusterd_op_t op); +int +gd_brick_op_phase (glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict, + char **op_errstr); + +int +glusterd_syncop_aggr_rsp_dict (glusterd_op_t op, dict_t *aggr, dict_t *rsp); + +void +gd_syncargs_init (struct syncargs *args, dict_t *op_ctx); #endif /* __RPC_SYNCOP_H */ diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index ea5799ae6..e8ae05851 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -11,7 +11,6 @@ #define _CONFIG_H #include "config.h" #endif -#include <openssl/md5.h> #include <inttypes.h> #include "globals.h" @@ -36,6 +35,8 @@ #include "glusterd-store.h" #include "glusterd-volgen.h" #include "glusterd-pmap.h" +#include "glusterfs-acl.h" +#include "glusterd-locks.h" #include "xdr-generic.h" #include <sys/resource.h> @@ -50,6 +51,10 @@ #include <fnmatch.h> #include <sys/statvfs.h> #include <ifaddrs.h> +#ifdef HAVE_BD_XLATOR +#include <lvm2app.h> +#endif + #ifdef GF_LINUX_HOST_OS #include <mntent.h> @@ -86,16 +91,6 @@ gd_peer_uuid_str (glusterd_peerinfo_t *peerinfo) return peerinfo->uuid_str; } -static void -md5_wrapper(const unsigned char *data, size_t len, char *md5) -{ - unsigned short i = 0; - unsigned short lim = MD5_DIGEST_LENGTH*2+1; - unsigned char scratch[MD5_DIGEST_LENGTH] = {0,}; - MD5(data, len, scratch); - for (; i < MD5_DIGEST_LENGTH; i++) - snprintf(md5 + i * 2, lim-i*2, "%02x", scratch[i]); -} int32_t glusterd_get_lock_owner (uuid_t *uuid) @@ -134,189 +129,6 @@ glusterd_is_fuse_available () return _gf_false; } -gf_boolean_t -glusterd_is_loopback_localhost (const struct sockaddr *sa, char *hostname) -{ - GF_ASSERT (sa); - - gf_boolean_t is_local = _gf_false; - const struct in_addr *addr4 = NULL; - const struct in6_addr *addr6 = NULL; - uint8_t *ap = NULL; - struct in6_addr loopbackaddr6 = IN6ADDR_LOOPBACK_INIT; - - switch (sa->sa_family) { - case AF_INET: - addr4 = &(((struct sockaddr_in *)sa)->sin_addr); - ap = (uint8_t*)&addr4->s_addr; - if (ap[0] == 127) - is_local = _gf_true; - break; - - case AF_INET6: - addr6 = &(((struct sockaddr_in6 *)sa)->sin6_addr); - if (memcmp (addr6, &loopbackaddr6, - sizeof (loopbackaddr6)) == 0) - is_local = _gf_true; - break; - - default: - if (hostname) - gf_log ("glusterd", GF_LOG_ERROR, - "unknown address family %d for %s", - sa->sa_family, hostname); - break; - } - - return is_local; -} - -char * -get_ip_from_addrinfo (struct addrinfo *addr, char **ip) -{ - char buf[64]; - void *in_addr = NULL; - struct sockaddr_in *s4 = NULL; - struct sockaddr_in6 *s6 = NULL; - - switch (addr->ai_family) - { - case AF_INET: - s4 = (struct sockaddr_in *)addr->ai_addr; - in_addr = &s4->sin_addr; - break; - - case AF_INET6: - s6 = (struct sockaddr_in6 *)addr->ai_addr; - in_addr = &s6->sin6_addr; - break; - - default: - gf_log ("glusterd", GF_LOG_ERROR, "Invalid family"); - return NULL; - } - - if (!inet_ntop(addr->ai_family, in_addr, buf, sizeof(buf))) { - gf_log ("glusterd", GF_LOG_ERROR, "String conversion failed"); - return NULL; - } - - *ip = strdup (buf); - return *ip; -} - -gf_boolean_t -glusterd_interface_search (char *ip) -{ - int32_t ret = -1; - gf_boolean_t found = _gf_false; - struct ifaddrs *ifaddr, *ifa; - int family; - char host[NI_MAXHOST]; - xlator_t *this = NULL; - char *pct = NULL; - - this = THIS; - - ret = getifaddrs (&ifaddr); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "getifaddrs() failed: %s\n", - gai_strerror(ret)); - goto out; - } - - for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { - if (!ifa->ifa_addr) { - /* - * This seemingly happens if an interface hasn't - * been bound to a particular protocol (seen with - * TUN devices). - */ - continue; - } - family = ifa->ifa_addr->sa_family; - - if (family != AF_INET && family != AF_INET6) - continue; - - ret = getnameinfo (ifa->ifa_addr, - (family == AF_INET) ? sizeof(struct sockaddr_in) : - sizeof(struct sockaddr_in6), - host, NI_MAXHOST, NULL, 0, NI_NUMERICHOST); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, - "getnameinfo() failed: %s\n", - gai_strerror(ret)); - goto out; - } - - /* - * Sometimes the address comes back as addr%eth0 or - * similar. Since % is an invalid character, we can - * strip it out with confidence that doing so won't - * harm anything. - */ - pct = index(host,'%'); - if (pct) { - *pct = '\0'; - } - - if (strncmp (ip, host, NI_MAXHOST) == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "%s is local address at interface %s", - ip, ifa->ifa_name); - found = _gf_true; - goto out; - } - } -out: - if(ifaddr) - freeifaddrs (ifaddr); - return found; -} - - -gf_boolean_t -glusterd_is_local_addr (char *hostname) -{ - int32_t ret = -1; - struct addrinfo *result = NULL; - struct addrinfo *res = NULL; - gf_boolean_t found = _gf_false; - char *ip = NULL; - xlator_t *this = NULL; - - this = THIS; - ret = getaddrinfo (hostname, NULL, NULL, &result); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "error in getaddrinfo: %s\n", - gai_strerror(ret)); - goto out; - } - - for (res = result; res != NULL; res = res->ai_next) { - gf_log (this->name, GF_LOG_DEBUG, "%s ", - get_ip_from_addrinfo (res, &ip)); - - found = glusterd_is_loopback_localhost (res->ai_addr, hostname) - || glusterd_interface_search (ip); - if (found) - goto out; - } - -out: - if (result) - freeaddrinfo (result); - - if (!found) - gf_log (this->name, GF_LOG_DEBUG, "%s is not local", hostname); - - return found; -} - int32_t glusterd_lock (uuid_t uuid) { @@ -624,8 +436,11 @@ glusterd_volinfo_new (glusterd_volinfo_t **volinfo) if (!new_volinfo) goto out; + LOCK_INIT (&new_volinfo->lock); INIT_LIST_HEAD (&new_volinfo->vol_list); + INIT_LIST_HEAD (&new_volinfo->snapvol_list); INIT_LIST_HEAD (&new_volinfo->bricks); + INIT_LIST_HEAD (&new_volinfo->snap_volumes); new_volinfo->dict = dict_new (); if (!new_volinfo->dict) { @@ -641,6 +456,10 @@ glusterd_volinfo_new (glusterd_volinfo_t **volinfo) goto out; } + snprintf (new_volinfo->parent_volname, GLUSTERD_MAX_VOLUME_NAME, "N/A"); + + new_volinfo->snap_max_hard_limit = GLUSTERD_SNAPS_MAX_HARD_LIMIT; + new_volinfo->xl = THIS; *volinfo = new_volinfo; @@ -652,6 +471,219 @@ out: return ret; } +/* This function will create a new volinfo and then + * dup the entries from volinfo to the new_volinfo. + * + * @param volinfo volinfo which will be duplicated + * @param dup_volinfo new volinfo which will be created + * @param set_userauth if this true then auth info is also set + * + * @return 0 on success else -1 + */ +int32_t +glusterd_volinfo_dup (glusterd_volinfo_t *volinfo, + glusterd_volinfo_t **dup_volinfo, + gf_boolean_t set_userauth) +{ + int32_t ret = -1; + xlator_t *this = NULL; + glusterd_volinfo_t *new_volinfo = NULL; + + this = THIS; + GF_ASSERT (this); + GF_VALIDATE_OR_GOTO (this->name, volinfo, out); + GF_VALIDATE_OR_GOTO (this->name, dup_volinfo, out); + + ret = glusterd_volinfo_new (&new_volinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "not able to create the " + "duplicate volinfo for the volume %s", + volinfo->volname); + goto out; + } + + new_volinfo->type = volinfo->type; + new_volinfo->replica_count = volinfo->replica_count; + new_volinfo->stripe_count = volinfo->stripe_count; + new_volinfo->dist_leaf_count = volinfo->dist_leaf_count; + new_volinfo->sub_count = volinfo->sub_count; + new_volinfo->transport_type = volinfo->transport_type; + new_volinfo->nfs_transport_type = volinfo->nfs_transport_type; + new_volinfo->brick_count = volinfo->brick_count; + + dict_copy (volinfo->dict, new_volinfo->dict); + gd_update_volume_op_versions (new_volinfo); + + if (set_userauth) { + glusterd_auth_set_username (new_volinfo, + volinfo->auth.username); + glusterd_auth_set_password (new_volinfo, + volinfo->auth.password); + } + + *dup_volinfo = new_volinfo; + ret = 0; +out: + if (ret && (NULL != new_volinfo)) { + (void) glusterd_volinfo_delete (new_volinfo); + } + return ret; +} + +/* This function will duplicate brickinfo + * + * @param brickinfo Source brickinfo + * @param dup_brickinfo Destination brickinfo + * + * @return 0 on success else -1 + */ +int32_t +glusterd_brickinfo_dup (glusterd_brickinfo_t *brickinfo, + glusterd_brickinfo_t *dup_brickinfo) +{ + int32_t ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + + GF_VALIDATE_OR_GOTO (this->name, brickinfo, out); + GF_VALIDATE_OR_GOTO (this->name, dup_brickinfo, out); + + strcpy (dup_brickinfo->hostname, brickinfo->hostname); + strcpy (dup_brickinfo->path, brickinfo->path); + strcpy (dup_brickinfo->device_path, brickinfo->device_path); + ret = gf_canonicalize_path (dup_brickinfo->path); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "Failed to canonicalize " + "brick path"); + goto out; + } + uuid_copy (dup_brickinfo->uuid, brickinfo->uuid); + + dup_brickinfo->port = brickinfo->port; + dup_brickinfo->rdma_port = brickinfo->rdma_port; + if (NULL != brickinfo->logfile) { + dup_brickinfo->logfile = gf_strdup (brickinfo->logfile); + if (NULL == dup_brickinfo->logfile) { + ret = -1; + goto out; + } + } + dup_brickinfo->status = brickinfo->status; + dup_brickinfo->snap_status = brickinfo->snap_status; +out: + return ret; +} + +/* This function will copy snap volinfo to the new + * passed volinfo and regenerate backend store files + * for the restored snap. + * + * @param new_volinfo new volinfo + * @param snap_volinfo volinfo of snap volume + * + * @return 0 on success and -1 on failure + * + * TODO: Duplicate all members of volinfo, e.g. geo-rep sync slaves + */ +int32_t +glusterd_snap_volinfo_restore (dict_t *rsp_dict, + glusterd_volinfo_t *new_volinfo, + glusterd_volinfo_t *snap_volinfo) +{ + int32_t brick_count = -1; + int32_t ret = -1; + xlator_t *this = NULL; + glusterd_brickinfo_t *brickinfo = NULL; + glusterd_brickinfo_t *new_brickinfo = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (rsp_dict); + + GF_VALIDATE_OR_GOTO (this->name, new_volinfo, out); + GF_VALIDATE_OR_GOTO (this->name, snap_volinfo, out); + + brick_count = 0; + list_for_each_entry (brickinfo, &snap_volinfo->bricks, brick_list) { + ret = glusterd_brickinfo_new (&new_brickinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to create " + "new brickinfo"); + goto out; + } + + /* Duplicate brickinfo */ + ret = glusterd_brickinfo_dup (brickinfo, new_brickinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to dup " + "brickinfo"); + goto out; + } + + /* If the brick is not of this peer, or snapshot is missed * + * for the brick do not replace the xattr for it */ + if ((!uuid_compare (brickinfo->uuid, MY_UUID)) && + (brickinfo->snap_status != -1)) { + /* We need to replace the volume id of all the bricks + * to the volume id of the origin volume. new_volinfo + * has the origin volume's volume id*/ + ret = sys_lsetxattr (new_brickinfo->path, + GF_XATTR_VOL_ID_KEY, + new_volinfo->volume_id, + sizeof (new_volinfo->volume_id), + XATTR_REPLACE); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "set extended attribute %s on %s. " + "Reason: %s, snap: %s", + GF_XATTR_VOL_ID_KEY, + new_brickinfo->path, strerror (errno), + new_volinfo->volname); + goto out; + } + } + + /* If a snapshot is pending for this brick then + * restore should also be pending + */ + if (brickinfo->snap_status == -1) { + /* Adding missed delete to the dict */ + ret = glusterd_add_missed_snaps_to_dict + (rsp_dict, + snap_volinfo->volname, + brickinfo, + brick_count + 1, + GF_SNAP_OPTION_TYPE_RESTORE); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to add missed snapshot info " + "for %s:%s in the rsp_dict", + brickinfo->hostname, + brickinfo->path); + goto out; + } + } + + list_add_tail (&new_brickinfo->brick_list, + &new_volinfo->bricks); + /* ownership of new_brickinfo is passed to new_volinfo */ + new_brickinfo = NULL; + brick_count++; + } + + /* Regenerate all volfiles */ + ret = glusterd_create_volfiles_and_notify_services (new_volinfo); + +out: + if (ret && (NULL != new_brickinfo)) { + (void) glusterd_brickinfo_delete (new_brickinfo); + } + + return ret; +} + void glusterd_auth_cleanup (glusterd_volinfo_t *volinfo) { @@ -744,6 +776,7 @@ glusterd_volinfo_delete (glusterd_volinfo_t *volinfo) GF_ASSERT (volinfo); list_del_init (&volinfo->vol_list); + list_del_init (&volinfo->snapvol_list); ret = glusterd_volume_brickinfos_delete (volinfo); if (ret) @@ -764,7 +797,6 @@ out: return ret; } - int32_t glusterd_brickinfo_new (glusterd_brickinfo_t **brickinfo) { @@ -816,6 +848,7 @@ glusterd_brickinfo_new_from_brick (char *brick, char *path = NULL; char *tmp_host = NULL; char *tmp_path = NULL; + char *vg = NULL; GF_ASSERT (brick); GF_ASSERT (brickinfo); @@ -834,6 +867,17 @@ glusterd_brickinfo_new_from_brick (char *brick, if (ret) goto out; +#ifdef HAVE_BD_XLATOR + vg = strchr (path, '?'); + /* ? is used as a delimiter for vg */ + if (vg) { + strncpy (new_brickinfo->vg, vg + 1, PATH_MAX - 1); + *vg = '\0'; + } + new_brickinfo->caps = CAPS_BD; +#else + vg = NULL; /* Avoid compiler warnings when BD not enabled */ +#endif ret = gf_canonicalize_path (path); if (ret) goto out; @@ -937,6 +981,62 @@ out: return available; } +#ifdef HAVE_BD_XLATOR +/* + * Sets the tag of the format "trusted.glusterfs.volume-id:<uuid>" in + * the brick VG. It is used to avoid using same VG for another brick. + * @volume-id - gfid, @brick - brick info, @msg - Error message returned + * to the caller + */ +int +glusterd_bd_set_vg_tag (unsigned char *volume_id, glusterd_brickinfo_t *brick, + char *msg, int msg_size) +{ + lvm_t handle = NULL; + vg_t vg = NULL; + char *uuid = NULL; + int ret = -1; + + gf_asprintf (&uuid, "%s:%s", GF_XATTR_VOL_ID_KEY, + uuid_utoa (volume_id)); + if (!uuid) { + snprintf (msg, sizeof(*msg), "Could not allocate memory " + "for tag"); + return -1; + } + + handle = lvm_init (NULL); + if (!handle) { + snprintf (msg, sizeof(*msg), "lvm_init failed"); + goto out; + } + + vg = lvm_vg_open (handle, brick->vg, "w", 0); + if (!vg) { + snprintf (msg, sizeof(*msg), "Could not open VG %s", + brick->vg); + goto out; + } + + if (lvm_vg_add_tag (vg, uuid) < 0) { + snprintf (msg, sizeof(*msg), "Could not set tag %s for " + "VG %s", uuid, brick->vg); + goto out; + } + lvm_vg_write (vg); + ret = 0; +out: + GF_FREE (uuid); + + if (vg) + lvm_vg_close (vg); + if (handle) + lvm_quit (handle); + + return ret; +} +#endif + int glusterd_validate_and_create_brickpath (glusterd_brickinfo_t *brickinfo, uuid_t volume_id, char **op_errstr, @@ -1019,9 +1119,17 @@ glusterd_validate_and_create_brickpath (glusterd_brickinfo_t *brickinfo, } } +#ifdef HAVE_BD_XLATOR + if (brickinfo->vg[0]) { + ret = glusterd_bd_set_vg_tag (volume_id, brickinfo, msg, + sizeof(msg)); + if (ret) + goto out; + } +#endif ret = glusterd_check_and_set_brick_xattr (brickinfo->hostname, brickinfo->path, volume_id, - op_errstr); + op_errstr, is_force); if (ret) goto out; @@ -1153,6 +1261,68 @@ glusterd_friend_cleanup (glusterd_peerinfo_t *peerinfo) return 0; } +int +glusterd_volinfo_find_by_volume_id (uuid_t volume_id, glusterd_volinfo_t **volinfo) +{ + int32_t ret = -1; + xlator_t *this = NULL; + glusterd_volinfo_t *voliter = NULL; + glusterd_conf_t *priv = NULL; + + if (!volume_id) + return -1; + + this = THIS; + priv = this->private; + + list_for_each_entry (voliter, &priv->volumes, vol_list) { + if (uuid_compare (volume_id, voliter->volume_id)) + continue; + *volinfo = voliter; + ret = 0; + gf_log (this->name, GF_LOG_DEBUG, "Volume %s found", + voliter->volname); + break; + } + return ret; +} + +int +glusterd_snap_volinfo_find_by_volume_id (uuid_t volume_id, + glusterd_volinfo_t **volinfo) +{ + int32_t ret = -1; + xlator_t *this = NULL; + glusterd_volinfo_t *voliter = NULL; + glusterd_snap_t *snap = NULL; + glusterd_conf_t *priv = NULL; + + this = THIS; + priv = this->private; + GF_ASSERT (priv); + GF_ASSERT (volinfo); + + if (uuid_is_null(volume_id)) { + gf_log (this->name, GF_LOG_WARNING, "Volume UUID is NULL"); + goto out; + } + + list_for_each_entry (snap, &priv->snapshots, snap_list) { + list_for_each_entry (voliter, &snap->volumes, vol_list) { + if (uuid_compare (volume_id, voliter->volume_id)) + continue; + *volinfo = voliter; + ret = 0; + goto out; + } + } + + gf_log (this->name, GF_LOG_WARNING, "Snap volume not found"); +out: + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + int32_t glusterd_volinfo_find (char *volname, glusterd_volinfo_t **volinfo) { @@ -1162,7 +1332,6 @@ glusterd_volinfo_find (char *volname, glusterd_volinfo_t **volinfo) glusterd_conf_t *priv = NULL; GF_ASSERT (volname); - this = THIS; GF_ASSERT (this); @@ -1171,7 +1340,8 @@ glusterd_volinfo_find (char *volname, glusterd_volinfo_t **volinfo) list_for_each_entry (tmp_volinfo, &priv->volumes, vol_list) { if (!strcmp (tmp_volinfo->volname, volname)) { - gf_log (this->name, GF_LOG_DEBUG, "Volume %s found", volname); + gf_log (this->name, GF_LOG_DEBUG, "Volume %s found", + volname); ret = 0; *volinfo = tmp_volinfo; break; @@ -1183,6 +1353,68 @@ glusterd_volinfo_find (char *volname, glusterd_volinfo_t **volinfo) } int32_t +glusterd_snap_volinfo_find (char *snap_volname, glusterd_snap_t *snap, + glusterd_volinfo_t **volinfo) +{ + int32_t ret = -1; + xlator_t *this = NULL; + glusterd_volinfo_t *snap_vol = NULL; + glusterd_conf_t *priv = NULL; + + this = THIS; + priv = this->private; + GF_ASSERT (priv); + GF_ASSERT (snap); + GF_ASSERT (snap_volname); + + list_for_each_entry (snap_vol, &snap->volumes, vol_list) { + if (!strcmp (snap_vol->volname, snap_volname)) { + ret = 0; + *volinfo = snap_vol; + goto out; + } + } + + gf_log (this->name, GF_LOG_WARNING, "Snap volume %s not found", + snap_volname); +out: + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +int32_t +glusterd_snap_volinfo_find_from_parent_volname (char *origin_volname, + glusterd_snap_t *snap, + glusterd_volinfo_t **volinfo) +{ + int32_t ret = -1; + xlator_t *this = NULL; + glusterd_volinfo_t *snap_vol = NULL; + glusterd_conf_t *priv = NULL; + + this = THIS; + priv = this->private; + GF_ASSERT (priv); + GF_ASSERT (snap); + GF_ASSERT (origin_volname); + + list_for_each_entry (snap_vol, &snap->volumes, vol_list) { + if (!strcmp (snap_vol->parent_volname, origin_volname)) { + ret = 0; + *volinfo = snap_vol; + goto out; + } + } + + gf_log (this->name, GF_LOG_DEBUG, "Snap volume not found(snap: %s, " + "origin-volume: %s", snap->snapname, origin_volname); + +out: + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +int32_t glusterd_service_stop (const char *service, char *pidfile, int sig, gf_boolean_t force_kill) { @@ -1262,22 +1494,20 @@ glusterd_set_brick_socket_filepath (glusterd_volinfo_t *volinfo, */ int32_t glusterd_brick_connect (glusterd_volinfo_t *volinfo, - glusterd_brickinfo_t *brickinfo) + glusterd_brickinfo_t *brickinfo, char *socketpath) { int ret = 0; - char socketpath[PATH_MAX] = {0}; + char volume_id_str[64]; + char *brickid = NULL; dict_t *options = NULL; struct rpc_clnt *rpc = NULL; glusterd_conf_t *priv = THIS->private; GF_ASSERT (volinfo); GF_ASSERT (brickinfo); + GF_ASSERT (socketpath); if (brickinfo->rpc == NULL) { - glusterd_set_brick_socket_filepath (volinfo, brickinfo, - socketpath, - sizeof (socketpath)); - /* Setting frame-timeout to 10mins (600seconds). * Unix domain sockets ensures that the connection is reliable. * The default timeout of 30mins used for unreliable network @@ -1287,10 +1517,17 @@ glusterd_brick_connect (glusterd_volinfo_t *volinfo, 600); if (ret) goto out; + + uuid_utoa_r (volinfo->volume_id, volume_id_str); + ret = gf_asprintf (&brickid, "%s:%s:%s", volume_id_str, + brickinfo->hostname, brickinfo->path); + if (ret < 0) + goto out; + synclock_unlock (&priv->big_lock); ret = glusterd_rpc_create (&rpc, options, glusterd_brick_rpc_notify, - brickinfo); + brickid); synclock_lock (&priv->big_lock); if (ret) goto out; @@ -1358,9 +1595,23 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t *volinfo, priv = this->private; GF_ASSERT (priv); + if (brickinfo->snap_status == -1) { + gf_log (this->name, GF_LOG_INFO, + "Snapshot is pending on %s:%s. " + "Hence not starting the brick", + brickinfo->hostname, + brickinfo->path); + ret = 0; + goto out; + } + ret = _mk_rundir_p (volinfo); if (ret) goto out; + + glusterd_set_brick_socket_filepath (volinfo, brickinfo, socketpath, + sizeof (socketpath)); + GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, priv); if (glusterd_is_service_running (pidfile, NULL)) goto connect; @@ -1371,6 +1622,11 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t *volinfo, if (!port) port = pmap_registry_alloc (THIS); + /* Build the exp_path, before starting the glusterfsd even in + valgrind mode. Otherwise all the glusterfsd processes start + writing the valgrind log to the same file. + */ + GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, exp_path); runinit (&runner); if (priv->valgrind) { @@ -1393,9 +1649,15 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t *volinfo, runner_argprintf (&runner, "--log-file=%s", valgrind_logfile); } - GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, exp_path); - snprintf (volfile, PATH_MAX, "%s.%s.%s", volinfo->volname, - brickinfo->hostname, exp_path); + if (volinfo->is_snap_volume) { + snprintf (volfile, PATH_MAX,"/%s/%s/%s.%s.%s", + GLUSTERD_VOL_SNAP_DIR_PREFIX, + volinfo->snapshot->snapname, volinfo->volname, + brickinfo->hostname, exp_path); + } else { + snprintf (volfile, PATH_MAX, "%s.%s.%s", volinfo->volname, + brickinfo->hostname, exp_path); + } if (volinfo->logdir) { snprintf (logfile, PATH_MAX, "%s/%s.log", @@ -1407,9 +1669,6 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t *volinfo, if (!brickinfo->logfile) brickinfo->logfile = gf_strdup (logfile); - glusterd_set_brick_socket_filepath (volinfo, brickinfo, socketpath, - sizeof (socketpath)); - (void) snprintf (glusterd_uuid, 1024, "*-posix.glusterd-uuid=%s", uuid_utoa (MY_UUID)); runner_add_args (&runner, SBIN_DIR"/glusterfsd", @@ -1457,9 +1716,13 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t *volinfo, brickinfo->rdma_port = rdma_port; connect: - ret = glusterd_brick_connect (volinfo, brickinfo); - if (ret) + ret = glusterd_brick_connect (volinfo, brickinfo, socketpath); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to connect to brick %s:%s on %s", + brickinfo->hostname, brickinfo->path, socketpath); goto out; + } out: return ret; } @@ -1498,18 +1761,25 @@ glusterd_brick_unlink_socket_file (glusterd_volinfo_t *volinfo, int32_t glusterd_brick_disconnect (glusterd_brickinfo_t *brickinfo) { - GF_ASSERT (brickinfo); + rpc_clnt_t *rpc = NULL; glusterd_conf_t *priv = THIS->private; - if (brickinfo->rpc) { - /* cleanup the saved-frames before last unref */ + GF_ASSERT (brickinfo); + + if (!brickinfo) { + gf_log_callingfn ("glusterd", GF_LOG_WARNING, "!brickinfo"); + return -1; + } + + rpc = brickinfo->rpc; + brickinfo->rpc = NULL; + + if (rpc) { synclock_unlock (&priv->big_lock); - rpc_clnt_connection_cleanup (&brickinfo->rpc->conn); + rpc_clnt_unref (rpc); synclock_lock (&priv->big_lock); - - rpc_clnt_unref (brickinfo->rpc); - brickinfo->rpc = NULL; } + return 0; } @@ -1518,10 +1788,10 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo, glusterd_brickinfo_t *brickinfo, gf_boolean_t del_brick) { - xlator_t *this = NULL; - glusterd_conf_t *priv = NULL; - char pidfile[PATH_MAX] = {0,}; - int ret = 0; + xlator_t *this = NULL; + glusterd_conf_t *priv = NULL; + char pidfile[PATH_MAX] = {0,}; + int ret = 0; GF_ASSERT (volinfo); GF_ASSERT (brickinfo); @@ -1720,13 +1990,14 @@ glusterd_volume_compute_cksum (glusterd_volinfo_t *volinfo) snprintf (filepath, sizeof (filepath), "%s/%s", path, GLUSTERD_VOLUME_INFO_FILE); - snprintf (sort_filepath, sizeof (sort_filepath), "/tmp/%s.XXXXXX", - volinfo->volname); + snprintf (sort_filepath, sizeof (sort_filepath), + "/tmp/%s.XXXXXX", volinfo->volname); sort_fd = mkstemp (sort_filepath); if (sort_fd < 0) { gf_log (this->name, GF_LOG_ERROR, "Could not generate temp " - "file, reason: %s for volume: %s", strerror (errno), + "file, reason: %s for %s: %s", strerror (errno), + (volinfo->is_snap_volume)?"snap":"volume", volinfo->volname); goto out; } else { @@ -1767,7 +2038,6 @@ glusterd_volume_compute_cksum (glusterd_volinfo_t *volinfo) goto out; volinfo->cksum = cksum; - out: if (fd > 0) close (fd); @@ -1846,7 +2116,10 @@ glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo, glusterd_dict_ctx_t ctx = {0}; char *rebalance_id_str = NULL; char *rb_id_str = NULL; + xlator_t *this = NULL; + this = THIS; + GF_ASSERT (this); GF_ASSERT (dict); GF_ASSERT (volinfo); @@ -1861,6 +2134,15 @@ glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo, if (ret) goto out; + snprintf (key, sizeof (key), "volume%d.is_volume_restored", count); + ret = dict_set_int32 (dict, key, volinfo->is_volume_restored); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "Failed to set " + "is_volume_restored option for %s volume", + volinfo->volname); + goto out; + } + memset (key, 0, sizeof (key)); snprintf (key, sizeof (key), "volume%d.brick_count", count); ret = dict_set_int32 (dict, key, volinfo->brick_count); @@ -1915,6 +2197,20 @@ glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo, if (ret) goto out; + snprintf (key, sizeof (key), "volume%d.is_snap_volume", count); + ret = dict_set_uint32 (dict, key, volinfo->is_snap_volume); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "Unable to set %s", key); + goto out; + } + + snprintf (key, sizeof (key), "volume%d.snap-max-hard-limit", count); + ret = dict_set_uint64 (dict, key, volinfo->snap_max_hard_limit); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "Unable to set %s", key); + goto out; + } + volume_id_str = gf_strdup (uuid_utoa (volinfo->volume_id)); if (!volume_id_str) { ret = -1; @@ -2061,9 +2357,42 @@ glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo, if (ret) goto out; + snprintf (key, sizeof (key), "volume%d.brick%d.snap_status", + count, i); + ret = dict_set_int32 (dict, key, brickinfo->snap_status); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set snap_status for %s:%s", + brickinfo->hostname, + brickinfo->path); + goto out; + } + + snprintf (key, sizeof (key), "volume%d.brick%d.device_path", + count, i); + ret = dict_set_str (dict, key, brickinfo->device_path); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set snap_device for %s:%s", + brickinfo->hostname, + brickinfo->path); + goto out; + } + i++; } + /* Add volume op-versions to dict. This prevents volume inconsistencies + * in the cluster + */ + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "volume%d.op-version", count); + ret = dict_set_int32 (dict, key, volinfo->op_version); + if (ret) + goto out; + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "volume%d.client-op-version", count); + ret = dict_set_int32 (dict, key, volinfo->client_op_version); out: GF_FREE (volume_id_str); @@ -2450,7 +2779,7 @@ glusterd_do_volume_quorum_action (xlator_t *this, glusterd_volinfo_t *volinfo, gf_boolean_t meets_quorum) { glusterd_brickinfo_t *brickinfo = NULL; - glusterd_conf_t *conf = NULL; + glusterd_conf_t *conf = NULL; conf = this->private; if (volinfo->status != GLUSTERD_STATUS_STARTED) @@ -2568,6 +2897,8 @@ glusterd_import_new_brick (dict_t *vols, int32_t vol_count, { char key[512] = {0,}; int ret = -1; + int32_t snap_status = 0; + char *snap_device = NULL; char *hostname = NULL; char *path = NULL; glusterd_brickinfo_t *new_brickinfo = NULL; @@ -2595,12 +2926,31 @@ glusterd_import_new_brick (dict_t *vols, int32_t vol_count, goto out; } + snprintf (key, sizeof (key), "volume%d.brick%d.snap_status", + vol_count, brick_count); + ret = dict_get_int32 (vols, key, &snap_status); + if (ret) { + snprintf (msg, sizeof (msg), "%s missing in payload", key); + goto out; + } + + snprintf (key, sizeof (key), "volume%d.brick%d.device_path", + vol_count, brick_count); + ret = dict_get_str (vols, key, &snap_device); + if (ret) { + snprintf (msg, sizeof (msg), "%s missing in payload", key); + goto out; + } + ret = glusterd_brickinfo_new (&new_brickinfo); if (ret) goto out; strcpy (new_brickinfo->path, path); strcpy (new_brickinfo->hostname, hostname); + strcpy (new_brickinfo->device_path, snap_device); + new_brickinfo->snap_status = snap_status; + //peerinfo might not be added yet (void) glusterd_resolve_brick (new_brickinfo); ret = 0; @@ -2654,6 +3004,9 @@ glusterd_import_volinfo (dict_t *vols, int count, int rb_status = 0; char *rebalance_id_str = NULL; char *rb_id_str = NULL; + int op_version = 0; + int client_op_version = 0; + uint32_t is_snap_volume = 0; GF_ASSERT (vols); GF_ASSERT (volinfo); @@ -2665,6 +3018,22 @@ glusterd_import_volinfo (dict_t *vols, int count, goto out; } + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "volume%d.is_snap_volume", count); + ret = dict_get_uint32 (vols, key, &is_snap_volume); + if (ret) { + snprintf (msg, sizeof (msg), "%s missing in payload for %s", + key, volname); + goto out; + } + + if (is_snap_volume == _gf_true) { + gf_log (THIS->name, GF_LOG_DEBUG, + "Not syncing snap volume %s", volname); + ret = 0; + goto out; + } + ret = glusterd_volinfo_new (&new_volinfo); if (ret) goto out; @@ -2790,6 +3159,25 @@ glusterd_import_volinfo (dict_t *vols, int count, goto out; } + new_volinfo->is_snap_volume = is_snap_volume; + + snprintf (key, sizeof (key), "volume%d.is_volume_restored", count); + ret = dict_get_uint32 (vols, key, &new_volinfo->is_volume_restored); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "Failed to get " + "is_volume_restored option for %s", + volname); + goto out; + } + + snprintf (key, sizeof (key), "volume%d.snap-max-hard-limit", count); + ret = dict_get_uint64 (vols, key, &new_volinfo->snap_max_hard_limit); + if (ret) { + snprintf (msg, sizeof (msg), "%s missing in payload for %s", + key, volname); + goto out; + } + memset (key, 0, sizeof (key)); snprintf (key, sizeof (key), "volume%d.rebalance", count); ret = dict_get_uint32 (vols, key, &new_volinfo->rebal.defrag_cmd); @@ -2880,6 +3268,40 @@ glusterd_import_volinfo (dict_t *vols, int count, ret = glusterd_import_friend_volume_opts (vols, count, new_volinfo); if (ret) goto out; + + /* Import the volume's op-versions if available else set it to 1. + * Not having op-versions implies this informtation was obtained from a + * op-version 1 friend (gluster-3.3), ergo the cluster is at op-version + * 1 and all volumes are at op-versions 1. + * + * Either both the volume op-versions should be absent or both should be + * present. Only one being present is a failure + */ + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "volume%d.op-version", count); + ret = dict_get_int32 (vols, key, &op_version); + if (ret) + ret = 0; + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "volume%d.client-op-version", count); + ret = dict_get_int32 (vols, key, &client_op_version); + if (ret) + ret = 0; + + if (op_version && client_op_version) { + new_volinfo->op_version = op_version; + new_volinfo->client_op_version = client_op_version; + } else if (((op_version == 0) && (client_op_version != 0)) || + ((op_version != 0) && (client_op_version == 0))) { + ret = -1; + gf_log ("glusterd", GF_LOG_ERROR, + "Only one volume op-version found"); + goto out; + } else { + new_volinfo->op_version = 1; + new_volinfo->client_op_version = 1; + } + ret = glusterd_import_bricks (vols, count, new_volinfo); if (ret) goto out; @@ -3011,7 +3433,7 @@ glusterd_delete_stale_volume (glusterd_volinfo_t *stale_volinfo, (void) glusterd_delete_all_bricks (stale_volinfo); if (stale_volinfo->shandle) { unlink (stale_volinfo->shandle->path); - (void) glusterd_store_handle_destroy (stale_volinfo->shandle); + (void) gf_store_handle_destroy (stale_volinfo->shandle); stale_volinfo->shandle = NULL; } (void) glusterd_volinfo_delete (stale_volinfo); @@ -3038,6 +3460,12 @@ glusterd_import_friend_volume (dict_t *vols, size_t count) if (ret) goto out; + if (!new_volinfo) { + gf_log (this->name, GF_LOG_DEBUG, + "Not importing snap volume"); + goto out; + } + ret = glusterd_volinfo_find (new_volinfo->volname, &old_volinfo); if (0 == ret) { (void) glusterd_delete_stale_volume (old_volinfo, new_volinfo); @@ -3052,8 +3480,6 @@ glusterd_import_friend_volume (dict_t *vols, size_t count) if (ret) goto out; - gd_update_volume_op_versions (new_volinfo); - list_add_tail (&new_volinfo->vol_list, &priv->volumes); out: gf_log ("", GF_LOG_DEBUG, "Returning with ret: %d", ret); @@ -3475,17 +3901,12 @@ int32_t glusterd_nodesvc_disconnect (char *server) { struct rpc_clnt *rpc = NULL; - glusterd_conf_t *priv = THIS->private; rpc = glusterd_nodesvc_get_rpc (server); + (void)glusterd_nodesvc_set_rpc (server, NULL); - if (rpc) { - synclock_unlock (&priv->big_lock); - rpc_clnt_connection_cleanup (&rpc->conn); - synclock_lock (&priv->big_lock); + if (rpc) rpc_clnt_unref (rpc); - (void)glusterd_nodesvc_set_rpc (server, NULL); - } return 0; } @@ -3677,6 +4098,10 @@ glusterd_nfs_pmap_deregister () else gf_log ("", GF_LOG_ERROR, "De-registration of NLM v1 failed"); + if (pmap_unset (ACL_PROGRAM, ACLV3_VERSION)) + gf_log ("", GF_LOG_INFO, "De-registered ACL v3 successfully"); + else + gf_log ("", GF_LOG_ERROR, "De-registration of ACL v3 failed"); } int @@ -3857,6 +4282,11 @@ glusterd_reconfigure_nfs () int ret = -1; gf_boolean_t identical = _gf_false; + /* + * Check both OLD and NEW volfiles, if they are SAME by size + * and cksum i.e. "character-by-character". If YES, then + * NOTHING has been changed, just return. + */ ret = glusterd_check_nfs_volfile_identical (&identical); if (ret) goto out; @@ -3866,6 +4296,31 @@ glusterd_reconfigure_nfs () goto out; } + /* + * They are not identical. Find out if the topology is changed + * OR just the volume options. If just the options which got + * changed, then inform the xlator to reconfigure the options. + */ + identical = _gf_false; /* RESET the FLAG */ + ret = glusterd_check_nfs_topology_identical (&identical); + if (ret) + goto out; + + /* Topology is not changed, but just the options. But write the + * options to NFS volfile, so that NFS will be reconfigured. + */ + if (identical) { + ret = glusterd_create_nfs_volfile(); + if (ret == 0) {/* Only if above PASSES */ + ret = glusterd_fetchspec_notify (THIS); + } + goto out; + } + + /* + * NFS volfile's topology has been changed. NFS server needs + * to be RESTARTED to ACT on the changed volfile. + */ ret = glusterd_check_generate_start_nfs (); out: @@ -4097,20 +4552,42 @@ out: int glusterd_restart_bricks (glusterd_conf_t *conf) { + int ret = 0; glusterd_volinfo_t *volinfo = NULL; glusterd_brickinfo_t *brickinfo = NULL; + glusterd_snap_t *snap = NULL; gf_boolean_t start_nodesvcs = _gf_false; - int ret = 0; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); list_for_each_entry (volinfo, &conf->volumes, vol_list) { if (volinfo->status != GLUSTERD_STATUS_STARTED) continue; start_nodesvcs = _gf_true; + gf_log (this->name, GF_LOG_DEBUG, "starting the volume %s", + volinfo->volname); list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { glusterd_brick_start (volinfo, brickinfo, _gf_false); } } + list_for_each_entry (snap, &conf->snapshots, snap_list) { + list_for_each_entry (volinfo, &snap->volumes, vol_list) { + if (volinfo->status != GLUSTERD_STATUS_STARTED) + continue; + start_nodesvcs = _gf_true; + gf_log (this->name, GF_LOG_DEBUG, "starting the snap " + "volume %s", volinfo->volname); + list_for_each_entry (brickinfo, &volinfo->bricks, + brick_list) { + glusterd_brick_start (volinfo, brickinfo, + _gf_false); + } + } + } + if (start_nodesvcs) glusterd_nodesvcs_handle_graph_change (NULL); @@ -4120,10 +4597,13 @@ glusterd_restart_bricks (glusterd_conf_t *conf) int _local_gsyncd_start (dict_t *this, char *key, data_t *value, void *data) { + char *path_list = NULL; char *slave = NULL; int uuid_len = 0; + int ret = 0; char uuid_str[64] = {0}; glusterd_volinfo_t *volinfo = NULL; + char *conf_path = NULL; volinfo = data; GF_ASSERT (volinfo); @@ -4135,9 +4615,24 @@ _local_gsyncd_start (dict_t *this, char *key, data_t *value, void *data) uuid_len = (slave - value->data - 1); strncpy (uuid_str, (char*)value->data, uuid_len); - glusterd_start_gsync (volinfo, slave, uuid_str, NULL); - return 0; + ret = glusterd_get_local_brickpaths (volinfo, &path_list); + + ret = dict_get_str (this, "conf_path", &conf_path); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to fetch conf file path."); + goto out; + } + + glusterd_start_gsync (volinfo, slave, path_list, conf_path, + uuid_str, NULL); + + GF_FREE (path_list); + path_list = NULL; + +out: + return ret; } int @@ -4186,7 +4681,7 @@ glusterd_get_brickinfo (xlator_t *this, const char *brickname, int port, list_for_each_entry (volinfo, &priv->volumes, vol_list) { list_for_each_entry (tmpbrkinfo, &volinfo->bricks, brick_list) { - if (localhost && !glusterd_is_local_addr (tmpbrkinfo->hostname)) + if (localhost && !gf_is_local_addr (tmpbrkinfo->hostname)) continue; if (!strcmp(tmpbrkinfo->path, brickname) && (tmpbrkinfo->port == port)) { @@ -4258,7 +4753,7 @@ out: } #ifdef GF_LINUX_HOST_OS -static int +int glusterd_get_brick_root (char *path, char **mount_point) { char *ptr = NULL; @@ -4426,6 +4921,31 @@ glusterd_add_inode_size_to_dict (dict_t *dict, int count) return ret; } +struct mntent * +glusterd_get_mnt_entry_info (char *mnt_pt, FILE *mtab) +{ + struct mntent *entry = NULL; + + mtab = setmntent (_PATH_MOUNTED, "r"); + if (!mtab) + goto out; + + entry = getmntent (mtab); + + while (1) { + if (!entry) + goto out; + + if (!strcmp (entry->mnt_dir, mnt_pt) && + strcmp (entry->mnt_type, "rootfs")) + break; + entry = getmntent (mtab); + } + +out: + return entry; +} + static int glusterd_add_brick_mount_details (glusterd_brickinfo_t *brickinfo, dict_t *dict, int count) @@ -4437,8 +4957,8 @@ glusterd_add_brick_mount_details (glusterd_brickinfo_t *brickinfo, char *fs_name = NULL; char *mnt_options = NULL; char *device = NULL; - FILE *mtab = NULL; struct mntent *entry = NULL; + FILE *mtab = NULL; snprintf (base_key, sizeof (base_key), "brick%d", count); @@ -4446,25 +4966,12 @@ glusterd_add_brick_mount_details (glusterd_brickinfo_t *brickinfo, if (ret) goto out; - mtab = setmntent (_PATH_MOUNTED, "r"); - if (!mtab) { + entry = glusterd_get_mnt_entry_info (mnt_pt, mtab); + if (!entry) { ret = -1; goto out; } - entry = getmntent (mtab); - - while (1) { - if (!entry) { - ret = -1; - goto out; - } - if (!strcmp (entry->mnt_dir, mnt_pt) && - strcmp (entry->mnt_type, "rootfs")) - break; - entry = getmntent (mtab); - } - /* get device file */ memset (key, 0, sizeof (key)); snprintf (key, sizeof (key), "%s.device", base_key); @@ -4497,6 +5004,45 @@ glusterd_add_brick_mount_details (glusterd_brickinfo_t *brickinfo, return ret; } + +char* +glusterd_get_brick_mount_details (glusterd_brickinfo_t *brickinfo) +{ + int ret = -1; + char *mnt_pt = NULL; + char *device = NULL; + FILE *mtab = NULL; + struct mntent *entry = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (brickinfo); + + ret = glusterd_get_brick_root (brickinfo->path, &mnt_pt); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get mount point " + "for %s brick", brickinfo->path); + goto out; + } + + entry = glusterd_get_mnt_entry_info (mnt_pt, mtab); + if (NULL == entry) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get mnt entry " + "for %s mount path", mnt_pt); + goto out; + } + + /* get the fs_name/device */ + device = gf_strdup (entry->mnt_fsname); + +out: + if (NULL != mtab) { + endmntent (mtab); + } + + return device; +} #endif int @@ -4853,10 +5399,11 @@ glusterd_hostname_to_uuid (char *hostname, uuid_t uuid) ret = glusterd_friend_find_by_hostname (hostname, &peerinfo); if (ret) { - if (glusterd_is_local_addr (hostname)) { + if (gf_is_local_addr (hostname)) { uuid_copy (uuid, MY_UUID); ret = 0; } else { + ret = 0; goto out; } } else { @@ -5191,11 +5738,12 @@ out: int glusterd_check_and_set_brick_xattr (char *host, char *path, uuid_t uuid, - char **op_errstr) + char **op_errstr, gf_boolean_t is_force) { int ret = -1; char msg[2048] = {0,}; gf_boolean_t in_use = _gf_false; + int flags = 0; /* Check for xattr support in backend fs */ ret = sys_lsetxattr (path, "trusted.glusterfs.test", @@ -5215,13 +5763,17 @@ glusterd_check_and_set_brick_xattr (char *host, char *path, uuid_t uuid, if (ret) goto out; - if (in_use) { + if (in_use && !is_force) { ret = -1; goto out; } + + if (!is_force) + flags = XATTR_CREATE; + ret = sys_lsetxattr (path, GF_XATTR_VOL_ID_KEY, uuid, 16, - XATTR_CREATE); + flags); if (ret) { snprintf (msg, sizeof (msg), "Failed to set extended " "attributes %s, reason: %s", @@ -5576,12 +6128,92 @@ glusterd_delete_all_bricks (glusterd_volinfo_t* volinfo) } int +glusterd_get_local_brickpaths (glusterd_volinfo_t *volinfo, char **pathlist) +{ + char **path_tokens = NULL; + char *tmp_path_list = NULL; + char path[PATH_MAX] = ""; + int32_t count = 0; + int32_t pathlen = 0; + int32_t total_len = 0; + int32_t ret = 0; + int i = 0; + glusterd_brickinfo_t *brickinfo = NULL; + + if ((!volinfo) || (!pathlist)) + goto out; + + path_tokens = GF_CALLOC (sizeof(char*), volinfo->brick_count, + gf_gld_mt_charptr); + if (!path_tokens) { + gf_log ("", GF_LOG_DEBUG, "Could not allocate memory."); + ret = -1; + goto out; + } + + list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { + if (uuid_compare (brickinfo->uuid, MY_UUID)) + continue; + + pathlen = snprintf (path, sizeof(path), + "--path=%s ", brickinfo->path); + if (pathlen < sizeof(path)) + path[pathlen] = '\0'; + else + path[sizeof(path)-1] = '\0'; + path_tokens[count] = gf_strdup (path); + if (!path_tokens[count]) { + gf_log ("", GF_LOG_DEBUG, + "Could not allocate memory."); + ret = -1; + goto out; + } + count++; + total_len += pathlen; + } + + tmp_path_list = GF_CALLOC (sizeof(char), total_len + 1, + gf_gld_mt_char); + if (!tmp_path_list) { + gf_log ("", GF_LOG_DEBUG, "Could not allocate memory."); + ret = -1; + goto out; + } + + for (i = 0; i < count; i++) + strcat (tmp_path_list, path_tokens[i]); + + if (count) + *pathlist = tmp_path_list; + + ret = count; +out: + for (i = 0; i < count; i++) { + GF_FREE (path_tokens[i]); + path_tokens[i] = NULL; + } + + GF_FREE (path_tokens); + path_tokens = NULL; + + if (ret == 0) { + gf_log ("", GF_LOG_DEBUG, "No Local Bricks Present."); + GF_FREE (tmp_path_list); + tmp_path_list = NULL; + } + + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +int glusterd_start_gsync (glusterd_volinfo_t *master_vol, char *slave, - char *glusterd_uuid_str, char **op_errstr) + char *path_list, char *conf_path, + char *glusterd_uuid_str, + char **op_errstr) { int32_t ret = 0; int32_t status = 0; - char buf[PATH_MAX] = {0,}; char uuid_str [64] = {0}; runner_t runner = {0,}; xlator_t *this = NULL; @@ -5594,32 +6226,23 @@ glusterd_start_gsync (glusterd_volinfo_t *master_vol, char *slave, GF_ASSERT (priv); uuid_utoa_r (MY_UUID, uuid_str); - if (strcmp (uuid_str, glusterd_uuid_str)) - goto out; - - ret = gsync_status (master_vol->volname, slave, &status); - if (status == 0) - goto out; - snprintf (buf, PATH_MAX, "%s/"GEOREP"/%s", priv->workdir, master_vol->volname); - ret = mkdir_p (buf, 0777, _gf_true); - if (ret) { - errcode = -1; + if (!path_list) { + ret = 0; + gf_log ("", GF_LOG_DEBUG, "No Bricks in this node." + " Not starting gsyncd."); goto out; } - snprintf (buf, PATH_MAX, DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"/%s", - master_vol->volname); - ret = mkdir_p (buf, 0777, _gf_true); - if (ret) { - errcode = -1; + ret = gsync_status (master_vol->volname, slave, conf_path, &status); + if (status == 0) goto out; - } uuid_utoa_r (master_vol->volume_id, uuid_str); runinit (&runner); - runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL); - runner_argprintf (&runner, "%s/"GSYNC_CONF, priv->workdir); + runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", + path_list, "-c", NULL); + runner_argprintf (&runner, "%s", conf_path); runner_argprintf (&runner, ":%s", master_vol->volname); runner_add_args (&runner, slave, "--config-set", "session-owner", uuid_str, NULL); @@ -5632,9 +6255,12 @@ glusterd_start_gsync (glusterd_volinfo_t *master_vol, char *slave, } runinit (&runner); - runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "--monitor", "-c", NULL); - runner_argprintf (&runner, "%s/"GSYNC_CONF, priv->workdir); + runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", + path_list, "--monitor", "-c", NULL); + runner_argprintf (&runner, "%s", conf_path); runner_argprintf (&runner, ":%s", master_vol->volname); + runner_argprintf (&runner, "--glusterd-uuid=%s", + uuid_utoa (priv->uuid)); runner_add_arg (&runner, slave); synclock_unlock (&priv->big_lock); ret = runner_run (&runner); @@ -5650,7 +6276,7 @@ glusterd_start_gsync (glusterd_volinfo_t *master_vol, char *slave, out: if ((ret != 0) && errcode == -1) { if (op_errstr) - *op_errstr = gf_strdup ("internal error, cannot start" + *op_errstr = gf_strdup ("internal error, cannot start " "the " GEOREP " session"); } @@ -5659,17 +6285,38 @@ out: } int32_t -glusterd_recreate_bricks (glusterd_conf_t *conf) +glusterd_recreate_volfiles (glusterd_conf_t *conf) { glusterd_volinfo_t *volinfo = NULL; int ret = 0; + int op_ret = 0; GF_ASSERT (conf); list_for_each_entry (volinfo, &conf->volumes, vol_list) { ret = generate_brick_volfiles (volinfo); + if (ret) { + gf_log ("glusterd", GF_LOG_ERROR, "Failed to " + "regenerate brick volfiles for %s", + volinfo->volname); + op_ret = ret; + } + ret = generate_client_volfiles (volinfo, GF_CLIENT_TRUSTED); + if (ret) { + gf_log ("glusterd", GF_LOG_ERROR, "Failed to " + "regenerate trusted client volfiles for %s", + volinfo->volname); + op_ret = ret; + } + ret = generate_client_volfiles (volinfo, GF_CLIENT_OTHER); + if (ret) { + gf_log ("glusterd", GF_LOG_ERROR, "Failed to " + "regenerate client volfiles for %s", + volinfo->volname); + op_ret = ret; + } } - return ret; + return op_ret; } int32_t @@ -5679,7 +6326,7 @@ glusterd_handle_upgrade_downgrade (dict_t *options, glusterd_conf_t *conf) char *type = NULL; gf_boolean_t upgrade = _gf_false; gf_boolean_t downgrade = _gf_false; - gf_boolean_t regenerate_brick_volfiles = _gf_false; + gf_boolean_t regenerate_volfiles = _gf_false; gf_boolean_t terminate = _gf_false; ret = dict_get_str (options, "upgrade", &type); @@ -5692,7 +6339,7 @@ glusterd_handle_upgrade_downgrade (dict_t *options, glusterd_conf_t *conf) goto out; } if (_gf_true == upgrade) - regenerate_brick_volfiles = _gf_true; + regenerate_volfiles = _gf_true; } ret = dict_get_str (options, "downgrade", &type); @@ -5717,8 +6364,8 @@ glusterd_handle_upgrade_downgrade (dict_t *options, glusterd_conf_t *conf) ret = 0; else terminate = _gf_true; - if (regenerate_brick_volfiles) { - ret = glusterd_recreate_bricks (conf); + if (regenerate_volfiles) { + ret = glusterd_recreate_volfiles (conf); } out: if (terminate && (ret == 0)) @@ -6113,6 +6760,7 @@ glusterd_volinfo_reset_defrag_stats (glusterd_volinfo_t *volinfo) rebal->lookedup_files = 0; rebal->rebalance_failures = 0; rebal->rebalance_time = 0; + rebal->skipped_files = 0; } @@ -6209,6 +6857,7 @@ glusterd_defrag_volume_status_update (glusterd_volinfo_t *volinfo, uint64_t lookup = 0; gf_defrag_status_t status = GF_DEFRAG_STATUS_NOT_STARTED; uint64_t failures = 0; + uint64_t skipped = 0; xlator_t *this = NULL; double run_time = 0; @@ -6239,6 +6888,11 @@ glusterd_defrag_volume_status_update (glusterd_volinfo_t *volinfo, gf_log (this->name, GF_LOG_TRACE, "failed to get failure count"); + ret = dict_get_uint64 (rsp_dict, "skipped", &skipped); + if (ret) + gf_log (this->name, GF_LOG_TRACE, + "failed to get skipped count"); + ret = dict_get_double (rsp_dict, "run-time", &run_time); if (ret) gf_log (this->name, GF_LOG_TRACE, @@ -6254,6 +6908,8 @@ glusterd_defrag_volume_status_update (glusterd_volinfo_t *volinfo, volinfo->rebal.defrag_status = status; if (failures) volinfo->rebal.rebalance_failures = failures; + if (skipped) + volinfo->rebal.skipped_files = skipped; if (run_time) volinfo->rebal.rebalance_time = run_time; @@ -6261,6 +6917,68 @@ glusterd_defrag_volume_status_update (glusterd_volinfo_t *volinfo, } int +glusterd_check_topology_identical (const char *filename1, + const char *filename2, + gf_boolean_t *identical) +{ + int ret = -1; /* FAILURE */ + xlator_t *this = NULL; + FILE *fp1 = NULL; + FILE *fp2 = NULL; + glusterfs_graph_t *grph1 = NULL; + glusterfs_graph_t *grph2 = NULL; + + if ((!filename1) || (!filename2) || (!identical)) + goto out; + + this = THIS; + + errno = 0; /* RESET the errno */ + + /* fopen() the volfile1 to create the graph */ + fp1 = fopen (filename1, "r"); + if (fp1 == NULL) { + gf_log (this->name, GF_LOG_ERROR, "fopen() on file: %s failed " + "(%s)", filename1, strerror (errno)); + goto out; + } + + /* fopen() the volfile2 to create the graph */ + fp2 = fopen (filename2, "r"); + if (fp2 == NULL) { + gf_log (this->name, GF_LOG_ERROR, "fopen() on file: %s failed " + "(%s)", filename2, strerror (errno)); + goto out; + } + + /* create the graph for filename1 */ + grph1 = glusterfs_graph_construct(fp1); + if (grph1 == NULL) + goto out; + + /* create the graph for filename2 */ + grph2 = glusterfs_graph_construct(fp2); + if (grph2 == NULL) + goto out; + + /* compare the graph topology */ + *identical = is_graph_topology_equal(grph1, grph2); + ret = 0; /* SUCCESS */ +out: + if (fp1) + fclose(fp1); + if (fp2) + fclose(fp2); + if (grph1) + glusterfs_graph_destroy(grph1); + if (grph2) + glusterfs_graph_destroy(grph2); + + gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret); + return ret; +} + +int glusterd_check_files_identical (char *filename1, char *filename2, gf_boolean_t *identical) { @@ -6520,6 +7238,7 @@ glusterd_gsync_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict, char *op_errstr) { dict_t *ctx = NULL; int ret = 0; + char *conf_path = NULL; if (aggr) { ctx = aggr; @@ -6541,6 +7260,17 @@ glusterd_gsync_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict, char *op_errstr) ret = glusterd_append_gsync_status (ctx, rsp_dict); if (ret) goto out; + + ret = dict_get_str (rsp_dict, "conf_path", &conf_path); + if (!ret && conf_path) { + ret = dict_set_dynstr (ctx, "conf_path", + gf_strdup(conf_path)); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to store conf path."); + goto out; + } + } } if ((op_errstr) && (strcmp ("", op_errstr))) { ret = dict_set_dynstr (ctx, "errstr", gf_strdup(op_errstr)); @@ -6761,7 +7491,7 @@ glusterd_volume_status_copy_to_op_ctx_dict (dict_t *aggr, dict_t *rsp_dict) if (ret) goto out; - if (cmd & GF_CLI_STATUS_ALL && is_origin_glusterd ()) { + if (cmd & GF_CLI_STATUS_ALL && is_origin_glusterd (ctx_dict)) { ret = dict_get_int32 (rsp_dict, "vol_count", &vol_count); if (ret == 0) { ret = dict_set_int32 (ctx_dict, "vol_count", @@ -6783,6 +7513,12 @@ glusterd_volume_status_copy_to_op_ctx_dict (dict_t *aggr, dict_t *rsp_dict) } } + if ((cmd & GF_CLI_STATUS_TASKS) != 0) { + dict_copy (rsp_dict, aggr); + ret = 0; + goto out; + } + ret = dict_get_int32 (rsp_dict, "count", &rsp_node_count); if (ret) { ret = 0; //no bricks in the rsp @@ -6985,6 +7721,18 @@ glusterd_volume_rebalance_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict) } memset (key, 0, 256); + snprintf (key, 256, "skipped-%d", index); + ret = dict_get_uint64 (rsp_dict, key, &value); + if (!ret) { + memset (key, 0, 256); + snprintf (key, 256, "skipped-%d", current_index); + ret = dict_set_uint64 (ctx_dict, key, value); + if (ret) { + gf_log (THIS->name, GF_LOG_DEBUG, + "failed to set skipped count"); + } + } + memset (key, 0, 256); snprintf (key, 256, "run-time-%d", index); ret = dict_get_double (rsp_dict, key, &elapsed_time); if (!ret) { @@ -7004,6 +7752,334 @@ out: } int +glusterd_snap_config_use_rsp_dict (dict_t *dst, dict_t *src) +{ + char buf[PATH_MAX] = ""; + char *volname = NULL; + int ret = -1; + int config_command = 0; + uint64_t i = 0; + uint64_t value = 0; + uint64_t voldisplaycount = 0; + + if (!dst || !src) { + gf_log ("", GF_LOG_ERROR, "Source or Destination " + "dict is empty."); + goto out; + } + + ret = dict_get_int32 (dst, "config-command", &config_command); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "failed to get config-command type"); + goto out; + } + + switch (config_command) { + case GF_SNAP_CONFIG_DISPLAY: + ret = dict_get_uint64 (src, "snap-max-hard-limit", &value); + if (!ret) { + ret = dict_set_uint64 (dst, "snap-max-hard-limit", value); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to set snap_max_hard_limit"); + goto out; + } + } else { + /* Received dummy response from other nodes */ + ret = 0; + goto out; + } + + ret = dict_get_uint64 (src, "snap-max-soft-limit", &value); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to get snap_max_soft_limit"); + goto out; + } + + ret = dict_set_uint64 (dst, "snap-max-soft-limit", value); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to set snap_max_soft_limit"); + goto out; + } + + ret = dict_get_uint64 (src, "voldisplaycount", + &voldisplaycount); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to get voldisplaycount"); + goto out; + } + + ret = dict_set_uint64 (dst, "voldisplaycount", + voldisplaycount); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to set voldisplaycount"); + goto out; + } + + for (i = 0; i < voldisplaycount; i++) { + snprintf (buf, sizeof(buf), "volume%ld-volname", i); + ret = dict_get_str (src, buf, &volname); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to get %s", buf); + goto out; + } + ret = dict_set_str (dst, buf, volname); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to set %s", buf); + goto out; + } + + snprintf (buf, sizeof(buf), + "volume%ld-snap-max-hard-limit", i); + ret = dict_get_uint64 (src, buf, &value); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to get %s", buf); + goto out; + } + ret = dict_set_uint64 (dst, buf, value); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to set %s", buf); + goto out; + } + + snprintf (buf, sizeof(buf), + "volume%ld-active-hard-limit", i); + ret = dict_get_uint64 (src, buf, &value); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to get %s", buf); + goto out; + } + ret = dict_set_uint64 (dst, buf, value); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to set %s", buf); + goto out; + } + + snprintf (buf, sizeof(buf), + "volume%ld-snap-max-soft-limit", i); + ret = dict_get_uint64 (src, buf, &value); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to get %s", buf); + goto out; + } + ret = dict_set_uint64 (dst, buf, value); + if (ret) { + gf_log ("", GF_LOG_ERROR, + "Unable to set %s", buf); + goto out; + } + } + + break; + default: + break; + } + + ret = 0; +out: + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +/* Aggregate missed_snap_counts from different nodes and save it * + * in the req_dict of the originator node */ +int +glusterd_snap_create_use_rsp_dict (dict_t *dst, dict_t *src) +{ + char *buf = NULL; + char *tmp_str = NULL; + char name_buf[PATH_MAX] = ""; + int32_t i = -1; + int32_t ret = -1; + int32_t src_missed_snap_count = -1; + int32_t dst_missed_snap_count = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + + if (!dst || !src) { + gf_log (this->name, GF_LOG_ERROR, "Source or Destination " + "dict is empty."); + goto out; + } + + ret = dict_get_int32 (src, "missed_snap_count", + &src_missed_snap_count); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "No missed snaps"); + ret = 0; + goto out; + } + + ret = dict_get_int32 (dst, "missed_snap_count", + &dst_missed_snap_count); + if (ret) { + /* Initialize dst_missed_count for the first time */ + dst_missed_snap_count = 0; + } + + for (i = 0; i < src_missed_snap_count; i++) { + snprintf (name_buf, sizeof(name_buf), "missed_snaps_%d", + i); + ret = dict_get_str (src, name_buf, &buf); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to fetch %s", name_buf); + goto out; + } + + snprintf (name_buf, sizeof(name_buf), "missed_snaps_%d", + dst_missed_snap_count); + + tmp_str = gf_strdup (buf); + if (!tmp_str) { + ret = -1; + goto out; + } + + ret = dict_set_dynstr (dst, name_buf, tmp_str); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to set %s", name_buf); + goto out; + } + + tmp_str = NULL; + dst_missed_snap_count++; + } + + ret = dict_set_int32 (dst, "missed_snap_count", dst_missed_snap_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to set dst_missed_snap_count"); + goto out; + } + +out: + if (ret && tmp_str) + GF_FREE(tmp_str); + + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +int +glusterd_snap_use_rsp_dict (dict_t *dst, dict_t *src) +{ + int ret = -1; + int32_t snap_command = 0; + + if (!dst || !src) { + gf_log ("", GF_LOG_ERROR, "Source or Destination " + "dict is empty."); + goto out; + } + + ret = dict_get_int32 (dst, "type", &snap_command); + if (ret) { + gf_log ("", GF_LOG_ERROR, "unable to get the type of " + "the snapshot command"); + goto out; + } + + switch (snap_command) { + case GF_SNAP_OPTION_TYPE_CREATE: + case GF_SNAP_OPTION_TYPE_DELETE: + ret = glusterd_snap_create_use_rsp_dict (dst, src); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to use rsp dict"); + goto out; + } + break; + case GF_SNAP_OPTION_TYPE_CONFIG: + ret = glusterd_snap_config_use_rsp_dict (dst, src); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to use rsp dict"); + goto out; + } + break; + default: + // copy the response dictinary's contents to the dict to be + // sent back to the cli + dict_copy (src, dst); + break; + } + + ret = 0; +out: + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +int +glusterd_sys_exec_output_rsp_dict (dict_t *dst, dict_t *src) +{ + char output_name[PATH_MAX] = ""; + char *output = NULL; + int ret = 0; + int i = 0; + int len = 0; + int src_output_count = 0; + int dst_output_count = 0; + + if (!dst || !src) { + gf_log ("", GF_LOG_ERROR, "Source or Destination " + "dict is empty."); + goto out; + } + + ret = dict_get_int32 (dst, "output_count", &dst_output_count); + + ret = dict_get_int32 (src, "output_count", &src_output_count); + if (ret) { + gf_log ("", GF_LOG_DEBUG, "No output from source"); + ret = 0; + goto out; + } + + for (i = 1; i <= src_output_count; i++) { + len = snprintf (output_name, sizeof(output_name) - 1, + "output_%d", i); + output_name[len] = '\0'; + ret = dict_get_str (src, output_name, &output); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to fetch %s", + output_name); + goto out; + } + + len = snprintf (output_name, sizeof(output_name) - 1, + "output_%d", i+dst_output_count); + output_name[len] = '\0'; + ret = dict_set_dynstr (dst, output_name, gf_strdup (output)); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to set %s", + output_name); + goto out; + } + } + + ret = dict_set_int32 (dst, "output_count", + dst_output_count+src_output_count); +out: + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +int glusterd_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict) { int ret = 0; @@ -7176,6 +8252,77 @@ out: } int +_heal_volume_add_shd_rsp_of_statistics (dict_t *this, char *key, data_t + *value, void *data) +{ + char new_key[256] = {0,}; + char int_str[16] = {0,}; + char key_begin_string[128] = {0,}; + data_t *new_value = NULL; + char *rxl_end = NULL; + char *rxl_child_end = NULL; + glusterd_volinfo_t *volinfo = NULL; + char *key_begin_str = NULL; + int rxl_id = 0; + int rxl_child_id = 0; + int brick_id = 0; + int int_len = 0; + int ret = 0; + glusterd_heal_rsp_conv_t *rsp_ctx = NULL; + glusterd_brickinfo_t *brickinfo = NULL; + + rsp_ctx = data; + key_begin_str = strchr (key, '-'); + if (!key_begin_str) + goto out; + + int_len = strlen (key) - strlen (key_begin_str); + strncpy (key_begin_string, key, int_len); + key_begin_string[int_len] = '\0'; + + rxl_end = strchr (key_begin_str + 1, '-'); + if (!rxl_end) + goto out; + + int_len = strlen (key_begin_str) - strlen (rxl_end) - 1; + strncpy (int_str, key_begin_str + 1, int_len); + int_str[int_len] = '\0'; + ret = gf_string2int (int_str, &rxl_id); + if (ret) + goto out; + + + rxl_child_end = strchr (rxl_end + 1, '-'); + if (!rxl_child_end) + goto out; + + int_len = strlen (rxl_end) - strlen (rxl_child_end) - 1; + strncpy (int_str, rxl_end + 1, int_len); + int_str[int_len] = '\0'; + ret = gf_string2int (int_str, &rxl_child_id); + if (ret) + goto out; + + volinfo = rsp_ctx->volinfo; + brick_id = rxl_id * volinfo->replica_count + rxl_child_id; + + brickinfo = glusterd_get_brickinfo_by_position (volinfo, brick_id); + if (!brickinfo) + goto out; + if (!glusterd_is_local_brick (rsp_ctx->this, volinfo, brickinfo)) + goto out; + + new_value = data_copy (value); + snprintf (new_key, sizeof (new_key), "%s-%d%s", key_begin_string, + brick_id, rxl_child_end); + dict_set (rsp_ctx->dict, new_key, new_value); + +out: + return 0; + +} + +int glusterd_heal_volume_brick_rsp (dict_t *req_dict, dict_t *rsp_dict, dict_t *op_ctx, char **op_errstr) { @@ -7183,6 +8330,7 @@ glusterd_heal_volume_brick_rsp (dict_t *req_dict, dict_t *rsp_dict, glusterd_heal_rsp_conv_t rsp_ctx = {0}; char *volname = NULL; glusterd_volinfo_t *volinfo = NULL; + int heal_op = -1; GF_ASSERT (rsp_dict); GF_ASSERT (op_ctx); @@ -7194,6 +8342,13 @@ glusterd_heal_volume_brick_rsp (dict_t *req_dict, dict_t *rsp_dict, goto out; } + ret = dict_get_int32 (req_dict, "heal-op", &heal_op); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to get heal_op"); + goto out; + } + + ret = glusterd_volinfo_find (volname, &volinfo); if (ret) @@ -7202,7 +8357,12 @@ glusterd_heal_volume_brick_rsp (dict_t *req_dict, dict_t *rsp_dict, rsp_ctx.dict = op_ctx; rsp_ctx.volinfo = volinfo; rsp_ctx.this = THIS; - dict_foreach (rsp_dict, _heal_volume_add_shd_rsp, &rsp_ctx); + if (heal_op == GF_AFR_OP_STATISTICS) + dict_foreach (rsp_dict, _heal_volume_add_shd_rsp_of_statistics, + &rsp_ctx); + else + dict_foreach (rsp_dict, _heal_volume_add_shd_rsp, &rsp_ctx); + out: return ret; @@ -7347,6 +8507,13 @@ glusterd_defrag_volume_node_rsp (dict_t *req_dict, dict_t *rsp_dict, gf_log (THIS->name, GF_LOG_ERROR, "failed to set failure count"); + memset (key, 0 , 256); + snprintf (key, 256, "skipped-%d", i); + ret = dict_set_uint64 (op_ctx, key, volinfo->rebal.skipped_files); + if (ret) + gf_log (THIS->name, GF_LOG_ERROR, + "failed to set skipped count"); + memset (key, 0, 256); snprintf (key, 256, "run-time-%d", i); ret = dict_set_double (op_ctx, key, volinfo->rebal.rebalance_time); @@ -7398,16 +8565,31 @@ glusterd_handle_node_rsp (dict_t *req_dict, void *pending_entry, * time a lock_owner is set */ gf_boolean_t -is_origin_glusterd () +is_origin_glusterd (dict_t *dict) { - int ret = 0; - uuid_t lock_owner = {0,}; + gf_boolean_t ret = _gf_false; + uuid_t lock_owner = {0,}; + uuid_t *originator_uuid = NULL; - ret = glusterd_get_lock_owner (&lock_owner); - if (ret) - return _gf_false; + GF_ASSERT (dict); + + ret = dict_get_bin (dict, "originator_uuid", + (void **) &originator_uuid); + if (ret) { + /* If not originator_uuid has been set, then the command + * has been originated from a glusterd running on older version + * Hence fetching the lock owner */ + ret = glusterd_get_lock_owner (&lock_owner); + if (ret) { + ret = _gf_false; + goto out; + } + ret = !uuid_compare (MY_UUID, lock_owner); + } else + ret = !uuid_compare (MY_UUID, *originator_uuid); - return (uuid_compare (MY_UUID, lock_owner) == 0); +out: + return ret; } int @@ -7470,54 +8652,6 @@ glusterd_copy_uuid_to_dict (uuid_t uuid, dict_t *dict, char *key) return 0; } -gf_boolean_t -glusterd_is_same_address (char *name1, char *name2) -{ - struct addrinfo *addr1 = NULL; - struct addrinfo *addr2 = NULL; - struct addrinfo *p = NULL; - struct addrinfo *q = NULL; - gf_boolean_t ret = _gf_false; - int gai_err = 0; - - gai_err = getaddrinfo(name1,NULL,NULL,&addr1); - if (gai_err != 0) { - gf_log (name1, GF_LOG_WARNING, - "error in getaddrinfo: %s\n", gai_strerror(gai_err)); - goto out; - } - - gai_err = getaddrinfo(name2,NULL,NULL,&addr2); - if (gai_err != 0) { - gf_log (name2, GF_LOG_WARNING, - "error in getaddrinfo: %s\n", gai_strerror(gai_err)); - goto out; - } - - for (p = addr1; p; p = p->ai_next) { - for (q = addr2; q; q = q->ai_next) { - if (p->ai_addrlen != q->ai_addrlen) { - continue; - } - if (memcmp(p->ai_addr,q->ai_addr,p->ai_addrlen)) { - continue; - } - ret = _gf_true; - goto out; - } - } - -out: - if (addr1) { - freeaddrinfo(addr1); - } - if (addr2) { - freeaddrinfo(addr2); - } - return ret; - -} - int _update_volume_op_versions (dict_t *this, char *key, data_t *value, void *data) { @@ -7590,3 +8724,171 @@ gd_update_volume_op_versions (glusterd_volinfo_t *volinfo) return; } + +/* A task is committed/completed once the task-id for it is cleared */ +gf_boolean_t +gd_is_remove_brick_committed (glusterd_volinfo_t *volinfo) +{ + GF_ASSERT (volinfo); + + if ((GD_OP_REMOVE_BRICK == volinfo->rebal.op) && + !uuid_is_null (volinfo->rebal.rebalance_id)) + return _gf_false; + + return _gf_true; +} + +gf_boolean_t +glusterd_are_vol_all_peers_up (glusterd_volinfo_t *volinfo, + struct list_head *peers, + char **down_peerstr) +{ + glusterd_peerinfo_t *peerinfo = NULL; + glusterd_brickinfo_t *brickinfo = NULL; + gf_boolean_t ret = _gf_false; + + list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { + if (!uuid_compare (brickinfo->uuid, MY_UUID)) + continue; + + list_for_each_entry (peerinfo, peers, uuid_list) { + if (uuid_compare (peerinfo->uuid, brickinfo->uuid)) + continue; + + /*Found peer who owns the brick, return false + * if peer is not connected or not friend */ + if (!(peerinfo->connected) || + (peerinfo->state.state != + GD_FRIEND_STATE_BEFRIENDED)) { + *down_peerstr = gf_strdup (peerinfo->hostname); + gf_log ("", GF_LOG_DEBUG, "Peer %s is down. ", + peerinfo->hostname); + goto out; + } + } + } + + ret = _gf_true; +out: + gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} + +gf_boolean_t +glusterd_is_status_tasks_op (glusterd_op_t op, dict_t *dict) +{ + int ret = -1; + uint32_t cmd = GF_CLI_STATUS_NONE; + gf_boolean_t is_status_tasks = _gf_false; + + if (op != GD_OP_STATUS_VOLUME) + goto out; + + ret = dict_get_uint32 (dict, "cmd", &cmd); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "Failed to get opcode"); + goto out; + } + + if (cmd & GF_CLI_STATUS_TASKS) + is_status_tasks = _gf_true; + +out: + return is_status_tasks; +} + +int +glusterd_compare_snap_time(struct list_head *list1, struct list_head *list2) +{ + glusterd_snap_t *snap1 = NULL; + glusterd_snap_t *snap2 = NULL; + double diff_time = 0; + + GF_ASSERT (list1); + GF_ASSERT (list2); + + snap1 = list_entry(list1, glusterd_snap_t, snap_list); + snap2 = list_entry(list2, glusterd_snap_t, snap_list); + diff_time = difftime(snap1->time_stamp, snap2->time_stamp); + + return ((int)diff_time); +} + +int +glusterd_compare_snap_vol_time(struct list_head *list1, struct list_head *list2) +{ + glusterd_volinfo_t *snapvol1 = NULL; + glusterd_volinfo_t *snapvol2 = NULL; + double diff_time = 0; + + GF_ASSERT (list1); + GF_ASSERT (list2); + + snapvol1 = list_entry(list1, glusterd_volinfo_t, snapvol_list); + snapvol2 = list_entry(list2, glusterd_volinfo_t, snapvol_list); + diff_time = difftime(snapvol1->snapshot->time_stamp, + snapvol2->snapshot->time_stamp); + + return ((int)diff_time); +} + +int32_t +glusterd_missed_snapinfo_new (glusterd_missed_snap_info **missed_snapinfo) +{ + glusterd_missed_snap_info *new_missed_snapinfo = NULL; + int32_t ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (missed_snapinfo); + + new_missed_snapinfo = GF_CALLOC (1, sizeof(*new_missed_snapinfo), + gf_gld_mt_missed_snapinfo_t); + + if (!new_missed_snapinfo) + goto out; + + new_missed_snapinfo->node_snap_info = NULL; + INIT_LIST_HEAD (&new_missed_snapinfo->missed_snaps); + INIT_LIST_HEAD (&new_missed_snapinfo->snap_ops); + + *missed_snapinfo = new_missed_snapinfo; + + ret = 0; + +out: + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} + +int32_t +glusterd_missed_snap_op_new (glusterd_snap_op_t **snap_op) +{ + glusterd_snap_op_t *new_snap_op = NULL; + int32_t ret = -1; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (snap_op); + + new_snap_op = GF_CALLOC (1, sizeof(*new_snap_op), + gf_gld_mt_missed_snapinfo_t); + + if (!new_snap_op) + goto out; + + new_snap_op->brick_path = NULL; + new_snap_op->brick_num = -1; + new_snap_op->op = -1; + new_snap_op->status = -1; + INIT_LIST_HEAD (&new_snap_op->snap_ops_list); + + *snap_op = new_snap_op; + + ret = 0; +out: + gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret); + return ret; +} diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h index 026d7e68f..56bb799bf 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.h +++ b/xlators/mgmt/glusterd/src/glusterd-utils.h @@ -81,6 +81,11 @@ glusterd_submit_request (struct rpc_clnt *rpc, void *req, int32_t glusterd_volinfo_new (glusterd_volinfo_t **volinfo); +int32_t +glusterd_volinfo_dup (glusterd_volinfo_t *volinfo, + glusterd_volinfo_t **dup_volinfo, + gf_boolean_t set_userauth); + char * glusterd_auth_get_username (glusterd_volinfo_t *volinfo); @@ -115,8 +120,23 @@ int32_t glusterd_peer_hostname_new (char *hostname, glusterd_peer_hostname_t **name); int32_t +glusterd_snap_volinfo_find (char *volname, glusterd_snap_t *snap, + glusterd_volinfo_t **volinfo); +int32_t +glusterd_snap_volinfo_find_from_parent_volname (char *origin_volname, + glusterd_snap_t *snap, + glusterd_volinfo_t **volinfo); + +int32_t glusterd_volinfo_find (char *volname, glusterd_volinfo_t **volinfo); +int +glusterd_volinfo_find_by_volume_id (uuid_t volume_id, glusterd_volinfo_t **volinfo); + +int +glusterd_snap_volinfo_find_by_volume_id (uuid_t volume_id, + glusterd_volinfo_t **volinfo); + int32_t glusterd_service_stop(const char *service, char *pidfile, int sig, gf_boolean_t force_kill); @@ -148,9 +168,6 @@ glusterd_volume_brickinfo_get_by_brick (char *brick, glusterd_volinfo_t *volinfo, glusterd_brickinfo_t **brickinfo); -gf_boolean_t -glusterd_is_local_addr (char *hostname); - int32_t glusterd_build_volume_dict (dict_t **vols); @@ -282,6 +299,7 @@ glusterd_is_defrag_on (glusterd_volinfo_t *volinfo); int32_t glusterd_volinfo_bricks_delete (glusterd_volinfo_t *volinfo); + int glusterd_friend_find_by_uuid (uuid_t uuid, glusterd_peerinfo_t **peerinfo); @@ -318,7 +336,7 @@ glusterd_rb_check_bricks (glusterd_volinfo_t *volinfo, int glusterd_check_and_set_brick_xattr (char *host, char *path, uuid_t uuid, - char **op_errstr); + char **op_errstr, gf_boolean_t is_force); int glusterd_validate_and_create_brickpath (glusterd_brickinfo_t *brickinfo, @@ -351,7 +369,7 @@ gf_boolean_t glusterd_peerinfo_is_uuid_unknown (glusterd_peerinfo_t *peerinfo); int32_t glusterd_brick_connect (glusterd_volinfo_t *volinfo, - glusterd_brickinfo_t *brickinfo); + glusterd_brickinfo_t *brickinfo, char *socketpath); int32_t glusterd_brick_disconnect (glusterd_brickinfo_t *brickinfo); int32_t @@ -359,15 +377,25 @@ glusterd_delete_volume (glusterd_volinfo_t *volinfo); int32_t glusterd_delete_brick (glusterd_volinfo_t* volinfo, glusterd_brickinfo_t *brickinfo); + int32_t glusterd_delete_all_bricks (glusterd_volinfo_t* volinfo); + int glusterd_spawn_daemons (void *opaque); + int glusterd_restart_gsyncds (glusterd_conf_t *conf); + int glusterd_start_gsync (glusterd_volinfo_t *master_vol, char *slave, - char *glusterd_uuid_str, char **op_errstr); + char *path_list, char *conf_path, + char *glusterd_uuid_str, + char **op_errstr); +int +glusterd_get_local_brickpaths (glusterd_volinfo_t *volinfo, + char **pathlist); + int32_t glusterd_recreate_bricks (glusterd_conf_t *conf); int32_t @@ -450,6 +478,12 @@ glusterd_defrag_volume_status_update (glusterd_volinfo_t *volinfo, int glusterd_check_files_identical (char *filename1, char *filename2, gf_boolean_t *identical); + +int +glusterd_check_topology_identical (const char *filename1, + const char *filename2, + gf_boolean_t *identical); + void glusterd_volinfo_reset_defrag_stats (glusterd_volinfo_t *volinfo); int @@ -471,6 +505,10 @@ int glusterd_volume_heal_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict); int glusterd_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict); +int +glusterd_sys_exec_output_rsp_dict (dict_t *aggr, dict_t *rsp_dict); +int +glusterd_snap_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict); int32_t glusterd_handle_node_rsp (dict_t *req_ctx, void *pending_entry, glusterd_op_t op, dict_t *rsp_dict, dict_t *op_ctx, @@ -487,11 +525,16 @@ glusterd_profile_volume_brick_rsp (void *pending_entry, dict_t *rsp_dict, dict_t *op_ctx, char **op_errstr, gd_node_type type); +gf_boolean_t +glusterd_are_vol_all_peers_up (glusterd_volinfo_t *volinfo, + struct list_head *peers, + char **down_peerstr); + /* Should be used only when an operation is in progress, as that is the only * time a lock_owner is set */ gf_boolean_t -is_origin_glusterd (); +is_origin_glusterd (dict_t *dict); gf_boolean_t glusterd_is_quorum_changed (dict_t *options, char *option, char *value); @@ -525,6 +568,67 @@ glusterd_is_same_address (char *name1, char *name2); void gd_update_volume_op_versions (glusterd_volinfo_t *volinfo); + char* gd_peer_uuid_str (glusterd_peerinfo_t *peerinfo); + +gf_boolean_t +gd_is_remove_brick_committed (glusterd_volinfo_t *volinfo); + +gf_boolean_t +glusterd_are_vol_all_peers_up (glusterd_volinfo_t *volinfo, + struct list_head *peers, + char **down_peerstr); + +int +glusterd_get_slave_details_confpath (glusterd_volinfo_t *volinfo, dict_t *dict, + char **slave_ip, char **slave_vol, + char **conf_path, char **op_errstr); + +int +glusterd_check_restart_gsync_session (glusterd_volinfo_t *volinfo, char *slave, + dict_t *resp_dict, char *path_list, + char *conf_path, gf_boolean_t is_force); + +int +glusterd_check_gsync_running_local (char *master, char *slave, + char *conf_path, + gf_boolean_t *is_run); + +gf_boolean_t +glusterd_is_status_tasks_op (glusterd_op_t op, dict_t *dict); + +#ifdef GF_LINUX_HOST_OS +char* +glusterd_get_brick_mount_details (glusterd_brickinfo_t *brickinfo); +struct mntent * +glusterd_get_mnt_entry_info (char *mnt_pt, FILE *mtab); +int +glusterd_get_brick_root (char *path, char **mount_point); +#endif //LINUX_HOST + +int +glusterd_compare_snap_time(struct list_head *, struct list_head *); + +int +glusterd_compare_snap_vol_time(struct list_head *, struct list_head *); + +int32_t +glusterd_snap_volinfo_restore (dict_t *rsp_dict, + glusterd_volinfo_t *new_volinfo, + glusterd_volinfo_t *snap_volinfo); +int32_t +glusterd_lvm_snapshot_remove (dict_t *rsp_dict, glusterd_volinfo_t *snap_vol); + +int32_t +glusterd_missed_snapinfo_new (glusterd_missed_snap_info **missed_snapinfo); + +int32_t +glusterd_missed_snap_op_new (glusterd_snap_op_t **snap_op); + +int32_t +glusterd_add_missed_snaps_to_dict (dict_t *rsp_dict, char *snap_uuid, + glusterd_brickinfo_t *brickinfo, + int32_t brick_number, int32_t op); + #endif diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 31e9fa379..6f3c69e7d 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -28,6 +28,8 @@ #include "logging.h" #include "dict.h" #include "graph-utils.h" +#include "glusterd-store.h" +#include "glusterd-hooks.h" #include "trie.h" #include "glusterd-mem-types.h" #include "cli1-xdr.h" @@ -594,6 +596,8 @@ get_server_xlator (char *xlator) subvol = GF_XLATOR_MARKER; if (strcmp (xlator, "io-stats") == 0) subvol = GF_XLATOR_IO_STATS; + if (strcmp (xlator, "bd") == 0) + subvol = GF_XLATOR_BD; return subvol; } @@ -1419,9 +1423,8 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, char *password = NULL; char index_basepath[PATH_MAX] = {0}; char key[1024] = {0}; - char *vgname = NULL; - char *vg = NULL; glusterd_brickinfo_t *brickinfo = NULL; + char changelog_basepath[PATH_MAX] = {0,}; brickinfo = param; path = brickinfo->path; @@ -1440,47 +1443,62 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, } } - if (volinfo->backend == GD_VOL_BK_BD) { - xl = volgen_graph_add (graph, "storage/bd_map", volname); - if (!xl) - return -1; + xl = volgen_graph_add (graph, "storage/posix", volname); + if (!xl) + return -1; - ret = xlator_set_option (xl, "device", "vg"); - if (ret) - return -1; + ret = xlator_set_option (xl, "directory", path); + if (ret) + return -1; - vg = gf_strdup (path); - vgname = strrchr (vg, '/'); - if (strchr(vg, '/') != vgname) { - gf_log ("glusterd", GF_LOG_ERROR, - "invalid vg specified %s", path); - GF_FREE (vg); - goto out; - } - vgname++; - ret = xlator_set_option (xl, "export", vgname); - GF_FREE (vg); - if (ret) - return -1; - } else { - xl = volgen_graph_add (graph, "storage/posix", volname); + ret = xlator_set_option (xl, "volume-id", + uuid_utoa (volinfo->volume_id)); + if (ret) + return -1; + + ret = check_and_add_debug_xl (graph, set_dict, volname, + "posix"); + if (ret) + return -1; +#ifdef HAVE_BD_XLATOR + if (*brickinfo->vg != '\0') { + /* Now add BD v2 xlator if volume is BD type */ + xl = volgen_graph_add (graph, "storage/bd", volname); if (!xl) return -1; - ret = xlator_set_option (xl, "directory", path); + ret = xlator_set_option (xl, "device", "vg"); if (ret) return -1; - - ret = xlator_set_option (xl, "volume-id", - uuid_utoa (volinfo->volume_id)); + ret = xlator_set_option (xl, "export", brickinfo->vg); if (ret) return -1; - ret = check_and_add_debug_xl (graph, set_dict, volname, - "posix"); + ret = check_and_add_debug_xl (graph, set_dict, volname, "bd"); if (ret) return -1; + } +#endif + + xl = volgen_graph_add (graph, "features/changelog", volname); + if (!xl) + return -1; + + ret = xlator_set_option (xl, "changelog-brick", path); + if (ret) + return -1; + + snprintf (changelog_basepath, sizeof (changelog_basepath), + "%s/%s", path, ".glusterfs/changelogs"); + ret = xlator_set_option (xl, "changelog-dir", changelog_basepath); + if (ret) + return -1; + + ret = check_and_add_debug_xl (graph, set_dict, volname, "changelog"); + if (ret) + return -1; + xl = volgen_graph_add (graph, "features/access-control", volname); if (!xl) return -1; @@ -1596,7 +1614,8 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, } /* Check for read-only volume option, and add it to the graph */ - if (dict_get_str_boolean (set_dict, "features.read-only", 0)) { + if (dict_get_str_boolean (set_dict, "features.read-only", 0) + || volinfo -> is_snap_volume) { xl = volgen_graph_add (graph, "features/read-only", volname); if (!xl) { ret = -1; @@ -1613,6 +1632,18 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, } } + /* Check for compress volume option, and add it to the graph on server side */ + if (dict_get_str_boolean (set_dict, "features.compress", 0)) { + xl = volgen_graph_add (graph, "features/cdc", volname); + if (!xl) { + ret = -1; + goto out; + } + ret = dict_set_str (set_dict, "compress.mode", "server"); + if (ret) + goto out; + } + xl = volgen_graph_add_as (graph, "debug/io-stats", path); if (!xl) return -1; @@ -1676,16 +1707,11 @@ static int perfxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme, void *param) { - char *volname = NULL; gf_boolean_t enabled = _gf_false; - xlator_t *this = NULL; - glusterd_conf_t *conf = NULL; - - this = THIS; - GF_ASSERT (this); - conf = this->private; + glusterd_volinfo_t *volinfo = NULL; - volname = param; + GF_ASSERT (param); + volinfo = param; if (strcmp (vme->option, "!perf") != 0) return 0; @@ -1698,10 +1724,10 @@ perfxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme, /* Check op-version before adding the 'open-behind' xlator in the graph */ if (!strcmp (vme->key, "performance.open-behind") && - (vme->op_version > conf->op_version)) + (vme->op_version > volinfo->client_op_version)) return 0; - if (volgen_graph_add (graph, vme->voltype, volname)) + if (volgen_graph_add (graph, vme->voltype, volinfo->volname)) return 0; else return -1; @@ -1925,8 +1951,6 @@ _free_xlator_opt_key (char *key) int glusterd_get_volopt_content (dict_t * ctx, gf_boolean_t xml_out) { - - char *xlator_type = NULL; void *dl_handle = NULL; volume_opt_list_t vol_opt_handle = {{0},}; char *key = NULL; @@ -1959,21 +1983,29 @@ glusterd_get_volopt_content (dict_t * ctx, gf_boolean_t xml_out) descr = vme->description; def_val = vme->value; } else { - if (_get_xlator_opt_key_from_vme (vme, &key)) + if (_get_xlator_opt_key_from_vme (vme, &key)) { + gf_log ("glusterd", GF_LOG_DEBUG, "Failed to " + "get %s key from volume option entry", + vme->key); goto out; /*Some error while geting key*/ + } - if (!xlator_type || strcmp (vme->voltype, xlator_type)){ - ret = xlator_volopt_dynload (vme->voltype, - &dl_handle, - &vol_opt_handle); - if (ret) { - ret = 0; - goto cont; - } + ret = xlator_volopt_dynload (vme->voltype, + &dl_handle, + &vol_opt_handle); + + if (ret) { + gf_log ("glusterd", GF_LOG_DEBUG, + "xlator_volopt_dynload error(%d)", ret); + ret = 0; + goto cont; } + ret = xlator_option_info_list (&vol_opt_handle, key, &def_val, &descr); if (ret) { /*Swallow Error i.e if option not found*/ + gf_log ("glusterd", GF_LOG_DEBUG, + "Failed to get option for %s key", key); ret = 0; goto cont; } @@ -2295,22 +2327,33 @@ volgen_graph_build_dht_cluster (volgen_graph_t *graph, int ret = -1; char *decommissioned_children = NULL; xlator_t *dht = NULL; - char *optstr = NULL; - gf_boolean_t use_nufa = _gf_false; + char *voltype = "cluster/distribute"; - if (dict_get_str(volinfo->dict,"cluster.nufa",&optstr) == 0) { - /* Keep static analyzers quiet by "using" the value. */ - ret = gf_string2boolean(optstr,&use_nufa); + /* NUFA and Switch section */ + if (dict_get_str_boolean (volinfo->dict, "cluster.nufa", 0) && + dict_get_str_boolean (volinfo->dict, "cluster.switch", 0)) { + gf_log (THIS->name, GF_LOG_ERROR, + "nufa and switch cannot be set together"); + ret = -1; + goto out; } + /* Check for NUFA volume option, and change the voltype */ + if (dict_get_str_boolean (volinfo->dict, "cluster.nufa", 0)) + voltype = "cluster/nufa"; + + /* Check for switch volume option, and change the voltype */ + if (dict_get_str_boolean (volinfo->dict, "cluster.switch", 0)) + voltype = "cluster/switch"; + clusters = volgen_graph_build_clusters (graph, volinfo, - use_nufa - ? "cluster/nufa" - : "cluster/distribute", + voltype, "%s-dht", - child_count, child_count); + child_count, + child_count); if (clusters < 0) goto out; + dht = first_of (graph); ret = _graph_get_decommissioned_children (dht, volinfo, &decommissioned_children); @@ -2404,7 +2447,7 @@ build_distribute: ret = volgen_graph_build_dht_cluster (graph, volinfo, dist_count); - if (ret) + if (ret == -1) goto out; ret = 0; @@ -2412,6 +2455,29 @@ out: return ret; } +static int client_graph_set_perf_options(volgen_graph_t *graph, + glusterd_volinfo_t *volinfo, + dict_t *set_dict) +{ + data_t *tmp_data = NULL; + char *volname = NULL; + + /* + * Logic to make sure NFS doesn't have performance translators by + * default for a volume + */ + volname = volinfo->volname; + tmp_data = dict_get (set_dict, "nfs-volume-file"); + if (!tmp_data) + return volgen_graph_set_options_generic(graph, set_dict, + volname, + &perfxl_option_handler); + else + return volgen_graph_set_options_generic(graph, set_dict, + volname, + &nfsperfxl_option_handler); +} + static int client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, dict_t *set_dict, void *param) @@ -2419,7 +2485,6 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, int ret = 0; xlator_t *xl = NULL; char *volname = NULL; - data_t *tmp_data = NULL; volname = volinfo->volname; ret = volgen_graph_build_clients (graph, volinfo, set_dict, param); @@ -2427,8 +2492,33 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, goto out; ret = volume_volgen_graph_build_clusters (graph, volinfo); - if (ret) + if (ret == -1) + goto out; + + /* Check for compress volume option, and add it to the graph on client side */ + if (dict_get_str_boolean (set_dict, "features.compress", 0)) { + xl = volgen_graph_add (graph, "features/cdc", volname); + if (!xl) { + ret = -1; + goto out; + } + ret = dict_set_str (set_dict, "compress.mode", "client"); + if (ret) + goto out; + + } + + ret = glusterd_volinfo_get_boolean (volinfo, "features.encryption"); + if (ret == -1) goto out; + if (ret) { + xl = volgen_graph_add (graph, "encryption/crypt", volname); + + if (!xl) { + ret = -1; + goto out; + } + } ret = glusterd_volinfo_get_boolean (volinfo, VKEY_FEATURES_QUOTA); if (ret == -1) @@ -2442,16 +2532,20 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, } } - /* Logic to make sure NFS doesn't have performance translators by - default for a volume */ - tmp_data = dict_get (set_dict, "nfs-volume-file"); - if (!tmp_data) - ret = volgen_graph_set_options_generic (graph, set_dict, volname, - &perfxl_option_handler); - else - ret = volgen_graph_set_options_generic (graph, set_dict, volname, - &nfsperfxl_option_handler); + ret = glusterd_volinfo_get_boolean (volinfo, "features.file-snapshot"); + if (ret == -1) + goto out; + if (ret) { + xl = volgen_graph_add (graph, "features/qemu-block", volname); + + if (!xl) { + ret = -1; + goto out; + } + } + + ret = client_graph_set_perf_options(graph, volinfo, set_dict); if (ret) goto out; @@ -2876,6 +2970,10 @@ build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict) if (ret) goto out; + ret = xlator_set_option (nfsxl, "nfs.drc", "on"); + if (ret) + goto out; + list_for_each_entry (voliter, &priv->volumes, vol_list) { if (voliter->status != GLUSTERD_STATUS_STARTED) continue; @@ -3077,8 +3175,6 @@ glusterd_generate_brick_volfile (glusterd_volinfo_t *volinfo, return ret; } - - static void get_vol_tstamp_file (char *filename, glusterd_volinfo_t *volinfo) { @@ -3179,7 +3275,7 @@ enumerate_transport_reqs (gf_transport_type type, char **types) } } -static int +int generate_client_volfiles (glusterd_volinfo_t *volinfo, glusterd_client_type_t client_type) { @@ -3345,6 +3441,54 @@ out: } int +glusterd_check_nfs_topology_identical (gf_boolean_t *identical) +{ + char nfsvol[PATH_MAX] = {0,}; + char tmpnfsvol[PATH_MAX] = {0,}; + glusterd_conf_t *conf = NULL; + xlator_t *this = THIS; + int ret = -1; + int tmpclean = 0; + int tmpfd = -1; + + if ((!identical) || (!this) || (!this->private)) + goto out; + + conf = (glusterd_conf_t *) this->private; + + /* Fetch the original NFS volfile */ + glusterd_get_nodesvc_volfile ("nfs", conf->workdir, + nfsvol, sizeof (nfsvol)); + + /* Create the temporary NFS volfile */ + snprintf (tmpnfsvol, sizeof (tmpnfsvol), "/tmp/gnfs-XXXXXX"); + tmpfd = mkstemp (tmpnfsvol); + if (tmpfd < 0) { + gf_log (this->name, GF_LOG_WARNING, + "Unable to create temp file %s: (%s)", + tmpnfsvol, strerror (errno)); + goto out; + } + + tmpclean = 1; /* SET the flag to unlink() tmpfile */ + + ret = glusterd_create_global_volfile (build_nfs_graph, + tmpnfsvol, NULL); + if (ret) + goto out; + + /* Compare the topology of volfiles */ + ret = glusterd_check_topology_identical (nfsvol, tmpnfsvol, + identical); +out: + if (tmpfd >= 0) + close (tmpfd); + if (tmpclean) + unlink (tmpnfsvol); + return ret; +} + +int glusterd_check_nfs_volfile_identical (gf_boolean_t *identical) { char nfsvol[PATH_MAX] = {0,}; @@ -3793,3 +3937,129 @@ gd_is_boolean_option (char *key) return _gf_false; } + +/* This function will restore origin volume to it's snap. + * The restore operation will simply replace the Gluster origin + * volume with the snap volume. + * TODO: Multi-volume delete to be done. + * Cleanup in case of restore failure is pending. + * + * @param orig_vol volinfo of origin volume + * @param snap_vol volinfo of snapshot volume + * + * @return 0 on success and negative value on error + */ +int +gd_restore_snap_volume (dict_t *rsp_dict, + glusterd_volinfo_t *orig_vol, + glusterd_volinfo_t *snap_vol) +{ + int ret = -1; + glusterd_volinfo_t *new_volinfo = NULL; + glusterd_snap_t *snap = NULL; + xlator_t *this = NULL; + glusterd_conf_t *conf = NULL; + glusterd_volinfo_t *temp_volinfo = NULL; + glusterd_volinfo_t *voliter = NULL; + + this = THIS; + GF_ASSERT (this); + GF_ASSERT (rsp_dict); + conf = this->private; + GF_ASSERT (conf); + + GF_VALIDATE_OR_GOTO (this->name, orig_vol, out); + GF_VALIDATE_OR_GOTO (this->name, snap_vol, out); + snap = snap_vol->snapshot; + GF_VALIDATE_OR_GOTO (this->name, snap, out); + + /* Snap volume must be stoped before performing the + * restore operation. + */ + ret = glusterd_stop_volume (snap_vol); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to stop " + "snap volume"); + goto out; + } + + /* Create a new volinfo for the restored volume */ + ret = glusterd_volinfo_dup (snap_vol, &new_volinfo, _gf_true); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to create volinfo"); + goto out; + } + + /* Following entries need to be derived from origin volume. */ + strcpy (new_volinfo->volname, orig_vol->volname); + uuid_copy (new_volinfo->volume_id, orig_vol->volume_id); + new_volinfo->snap_count = orig_vol->snap_count; + new_volinfo->snap_max_hard_limit = orig_vol->snap_max_hard_limit; + new_volinfo->is_volume_restored = _gf_true; + + /* Bump the version of the restored volume, so that nodes * + * which are done can sync during handshake */ + new_volinfo->version = orig_vol->version; + + list_for_each_entry_safe (voliter, temp_volinfo, + &orig_vol->snap_volumes, snapvol_list) { + list_add_tail (&voliter->snapvol_list, + &new_volinfo->snap_volumes); + } + /* Copy the snap vol info to the new_volinfo.*/ + ret = glusterd_snap_volinfo_restore (rsp_dict, new_volinfo, snap_vol); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to restore snap"); + (void)glusterd_volinfo_delete (new_volinfo); + goto out; + } + + /* If the orig_vol is already restored then we should delete + * the backend LVMs */ + if (orig_vol->is_volume_restored) { + ret = glusterd_lvm_snapshot_remove (rsp_dict, orig_vol); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to remove " + "LVM backend"); + (void)glusterd_volinfo_delete (new_volinfo); + goto out; + } + } + + /* Once the new_volinfo is completely constructed then delete + * the orinal volinfo + */ + ret = glusterd_volinfo_delete (orig_vol); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to delete volinfo"); + (void)glusterd_volinfo_delete (new_volinfo); + goto out; + } + + /* New volinfo always shows the status as created. Therefore + * set the status to stop. */ + glusterd_set_volume_status (new_volinfo, GLUSTERD_STATUS_STOPPED); + + list_add_tail (&new_volinfo->vol_list, &conf->volumes); + + /* Now delete the snap entry. As a first step delete the snap + * volume information stored in store. */ + ret = glusterd_snap_remove (rsp_dict, snap, _gf_false, _gf_true); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "Failed to delete " + "snap %s", snap->snapname); + goto out; + } + + ret = glusterd_store_volinfo (new_volinfo, + GLUSTERD_VOLINFO_VER_AC_INCREMENT); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to store volinfo"); + goto out; + } + + ret = 0; +out: + + return ret; +} diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.h b/xlators/mgmt/glusterd/src/glusterd-volgen.h index 6a41f47c1..fcbaaf93e 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.h +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.h @@ -23,6 +23,8 @@ #define VKEY_DIAG_LAT_MEASUREMENT "diagnostics.latency-measurement" #define VKEY_FEATURES_LIMIT_USAGE "features.limit-usage" #define VKEY_MARKER_XTIME GEOREP".indexing" +#define VKEY_MARKER_XTIME_FORCE GEOREP".ignore-pid-check" +#define VKEY_CHANGELOG "changelog.changelog" #define VKEY_FEATURES_QUOTA "features.quota" #define AUTH_ALLOW_MAP_KEY "auth.allow" @@ -73,6 +75,7 @@ typedef enum { GF_XLATOR_INDEX, GF_XLATOR_MARKER, GF_XLATOR_IO_STATS, + GF_XLATOR_BD, GF_XLATOR_NONE, } glusterd_server_xlator_t; @@ -121,6 +124,10 @@ int glusterd_create_shd_volfile (); int glusterd_delete_volfile (glusterd_volinfo_t *volinfo, glusterd_brickinfo_t *brickinfo); +int +glusterd_delete_snap_volfile (glusterd_volinfo_t *volinfo, + glusterd_volinfo_t *snap_volinfo, + glusterd_brickinfo_t *brickinfo); int glusterd_volinfo_get (glusterd_volinfo_t *volinfo, char *key, char **value); int glusterd_volinfo_get_boolean (glusterd_volinfo_t *volinfo, char *key); @@ -134,11 +141,22 @@ glusterd_check_voloption_flags (char *key, int32_t flags); gf_boolean_t glusterd_is_valid_volfpath (char *volname, char *brick); int generate_brick_volfiles (glusterd_volinfo_t *volinfo); +int generate_snap_brick_volfiles (glusterd_volinfo_t *volinfo, + glusterd_volinfo_t *snap_volinfo); +int generate_client_volfiles (glusterd_volinfo_t *volinfo, + glusterd_client_type_t client_type); +int +generate_snap_client_volfiles (glusterd_volinfo_t *actual_volinfo, + glusterd_volinfo_t *snap_volinfo, + glusterd_client_type_t client_type, + gf_boolean_t vol_restore); int glusterd_get_volopt_content (dict_t *dict, gf_boolean_t xml_out); char* glusterd_get_trans_type_rb (gf_transport_type ttype); int glusterd_check_nfs_volfile_identical (gf_boolean_t *identical); +int +glusterd_check_nfs_topology_identical (gf_boolean_t *identical); uint32_t glusterd_get_op_version_for_key (char *key); @@ -152,4 +170,7 @@ gd_is_xlator_option (char *key); gf_boolean_t gd_is_boolean_option (char *key); +int gd_restore_snap_volume (dict_t *rsp_dict, + glusterd_volinfo_t *orig_vol, + glusterd_volinfo_t *snap_vol); #endif diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c index 17b3f6182..0d322b9ad 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c @@ -30,6 +30,7 @@ #define glusterd_op_start_volume_args_get(dict, volname, flags) \ glusterd_op_stop_volume_args_get (dict, volname, flags) + int __glusterd_handle_create_volume (rpcsvc_request_t *req) { @@ -604,31 +605,96 @@ glusterd_handle_cli_statedump_volume (rpcsvc_request_t *req) } #ifdef HAVE_BD_XLATOR +/* + * Validates if given VG in the brick exists or not. Also checks if VG has + * GF_XATTR_VOL_ID_KEY tag set to avoid using same VG for multiple bricks. + * Tag is checked only during glusterd_op_stage_create_volume. Tag is set during + * glusterd_validate_and_create_brickpath(). + * @brick - brick info, @check_tag - check for VG tag or not + * @msg - Error message to return to caller + */ int -glusterd_is_valid_vg (const char *name) +glusterd_is_valid_vg (glusterd_brickinfo_t *brick, int check_tag, char *msg) { - lvm_t handle = NULL; - vg_t vg = NULL; - char *vg_name = NULL; - int retval = -1; + lvm_t handle = NULL; + vg_t vg = NULL; + char *vg_name = NULL; + int retval = 0; + char *p = NULL; + char *ptr = NULL; + struct dm_list *dm_lvlist = NULL; + struct dm_list *dm_seglist = NULL; + struct lvm_lv_list *lv_list = NULL; + struct lvm_property_value prop = {0, }; + struct lvm_lvseg_list *seglist = NULL; + struct dm_list *taglist = NULL; + struct lvm_str_list *strl = NULL; handle = lvm_init (NULL); if (!handle) { - gf_log ("", GF_LOG_ERROR, "lvm_init failed"); + sprintf (msg, "lvm_init failed, could not validate vg"); return -1; } - vg_name = gf_strdup (name); - vg = lvm_vg_open (handle, basename (vg_name), "r", 0); + if (*brick->vg == '\0') { /* BD xlator has vg in brick->path */ + p = gf_strdup (brick->path); + vg_name = strtok_r (p, "/", &ptr); + } else + vg_name = brick->vg; + + vg = lvm_vg_open (handle, vg_name, "r", 0); if (!vg) { - gf_log ("", GF_LOG_ERROR, "no such vg: %s", vg_name); + sprintf (msg, "no such vg: %s", vg_name); + retval = -1; goto out; } + if (!check_tag) + goto next; + + taglist = lvm_vg_get_tags (vg); + if (!taglist) + goto next; + + dm_list_iterate_items (strl, taglist) { + if (!strncmp(strl->str, GF_XATTR_VOL_ID_KEY, + strlen (GF_XATTR_VOL_ID_KEY))) { + sprintf (msg, "VG %s is already part of" + " a brick", vg_name); + retval = -1; + goto out; + } + } +next: + + brick->caps = CAPS_BD | CAPS_OFFLOAD_COPY | CAPS_OFFLOAD_SNAPSHOT; + + dm_lvlist = lvm_vg_list_lvs (vg); + if (!dm_lvlist) + goto out; + + dm_list_iterate_items (lv_list, dm_lvlist) { + dm_seglist = lvm_lv_list_lvsegs (lv_list->lv); + dm_list_iterate_items (seglist, dm_seglist) { + prop = lvm_lvseg_get_property (seglist->lvseg, + "segtype"); + if (!prop.is_valid || !prop.value.string) + continue; + if (!strcmp (prop.value.string, "thin-pool")) { + brick->caps |= CAPS_THIN; + gf_log (THIS->name, GF_LOG_INFO, "Thin Pool " + "\"%s\" will be used for thin LVs", + lvm_lv_get_name (lv_list->lv)); + break; + } + } + } + retval = 0; out: if (vg) lvm_vg_close (vg); lvm_quit (handle); - GF_FREE (vg_name); + if (p) + GF_FREE (p); return retval; } #endif @@ -653,9 +719,6 @@ glusterd_op_stage_create_volume (dict_t *dict, char **op_errstr) char msg[2048] = {0}; uuid_t volume_uuid; char *volume_uuid_str; -#ifdef HAVE_BD_XLATOR - char *dev_type = NULL; -#endif gf_boolean_t is_force = _gf_false; this = THIS; @@ -700,10 +763,6 @@ glusterd_op_stage_create_volume (dict_t *dict, char **op_errstr) goto out; } -#ifdef HAVE_BD_XLATOR - ret = dict_get_str (dict, "device", &dev_type); -#endif - ret = dict_get_str (dict, "bricks", &bricks); if (ret) { gf_log (this->name, GF_LOG_ERROR, "Unable to get bricks for " @@ -752,19 +811,15 @@ glusterd_op_stage_create_volume (dict_t *dict, char **op_errstr) goto out; } + if (!uuid_compare (brick_info->uuid, MY_UUID)) { + #ifdef HAVE_BD_XLATOR - if (dev_type) { - ret = glusterd_is_valid_vg (brick_info->path); - if (ret) { - snprintf (msg, sizeof(msg), "invalid vg %s", - brick_info->path); - goto out; + if (brick_info->vg[0]) { + ret = glusterd_is_valid_vg (brick_info, 1, msg); + if (ret) + goto out; } - - break; - } else #endif - if (!uuid_compare (brick_info->uuid, MY_UUID)) { ret = glusterd_validate_and_create_brickpath (brick_info, volume_uuid, op_errstr, is_force); @@ -862,6 +917,7 @@ glusterd_op_stage_start_volume (dict_t *dict, char **op_errstr) uuid_t volume_id = {0,}; char volid[50] = {0,}; char xattr_volid[50] = {0,}; + int caps = 0; this = THIS; GF_ASSERT (this); @@ -911,11 +967,10 @@ glusterd_op_stage_start_volume (dict_t *dict, char **op_errstr) if (uuid_compare (brickinfo->uuid, MY_UUID)) continue; - if (volinfo->backend == GD_VOL_BK_BD) - continue; - ret = gf_lstat_dir (brickinfo->path, NULL); - if (ret) { + if (ret && (flags & GF_CLI_FLAG_OP_FORCE)) { + continue; + } else if (ret) { snprintf (msg, sizeof (msg), "Failed to find " "brick directory %s for volume %s. " "Reason : %s", brickinfo->path, @@ -956,8 +1011,24 @@ glusterd_op_stage_start_volume (dict_t *dict, char **op_errstr) ret = -1; goto out; } +#ifdef HAVE_BD_XLATOR + if (brickinfo->vg[0]) + caps = CAPS_BD | CAPS_THIN | + CAPS_OFFLOAD_COPY | CAPS_OFFLOAD_SNAPSHOT; + /* Check for VG/thin pool if its BD volume */ + if (brickinfo->vg[0]) { + ret = glusterd_is_valid_vg (brickinfo, 0, msg); + if (ret) + goto out; + /* if anyone of the brick does not have thin support, + disable it for entire volume */ + caps &= brickinfo->caps; + } else + caps = 0; +#endif } + volinfo->caps = caps; ret = 0; out: if (ret && (msg[0] != '\0')) { @@ -1112,6 +1183,16 @@ glusterd_op_stage_delete_volume (dict_t *dict, char **op_errstr) goto out; } + if (volinfo->snap_count > 0 || !list_empty(&volinfo->snap_volumes)) { + snprintf (msg, sizeof (msg), "Cannot delete Volume %s ," + "as it has %ld snapshots. " + "To delete the volume, " + "first delete all the snapshots under it.", + volname, volinfo->snap_count); + ret = -1; + goto out; + } + ret = 0; out: @@ -1209,14 +1290,22 @@ glusterd_op_stage_heal_volume (dict_t *dict, char **op_errstr) goto out; } - if ((heal_op != GF_AFR_OP_INDEX_SUMMARY) && - !glusterd_is_nodesvc_online ("glustershd")) { - ret = -1; - *op_errstr = gf_strdup ("Self-heal daemon is not running." - " Check self-heal daemon log file."); - gf_log (this->name, GF_LOG_WARNING, "%s", "Self-heal daemon is " - "not running. Check self-heal daemon log file."); - goto out; + switch (heal_op) { + case GF_AFR_OP_INDEX_SUMMARY: + case GF_AFR_OP_STATISTICS_HEAL_COUNT: + case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: + break; + default: + if (!glusterd_is_nodesvc_online("glustershd")){ + ret = -1; + *op_errstr = gf_strdup ("Self-heal daemon is " + "not running. Check self-heal " + "daemon log file."); + gf_log (this->name, GF_LOG_WARNING, "%s", + "Self-heal daemon is not running." + "Check self-heal daemon log file."); + goto out; + } } ret = 0; @@ -1340,109 +1429,6 @@ out: return ret; } -#ifdef HAVE_BD_XLATOR -int -glusterd_op_stage_bd (dict_t *dict, char **op_errstr) -{ - int ret = -1; - char *volname = NULL; - char *path = NULL; - char *size = NULL; - glusterd_volinfo_t *volinfo = NULL; - char msg[2048] = {0,}; - gf_xl_bd_op_t bd_op = GF_BD_OP_INVALID; - uint64_t bytes = 0; - - ret = dict_get_str (dict, "volname", &volname); - if (ret) { - snprintf (msg, sizeof(msg), "Failed to get volume name"); - gf_log (THIS->name, GF_LOG_ERROR, "%s", msg); - *op_errstr = gf_strdup (msg); - goto out; - } - - ret = dict_get_int32 (dict, "bd-op", (int32_t *)&bd_op); - if (ret) { - snprintf (msg, sizeof(msg), "Failed to get bd-op"); - gf_log (THIS->name, GF_LOG_ERROR, "%s", msg); - *op_errstr = gf_strdup (msg); - goto out; - } - - ret = dict_get_str (dict, "path", &path); - if (ret) { - snprintf (msg, sizeof(msg), "Failed to get path"); - gf_log (THIS->name, GF_LOG_ERROR, "%s", msg); - *op_errstr = gf_strdup (msg); - goto out; - } - - if (bd_op == GF_BD_OP_NEW_BD) { - ret = dict_get_str (dict, "size", &size); - if (ret) { - snprintf (msg, sizeof(msg), "Failed to get size"); - gf_log ("", GF_LOG_ERROR, "%s", msg); - *op_errstr = gf_strdup (msg); - goto out; - } - if (gf_string2bytesize (size, &bytes) < 0) { - snprintf (msg, sizeof(msg), - "Invalid size %s, suffix with KB, MB etc", - size); - gf_log ("", GF_LOG_ERROR, "%s", msg); - *op_errstr = gf_strdup (msg); - ret = -1; - goto out; - } - } else if (bd_op == GF_BD_OP_SNAPSHOT_BD) { - ret = dict_get_str (dict, "size", &size); - if (ret) { - snprintf (msg, sizeof(msg), "Failed to get size"); - gf_log ("", GF_LOG_ERROR, "%s", msg); - *op_errstr = gf_strdup (msg); - goto out; - } - - if (gf_string2bytesize (size, &bytes) < 0) { - ret = -1; - snprintf (msg, sizeof(msg), - "Invalid size %s, suffix with KB, MB etc", - size); - gf_log ("", GF_LOG_ERROR, "%s", msg); - *op_errstr = gf_strdup (msg); - goto out; - } - } - - ret = glusterd_volinfo_find (volname, &volinfo); - if (ret) { - snprintf (msg, sizeof(msg), "Volume %s does not exist", - volname); - gf_log ("", GF_LOG_ERROR, "%s", msg); - *op_errstr = gf_strdup (msg); - goto out; - } - - ret = glusterd_validate_volume_id (dict, volinfo); - if (ret) - goto out; - - if (!glusterd_is_volume_started (volinfo)) { - snprintf (msg, sizeof(msg), "Volume %s is not started", - volname); - gf_log ("", GF_LOG_ERROR, "%s", msg); - *op_errstr = gf_strdup (msg); - ret = -1; - goto out; - } - - ret = 0; -out: - gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); - return ret; -} -#endif - int glusterd_op_create_volume (dict_t *dict, char **op_errstr) { @@ -1464,9 +1450,8 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) char *str = NULL; char *username = NULL; char *password = NULL; -#ifdef HAVE_BD_XLATOR - char *device = NULL; -#endif + int caps = 0; + char msg[1024] __attribute__((unused)) = {0, }; this = THIS; GF_ASSERT (this); @@ -1521,12 +1506,6 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) goto out; } -#ifdef HAVE_BD_XLATOR - ret = dict_get_str (dict, "device", &device); - if (!ret) - volinfo->backend = GD_VOL_BK_BD; -#endif - /* replica-count 1 means, no replication, file is in one brick only */ volinfo->replica_count = 1; /* stripe-count 1 means, no striping, file is present as a whole */ @@ -1635,6 +1614,7 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) if (count) brick = strtok_r (brick_list+1, " \n", &saveptr); + caps = CAPS_BD | CAPS_THIN | CAPS_OFFLOAD_COPY | CAPS_OFFLOAD_SNAPSHOT; while ( i <= count) { ret = glusterd_brickinfo_new_from_brick (brick, &brickinfo); @@ -1647,11 +1627,36 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) brickinfo->hostname, brickinfo->path); goto out; } + +#ifdef HAVE_BD_XLATOR + if (!uuid_compare (brickinfo->uuid, MY_UUID)) { + if (brickinfo->vg[0]) { + ret = glusterd_is_valid_vg (brickinfo, 0, msg); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s", + msg); + goto out; + } + + /* if anyone of the brick does not have thin + support, disable it for entire volume */ + caps &= brickinfo->caps; + + + } else + caps = 0; + } +#endif + list_add_tail (&brickinfo->brick_list, &volinfo->bricks); brick = strtok_r (NULL, " \n", &saveptr); i++; } + gd_update_volume_op_versions (volinfo); + + volinfo->caps = caps; + ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT); if (ret) { glusterd_store_delete_volume (volinfo); @@ -1669,7 +1674,6 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) list_add_tail (&volinfo->vol_list, &priv->volumes); vol_added = _gf_true; - gd_update_volume_op_versions (volinfo); out: GF_FREE(free_ptr); if (!vol_added && volinfo) @@ -1703,7 +1707,10 @@ glusterd_op_start_volume (dict_t *dict, char **op_errstr) list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { ret = glusterd_brick_start (volinfo, brickinfo, _gf_true); - if (ret) + /* If 'force' try to start all bricks regardless of success or + * failure + */ + if (!(flags & GF_CLI_FLAG_OP_FORCE) && ret) goto out; } @@ -1720,6 +1727,47 @@ out: return ret; } +int +glusterd_stop_volume (glusterd_volinfo_t *volinfo) +{ + int ret = -1; + glusterd_brickinfo_t *brickinfo = NULL; + xlator_t *this = NULL; + + this = THIS; + GF_ASSERT (this); + + GF_VALIDATE_OR_GOTO (this->name, volinfo, out); + + list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { + ret = glusterd_brick_stop (volinfo, brickinfo, _gf_false); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to stop " + "brick (%s)", brickinfo->path); + goto out; + } + } + + glusterd_set_volume_status (volinfo, GLUSTERD_STATUS_STOPPED); + + ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to store volinfo of " + "%s volume", volinfo->volname); + goto out; + } + + ret = glusterd_nodesvcs_handle_graph_change (volinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to notify graph " + "change for %s volume", volinfo->volname); + goto out; + } + +out: + return ret; +} + int glusterd_op_stop_volume (dict_t *dict) @@ -1728,7 +1776,6 @@ glusterd_op_stop_volume (dict_t *dict) int flags = 0; char *volname = NULL; glusterd_volinfo_t *volinfo = NULL; - glusterd_brickinfo_t *brickinfo = NULL; xlator_t *this = NULL; this = THIS; @@ -1745,19 +1792,12 @@ glusterd_op_stop_volume (dict_t *dict) goto out; } - list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { - ret = glusterd_brick_stop (volinfo, brickinfo, _gf_false); - if (ret) - goto out; - } - - glusterd_set_volume_status (volinfo, GLUSTERD_STATUS_STOPPED); - - ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT); - if (ret) + ret = glusterd_stop_volume (volinfo); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to stop %s volume", + volname); goto out; - - ret = glusterd_nodesvcs_handle_graph_change (volinfo); + } out: return ret; } diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 6d1715bc8..665a8b298 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -376,35 +376,60 @@ struct volopt_map_entry glusterd_volopt_map[] = { }, { .key = "cluster.readdir-optimize", .voltype = "cluster/distribute", - .op_version = 2, - .flags = OPT_FLAG_CLIENT_OPT - }, - { .key = "cluster.nufa", - .voltype = "cluster/distribute", - .option = "!nufa", - .type = NO_DOC, - .op_version = 2, + .op_version = 1, .flags = OPT_FLAG_CLIENT_OPT }, { .key = "cluster.rsync-hash-regex", .voltype = "cluster/distribute", .type = NO_DOC, - .op_version = 2, + .op_version = 3, .flags = OPT_FLAG_CLIENT_OPT }, { .key = "cluster.extra-hash-regex", .voltype = "cluster/distribute", .type = NO_DOC, - .op_version = 2, + .op_version = 3, .flags = OPT_FLAG_CLIENT_OPT }, { .key = "cluster.dht-xattr-name", .voltype = "cluster/distribute", .option = "xattr-name", .type = NO_DOC, + .op_version = 3, + .flags = OPT_FLAG_CLIENT_OPT + }, + + /* NUFA xlator options (Distribute special case) */ + { .key = "cluster.nufa", + .voltype = "cluster/distribute", + .option = "!nufa", + .type = NO_DOC, .op_version = 2, .flags = OPT_FLAG_CLIENT_OPT }, + { .key = "cluster.local-volume-name", + .voltype = "cluster/nufa", + .option = "local-volume-name", + .type = NO_DOC, + .op_version = 3, + .flags = OPT_FLAG_CLIENT_OPT + }, + + /* Switch xlator options (Distribute special case) */ + { .key = "cluster.switch", + .voltype = "cluster/distribute", + .option = "!switch", + .type = NO_DOC, + .op_version = 3, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "cluster.switch-pattern", + .voltype = "cluster/switch", + .option = "pattern.switch.case", + .type = NO_DOC, + .op_version = 3, + .flags = OPT_FLAG_CLIENT_OPT + }, /* AFR xlator options */ { .key = "cluster.entry-change-log", @@ -524,6 +549,11 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = 2, .flags = OPT_FLAG_CLIENT_OPT }, + { .key = "cluster.ensure-durability", + .voltype = "cluster/replicate", + .op_version = 3, + .flags = OPT_FLAG_CLIENT_OPT + }, /* Stripe xlator options */ { .key = "cluster.stripe-block-size", @@ -536,7 +566,7 @@ struct volopt_map_entry glusterd_volopt_map[] = { { .key = "cluster.stripe-coalesce", .voltype = "cluster/stripe", .option = "coalesce", - .op_version = 2, + .op_version = 1, .flags = OPT_FLAG_CLIENT_OPT }, @@ -638,11 +668,11 @@ struct volopt_map_entry glusterd_volopt_map[] = { }, { .key = "performance.enable-least-priority", .voltype = "performance/io-threads", - .op_version = 2 + .op_version = 1 }, { .key = "performance.least-rate-limit", .voltype = "performance/io-threads", - .op_version = 1 + .op_version = 2 }, /* Other perf xlators' options */ @@ -678,7 +708,7 @@ struct volopt_map_entry glusterd_volopt_map[] = { { .key = "performance.lazy-open", .voltype = "performance/open-behind", .option = "lazy-open", - .op_version = 2, + .op_version = 3, .flags = OPT_FLAG_CLIENT_OPT }, { .key = "performance.read-ahead-page-count", @@ -694,6 +724,34 @@ struct volopt_map_entry glusterd_volopt_map[] = { .flags = OPT_FLAG_CLIENT_OPT }, + /* Crypt xlator options */ + + { .key = "features.encryption", + .voltype = "encryption/crypt", + .option = "!feat", + .value = "off", + .op_version = 3, + .description = "enable/disable client-side encryption for " + "the volume.", + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT + }, + + { .key = "encryption.master-key", + .voltype = "encryption/crypt", + .op_version = 3, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "encryption.data-key-size", + .voltype = "encryption/crypt", + .op_version = 3, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "encryption.block-size", + .voltype = "encryption/crypt", + .op_version = 3, + .flags = OPT_FLAG_CLIENT_OPT + }, + /* Client xlator options */ { .key = "network.frame-timeout", .voltype = "protocol/client", @@ -732,7 +790,7 @@ struct volopt_map_entry glusterd_volopt_map[] = { { .key = "network.remote-dio", .voltype = "protocol/client", .option = "filter-O_DIRECT", - .op_version = 1, + .op_version = 2, .flags = OPT_FLAG_CLIENT_OPT }, @@ -778,6 +836,12 @@ struct volopt_map_entry glusterd_volopt_map[] = { .option = "statedump-path", .op_version = 1 }, + { .key = "server.outstanding-rpc-limit", + .voltype = "protocol/server", + .option = "rpc.outstanding-rpc-limit", + .type = GLOBAL_DOC, + .op_version = 3 + }, { .key = "features.lock-heal", .voltype = "protocol/server", .option = "lk-heal", @@ -815,6 +879,15 @@ struct volopt_map_entry glusterd_volopt_map[] = { .description = "enable/disable read-ahead translator in the volume.", .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT }, + { .key = "performance.readdir-ahead", + .voltype = "performance/readdir-ahead", + .option = "!perf", + .value = "off", + .op_version = 3, + .description = "enable/disable readdir-ahead translator in the volume.", + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT + }, + { .key = "performance.io-cache", .voltype = "performance/io-cache", .option = "!perf", @@ -914,6 +987,61 @@ struct volopt_map_entry glusterd_volopt_map[] = { .flags = OPT_FLAG_CLIENT_OPT }, + /* Feature translators */ + { .key = "features.file-snapshot", + .voltype = "features/qemu-block", + .option = "!feat", + .value = "off", + .op_version = 3, + .description = "enable/disable file-snapshot feature in the " + "volume.", + .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT + }, + +#ifdef HAVE_LIB_Z + /* Compressor-decompressor xlator options + * defaults used from xlator/feature/compress/src/cdc.h + */ + { .key = "features.compress", + .voltype = "features/cdc", + .option = "!compress", + .value = "off", + .type = NO_DOC, + .op_version = 2, + .description = "enable/disable compression translator" + }, + { .key = "compress.mode", + .voltype = "features/cdc", + .type = NO_DOC, + .op_version = 2 + }, + { .key = "compress.window-size", + .voltype = "features/cdc", + .type = NO_DOC, + .op_version = 2 + }, + { .key = "compress.mem-level", + .voltype = "features/cdc", + .type = NO_DOC, + .op_version = 2 + }, + { .key = "compress.min-size", + .voltype = "features/cdc", + .type = NO_DOC, + .op_version = 2 + }, + { .key = "compress.compression-level", + .voltype = "features/cdc", + .type = NO_DOC, + .op_version = 2 + }, + { .key = "compress.debug", + .voltype = "features/cdc", + .type = NO_DOC, + .op_version = 2 + }, + #endif + /* Quota xlator options */ { .key = VKEY_FEATURES_LIMIT_USAGE, .voltype = "features/quota", @@ -935,7 +1063,7 @@ struct volopt_map_entry glusterd_volopt_map[] = { .option = "deem-statfs", .value = "off", .type = DOC, - .op_version = 2, + .op_version = 3, .validate_fn = validate_quota, .flags = OPT_FLAG_CLIENT_OPT }, @@ -957,6 +1085,22 @@ struct volopt_map_entry glusterd_volopt_map[] = { .flags = OPT_FLAG_FORCE, .op_version = 1 }, + { .key = VKEY_MARKER_XTIME_FORCE, + .voltype = "features/marker", + .option = "gsync-force-xtime", + .value = "off", + .type = NO_DOC, + .flags = OPT_FLAG_FORCE, + .op_version = 2 + }, + { .key = VKEY_MARKER_XTIME_FORCE, + .voltype = "features/marker", + .option = "!gsync-force-xtime", + .value = "off", + .type = NO_DOC, + .flags = OPT_FLAG_FORCE, + .op_version = 2 + }, { .key = VKEY_FEATURES_QUOTA, .voltype = "features/marker", .option = "quota", @@ -1011,25 +1155,25 @@ struct volopt_map_entry glusterd_volopt_map[] = { .voltype = "debug/error-gen", .option = "failure", .type = NO_DOC, - .op_version = 2 + .op_version = 3 }, { .key = "debug.error-number", .voltype = "debug/error-gen", .option = "error-no", .type = NO_DOC, - .op_version = 2 + .op_version = 3 }, { .key = "debug.random-failure", .voltype = "debug/error-gen", .option = "random-failure", .type = NO_DOC, - .op_version = 2 + .op_version = 3 }, { .key = "debug.error-fops", .voltype = "debug/error-gen", .option = "enable", .type = NO_DOC, - .op_version = 2 + .op_version = 3 }, @@ -1076,6 +1220,12 @@ struct volopt_map_entry glusterd_volopt_map[] = { .type = GLOBAL_DOC, .op_version = 1 }, + { .key = "nfs.outstanding-rpc-limit", + .voltype = "nfs/server", + .option = "rpc.outstanding-rpc-limit", + .type = GLOBAL_DOC, + .op_version = 3 + }, { .key = "nfs.port", .voltype = "nfs/server", .option = "nfs.port", @@ -1146,25 +1296,67 @@ struct volopt_map_entry glusterd_volopt_map[] = { .type = GLOBAL_DOC, .op_version = 1 }, + { .key = "nfs.acl", + .voltype = "nfs/server", + .option = "nfs.acl", + .type = GLOBAL_DOC, + .op_version = 3 + }, { .key = "nfs.mount-udp", .voltype = "nfs/server", .option = "nfs.mount-udp", .type = GLOBAL_DOC, .op_version = 1 }, + { .key = "nfs.mount-rmtab", + .voltype = "nfs/server", + .option = "nfs.mount-rmtab", + .type = GLOBAL_DOC, + .op_version = 1 + }, { .key = "nfs.server-aux-gids", .voltype = "nfs/server", .option = "nfs.server-aux-gids", .type = NO_DOC, .op_version = 2 }, + { .key = "nfs.drc", + .voltype = "nfs/server", + .option = "nfs.drc", + .type = GLOBAL_DOC, + .op_version = 3 + }, + { .key = "nfs.drc-size", + .voltype = "nfs/server", + .option = "nfs.drc-size", + .type = GLOBAL_DOC, + .op_version = 3 + }, + { .key = "nfs.read-size", + .voltype = "nfs/server", + .option = "nfs3.read-size", + .type = GLOBAL_DOC, + .op_version = 3 + }, + { .key = "nfs.write-size", + .voltype = "nfs/server", + .option = "nfs3.write-size", + .type = GLOBAL_DOC, + .op_version = 3 + }, + { .key = "nfs.readdir-size", + .voltype = "nfs/server", + .option = "nfs3.readdir-size", + .type = GLOBAL_DOC, + .op_version = 3 + }, /* Other options which don't fit any place above */ { .key = "features.read-only", .voltype = "features/read-only", .option = "!read-only", .value = "off", - .op_version = 2, + .op_version = 1, .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT }, { .key = "features.worm", @@ -1176,21 +1368,37 @@ struct volopt_map_entry glusterd_volopt_map[] = { }, { .key = "storage.linux-aio", .voltype = "storage/posix", - .op_version = 2 + .op_version = 1 + }, + { .key = "storage.batch-fsync-mode", + .voltype = "storage/posix", + .op_version = 3 + }, + { .key = "storage.batch-fsync-delay-usec", + .voltype = "storage/posix", + .op_version = 3 }, { .key = "storage.owner-uid", .voltype = "storage/posix", .option = "brick-uid", - .op_version = 2 + .op_version = 1 }, { .key = "storage.owner-gid", .voltype = "storage/posix", .option = "brick-gid", - .op_version = 2 + .op_version = 1 }, { .key = "storage.node-uuid-pathinfo", .voltype = "storage/posix", - .op_version = 2 + .op_version = 3 + }, + { .key = "storage.health-check-interval", + .voltype = "storage/posix", + .op_version = 3 + }, + { .key = "storage.bd-aio", + .voltype = "storage/bd", + .op_version = 3 }, { .key = "config.memory-accounting", .voltype = "configuration", @@ -1213,6 +1421,32 @@ struct volopt_map_entry glusterd_volopt_map[] = { .value = "0", .op_version = 2 }, + /* changelog translator - global tunables */ + { .key = "changelog.changelog", + .voltype = "features/changelog", + .type = NO_DOC, + .op_version = 2 + }, + { .key = "changelog.changelog-dir", + .voltype = "features/changelog", + .type = NO_DOC, + .op_version = 2 + }, + { .key = "changelog.encoding", + .voltype = "features/changelog", + .type = NO_DOC, + .op_version = 2 + }, + { .key = "changelog.rollover-time", + .voltype = "features/changelog", + .type = NO_DOC, + .op_version = 2 + }, + { .key = "changelog.fsync-interval", + .voltype = "features/changelog", + .type = NO_DOC, + .op_version = 2 + }, { .key = NULL } }; diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c index 3b419d2e0..59288ada0 100644 --- a/xlators/mgmt/glusterd/src/glusterd.c +++ b/xlators/mgmt/glusterd/src/glusterd.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2006-2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -36,6 +36,7 @@ #include "glusterd-store.h" #include "glusterd-hooks.h" #include "glusterd-utils.h" +#include "glusterd-locks.h" #include "common-utils.h" #include "run.h" @@ -44,29 +45,43 @@ #include "glusterd-mountbroker.h" extern struct rpcsvc_program gluster_handshake_prog; +extern struct rpcsvc_program gluster_cli_getspec_prog; extern struct rpcsvc_program gluster_pmap_prog; extern glusterd_op_info_t opinfo; extern struct rpcsvc_program gd_svc_mgmt_prog; +extern struct rpcsvc_program gd_svc_mgmt_v3_prog; extern struct rpcsvc_program gd_svc_peer_prog; extern struct rpcsvc_program gd_svc_cli_prog; +extern struct rpcsvc_program gd_svc_cli_prog_ro; extern struct rpc_clnt_program gd_brick_prog; extern struct rpcsvc_program glusterd_mgmt_hndsk_prog; +extern char snap_mount_folder[PATH_MAX]; + rpcsvc_cbk_program_t glusterd_cbk_prog = { .progname = "Gluster Callback", .prognum = GLUSTER_CBK_PROGRAM, .progver = GLUSTER_CBK_VERSION, }; -struct rpcsvc_program *all_programs[] = { +struct rpcsvc_program *gd_inet_programs[] = { &gd_svc_peer_prog, - &gd_svc_cli_prog, + &gd_svc_cli_prog_ro, &gd_svc_mgmt_prog, + &gd_svc_mgmt_v3_prog, &gluster_pmap_prog, &gluster_handshake_prog, &glusterd_mgmt_hndsk_prog, }; -int rpcsvc_programs_count = (sizeof (all_programs) / sizeof (all_programs[0])); +int gd_inet_programs_count = (sizeof (gd_inet_programs) / + sizeof (gd_inet_programs[0])); + +struct rpcsvc_program *gd_uds_programs[] = { + &gd_svc_cli_prog, + &gluster_cli_getspec_prog, +}; +int gd_uds_programs_count = (sizeof (gd_uds_programs) / + sizeof (gd_uds_programs[0])); const char *gd_op_list[GD_OP_MAX + 1] = { [GD_OP_NONE] = "Invalid op", @@ -94,6 +109,10 @@ const char *gd_op_list[GD_OP_MAX + 1] = { [GD_OP_LIST_VOLUME] = "Lists", [GD_OP_CLEARLOCKS_VOLUME] = "Clear locks", [GD_OP_DEFRAG_BRICK_VOLUME] = "Rebalance", + [GD_OP_COPY_FILE] = "Copy File", + [GD_OP_SYS_EXEC] = "Execute system commands", + [GD_OP_GSYNC_CREATE] = "Geo-replication Create", + [GD_OP_SNAP] = "Snapshot", [GD_OP_MAX] = "Invalid op" }; @@ -119,12 +138,12 @@ glusterd_uuid_init () GF_ASSERT (this); priv = this->private; - ret = glusterd_retrieve_uuid (); - if (ret == 0) { - gf_log (this->name, GF_LOG_INFO, - "retrieved UUID: %s", uuid_utoa (priv->uuid)); - return 0; - } + ret = glusterd_retrieve_uuid (); + if (ret == 0) { + gf_log (this->name, GF_LOG_INFO, + "retrieved UUID: %s", uuid_utoa (priv->uuid)); + return 0; + } ret = glusterd_uuid_generate_save (); @@ -519,7 +538,7 @@ runinit_gsyncd_setrx (runner_t *runner, glusterd_conf_t *conf) { runinit (runner); runner_add_args (runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL); - runner_argprintf (runner, "%s/"GSYNC_CONF,conf->workdir); + runner_argprintf (runner, "%s/"GSYNC_CONF_TEMPLATE, conf->workdir); runner_add_arg (runner, "--config-set-rx"); } @@ -581,7 +600,7 @@ configure_syncdaemon (glusterd_conf_t *conf) /* gluster-params */ runinit_gsyncd_setrx (&runner, conf); runner_add_args (&runner, "gluster-params", - "xlator-option=*-dht.assert-no-child-down=true", + "aux-gfid-mount xlator-option=*-dht.assert-no-child-down=true", ".", ".", NULL); RUN_GSYNCD_CMD; @@ -598,14 +617,30 @@ configure_syncdaemon (glusterd_conf_t *conf) /* pid-file */ runinit_gsyncd_setrx (&runner, conf); runner_add_arg (&runner, "pid-file"); - runner_argprintf (&runner, "%s/${mastervol}/${eSlave}.pid", georepdir); + runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}.pid", georepdir); runner_add_args (&runner, ".", ".", NULL); RUN_GSYNCD_CMD; /* state-file */ runinit_gsyncd_setrx (&runner, conf); runner_add_arg (&runner, "state-file"); - runner_argprintf (&runner, "%s/${mastervol}/${eSlave}.status", georepdir); + runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}.status", georepdir); + runner_add_args (&runner, ".", ".", NULL); + RUN_GSYNCD_CMD; + + /* state-detail-file */ + runinit_gsyncd_setrx (&runner, conf); + runner_add_arg (&runner, "state-detail-file"); + runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}-detail.status", + georepdir); + runner_add_args (&runner, ".", ".", NULL); + RUN_GSYNCD_CMD; + + /* state-detail-file */ + runinit_gsyncd_setrx (&runner, conf); + runner_add_arg (&runner, "state-detail-file"); + runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}-detail.status", + georepdir); runner_add_args (&runner, ".", ".", NULL); RUN_GSYNCD_CMD; @@ -633,10 +668,32 @@ configure_syncdaemon (glusterd_conf_t *conf) runinit_gsyncd_setrx (&runner, conf); runner_add_args (&runner, "gluster-log-file", - DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"/${mastervol}/${eSlave}.gluster.log", + DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"/${mastervol}/${eSlave}${local_id}.gluster.log", ".", ".", NULL); RUN_GSYNCD_CMD; + /* ignore-deletes */ + runinit_gsyncd_setrx (&runner, conf); + runner_add_args (&runner, "ignore-deletes", "true", ".", ".", NULL); + RUN_GSYNCD_CMD; + + /* special-sync-mode */ + runinit_gsyncd_setrx (&runner, conf); + runner_add_args (&runner, "special-sync-mode", "partial", ".", ".", NULL); + RUN_GSYNCD_CMD; + + /* change-detector == changelog */ + runinit_gsyncd_setrx (&runner, conf); + runner_add_args(&runner, "change-detector", "changelog", ".", ".", NULL); + RUN_GSYNCD_CMD; + + runinit_gsyncd_setrx (&runner, conf); + runner_add_arg(&runner, "working-dir"); + runner_argprintf(&runner, "%s/${mastervol}/${eSlave}", + DEFAULT_VAR_RUN_DIRECTORY); + runner_add_args (&runner, ".", ".", NULL); + RUN_GSYNCD_CMD; + /************ * slave pre-configuration ************/ @@ -650,7 +707,7 @@ configure_syncdaemon (glusterd_conf_t *conf) /* gluster-params */ runinit_gsyncd_setrx (&runner, conf); runner_add_args (&runner, "gluster-params", - "xlator-option=*-dht.assert-no-child-down=true", + "aux-gfid-mount xlator-option=*-dht.assert-no-child-down=true", ".", NULL); RUN_GSYNCD_CMD; @@ -896,6 +953,187 @@ glusterd_launch_synctask (synctask_fn_t fn, void *opaque) " and other volume related services"); } +int +glusterd_uds_rpcsvc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event, + void *data) +{ + /* glusterd_rpcsvc_notify() does stuff that calls coming in from the + * unix domain socket don't need. This is just an empty function to be + * used for the uds listener. This will be used later if required. + */ + return 0; +} + +/* The glusterd unix domain socket listener only listens for cli */ +rpcsvc_t * +glusterd_init_uds_listener (xlator_t *this) +{ + int ret = -1; + dict_t *options = NULL; + rpcsvc_t *rpc = NULL; + data_t *sock_data = NULL; + char sockfile[PATH_MAX+1] = {0,}; + int i = 0; + + + GF_ASSERT (this); + + sock_data = dict_get (this->options, "glusterd-sockfile"); + if (!sock_data) { + strncpy (sockfile, DEFAULT_GLUSTERD_SOCKFILE, PATH_MAX); + } else { + strncpy (sockfile, sock_data->data, PATH_MAX); + } + + options = dict_new (); + if (!options) + goto out; + + ret = rpcsvc_transport_unix_options_build (&options, sockfile); + if (ret) + goto out; + + rpc = rpcsvc_init (this, this->ctx, options, 8); + if (rpc == NULL) { + ret = -1; + goto out; + } + + ret = rpcsvc_register_notify (rpc, glusterd_uds_rpcsvc_notify, + this); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "Failed to register notify function"); + goto out; + } + + ret = rpcsvc_create_listeners (rpc, options, this->name); + if (ret != 1) { + gf_log (this->name, GF_LOG_DEBUG, "Failed to create listener"); + goto out; + } + ret = 0; + + for (i = 0; i < gd_uds_programs_count; i++) { + ret = glusterd_program_register (this, rpc, gd_uds_programs[i]); + if (ret) { + i--; + for (; i >= 0; i--) + rpcsvc_program_unregister (rpc, + gd_uds_programs[i]); + + goto out; + } + } + +out: + if (options) + dict_unref (options); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to start glusterd " + "unix domain socket listener."); + if (rpc) { + GF_FREE (rpc); + rpc = NULL; + } + } + return rpc; +} + +void +glusterd_stop_uds_listener (xlator_t *this) +{ + glusterd_conf_t *conf = NULL; + rpcsvc_listener_t *listener = NULL; + rpcsvc_listener_t *next = NULL; + + GF_ASSERT (this); + conf = this->private; + + (void) rpcsvc_program_unregister (conf->uds_rpc, &gd_svc_cli_prog); + (void) rpcsvc_program_unregister (conf->uds_rpc, &gluster_handshake_prog); + + list_for_each_entry_safe (listener, next, &conf->uds_rpc->listeners, + list) { + rpcsvc_listener_destroy (listener); + } + + (void) rpcsvc_unregister_notify (conf->uds_rpc, glusterd_rpcsvc_notify, + this); + + unlink (DEFAULT_GLUSTERD_SOCKFILE); + + GF_FREE (conf->uds_rpc); + conf->uds_rpc = NULL; + + return; +} + +static int +glusterd_init_snap_folder (xlator_t *this) +{ + int ret = -1; + struct stat buf = {0,}; + + GF_ASSERT (this); + + /* Snapshot volumes are mounted under /var/run/gluster/snaps folder. + * But /var/run is normally a symbolic link to /run folder, which + * creates problems as the entry point in the mtab for the mount point + * and glusterd maintained entry point will be different. Therefore + * identify the correct run folder and use it for snap volume mounting. + */ + ret = lstat (GLUSTERD_VAR_RUN_DIR, &buf); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "stat fails on %s, exiting. (errno = %d)", + GLUSTERD_VAR_RUN_DIR, errno); + goto out; + } + + /* If /var/run is symlink then use /run folder */ + if (S_ISLNK (buf.st_mode)) { + strcpy (snap_mount_folder, GLUSTERD_RUN_DIR); + } else { + strcpy (snap_mount_folder, GLUSTERD_VAR_RUN_DIR); + } + + strcat (snap_mount_folder, GLUSTERD_DEFAULT_SNAPS_BRICK_DIR); + + ret = stat (snap_mount_folder, &buf); + if ((ret != 0) && (ENOENT != errno)) { + gf_log (this->name, GF_LOG_ERROR, + "stat fails on %s, exiting. (errno = %d)", + snap_mount_folder, errno); + ret = -1; + goto out; + } + + if ((!ret) && (!S_ISDIR(buf.st_mode))) { + gf_log (this->name, GF_LOG_CRITICAL, + "Provided snap path %s is not a directory," + "exiting", snap_mount_folder); + ret = -1; + goto out; + } + + if ((-1 == ret) && (ENOENT == errno)) { + /* Create missing folders */ + ret = mkdir_p (snap_mount_folder, 0777, _gf_false); + + if (-1 == ret) { + gf_log (this->name, GF_LOG_CRITICAL, + "Unable to create directory %s" + " ,errno = %d", snap_mount_folder, errno); + goto out; + } + } + +out: + return ret; +} + /* * init - called during glusterd initialization * @@ -907,6 +1145,7 @@ init (xlator_t *this) { int32_t ret = -1; rpcsvc_t *rpc = NULL; + rpcsvc_t *uds_rpc = NULL; glusterd_conf_t *conf = NULL; data_t *dir_data = NULL; struct stat buf = {0,}; @@ -932,7 +1171,7 @@ init (xlator_t *this) if ((ret != 0) && (ENOENT != errno)) { gf_log (this->name, GF_LOG_ERROR, "stat fails on %s, exiting. (errno = %d)", - workdir, errno); + workdir, errno); exit (1); } @@ -957,9 +1196,17 @@ init (xlator_t *this) first_time = 1; } + setenv ("GLUSTERD_WORKING_DIR", workdir, 1); gf_log (this->name, GF_LOG_INFO, "Using %s as working directory", workdir); + ret = glusterd_init_snap_folder (this); + if (ret) { + gf_log (this->name, GF_LOG_CRITICAL, "Unable to create " + "snap backend folder"); + exit (1); + } + snprintf (cmd_log_filename, PATH_MAX,"%s/.cmd_log_history", DEFAULT_LOG_FILE_DIRECTORY); ret = gf_cmd_log_init (cmd_log_filename); @@ -1057,18 +1304,29 @@ init (xlator_t *this) goto out; } - for (i = 0; i < rpcsvc_programs_count; i++) { - ret = glusterd_program_register (this, rpc, all_programs[i]); + for (i = 0; i < gd_inet_programs_count; i++) { + ret = glusterd_program_register (this, rpc, + gd_inet_programs[i]); if (ret) { i--; for (; i >= 0; i--) rpcsvc_program_unregister (rpc, - all_programs[i]); + gd_inet_programs[i]); goto out; } } + /* Start a unix domain socket listener just for cli commands + * This should prevent ports from being wasted by being in TIMED_WAIT + * when cli commands are done continuously + */ + uds_rpc = glusterd_init_uds_listener (this); + if (uds_rpc == NULL) { + ret = -1; + goto out; + } + conf = GF_CALLOC (1, sizeof (glusterd_conf_t), gf_gld_mt_glusterd_conf_t); GF_VALIDATE_OR_GOTO(this->name, conf, out); @@ -1081,8 +1339,12 @@ init (xlator_t *this) INIT_LIST_HEAD (&conf->peers); INIT_LIST_HEAD (&conf->volumes); + INIT_LIST_HEAD (&conf->snapshots); + INIT_LIST_HEAD (&conf->missed_snaps_list); + pthread_mutex_init (&conf->mutex, NULL); conf->rpc = rpc; + conf->uds_rpc = uds_rpc; conf->gfs_mgmt = &gd_brick_prog; strncpy (conf->workdir, workdir, PATH_MAX); @@ -1093,6 +1355,8 @@ init (xlator_t *this) glusterd_friend_sm_init (); glusterd_op_sm_init (); glusterd_opinfo_init (); + glusterd_mgmt_v3_lock_init (); + glusterd_txn_opinfo_dict_init (); ret = glusterd_sm_tr_log_init (&conf->op_sm_log, glusterd_op_sm_state_name_get, glusterd_op_sm_event_name_get, @@ -1100,6 +1364,12 @@ init (xlator_t *this) if (ret) goto out; + conf->base_port = GF_IANA_PRIV_PORTS_START; + if (dict_get_uint32(this->options, "base-port", &conf->base_port) == 0) { + gf_log (this->name, GF_LOG_INFO, + "base-port override: %d", conf->base_port); + } + /* Set option to run bricks on valgrind if enabled in glusterd.vol */ conf->valgrind = _gf_false; ret = dict_get_str (this->options, "run-with-valgrind", &valgrind_str); @@ -1199,11 +1469,17 @@ fini (xlator_t *this) goto out; conf = this->private; + + glusterd_stop_uds_listener (this); + FREE (conf->pmap); if (conf->handle) - glusterd_store_handle_destroy (conf->handle); + gf_store_handle_destroy (conf->handle); glusterd_sm_tr_log_delete (&conf->op_sm_log); + glusterd_mgmt_v3_lock_fini (); + glusterd_txn_opinfo_dict_fini (); GF_FREE (conf); + this->private = NULL; out: return; @@ -1304,5 +1580,18 @@ struct volume_options options[] = { .description = "Sets the quorum percentage for the trusted " "storage pool." }, + { .key = {"glusterd-sockfile"}, + .type = GF_OPTION_TYPE_PATH, + .description = "The socket file on which glusterd should listen for " + "cli requests. Default is "DEFAULT_GLUSTERD_SOCKFILE "." + }, + { .key = {"base-port"}, + .type = GF_OPTION_TYPE_INT, + .description = "Sets the base port for portmap query" + }, + { .key = {"snap-brick-path"}, + .type = GF_OPTION_TYPE_STR, + .description = "directory where the bricks for the snapshots will be created" + }, { .key = {NULL} }, }; diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index b466ac20c..3aa395ebc 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + Copyright (c) 2006-2013 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -37,16 +37,24 @@ #include "glusterd-pmap.h" #include "cli1-xdr.h" #include "syncop.h" +#include "store.h" #define GLUSTERD_MAX_VOLUME_NAME 1000 -#define DEFAULT_LOG_FILE_DIRECTORY DATADIR "/log/glusterfs" #define GLUSTERD_TR_LOG_SIZE 50 #define GLUSTERD_NAME "glusterd" #define GLUSTERD_SOCKET_LISTEN_BACKLOG 128 #define GLUSTERD_QUORUM_TYPE_KEY "cluster.server-quorum-type" #define GLUSTERD_QUORUM_RATIO_KEY "cluster.server-quorum-ratio" #define GLUSTERD_GLOBAL_OPT_VERSION "global-option-version" +#define GLUSTERD_COMMON_PEM_PUB_FILE "/geo-replication/common_secret.pem.pub" +#define GEO_CONF_MAX_OPT_VALS 5 +#define GLUSTERD_CREATE_HOOK_SCRIPT "/hooks/1/gsync-create/post/" \ + "S56glusterd-geo-rep-create-post.sh" + +#define GLUSTERD_SNAPS_MAX_HARD_LIMIT 256 +#define GLUSTERD_SNAPS_DEF_SOFT_LIMIT_PERCENT 90 +#define GLUSTERD_SNAPS_MAX_SOFT_LIMIT_PERCENT 100 #define GLUSTERD_SERVER_QUORUM "server" #define FMTSTR_CHECK_VOL_EXISTS "Volume %s does not exist" @@ -67,6 +75,9 @@ struct glusterd_volinfo_; typedef struct glusterd_volinfo_ glusterd_volinfo_t; +struct glusterd_snap_; +typedef struct glusterd_snap_ glusterd_snap_t; + typedef enum glusterd_op_ { GD_OP_NONE = 0, GD_OP_CREATE_VOLUME, @@ -93,26 +104,22 @@ typedef enum glusterd_op_ { GD_OP_LIST_VOLUME, GD_OP_CLEARLOCKS_VOLUME, GD_OP_DEFRAG_BRICK_VOLUME, - GD_OP_BD_OP, + GD_OP_COPY_FILE, + GD_OP_SYS_EXEC, + GD_OP_GSYNC_CREATE, + GD_OP_SNAP, GD_OP_MAX, } glusterd_op_t; extern const char * gd_op_list[]; -struct glusterd_store_iter_ { - int fd; - FILE *file; - char filepath[PATH_MAX]; -}; - -typedef struct glusterd_store_iter_ glusterd_store_iter_t; struct glusterd_volgen { dict_t *dict; }; typedef struct { - struct rpc_clnt *rpc; - gf_boolean_t online; + struct rpc_clnt *rpc; + gf_boolean_t online; } nodesrv_t; typedef struct { @@ -122,37 +129,45 @@ typedef struct { } gd_global_opts_t; typedef struct { - struct _volfile_ctx *volfile; - pthread_mutex_t mutex; - struct list_head peers; - struct list_head xaction_peers; - gf_boolean_t verify_volfile_checksum; - gf_boolean_t trace; - uuid_t uuid; - char workdir[PATH_MAX]; - rpcsvc_t *rpc; - nodesrv_t *shd; - nodesrv_t *nfs; - struct pmap_registry *pmap; - struct list_head volumes; - pthread_mutex_t xprt_lock; - struct list_head xprt_list; - glusterd_store_handle_t *handle; - gf_timer_t *timer; - glusterd_sm_tr_log_t op_sm_log; - struct rpc_clnt_program *gfs_mgmt; - - struct list_head mount_specs; - gf_boolean_t valgrind; - pthread_t brick_thread; - void *hooks_priv; + struct _volfile_ctx *volfile; + pthread_mutex_t mutex; + struct list_head peers; + struct list_head xaction_peers; + gf_boolean_t verify_volfile_checksum; + gf_boolean_t trace; + uuid_t uuid; + char workdir[PATH_MAX]; + rpcsvc_t *rpc; + nodesrv_t *shd; + nodesrv_t *nfs; + struct pmap_registry *pmap; + struct list_head volumes; + struct list_head snapshots; /*List of snap volumes */ + pthread_mutex_t xprt_lock; + struct list_head xprt_list; + gf_store_handle_t *handle; + gf_timer_t *timer; + glusterd_sm_tr_log_t op_sm_log; + struct rpc_clnt_program *gfs_mgmt; + + struct list_head mount_specs; + gf_boolean_t valgrind; + pthread_t brick_thread; + void *hooks_priv; /* need for proper handshake_t */ - int op_version; /* Starts with 1 for 3.3.0 */ - xlator_t *xl; /* Should be set to 'THIS' before creating thread */ - gf_boolean_t pending_quorum_action; - dict_t *opts; - synclock_t big_lock; - gf_boolean_t restart_done; + int op_version; /* Starts with 1 for 3.3.0 */ + xlator_t *xl; /* Should be set to 'THIS' before creating thread */ + gf_boolean_t pending_quorum_action; + dict_t *opts; + synclock_t big_lock; + gf_boolean_t restart_done; + rpcsvc_t *uds_rpc; /* RPCSVC for the unix domain socket */ + uint32_t base_port; + uint64_t snap_max_hard_limit; + uint64_t snap_max_soft_limit; + char *snap_bricks_directory; + gf_store_handle_t *missed_snaps_list_shandle; + struct list_head missed_snaps_list; } glusterd_conf_t; @@ -162,18 +177,22 @@ typedef enum gf_brick_status { } gf_brick_status_t; struct glusterd_brickinfo { - char hostname[1024]; - char path[PATH_MAX]; - struct list_head brick_list; - uuid_t uuid; - int port; - int rdma_port; - char *logfile; - gf_boolean_t signed_in; - glusterd_store_handle_t *shandle; - gf_brick_status_t status; - struct rpc_clnt *rpc; - int decommissioned; + char hostname[1024]; + char path[PATH_MAX]; + char device_path[PATH_MAX]; + struct list_head brick_list; + uuid_t uuid; + int port; + int rdma_port; + char *logfile; + gf_boolean_t signed_in; + gf_store_handle_t *shandle; + gf_brick_status_t status; + struct rpc_clnt *rpc; + int decommissioned; + char vg[PATH_MAX]; /* FIXME: Use max size for length of vg */ + int caps; /* Capability */ + int32_t snap_status; }; typedef struct glusterd_brickinfo glusterd_brickinfo_t; @@ -196,7 +215,7 @@ struct glusterd_defrag_info_ { int cmd; pthread_t th; gf_defrag_status_t defrag_status; - struct rpc_clnt * rpc; + struct rpc_clnt *rpc; uint32_t connected; char mount[1024]; char databuf[131072]; @@ -229,83 +248,147 @@ struct _auth { typedef struct _auth auth_t; -typedef enum glusterd_vol_backend_ { - GD_VOL_BK_DEFAULT = 0, /* POSIX */ - GD_VOL_BK_BD = 1, -} glusterd_vol_backend_t; +/* Capabilities of xlator */ +#define CAPS_BD 0x00000001 +#define CAPS_THIN 0x00000002 +#define CAPS_OFFLOAD_COPY 0x00000004 +#define CAPS_OFFLOAD_SNAPSHOT 0x00000008 struct glusterd_rebalance_ { - gf_defrag_status_t defrag_status; - uint64_t rebalance_files; - uint64_t rebalance_data; - uint64_t lookedup_files; + gf_defrag_status_t defrag_status; + uint64_t rebalance_files; + uint64_t rebalance_data; + uint64_t lookedup_files; + uint64_t skipped_files; glusterd_defrag_info_t *defrag; - gf_cli_defrag_type defrag_cmd; - uint64_t rebalance_failures; - uuid_t rebalance_id; - double rebalance_time; - glusterd_op_t op; + gf_cli_defrag_type defrag_cmd; + uint64_t rebalance_failures; + uuid_t rebalance_id; + double rebalance_time; + glusterd_op_t op; + dict_t *dict; /* Dict to store misc information + * like list of bricks being removed */ }; typedef struct glusterd_rebalance_ glusterd_rebalance_t; struct glusterd_replace_brick_ { gf_rb_status_t rb_status; - glusterd_brickinfo_t *src_brick; - glusterd_brickinfo_t *dst_brick; + glusterd_brickinfo_t *src_brick; + glusterd_brickinfo_t *dst_brick; uuid_t rb_id; }; typedef struct glusterd_replace_brick_ glusterd_replace_brick_t; struct glusterd_volinfo_ { - char volname[GLUSTERD_MAX_VOLUME_NAME]; - int type; - int brick_count; - struct list_head vol_list; - struct list_head bricks; - glusterd_volume_status status; - int sub_count; /* backward compatibility */ - int stripe_count; - int replica_count; - int subvol_count; /* Number of subvolumes in a + gf_lock_t lock; + char volname[GLUSTERD_MAX_VOLUME_NAME]; + gf_boolean_t is_snap_volume; + glusterd_snap_t *snapshot; + gf_boolean_t is_volume_restored; + char parent_volname[GLUSTERD_MAX_VOLUME_NAME]; + /* In case of a snap volume + i.e (is_snap_volume == TRUE) this + field will contain the name of + the volume which is snapped. In + case of a non-snap volume, this + field will be initialized as N/A */ + int type; + int brick_count; + uint64_t snap_count; + uint64_t snap_max_hard_limit; + struct list_head vol_list; + /* In case of a snap volume + i.e (is_snap_volume == TRUE) this + is linked to glusterd_snap_t->volumes. + In case of a non-snap volume, this is + linked to glusterd_conf_t->volumes */ + struct list_head snapvol_list; + /* This is a current pointer for + glusterd_volinfo_t->snap_volumes */ + struct list_head bricks; + struct list_head snap_volumes; + /* TODO : Need to remove this, as this + * is already part of snapshot object. + */ + glusterd_volume_status status; + int sub_count; /* backward compatibility */ + int stripe_count; + int replica_count; + int subvol_count; /* Number of subvolumes in a distribute volume */ - int dist_leaf_count; /* Number of bricks in one + int dist_leaf_count; /* Number of bricks in one distribute subvolume */ - int port; - glusterd_store_handle_t *shandle; - glusterd_store_handle_t *rb_shandle; - glusterd_store_handle_t *node_state_shandle; + int port; + gf_store_handle_t *shandle; + gf_store_handle_t *rb_shandle; + gf_store_handle_t *node_state_shandle; /* Defrag/rebalance related */ - glusterd_rebalance_t rebal; + glusterd_rebalance_t rebal; /* Replace brick status */ - glusterd_replace_brick_t rep_brick; + glusterd_replace_brick_t rep_brick; - int version; - uint32_t cksum; - gf_transport_type transport_type; - gf_transport_type nfs_transport_type; + int version; + uint32_t cksum; + gf_transport_type transport_type; + gf_transport_type nfs_transport_type; - dict_t *dict; + dict_t *dict; - uuid_t volume_id; - auth_t auth; - char *logdir; + uuid_t volume_id; + auth_t auth; + char *logdir; - dict_t *gsync_slaves; + dict_t *gsync_slaves; - int decommission_in_progress; - xlator_t *xl; + int decommission_in_progress; + xlator_t *xl; - gf_boolean_t memory_accounting; - glusterd_vol_backend_t backend; + gf_boolean_t memory_accounting; + int caps; /* Capability */ - int op_version; - int client_op_version; + int op_version; + int client_op_version; }; +typedef enum gd_snap_status_ { + GD_SNAP_STATUS_NONE, + GD_SNAP_STATUS_INIT, + GD_SNAP_STATUS_IN_USE, + GD_SNAP_STATUS_DECOMMISSION, + GD_SNAP_STATUS_RESTORED, +} gd_snap_status_t; + +struct glusterd_snap_ { + gf_lock_t lock; + struct list_head volumes; + struct list_head snap_list; + char snapname[GLUSTERD_MAX_SNAP_NAME]; + uuid_t snap_id; + char *description; + time_t time_stamp; + gf_boolean_t snap_restored; + gd_snap_status_t snap_status; + gf_store_handle_t *shandle; +}; + +typedef struct glusterd_snap_op_ { + int32_t brick_num; + char *brick_path; + int32_t op; + int32_t status; + struct list_head snap_ops_list; +} glusterd_snap_op_t; + +typedef struct glusterd_missed_snap_ { + char *node_snap_info; + struct list_head missed_snaps; + struct list_head snap_ops; +} glusterd_missed_snap_info; + typedef enum gd_node_type_ { GD_NODE_NONE, GD_NODE_BRICK, @@ -314,6 +397,12 @@ typedef enum gd_node_type_ { GD_NODE_NFS, } gd_node_type; +typedef enum missed_snap_stat { + GD_MISSED_SNAP_NONE, + GD_MISSED_SNAP_PENDING, + GD_MISSED_SNAP_DONE, +} missed_snap_stat; + typedef struct glusterd_pending_node_ { struct list_head list; void *node; @@ -321,6 +410,13 @@ typedef struct glusterd_pending_node_ { int32_t index; } glusterd_pending_node_t; +struct gsync_config_opt_vals_ { + char *op_name; + int no_of_pos_vals; + gf_boolean_t case_sensitive; + char *values[GEO_CONF_MAX_OPT_VALS]; +}; + enum glusterd_op_ret { GLUSTERD_CONNECTION_AWAITED = 100, }; @@ -338,11 +434,18 @@ enum glusterd_vol_comp_status_ { #define GLUSTERD_VOLUME_DIR_PREFIX "vols" #define GLUSTERD_PEER_DIR_PREFIX "peers" #define GLUSTERD_VOLUME_INFO_FILE "info" +#define GLUSTERD_SNAP_INFO_FILE "info" #define GLUSTERD_VOLUME_RBSTATE_FILE "rbstate" #define GLUSTERD_BRICK_INFO_DIR "bricks" #define GLUSTERD_CKSUM_FILE "cksum" #define GLUSTERD_TRASH "trash" #define GLUSTERD_NODE_STATE_FILE "node_state.info" +#define GLUSTERD_MISSED_SNAPS_LIST_FILE "missed_snaps_list" +#define GLUSTERD_VOL_SNAP_DIR_PREFIX "snaps" + +#define GLUSTERD_DEFAULT_SNAPS_BRICK_DIR "/gluster/snaps" +#define GLUSTERD_VAR_RUN_DIR "/var/run" +#define GLUSTERD_RUN_DIR "/run" /* definitions related to replace brick */ #define RB_CLIENT_MOUNTPOINT "rb_mount" @@ -355,14 +458,29 @@ enum glusterd_vol_comp_status_ { typedef ssize_t (*gd_serialize_t) (struct iovec outmsg, void *args); -#define GLUSTERD_GET_VOLUME_DIR(path, volinfo, priv) \ - snprintf (path, PATH_MAX, "%s/vols/%s", priv->workdir,\ - volinfo->volname); +#define GLUSTERD_GET_VOLUME_DIR(path, volinfo, priv) \ + if (volinfo->is_snap_volume) { \ + snprintf (path, PATH_MAX, "%s/snaps/%s/%s", priv->workdir, \ + volinfo->snapshot->snapname, volinfo->volname); \ + } else { \ + snprintf (path, PATH_MAX, "%s/vols/%s", priv->workdir, \ + volinfo->volname); \ + } -#define GLUSTERD_GET_BRICK_DIR(path, volinfo, priv) \ - snprintf (path, PATH_MAX, "%s/%s/%s/%s", priv->workdir,\ - GLUSTERD_VOLUME_DIR_PREFIX, volinfo->volname, \ - GLUSTERD_BRICK_INFO_DIR); +#define GLUSTERD_GET_SNAP_DIR(path, snap, priv) \ + snprintf (path, PATH_MAX, "%s/snaps/%s", priv->workdir, \ + snap->snapname); + +#define GLUSTERD_GET_BRICK_DIR(path, volinfo, priv) \ + if (volinfo->is_snap_volume) { \ + snprintf (path, PATH_MAX, "%s/snaps/%s/%s/%s", priv->workdir, \ + volinfo->snapshot->snapname, volinfo->volname, \ + GLUSTERD_BRICK_INFO_DIR); \ + } else { \ + snprintf (path, PATH_MAX, "%s/%s/%s/%s", priv->workdir, \ + GLUSTERD_VOLUME_DIR_PREFIX, volinfo->volname, \ + GLUSTERD_BRICK_INFO_DIR); \ + } #define GLUSTERD_GET_NFS_DIR(path, priv) \ snprintf (path, PATH_MAX, "%s/nfs", priv->workdir); @@ -382,7 +500,7 @@ typedef ssize_t (*gd_serialize_t) (struct iovec outmsg, void *args); GLUSTERD_GET_VOLUME_DIR (volpath, volinfo, priv); \ GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, exp_path); \ snprintf (pidfile, PATH_MAX, "%s/run/%s-%s.pid", \ - volpath, brickinfo->hostname, exp_path); \ + volpath, brickinfo->hostname, exp_path); \ } while (0) #define GLUSTERD_GET_NFS_PIDFILE(pidfile,nfspath) { \ @@ -391,9 +509,9 @@ typedef ssize_t (*gd_serialize_t) (struct iovec outmsg, void *args); } #define GLUSTERD_STACK_DESTROY(frame) do {\ - frame->local = NULL; \ - STACK_DESTROY (frame->root);\ - } while (0) + frame->local = NULL; \ + STACK_DESTROY (frame->root); \ + } while (0) #define GLUSTERD_GET_DEFRAG_DIR(path, volinfo, priv) do { \ char vol_path[PATH_MAX]; \ @@ -415,6 +533,19 @@ typedef ssize_t (*gd_serialize_t) (struct iovec outmsg, void *args); uuid_utoa(MY_UUID)); \ } while (0) +#define GLUSTERD_GET_UUID_NOHYPHEN(ret_string, uuid) do { \ + char *snap_volname_ptr = ret_string; \ + char *snap_volid_ptr = uuid_utoa(uuid); \ + while (*snap_volid_ptr) { \ + if (*snap_volid_ptr == '-') { \ + snap_volid_ptr++; \ + } else { \ + (*snap_volname_ptr++) = \ + (*snap_volid_ptr++); \ + } \ + } \ + *snap_volname_ptr = '\0'; \ + } while (0) int glusterd_uuid_init(); @@ -425,11 +556,11 @@ int glusterd_uuid_generate_save (); static inline unsigned char * __glusterd_uuid() { - glusterd_conf_t *priv = THIS->private; + glusterd_conf_t *priv = THIS->private; - if (uuid_is_null (priv->uuid)) - glusterd_uuid_init(); - return &priv->uuid[0]; + if (uuid_is_null (priv->uuid)) + glusterd_uuid_init(); + return &priv->uuid[0]; } int glusterd_big_locked_notify (struct rpc_clnt *rpc, void *mydata, @@ -446,7 +577,8 @@ int32_t glusterd_brick_from_brickinfo (glusterd_brickinfo_t *brickinfo, char **new_brick); int -glusterd_probe_begin (rpcsvc_request_t *req, const char *hoststr, int port); +glusterd_probe_begin (rpcsvc_request_t *req, const char *hoststr, int port, + dict_t *dict); int glusterd_xfer_friend_add_resp (rpcsvc_request_t *req, char *myhostname, @@ -476,6 +608,14 @@ int glusterd_op_unlock_send_resp (rpcsvc_request_t *req, int32_t status); int +glusterd_op_mgmt_v3_lock_send_resp (rpcsvc_request_t *req, + uuid_t *txn_id, int32_t status); + +int +glusterd_op_mgmt_v3_unlock_send_resp (rpcsvc_request_t *req, + uuid_t *txn_id, int32_t status); + +int glusterd_op_stage_send_resp (rpcsvc_request_t *req, int32_t op, int32_t status, char *op_errstr, dict_t *rsp_dict); @@ -520,7 +660,7 @@ glusterd_handle_defrag_volume_v2 (rpcsvc_request_t *req); int glusterd_xfer_cli_probe_resp (rpcsvc_request_t *req, int32_t op_ret, int32_t op_errno, char *op_errstr, char *hostname, - int port); + int port, dict_t *dict); int glusterd_op_commit_send_resp (rpcsvc_request_t *req, @@ -532,7 +672,7 @@ glusterd_xfer_friend_remove_resp (rpcsvc_request_t *req, char *hostname, int por int glusterd_deprobe_begin (rpcsvc_request_t *req, const char *hoststr, int port, - uuid_t uuid); + uuid_t uuid, dict_t *dict); int glusterd_handle_cli_deprobe (rpcsvc_request_t *req); @@ -607,6 +747,12 @@ int glusterd_handle_reset_volume (rpcsvc_request_t *req); int +glusterd_handle_copy_file (rpcsvc_request_t *req); + +int +glusterd_handle_sys_exec (rpcsvc_request_t *req); + +int glusterd_handle_gsync_set (rpcsvc_request_t *req); int @@ -618,7 +764,7 @@ glusterd_handle_fsm_log (rpcsvc_request_t *req); int glusterd_xfer_cli_deprobe_resp (rpcsvc_request_t *req, int32_t op_ret, int32_t op_errno, char *op_errstr, - char *hostname); + char *hostname, dict_t *dict); int glusterd_fetchspec_notify (xlator_t *this); @@ -685,11 +831,20 @@ int glusterd_handle_cli_heal_volume (rpcsvc_request_t *req); int glusterd_handle_cli_list_volume (rpcsvc_request_t *req); +int +glusterd_handle_snapshot (rpcsvc_request_t *req); + /* op-sm functions */ int glusterd_op_stage_heal_volume (dict_t *dict, char **op_errstr); int glusterd_op_heal_volume (dict_t *dict, char **op_errstr); int glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr); int glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict); +int glusterd_op_stage_copy_file (dict_t *dict, char **op_errstr); +int glusterd_op_copy_file (dict_t *dict, char **op_errstr); +int glusterd_op_stage_sys_exec (dict_t *dict, char **op_errstr); +int glusterd_op_sys_exec (dict_t *dict, char **op_errstr, dict_t *rsp_dict); +int glusterd_op_stage_gsync_create (dict_t *dict, char **op_errstr); +int glusterd_op_gsync_create (dict_t *dict, char **op_errstr, dict_t *rsp_dict); int glusterd_op_quota (dict_t *dict, char **op_errstr, dict_t *rsp_dict); int glusterd_op_stage_quota (dict_t *dict, char **op_errstr); int glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr, @@ -720,7 +875,6 @@ int glusterd_op_statedump_volume (dict_t *dict, char **op_errstr); int glusterd_op_stage_clearlocks_volume (dict_t *dict, char **op_errstr); int glusterd_op_clearlocks_volume (dict_t *dict, char **op_errstr, dict_t *rsp_dict); -int glusterd_op_stage_bd (dict_t *dict, char **op_errstr); /* misc */ void glusterd_do_replace_brick (void *data); @@ -731,10 +885,68 @@ int glusterd_op_statedump_volume_args_get (dict_t *dict, char **volname, char **options, int *option_cnt); int glusterd_op_gsync_args_get (dict_t *dict, char **op_errstr, - char **master, char **slave); + char **master, char **slave, char **host_uuid); +int glusterd_stop_volume (glusterd_volinfo_t *volinfo); + /* Synctask part */ int32_t glusterd_op_begin_synctask (rpcsvc_request_t *req, glusterd_op_t op, void *dict); int32_t glusterd_defrag_event_notify_handle (dict_t *dict); + +int32_t +glusterd_txn_opinfo_dict_init (); + +void +glusterd_txn_opinfo_dict_fini (); + +void +glusterd_txn_opinfo_init (); + +/* snapshot */ +glusterd_snap_t* +glusterd_new_snap_object(); + +int32_t +glusterd_list_add_snapvol (glusterd_volinfo_t *origin_vol, + glusterd_volinfo_t *snap_vol); + +glusterd_snap_t* +glusterd_remove_snap_by_id (uuid_t snap_id); + +glusterd_snap_t* +glusterd_remove_snap_by_name (char *snap_name); + +glusterd_snap_t* +glusterd_find_snap_by_name (char *snap_name); + +glusterd_snap_t* +glusterd_find_snap_by_id (uuid_t snap_id); + +int +glusterd_snapshot_prevalidate (dict_t *dict, char **op_errstr, + dict_t *rsp_dict); +int +glusterd_snapshot_brickop (dict_t *dict, char **op_errstr, dict_t *rsp_dict); +int +glusterd_snapshot (dict_t *dict, char **op_errstr, dict_t *rsp_dict); +int +glusterd_snapshot_postvalidate (dict_t *dict, int32_t op_ret, char **op_errstr, + dict_t *rsp_dict); +char * +glusterd_build_snap_device_path (char *device, char *snapname); +int32_t +glusterd_snap_remove (dict_t *rsp_dict, glusterd_snap_t *snap, + gf_boolean_t remove_lvm, gf_boolean_t force); +int32_t +glusterd_snapshot_cleanup (dict_t *dict, char **op_errstr, dict_t *rsp_dict); + +int32_t +glusterd_add_missed_snaps_to_list (dict_t *dict, int32_t missed_snap_count); + +int32_t +glusterd_store_missed_snaps_list (char *missed_info, int32_t brick_num, + char *brick_path, int32_t snap_op, + int32_t snap_status); + #endif diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c index 43c98a23f..6a5587c2d 100644 --- a/xlators/mount/fuse/src/fuse-bridge.c +++ b/xlators/mount/fuse/src/fuse-bridge.c @@ -11,6 +11,8 @@ #include <sys/wait.h> #include "fuse-bridge.h" #include "mount-gluster-compat.h" +#include "glusterfs.h" +#include "glusterfs-acl.h" #ifdef __NetBSD__ #undef open /* in perfuse.h, pulled from mount-gluster-compat.h */ @@ -51,6 +53,46 @@ fuse_invalidate(xlator_t *this, inode_t *inode) return 0; } +static int32_t +fuse_forget_cbk (xlator_t *this, inode_t *inode) +{ + //Nothing to free in inode ctx, hence return. + return 0; +} + +void +fuse_inode_set_need_lookup (inode_t *inode, xlator_t *this) +{ + uint64_t need_lookup = 1; + + if (!inode || !this) + return; + + inode_ctx_set (inode, this, &need_lookup); + + return; +} + + +gf_boolean_t +fuse_inode_needs_lookup (inode_t *inode, xlator_t *this) +{ + uint64_t need_lookup = 0; + gf_boolean_t ret = _gf_false; + + if (!inode || !this) + return ret; + + inode_ctx_get (inode, this, &need_lookup); + if (need_lookup) + ret = _gf_true; + need_lookup = 0; + inode_ctx_set (inode, this, &need_lookup); + + return ret; +} + + fuse_fd_ctx_t * __fuse_fd_ctx_check_n_create (xlator_t *this, fd_t *fd) { @@ -371,7 +413,7 @@ fuse_entry_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret == 0) { gf_log ("glusterfs-fuse", GF_LOG_TRACE, - "%"PRIu64": %s() %s => %"PRId64, + "%"PRIu64": %s() %s => %"PRIu64, frame->root->unique, gf_fop_list[frame->root->op], state->loc.path, buf->ia_ino); @@ -463,6 +505,13 @@ fuse_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret == -1 && state->is_revalidate == 1) { itable = state->itable; + /* + * A stale mapping might exist for a dentry/inode that has been + * removed from another client. + */ + if (op_errno == ENOENT) + inode_unlink(state->loc.inode, state->loc.parent, + state->loc.name); inode_unref (state->loc.inode); state->loc.inode = inode_new (itable); state->is_revalidate = 2; @@ -522,8 +571,8 @@ fuse_lookup_resume (fuse_state_t *state) static void fuse_lookup (xlator_t *this, fuse_in_header_t *finh, void *msg) { - char *name = msg; - fuse_state_t *state = NULL; + char *name = msg; + fuse_state_t *state = NULL; GET_STATE (this, finh, state); @@ -531,15 +580,27 @@ fuse_lookup (xlator_t *this, fuse_in_header_t *finh, void *msg) finh->nodeid, name); fuse_resolve_and_resume (state, fuse_lookup_resume); + + return; +} + +static inline void +do_forget(xlator_t *this, uint64_t unique, uint64_t nodeid, uint64_t nlookup) +{ + inode_t *fuse_inode = fuse_ino_to_inode(nodeid, this); + + fuse_log_eh(this, "%"PRIu64": FORGET %"PRIu64"/%"PRIu64" gfid: (%s)", + unique, nodeid, nlookup, uuid_utoa(fuse_inode->gfid)); + + inode_forget(fuse_inode, nlookup); + inode_unref(fuse_inode); } static void fuse_forget (xlator_t *this, fuse_in_header_t *finh, void *msg) { - struct fuse_forget_in *ffi = msg; - - inode_t *fuse_inode; + struct fuse_forget_in *ffi = msg; if (finh->nodeid == 1) { GF_FREE (finh); @@ -550,16 +611,29 @@ fuse_forget (xlator_t *this, fuse_in_header_t *finh, void *msg) "%"PRIu64": FORGET %"PRIu64"/%"PRIu64, finh->unique, finh->nodeid, ffi->nlookup); - fuse_inode = fuse_ino_to_inode (finh->nodeid, this); + do_forget(this, finh->unique, finh->nodeid, ffi->nlookup); - fuse_log_eh (this, "%"PRIu64": FORGET %"PRIu64"/%"PRIu64" gfid: (%s)", - finh->unique, finh->nodeid, ffi->nlookup, - uuid_utoa (fuse_inode->gfid)); + GF_FREE (finh); +} - inode_forget (fuse_inode, ffi->nlookup); - inode_unref (fuse_inode); +static void +fuse_batch_forget(xlator_t *this, fuse_in_header_t *finh, void *msg) +{ + struct fuse_batch_forget_in *fbfi = msg; + struct fuse_forget_one *ffo = (struct fuse_forget_one *) (fbfi + 1); + int i; - GF_FREE (finh); + gf_log("glusterfs-fuse", GF_LOG_TRACE, + "%"PRIu64": BATCH_FORGET %"PRIu64"/%"PRIu32, + finh->unique, finh->nodeid, fbfi->count); + + for (i = 0; i < fbfi->count; i++) { + if (ffo[i].nodeid == 1) + continue; + do_forget(this, finh->unique, ffo[i].nodeid, ffo[i].nlookup); + } + + GF_FREE(finh); } static int @@ -580,7 +654,7 @@ fuse_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret == 0) { gf_log ("glusterfs-fuse", GF_LOG_TRACE, - "%"PRIu64": %s() %s => %"PRId64, frame->root->unique, + "%"PRIu64": %s() %s => %"PRIu64, frame->root->unique, gf_fop_list[frame->root->op], state->loc.path ? state->loc.path : "ERR", prebuf->ia_ino); @@ -635,7 +709,7 @@ fuse_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, state->loc.inode ? uuid_utoa (state->loc.inode->gfid) : ""); if (op_ret == 0) { gf_log ("glusterfs-fuse", GF_LOG_TRACE, - "%"PRIu64": %s() %s => %"PRId64, frame->root->unique, + "%"PRIu64": %s() %s => %"PRIu64, frame->root->unique, gf_fop_list[frame->root->op], state->loc.path ? state->loc.path : "ERR", buf->ia_ino); @@ -920,7 +994,7 @@ fuse_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret == 0) { gf_log ("glusterfs-fuse", GF_LOG_TRACE, - "%"PRIu64": %s() %s => %"PRId64, frame->root->unique, + "%"PRIu64": %s() %s => %"PRIu64, frame->root->unique, gf_fop_list[frame->root->op], state->loc.path ? state->loc.path : "ERR", statpost->ia_ino); @@ -1318,7 +1392,7 @@ fuse_mknod_resume (fuse_state_t *state) { if (!state->loc.parent) { gf_log ("glusterfs-fuse", GF_LOG_ERROR, - "MKNOD %"PRId64"/%s (%s/%s) resolution failed", + "MKNOD %"PRIu64"/%s (%s/%s) resolution failed", state->finh->nodeid, state->resolve.bname, uuid_utoa (state->resolve.gfid), state->resolve.bname); send_fuse_err (state->this, state->finh, ENOENT); @@ -1388,7 +1462,7 @@ fuse_mkdir_resume (fuse_state_t *state) { if (!state->loc.parent) { gf_log ("glusterfs-fuse", GF_LOG_ERROR, - "MKDIR %"PRId64" (%s/%s) resolution failed", + "MKDIR %"PRIu64" (%s/%s) resolution failed", state->finh->nodeid, uuid_utoa (state->resolve.gfid), state->resolve.bname); send_fuse_err (state->this, state->finh, ENOENT); @@ -1450,7 +1524,7 @@ fuse_unlink_resume (fuse_state_t *state) { if (!state->loc.parent || !state->loc.inode) { gf_log ("glusterfs-fuse", GF_LOG_ERROR, - "UNLINK %"PRId64" (%s/%s) resolution failed", + "UNLINK %"PRIu64" (%s/%s) resolution failed", state->finh->nodeid, uuid_utoa (state->resolve.gfid), state->resolve.bname); send_fuse_err (state->this, state->finh, ENOENT); @@ -1486,7 +1560,7 @@ fuse_rmdir_resume (fuse_state_t *state) { if (!state->loc.parent || !state->loc.inode) { gf_log ("glusterfs-fuse", GF_LOG_ERROR, - "RMDIR %"PRId64" (%s/%s) resolution failed", + "RMDIR %"PRIu64" (%s/%s) resolution failed", state->finh->nodeid, uuid_utoa (state->resolve.gfid), state->resolve.bname); send_fuse_err (state->this, state->finh, ENOENT); @@ -1522,7 +1596,7 @@ fuse_symlink_resume (fuse_state_t *state) { if (!state->loc.parent) { gf_log ("glusterfs-fuse", GF_LOG_ERROR, - "SYMLINK %"PRId64" (%s/%s) -> %s resolution failed", + "SYMLINK %"PRIu64" (%s/%s) -> %s resolution failed", state->finh->nodeid, uuid_utoa (state->resolve.gfid), state->resolve.bname, state->name); send_fuse_err (state->this, state->finh, ENOENT); @@ -1595,7 +1669,7 @@ fuse_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret == 0) { gf_log ("glusterfs-fuse", GF_LOG_TRACE, - "%"PRIu64": %s -> %s => 0 (buf->ia_ino=%"PRId64")", + "%"PRIu64": %s -> %s => 0 (buf->ia_ino=%"PRIu64")", frame->root->unique, state->loc.path, state->loc2.path, buf->ia_ino); @@ -1770,7 +1844,7 @@ fuse_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, foo.open_flags |= FOPEN_DIRECT_IO; gf_log ("glusterfs-fuse", GF_LOG_TRACE, - "%"PRIu64": %s() %s => %p (ino=%"PRId64")", + "%"PRIu64": %s() %s => %p (ino=%"PRIu64")", frame->root->unique, gf_fop_list[frame->root->op], state->loc.path, fd, buf->ia_ino); @@ -2029,7 +2103,7 @@ fuse_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret >= 0) { gf_log ("glusterfs-fuse", GF_LOG_TRACE, - "%"PRIu64": READ => %d/%"GF_PRI_SIZET",%"PRId64"/%"PRId64, + "%"PRIu64": READ => %d/%"GF_PRI_SIZET",%"PRId64"/%"PRIu64, frame->root->unique, op_ret, state->size, state->off, stbuf->ia_size); @@ -2115,7 +2189,7 @@ fuse_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret >= 0) { gf_log ("glusterfs-fuse", GF_LOG_TRACE, - "%"PRIu64": WRITE => %d/%"GF_PRI_SIZET",%"PRId64"/%"PRId64, + "%"PRIu64": WRITE => %d/%"GF_PRI_SIZET",%"PRId64"/%"PRIu64, frame->root->unique, op_ret, state->size, state->off, stbuf->ia_size); @@ -2157,7 +2231,7 @@ fuse_write_resume (fuse_state_t *state) iobref_add (iobref, iobuf); gf_log ("glusterfs-fuse", GF_LOG_TRACE, - "%"PRIu64": WRITE (%p, size=%"PRId64", offset=%"PRId64")", + "%"PRIu64": WRITE (%p, size=%"GF_PRI_SIZET", offset=%"PRId64")", state->finh->unique, state->fd, state->size, state->off); FUSE_FOP (state, fuse_writev_cbk, GF_FOP_WRITE, writev, state->fd, @@ -2495,7 +2569,7 @@ void fuse_readdir_resume (fuse_state_t *state) { gf_log ("glusterfs-fuse", GF_LOG_TRACE, - "%"PRIu64": READDIR (%p, size=%zu, offset=%"PRId64")", + "%"PRIu64": READDIR (%p, size=%"GF_PRI_SIZET", offset=%"PRId64")", state->finh->unique, state->fd, state->size, state->off); FUSE_FOP (state, fuse_readdir_cbk, GF_FOP_READDIR, @@ -2600,6 +2674,8 @@ fuse_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, feo->nodeid = inode_to_fuse_nodeid (linked_inode); + fuse_inode_set_need_lookup (linked_inode, this); + inode_unref (linked_inode); feo->entry_valid = @@ -2626,7 +2702,7 @@ void fuse_readdirp_resume (fuse_state_t *state) { gf_log ("glusterfs-fuse", GF_LOG_TRACE, - "%"PRIu64": READDIRP (%p, size=%zu, offset=%"PRId64")", + "%"PRIu64": READDIRP (%p, size=%"GF_PRI_SIZET", offset=%"PRId64")", state->finh->unique, state->fd, state->size, state->off); FUSE_FOP (state, fuse_readdirp_cbk, GF_FOP_READDIRP, @@ -2653,6 +2729,47 @@ fuse_readdirp (xlator_t *this, fuse_in_header_t *finh, void *msg) fuse_resolve_and_resume (state, fuse_readdirp_resume); } +static int +fuse_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + return fuse_err_cbk(frame, cookie, this, op_ret, op_errno, xdata); +} + +static void +fuse_fallocate_resume(fuse_state_t *state) +{ + gf_log("glusterfs-fuse", GF_LOG_TRACE, + "%"PRIu64": FALLOCATE (%p, flags=%d, size=%zu, offset=%"PRId64")", + state->finh->unique, state->fd, state->flags, state->size, + state->off); + + if (state->flags & FALLOC_FL_PUNCH_HOLE) + FUSE_FOP(state, fuse_fallocate_cbk, GF_FOP_DISCARD, discard, + state->fd, state->off, state->size, state->xdata); + else + FUSE_FOP(state, fuse_fallocate_cbk, GF_FOP_FALLOCATE, fallocate, + state->fd, (state->flags & FALLOC_FL_KEEP_SIZE), + state->off, state->size, state->xdata); +} + +static void +fuse_fallocate(xlator_t *this, fuse_in_header_t *finh, void *msg) +{ + struct fuse_fallocate_in *ffi = msg; + fuse_state_t *state = NULL; + + GET_STATE(this, finh, state); + state->off = ffi->offset; + state->size = ffi->length; + state->flags = ffi->mode; + state->fd = FH_TO_FD(ffi->fh); + + fuse_resolve_fd_init(state, &state->resolve, state->fd); + fuse_resolve_and_resume(state, fuse_fallocate_resume); +} + static void fuse_releasedir (xlator_t *this, fuse_in_header_t *finh, void *msg) @@ -2895,8 +3012,8 @@ fuse_setxattr (xlator_t *this, fuse_in_header_t *finh, void *msg) } if (!priv->acl) { - if ((strcmp (name, "system.posix_acl_access") == 0) || - (strcmp (name, "system.posix_acl_default") == 0)) { + if ((strcmp (name, POSIX_ACL_ACCESS_XATTR) == 0) || + (strcmp (name, POSIX_ACL_DEFAULT_XATTR) == 0)) { send_fuse_err (this, finh, EOPNOTSUPP); GF_FREE (finh); return; @@ -3120,6 +3237,8 @@ out: void fuse_getxattr_resume (fuse_state_t *state) { + char *value = NULL; + if (!state->loc.inode) { gf_log ("glusterfs-fuse", GF_LOG_WARNING, "%"PRIu64": GETXATTR %s/%"PRIu64" (%s) " @@ -3137,6 +3256,46 @@ fuse_getxattr_resume (fuse_state_t *state) state->fd = fd_lookup (state->loc.inode, state->finh->pid); #endif /* GF_TEST_FFOP */ + if (state->name && + (strcmp (state->name, VIRTUAL_GFID_XATTR_KEY) == 0)) { + /* send glusterfs gfid in binary form */ + + value = GF_CALLOC (16 + 1, sizeof(char), + gf_common_mt_char); + if (!value) { + send_fuse_err (state->this, state->finh, ENOMEM); + goto internal_out; + } + memcpy (value, state->loc.inode->gfid, 16); + + send_fuse_xattr (THIS, state->finh, value, 16, state->size); + GF_FREE (value); + internal_out: + free_fuse_state (state); + return; + } + + if (state->name && + (strcmp (state->name, VIRTUAL_GFID_XATTR_KEY_STR) == 0)) { + /* transform binary gfid to canonical form */ + + value = GF_CALLOC (UUID_CANONICAL_FORM_LEN + 1, sizeof(char), + gf_common_mt_char); + if (!value) { + send_fuse_err (state->this, state->finh, ENOMEM); + goto internal_out1; + } + uuid_utoa_r (state->loc.inode->gfid, value); + + send_fuse_xattr (THIS, state->finh, value, + UUID_CANONICAL_FORM_LEN, state->size); + GF_FREE (value); + internal_out1: + free_fuse_state (state); + return; + } + + if (state->fd) { gf_log ("glusterfs-fuse", GF_LOG_TRACE, "%"PRIu64": GETXATTR %p/%"PRIu64" (%s)", state->finh->unique, @@ -3158,15 +3317,16 @@ fuse_getxattr_resume (fuse_state_t *state) static void fuse_getxattr (xlator_t *this, fuse_in_header_t *finh, void *msg) { - struct fuse_getxattr_in *fgxi = msg; - char *name = (char *)(fgxi + 1); - - fuse_state_t *state = NULL; - struct fuse_private *priv = NULL; - int rv = 0; - char *newkey = NULL; + struct fuse_getxattr_in *fgxi = msg; + char *name = (char *)(fgxi + 1); + fuse_state_t *state = NULL; + struct fuse_private *priv = NULL; + int rv = 0; + int op_errno = EINVAL; + char *newkey = NULL; priv = this->private; + GET_STATE (this, finh, state); #ifdef GF_DARWIN_HOST_OS if (fgxi->position) { @@ -3182,26 +3342,23 @@ fuse_getxattr (xlator_t *this, fuse_in_header_t *finh, void *msg) "%"PRIu64": GETXATTR %s/%"PRIu64" (%s):" "refusing positioned getxattr", finh->unique, state->loc.path, finh->nodeid, name); - send_fuse_err (this, finh, EINVAL); - FREE (finh); - return; + op_errno = EINVAL; + goto err; } #endif if (!priv->acl) { - if ((strcmp (name, "system.posix_acl_access") == 0) || - (strcmp (name, "system.posix_acl_default") == 0)) { - send_fuse_err (this, finh, ENOTSUP); - GF_FREE (finh); - return; + if ((strcmp (name, POSIX_ACL_ACCESS_XATTR) == 0) || + (strcmp (name, POSIX_ACL_DEFAULT_XATTR) == 0)) { + op_errno = ENOTSUP; + goto err; } } if (!priv->selinux) { if (strncmp (name, "security.", 9) == 0) { - send_fuse_err (this, finh, ENODATA); - GF_FREE (finh); - return; + op_errno = ENODATA; + goto err; } } @@ -3211,16 +3368,19 @@ fuse_getxattr (xlator_t *this, fuse_in_header_t *finh, void *msg) rv = fuse_flip_xattr_ns (priv, name, &newkey); if (rv) { - send_fuse_err (this, finh, ENOMEM); - free_fuse_state (state); - goto out; + op_errno = ENOMEM; + goto err; } state->size = fgxi->size; state->name = newkey; fuse_resolve_and_resume (state, fuse_getxattr_resume); - out: + + return; + err: + send_fuse_err (this, finh, op_errno); + free_fuse_state (state); return; } @@ -3607,6 +3767,10 @@ fuse_init (xlator_t *this, fuse_in_header_t *finh, void *msg) fino.max_readahead = 1 << 17; fino.max_write = 1 << 17; fino.flags = FUSE_ASYNC_READ | FUSE_POSIX_LOCKS; +#if FUSE_KERNEL_MINOR_VERSION >= 17 + if (fini->minor >= 17) + fino.flags |= FUSE_FLOCK_LOCKS; +#endif #if FUSE_KERNEL_MINOR_VERSION >= 12 if (fini->minor >= 12) { /* let fuse leave the umask processing to us, so that it does not @@ -3635,8 +3799,8 @@ fuse_init (xlator_t *this, fuse_in_header_t *finh, void *msg) } priv->revchan_in = pfd[0]; priv->revchan_out = pfd[1]; - ret = pthread_create (&messenger, NULL, notify_kernel_loop, - this); + ret = gf_thread_create (&messenger, NULL, notify_kernel_loop, + this); if (ret != 0) { gf_log ("glusterfs-fuse", GF_LOG_ERROR, "failed to start messenger daemon (%s)", @@ -3668,8 +3832,44 @@ fuse_init (xlator_t *this, fuse_in_header_t *finh, void *msg) if (fini->minor < 9) *priv->msg0_len_p = sizeof(*finh) + FUSE_COMPAT_WRITE_IN_SIZE; #endif - if (fini->flags & FUSE_DO_READDIRPLUS) - fino.flags |= FUSE_DO_READDIRPLUS; + if (priv->use_readdirp) { + if (fini->flags & FUSE_DO_READDIRPLUS) + fino.flags |= FUSE_DO_READDIRPLUS; + } + + if (priv->fopen_keep_cache == 2) { + /* If user did not explicitly set --fopen-keep-cache[=off], + then check if kernel support FUSE_AUTO_INVAL_DATA and ... + */ + if (fini->flags & FUSE_AUTO_INVAL_DATA) { + /* ... enable fopen_keep_cache mode if supported. + */ + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, "Detected " + "support for FUSE_AUTO_INVAL_DATA. Enabling " + "fopen_keep_cache automatically."); + fino.flags |= FUSE_AUTO_INVAL_DATA; + priv->fopen_keep_cache = 1; + } else { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, "No support " + "for FUSE_AUTO_INVAL_DATA. Disabling " + "fopen_keep_cache."); + /* ... else disable. */ + priv->fopen_keep_cache = 0; + } + } else if (priv->fopen_keep_cache == 1) { + /* If user explicitly set --fopen-keep-cache[=on], + then enable FUSE_AUTO_INVAL_DATA if possible. + */ + if (fini->flags & FUSE_AUTO_INVAL_DATA) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, "fopen_keep_cache " + "is explicitly set. Enabling FUSE_AUTO_INVAL_DATA"); + fino.flags |= FUSE_AUTO_INVAL_DATA; + } else { + gf_log ("glusterfs-fuse", GF_LOG_WARNING, "fopen_keep_cache " + "is explicitly set. Support for " + "FUSE_AUTO_INVAL_DATA is missing"); + } + } if (fini->flags & FUSE_ASYNC_DIO) fino.flags |= FUSE_ASYNC_DIO; @@ -3908,6 +4108,7 @@ fuse_migrate_fd_open (xlator_t *this, fd_t *basefd, fd_t *oldfd, goto out; } + newfd->flags = basefd->flags; if (newfd->lk_ctx) fd_lk_ctx_unref (newfd->lk_ctx); @@ -4644,6 +4845,7 @@ fuse_priv_dump (xlator_t *this) (int)private->strict_volfile_check); gf_proc_dump_write("reverse_thread_started", "%d", (int)private->reverse_fuse_thread_started); + gf_proc_dump_write("use_readdirp", "%d", private->use_readdirp); return 0; } @@ -4772,8 +4974,8 @@ notify (xlator_t *this, int32_t event, void *data, ...) if (!private->fuse_thread_started) { private->fuse_thread_started = 1; - ret = pthread_create (&private->fuse_thread, NULL, - fuse_thread_proc, this); + ret = gf_thread_create (&private->fuse_thread, NULL, + fuse_thread_proc, this); if (ret != 0) { gf_log (this->name, GF_LOG_DEBUG, "pthread_create() failed (%s)", @@ -4860,8 +5062,8 @@ static fuse_handler_t *fuse_std_ops[FUSE_OP_HIGH] = { /* [FUSE_IOCTL] */ /* [FUSE_POLL] */ /* [FUSE_NOTIFY_REPLY] */ - /* [FUSE_BATCH_FORGET] */ - /* [FUSE_FALLOCATE] */ + [FUSE_BATCH_FORGET]= fuse_batch_forget, + [FUSE_FALLOCATE] = fuse_fallocate, [FUSE_READDIRPLUS] = fuse_readdirp, }; @@ -4913,6 +5115,7 @@ init (xlator_t *this_xl) int fsname_allocated = 0; glusterfs_ctx_t *ctx = NULL; gf_boolean_t sync_to_mount = _gf_false; + gf_boolean_t fopen_keep_cache = _gf_false; unsigned long mntflags = 0; char *mnt_args = NULL; eh_t *event = NULL; @@ -5031,6 +5234,8 @@ init (xlator_t *this_xl) GF_OPTION_INIT ("enable-ino32", priv->enable_ino32, bool, cleanup_exit); + GF_OPTION_INIT ("use-readdirp", priv->use_readdirp, bool, cleanup_exit); + priv->fuse_dump_fd = -1; ret = dict_get_str (options, "dump-fuse", &value_string); if (ret == 0) { @@ -5056,8 +5261,12 @@ init (xlator_t *this_xl) GF_ASSERT (ret == 0); } - GF_OPTION_INIT("fopen-keep-cache", priv->fopen_keep_cache, bool, - cleanup_exit); + priv->fopen_keep_cache = 2; + if (dict_get (options, "fopen-keep-cache")) { + GF_OPTION_INIT("fopen-keep-cache", fopen_keep_cache, bool, + cleanup_exit); + priv->fopen_keep_cache = fopen_keep_cache; + } GF_OPTION_INIT("gid-timeout", priv->gid_cache_timeout, int32, cleanup_exit); @@ -5146,7 +5355,7 @@ init (xlator_t *this_xl) if (priv->fd == -1) goto cleanup_exit; - event = eh_new (FUSE_EVENT_HISTORY_SIZE, _gf_false); + event = eh_new (FUSE_EVENT_HISTORY_SIZE, _gf_false, NULL); if (!event) { gf_log (this_xl->name, GF_LOG_ERROR, "could not create a new event history"); @@ -5228,6 +5437,7 @@ struct xlator_fops fops; struct xlator_cbks cbks = { .invalidate = fuse_invalidate, + .forget = fuse_forget_cbk, }; @@ -5310,5 +5520,9 @@ struct volume_options options[] = { { .key = {"fuse-mountopts"}, .type = GF_OPTION_TYPE_STR }, + { .key = {"use-readdirp"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "yes" + }, { .key = {NULL} }, }; diff --git a/xlators/mount/fuse/src/fuse-bridge.h b/xlators/mount/fuse/src/fuse-bridge.h index d90b85e72..34794b6ea 100644 --- a/xlators/mount/fuse/src/fuse-bridge.h +++ b/xlators/mount/fuse/src/fuse-bridge.h @@ -101,7 +101,7 @@ struct fuse_private { gf_boolean_t acl; gf_boolean_t selinux; gf_boolean_t read_only; - gf_boolean_t fopen_keep_cache; + int32_t fopen_keep_cache; int32_t gid_cache_timeout; gf_boolean_t enable_ino32; fdtable_t *fdtable; @@ -119,6 +119,9 @@ struct fuse_private { /* for fuse queue length and congestion threshold */ int background_qlen; int congestion_threshold; + + /* for using fuse-kernel readdirp*/ + gf_boolean_t use_readdirp; }; typedef struct fuse_private fuse_private_t; @@ -142,9 +145,10 @@ typedef struct fuse_graph_switch_args fuse_graph_switch_args_t; #define FUSE_FOP(state, ret, op_num, fop, args ...) \ do { \ - call_frame_t *frame = NULL; \ - xlator_t *xl = NULL; \ - int32_t op_ret = 0, op_errno = 0; \ + call_frame_t *frame = NULL; \ + xlator_t *xl = NULL; \ + int32_t op_ret = 0, op_errno = 0; \ + fuse_resolve_t *resolve = NULL; \ \ frame = get_call_frame_for_req (state); \ if (!frame) { \ @@ -171,14 +175,20 @@ typedef struct fuse_graph_switch_args fuse_graph_switch_args_t; frame->root->op = op_num; \ frame->op = op_num; \ \ + if ( state->resolve_now ) { \ + resolve = state->resolve_now; \ + } else { \ + resolve = &(state->resolve); \ + } \ + \ xl = state->active_subvol; \ if (!xl) { \ gf_log_callingfn ("glusterfs-fuse", GF_LOG_ERROR, \ "xl is NULL"); \ op_errno = ENOENT; \ op_ret = -1; \ - } else if (state->resolve.op_ret < 0) { \ - op_errno = state->resolve.op_errno; \ + } else if (resolve->op_ret < 0) { \ + op_errno = resolve->op_errno; \ op_ret = -1; \ if (op_num == GF_FOP_LOOKUP) { \ gf_log ("glusterfs-fuse", \ @@ -187,7 +197,7 @@ typedef struct fuse_graph_switch_args fuse_graph_switch_args_t; "%"PRIu64": %s() %s => -1 (%s)", \ frame->root->unique, \ gf_fop_list[frame->root->op], \ - state->resolve.resolve_loc.path, \ + resolve->resolve_loc.path, \ strerror (op_errno)); \ } else { \ gf_log ("glusterfs-fuse", \ @@ -196,7 +206,7 @@ typedef struct fuse_graph_switch_args fuse_graph_switch_args_t; "migration of %s failed (%s)", \ frame->root->unique, \ gf_fop_list[frame->root->op], \ - state->resolve.resolve_loc.path, \ + resolve->resolve_loc.path, \ strerror (op_errno)); \ } \ } else if (state->resolve2.op_ret < 0) { \ diff --git a/xlators/mount/fuse/src/fuse-helpers.c b/xlators/mount/fuse/src/fuse-helpers.c index d4dcc2e61..4d478b919 100644 --- a/xlators/mount/fuse/src/fuse-helpers.c +++ b/xlators/mount/fuse/src/fuse-helpers.c @@ -7,6 +7,10 @@ later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ +#ifdef __NetBSD__ +#define _KMEMUSER +#endif + #include "fuse-bridge.h" #if defined(GF_SOLARIS_HOST_OS) #include <sys/procfs.h> @@ -14,9 +18,6 @@ #include <sys/sysctl.h> #endif -#ifndef GF_REQUEST_MAXGROUPS -#define GF_REQUEST_MAXGROUPS 16 -#endif /* GF_REQUEST_MAXGROUPS */ static void fuse_resolve_wipe (fuse_resolve_t *resolve) @@ -138,6 +139,7 @@ get_fuse_state (xlator_t *this, fuse_in_header_t *finh) } +#define FUSE_MAX_AUX_GROUPS 32 /* We can get only up to 32 aux groups from /proc */ void frame_fill_groups (call_frame_t *frame) { @@ -160,6 +162,9 @@ frame_fill_groups (call_frame_t *frame) if (!fp) goto out; + if (call_stack_alloc_groups (frame->root, FUSE_MAX_AUX_GROUPS) != 0) + goto out; + while ((ptr = fgets (line, sizeof line, fp))) { if (strncmp (ptr, "Groups:", 7) != 0) continue; @@ -176,7 +181,7 @@ frame_fill_groups (call_frame_t *frame) if (!endptr || *endptr) break; frame->root->groups[idx++] = id; - if (idx == GF_MAX_AUX_GROUPS) + if (idx == FUSE_MAX_AUX_GROUPS) break; } @@ -192,6 +197,7 @@ out: prcred_t *prcred = (prcred_t *) scratch; FILE *fp = NULL; int ret = 0; + int ngrps; ret = snprintf (filename, sizeof filename, "/proc/%d/cred", frame->root->pid); @@ -200,8 +206,11 @@ out: fp = fopen (filename, "r"); if (fp != NULL) { if (fgets (scratch, sizeof scratch, fp) != NULL) { - frame->root->ngrps = MIN(prcred->pr_ngroups, - GF_REQUEST_MAXGROUPS); + ngrps = MIN(prcred->pr_ngroups, + GF_MAX_AUX_GROUPS); + if (call_stack_alloc_groups (frame->root, + ngrps) != 0) + return; } fclose (fp); } @@ -226,7 +235,9 @@ out: if (sysctl(name, namelen, &kp, &kplen, NULL, 0) != 0) return; - ngroups = MIN(kp.kp_eproc.e_ucred.cr_ngroups, GF_REQUEST_MAXGROUPS); + ngroups = MIN(kp.kp_eproc.e_ucred.cr_ngroups, GF_MAX_AUX_GROUPS); + if (call_stack_alloc_groups (frame->root, ngroups) != 0) + return; for (i = 0; i < ngroups; i++) frame->root->groups[i] = kp.kp_eproc.e_ucred.cr_groups[i]; frame->root->ngrps = ngroups; @@ -257,6 +268,8 @@ static void get_groups(fuse_private_t *priv, call_frame_t *frame) gl = gid_cache_lookup(&priv->gid_cache, frame->root->pid); if (gl) { + if (call_stack_alloc_groups (frame->root, gl->gl_count) != 0) + return; frame->root->ngrps = gl->gl_count; for (i = 0; i < gl->gl_count; i++) frame->root->groups[i] = gl->gl_list[i]; diff --git a/xlators/mount/fuse/src/fuse-resolve.c b/xlators/mount/fuse/src/fuse-resolve.c index 88ce32ab9..8565ce0e4 100644 --- a/xlators/mount/fuse/src/fuse-resolve.c +++ b/xlators/mount/fuse/src/fuse-resolve.c @@ -26,6 +26,8 @@ int fuse_migrate_fd (xlator_t *this, fd_t *fd, xlator_t *old_subvol, fuse_fd_ctx_t * fuse_fd_ctx_get (xlator_t *this, fd_t *fd); +gf_boolean_t fuse_inode_needs_lookup (inode_t *inode, xlator_t *this); + static int fuse_resolve_loc_touchup (fuse_state_t *state) { @@ -201,7 +203,11 @@ fuse_resolve_gfid (fuse_state_t *state) uuid_copy (resolve_loc->gfid, resolve->gfid); } - resolve_loc->inode = inode_new (state->itable); + /* inode may already exist in case we are looking up an inode which was + linked through readdirplus */ + resolve_loc->inode = inode_find (state->itable, resolve_loc->gfid); + if (!resolve_loc->inode) + resolve_loc->inode = inode_new (state->itable); ret = loc_path (resolve_loc, NULL); if (ret <= 0) { @@ -239,6 +245,9 @@ fuse_resolve_parent_simple (fuse_state_t *state) parent = resolve->parhint; if (parent->table == state->itable) { + if (fuse_inode_needs_lookup (parent, THIS)) + return 1; + /* no graph switches since */ loc->parent = inode_ref (parent); uuid_copy (loc->pargfid, parent->gfid); @@ -265,6 +274,10 @@ fuse_resolve_parent_simple (fuse_state_t *state) /* non decisive result - parent missing */ return 1; } + if (fuse_inode_needs_lookup (parent, THIS)) { + inode_unref (parent); + return 1; + } loc->parent = parent; uuid_copy (loc->pargfid, resolve->pargfid); @@ -314,15 +327,18 @@ fuse_resolve_inode_simple (fuse_state_t *state) loc = state->loc_now; inode = resolve->hint; - if (inode->table == state->itable) { + if (inode->table == state->itable) inode_ref (inode); - goto found; + else + inode = inode_find (state->itable, resolve->gfid); + + if (inode) { + if (!fuse_inode_needs_lookup (inode, THIS)) + goto found; + /* inode was linked through readdirplus */ + inode_unref (inode); } - inode = inode_find (state->itable, resolve->gfid); - if (inode) - goto found; - return 1; found: loc->inode = inode; diff --git a/xlators/mount/fuse/utils/mount.glusterfs.in b/xlators/mount/fuse/utils/mount.glusterfs.in index 3fedb8ce3..a192d6059 100755 --- a/xlators/mount/fuse/utils/mount.glusterfs.in +++ b/xlators/mount/fuse/utils/mount.glusterfs.in @@ -27,6 +27,8 @@ _init () LOG_DEBUG=DEBUG; LOG_TRACE=TRACE; + HOST_NAME_MAX=64; + prefix="@prefix@"; exec_prefix=@exec_prefix@; cmd_line=$(echo "@sbindir@/glusterfs"); @@ -51,8 +53,24 @@ _init () esac UPDATEDBCONF=/etc/updatedb.conf - LD_LIBRARY_PATH=@libdir@:${LD_LIBRARY_PATH} - export LD_LIBRARY_PATH +} + +parse_backup_volfile_servers () +{ + local server_list=$1 + local servers="" + local new_servers="" + + servers=$(echo ${server_list} | sed 's/\:/ /g') + for server in ${servers}; do + length=$(echo $server | wc -c) + if [ ${length} -gt ${HOST_NAME_MAX} ]; then + echo "Hostname:${server} provided is too long.. skipping" + continue + fi + new_servers=$(echo "$new_servers $server") + done + echo ${new_servers} } start_glusterfs () @@ -60,46 +78,46 @@ start_glusterfs () # lets the comparsion be case insensitive for all strings if [ -n "$log_level_str" ]; then - case "$( echo $log_level_str | tr '[a-z]' '[A-Z]')" in - "ERROR") - log_level=$LOG_ERROR; - ;; + case "$( echo $log_level_str | tr '[a-z]' '[A-Z]')" in + "ERROR") + log_level=$LOG_ERROR; + ;; "INFO") log_level=$LOG_INFO ;; - "DEBUG") - log_level=$LOG_DEBUG; - ;; - "CRITICAL") - log_level=$LOG_CRITICAL; - ;; - "WARNING") - log_level=$LOG_WARNING; - ;; - "TRACE") - log_level=$LOG_TRACE; - ;; - "NONE") - log_level=$LOG_NONE; - ;; - *) - echo "invalid log level $log_level_str, using INFO"; - log_level=$LOG_INFO; - ;; - esac + "DEBUG") + log_level=$LOG_DEBUG; + ;; + "CRITICAL") + log_level=$LOG_CRITICAL; + ;; + "WARNING") + log_level=$LOG_WARNING; + ;; + "TRACE") + log_level=$LOG_TRACE; + ;; + "NONE") + log_level=$LOG_NONE; + ;; + *) + echo "invalid log level $log_level_str, using INFO"; + log_level=$LOG_INFO; + ;; + esac fi #options without values start here if [ -n "$read_only" ]; then - cmd_line=$(echo "$cmd_line --read-only"); + cmd_line=$(echo "$cmd_line --read-only"); fi if [ -n "$acl" ]; then - cmd_line=$(echo "$cmd_line --acl"); + cmd_line=$(echo "$cmd_line --acl"); fi if [ -n "$selinux" ]; then - cmd_line=$(echo "$cmd_line --selinux"); + cmd_line=$(echo "$cmd_line --selinux"); fi if [ -n "$enable_ino32" ]; then @@ -111,28 +129,36 @@ start_glusterfs () fi if [ -n "$fopen_keep_cache" ]; then - cmd_line=$(echo "$cmd_line --fopen-keep-cache"); + cmd_line=$(echo "$cmd_line --fopen-keep-cache"); fi if [ -n "$volfile_check" ]; then - cmd_line=$(echo "$cmd_line --volfile-check"); + cmd_line=$(echo "$cmd_line --volfile-check"); fi if [ -n "$mem_accounting" ]; then cmd_line=$(echo "$cmd_line --mem-accounting"); fi + if [ -n "$aux_gfid_mount" ]; then + cmd_line=$(echo "$cmd_line --aux-gfid-mount"); + fi + #options with values start here if [ -n "$log_level" ]; then cmd_line=$(echo "$cmd_line --log-level=$log_level"); fi if [ -n "$log_file" ]; then - cmd_line=$(echo "$cmd_line --log-file=$log_file"); + cmd_line=$(echo "$cmd_line --log-file=$log_file"); fi if [ -n "$direct_io_mode" ]; then - cmd_line=$(echo "$cmd_line --direct-io-mode=$direct_io_mode"); + cmd_line=$(echo "$cmd_line --direct-io-mode=$direct_io_mode"); + fi + + if [ -n "$use_readdirp" ]; then + cmd_line=$(echo "$cmd_line --use-readdirp=$use_readdirp"); fi if [ -n "$volume_name" ]; then @@ -152,19 +178,19 @@ start_glusterfs () fi if [ -n "$gid_timeout" ]; then - cmd_line=$(echo "$cmd_line --gid-timeout=$gid_timeout"); + cmd_line=$(echo "$cmd_line --gid-timeout=$gid_timeout"); fi if [ -n "$bg_qlen" ]; then - cmd_line=$(echo "$cmd_line --background-qlen=$bg_qlen"); + cmd_line=$(echo "$cmd_line --background-qlen=$bg_qlen"); fi if [ -n "$cong_threshold" ]; then - cmd_line=$(echo "$cmd_line --congestion-threshold=$cong_threshold"); + cmd_line=$(echo "$cmd_line --congestion-threshold=$cong_threshold"); fi if [ -n "$fuse_mountopts" ]; then - cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts"); + cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts"); fi if [ -n "$xlator_option" ]; then @@ -178,36 +204,40 @@ start_glusterfs () if [ -z "$volfile_loc" ]; then if [ -n "$server_ip" ]; then + + cmd_line=$(echo "$cmd_line --volfile-server=$server_ip"); + + if [ -n "$backup_volfile_servers" ]; then + servers=$(parse_backup_volfile_servers ${backup_volfile_servers}) + for i in $(echo ${servers}); do + cmd_line=$(echo "$cmd_line --volfile-server=$i"); + done + fi + if [ -n "$server_port" ]; then cmd_line=$(echo "$cmd_line --volfile-server-port=$server_port"); fi - if [ -n "$transport" ]; then + + if [ -n "$transport" ]; then cmd_line=$(echo "$cmd_line --volfile-server-transport=$transport"); if [ "$transport" = "rdma" ]; then volume_id_rdma=".rdma"; fi fi + if [ -n "$volume_id" ]; then if [ -n "$volume_id_rdma" ]; then volume_id="$volume_id$volume_id_rdma"; fi cmd_line=$(echo "$cmd_line --volfile-id=$volume_id"); fi - - if [ -n "$backupvolfile_server" ]; then - cmd_line1=$(echo "$cmd_line --volfile-server=$backupvolfile_server"); - fi - if [ -n "$volfile_max_fetch_attempts" ]; then - cmd_line=$(echo "$cmd_line --volfile-max-fetch-attempts=$volfile_max_fetch_attempts"); - fi - cmd_line=$(echo "$cmd_line --volfile-server=$server_ip"); fi else cmd_line=$(echo "$cmd_line --volfile=$volfile_loc"); fi if [ -n "$fuse_mountopts" ]; then - cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts"); + cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts"); fi cmd_line=$(echo "$cmd_line $mount_point"); @@ -222,24 +252,8 @@ start_glusterfs () inode="0"; fi - # retry the failover - # if [ $? != "0" ]; then # <--- TODO: Once glusterfs returns proper error code, change it. if [ $inode -ne 1 ]; then err=1; - if [ -n "$cmd_line1" ]; then - cmd_line1=$(echo "$cmd_line1 $mount_point"); - $cmd_line1; - err=0; - - inode=$( ${getinode} $mount_point 2>/dev/null); - # this is required if the stat returns error - if [ -z "$inode" ]; then - inode="0"; - fi - if [ $inode -ne 1 ]; then - err=1; - fi - fi fi if [ $err -eq "1" ]; then @@ -264,13 +278,13 @@ mount.glusterfs --version" # check for recursive mounts. i.e, mounting over an existing brick check_recursive_mount () { - if [ $2 = "/" ]; then + if [ $1 = "/" ]; then echo Cannot mount over root; exit 2; fi # GFID check first # remove trailing / from mount point - mnt_dir=${2%/}; + mnt_dir=${1%/}; export PATH; # check whether getfattr exists @@ -329,7 +343,6 @@ check_recursive_mount () main () { helper=$(echo "$@" | sed -n 's/.*\--[ ]*\([^ ]*\).*/\1/p'); - in_opt="no" pos_args=0 for opt in "$@"; do @@ -341,9 +354,14 @@ main () "acl") acl=1 ;; "selinux") selinux=1 ;; "worm") worm=1 ;; - "fopen-keep-cache") fopen_keep_cache=1 ;; + "fopen-keep-cache") fopen_keep_cache=1 ;; "enable-ino32") enable_ino32=1 ;; "mem-accounting") mem_accounting=1;; + "aux-gfid-mount") + if [ `uname -s` = "Linux" ]; then + aux_gfid_mount=1 + fi + ;; # "mount -t glusterfs" sends this, but it's useless. "rw") ;; # these ones are interpreted during system initialization @@ -363,19 +381,17 @@ main () "volume-id") volume_id=$value ;; "volfile-check") volfile_check=$value ;; "server-port") server_port=$value ;; - "fetch-attempts") - volfile_max_fetch_attempts=$value ;; - "backupvolfile-server") - backupvolfile_server=$value ;; "attribute-timeout") attribute_timeout=$value ;; "entry-timeout") entry_timeout=$value ;; "negative-timeout") negative_timeout=$value ;; - "gid-timeout") gid_timeout=$value ;; - "background-qlen") bg_qlen=$value ;; - "congestion-threshold") cong_threshold=$value ;; - "xlator-option") xlator_option=$xlator_option" "$pair ;; - "fuse-mountopts") fuse_mountopts=$value ;; + "gid-timeout") gid_timeout=$value ;; + "background-qlen") bg_qlen=$value ;; + "backup-volfile-servers") backup_volfile_servers=$value ;; + "congestion-threshold") cong_threshold=$value ;; + "xlator-option") xlator_option=$xlator_option" "$pair ;; + "fuse-mountopts") fuse_mountopts=$value ;; + "use-readdirp") use_readdirp=$value ;; *) # Passthru [ -z "$fuse_mountopts" ] || fuse_mountopts="$fuse_mountopts," @@ -407,7 +423,7 @@ main () [ -n "$test_str" ] && { volume_id="$test_str"; } - volfile_loc=""; + volfile_loc=""; } # @@ -432,7 +448,7 @@ main () exit 0; fi - check_recursive_mount "$@"; + check_recursive_mount "$mount_point"; # Append fuse.glusterfs to PRUNEFS variable in updatedb.conf(5). updatedb(8) # should not index files under GlusterFS, indexing will slow down GlusteFS diff --git a/xlators/nfs/server/src/Makefile.am b/xlators/nfs/server/src/Makefile.am index 2795a935d..62fbf6535 100644 --- a/xlators/nfs/server/src/Makefile.am +++ b/xlators/nfs/server/src/Makefile.am @@ -14,7 +14,7 @@ noinst_HEADERS = nfs.h nfs-common.h nfs-fops.h nfs-inodes.h nfs-generics.h \ AM_CPPFLAGS = $(GF_CPPFLAGS) \ -DLIBDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/auth\" \ -I$(top_srcdir)/libglusterfs/src \ - -I$(nfsrpclibdir) -I$(CONTRIBDIR)/rbtree\ + -I$(nfsrpclibdir) -I$(CONTRIBDIR)/rbtree \ -I$(top_srcdir)/rpc/xdr/src/ AM_CFLAGS = -Wall $(GF_CFLAGS) diff --git a/xlators/nfs/server/src/acl3.c b/xlators/nfs/server/src/acl3.c index ed60775ab..08b099b4e 100644 --- a/xlators/nfs/server/src/acl3.c +++ b/xlators/nfs/server/src/acl3.c @@ -113,8 +113,10 @@ nfs3_fh_to_xlator (struct nfs3_state *nfs3, struct nfs3_fh *fh); xlatorp = nfs3_fh_to_xlator (cst->nfs3state, \ &cst->resolvefh); \ uuid_unparse (cst->resolvefh.gfid, gfid); \ - sprintf (buf, "(%s) %s : %s", trans->peerinfo.identifier,\ - xlatorp ? xlatorp->name : "ERR", gfid); \ + snprintf (buf, sizeof (buf), "(%s) %s : %s", \ + trans->peerinfo.identifier, \ + xlatorp ? xlatorp->name : "ERR", \ + gfid); \ gf_log (GF_ACL, GF_LOG_ERROR, "Unable to resolve FH"\ ": %s", buf); \ nfstat = nfs3_errno_to_nfsstat3 (cst->resolve_errno);\ @@ -138,10 +140,11 @@ nfs3_fh_to_xlator (struct nfs3_state *nfs3, struct nfs3_fh *fh); int acl3svc_submit_reply (rpcsvc_request_t *req, void *arg, acl3_serializer sfunc) { - struct iovec outmsg = {0, }; - struct iobuf *iob = NULL; - struct nfs3_state *nfs3 = NULL; - int ret = -1; + struct iovec outmsg = {0, }; + struct iobuf *iob = NULL; + struct nfs3_state *nfs3 = NULL; + int ret = -1; + ssize_t msglen = 0; struct iobref *iobref = NULL; if (!req) @@ -166,7 +169,12 @@ acl3svc_submit_reply (rpcsvc_request_t *req, void *arg, acl3_serializer sfunc) /* Use the given serializer to translate the give C structure in arg * to XDR format which will be written into the buffer in outmsg. */ - outmsg.iov_len = sfunc (outmsg, arg); + msglen = sfunc (outmsg, arg); + if (msglen < 0) { + gf_log (GF_ACL, GF_LOG_ERROR, "Failed to encode message"); + goto ret; + } + outmsg.iov_len = msglen; iobref = iobref_new (); if (iobref == NULL) { @@ -174,7 +182,11 @@ acl3svc_submit_reply (rpcsvc_request_t *req, void *arg, acl3_serializer sfunc) goto ret; } - iobref_add (iobref, iob); + ret = iobref_add (iobref, iob); + if (ret) { + gf_log (GF_ACL, GF_LOG_ERROR, "Failed to add iob to iobref"); + goto ret; + } /* Then, submit the message for transmission. */ ret = rpcsvc_submit_message (req, &outmsg, 1, NULL, 0, iobref); @@ -228,12 +240,15 @@ acl3_getacl_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int i = 0; getaclreply *getaclreply = NULL; + if (!frame->local) { + gf_log (GF_ACL, GF_LOG_ERROR, "Invalid argument," + " frame->local NULL"); + return EINVAL; + } cs = frame->local; - if (cs) - getaclreply = &cs->args.getaclreply; - + getaclreply = &cs->args.getaclreply; if (op_ret == -1) { - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto err; } @@ -241,9 +256,9 @@ acl3_getacl_cbk (call_frame_t *frame, void *cookie, xlator_t *this, getaclreply->daclentry.daclentry_val = cs->daclentry; /* FIXME: use posix_acl_from_xattr() */ - data = dict_get (dict, "system.posix_acl_access"); + data = dict_get (dict, POSIX_ACL_ACCESS_XATTR); if (data && (p = data_to_bin (data))) { - /* POSIX_ACL_XATTR_VERSION */ + /* POSIX_ACL_VERSION */ p++; while ((char *)p < (data->data + data->len)) { getaclreply->aclentry.aclentry_val[i].type = *(*(short **)&p)++; @@ -255,9 +270,9 @@ acl3_getacl_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } i = 0; - data = dict_get (dict, "system.posix_acl_default"); + data = dict_get (dict, POSIX_ACL_DEFAULT_XATTR); if (data && (p = data_to_bin (data))) { - /* POSIX_ACL_XATTR_VERSION */ + /* POSIX_ACL_VERSION */ p++; while ((char *)p < (data->data + data->len)) { getaclreply->daclentry.daclentry_val[i].type = *(*(short **)&p)++; @@ -291,12 +306,17 @@ acl3_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int ret = -1; nfs_user_t nfu = {0, }; + if (!frame->local) { + gf_log (GF_ACL, GF_LOG_ERROR, "Invalid argument," + " frame->local NULL"); + return EINVAL; + } + cs = frame->local; - if (cs) - getaclreply = &cs->args.getaclreply; + getaclreply = &cs->args.getaclreply; if (op_ret == -1) { - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto err; } @@ -307,7 +327,7 @@ acl3_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ret = nfs_getxattr (cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, NULL, NULL, acl3_getacl_cbk, cs); if (ret == -1) { - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto err; } return 0; @@ -409,7 +429,8 @@ acl3_setacl_cbk (call_frame_t *frame, void *cookie, nfs3_call_state_t *cs = NULL; cs = frame->local; if (op_ret < 0) { - cs->args.setaclreply.status = nfs3_errno_to_nfsstat3 (op_errno); + nfsstat3 status = nfs3_cbk_errno_status (op_ret, op_errno); + cs->args.setaclreply.status = status; } acl3svc_submit_reply (cs->req, (void *)&cs->args.setaclreply, @@ -428,17 +449,16 @@ acl3_setacl_resume (void *carg) if (!carg) return ret; - cs = (nfs3_call_state_t *)carg; acl3_check_fh_resolve_status (cs, stat, acl3err); nfs_request_user_init (&nfu, cs->req); xattr = dict_new(); if (cs->aclcount) - ret = dict_set_static_bin (xattr, "system.posix_acl_access", cs->aclxattr, + ret = dict_set_static_bin (xattr, POSIX_ACL_ACCESS_XATTR, cs->aclxattr, cs->aclcount * 8 + 4); if (cs->daclcount) - ret = dict_set_static_bin (xattr, "system.posix_acl_default", cs->daclxattr, - cs->daclcount * 8 + 4); + ret = dict_set_static_bin (xattr, POSIX_ACL_DEFAULT_XATTR, + cs->daclxattr, cs->daclcount * 8 + 4); ret = nfs_setxattr (cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, xattr, 0, NULL, acl3_setacl_cbk, cs); @@ -470,12 +490,24 @@ acl3svc_setacl (rpcsvc_request_t *req) struct nfs3_fh fh; struct nfs3_fh *fhp = NULL; setaclargs setaclargs; - aclentry aclentry[NFS_ACL_MAX_ENTRIES]; - struct aclentry daclentry[NFS_ACL_MAX_ENTRIES]; - int *p = NULL, i = 0; + aclentry *aclentry = NULL; + struct aclentry *daclentry = NULL; + int i = 0; + struct posix_acl_xattr_header *bufheader = NULL; + struct posix_acl_xattr_entry *bufentry = NULL; if (!req) return ret; + aclentry = GF_CALLOC (NFS_ACL_MAX_ENTRIES, sizeof(*aclentry), + gf_nfs_mt_arr); + if (!aclentry) { + goto rpcerr; + } + daclentry = GF_CALLOC (NFS_ACL_MAX_ENTRIES, sizeof(*daclentry), + gf_nfs_mt_arr); + if (!daclentry) { + goto rpcerr; + } acl3_validate_nfs3_state (req, nfs3, stat, rpcerr, ret); nfs = nfs_state (nfs3->nfsx); @@ -506,19 +538,58 @@ acl3svc_setacl (rpcsvc_request_t *req) (cs->daclcount > NFS_ACL_MAX_ENTRIES)) goto acl3err; /* FIXME: use posix_acl_to_xattr() */ - p = (int *)cs->aclxattr; - *(*(int **)&p)++ = POSIX_ACL_XATTR_VERSION; + /* Populate xattr buffer for user ACL */ + bufheader = (struct posix_acl_xattr_header *)(cs->aclxattr); + bufheader->version = htole32(POSIX_ACL_VERSION); + bufentry = bufheader->entries; for (i = 0; i < cs->aclcount; i++) { - *(*(short **)&p)++ = aclentry[i].type; - *(*(short **)&p)++ = aclentry[i].perm; - *(*(int **)&p)++ = aclentry[i].uid; + int uaceuid; + const struct aclentry *uace = &aclentry[i]; + switch (uace->type) { + case POSIX_ACL_USER: + case POSIX_ACL_GROUP: + uaceuid = uace->uid; + break; + default: + uaceuid = POSIX_ACL_UNDEFINED_ID; + break; + } + bufentry->tag = htole16(uace->type); + bufentry->perm = htole16(uace->perm); + bufentry->id = htole32(uaceuid); + + bufentry++; } - p = (int *)cs->daclxattr; - *(*(int **)&p)++ = POSIX_ACL_XATTR_VERSION; + + /* Populate xattr buffer for Default ACL */ + bufheader = (struct posix_acl_xattr_header *)(cs->aclxattr); + bufheader->version = htole32(POSIX_ACL_VERSION); + bufentry = bufheader->entries; for (i = 0; i < cs->daclcount; i++) { - *(*(short **)&p)++ = daclentry[i].type; - *(*(short **)&p)++ = daclentry[i].perm; - *(*(int **)&p)++ = daclentry[i].uid; + int daceuid; + int dacetype; + const struct aclentry *dace = &daclentry[i]; + /* + * For "default ACL", NFSv3 handles the 'type' differently + * i.e. by logical OR'ing 'type' with NFS_ACL_DEFAULT. + * Which the backend File system does not understand and + * that needs to be masked OFF. + */ + dacetype = (dace->type & ~(NFS_ACL_DEFAULT)); + switch (dacetype) { + case POSIX_ACL_USER: + case POSIX_ACL_GROUP: + daceuid = dace->uid; + break; + default: + daceuid = POSIX_ACL_UNDEFINED_ID; + break; + } + bufentry->tag = htole16(dacetype); + bufentry->perm = htole16(dace->perm); + bufentry->id = htole32(daceuid); + + bufentry++; } @@ -532,19 +603,23 @@ acl3err: acl3svc_submit_reply (cs->req, (void *)&cs->args.setaclreply, (acl3_serializer)xdr_serialize_setaclreply); nfs3_call_state_wipe (cs); + GF_FREE(aclentry); + GF_FREE(daclentry); return 0; } rpcerr: if (ret < 0) nfs3_call_state_wipe (cs); - + if (aclentry) + GF_FREE (aclentry); + if (daclentry) + GF_FREE (daclentry); return ret; } - rpcsvc_actor_t acl3svc_actors[ACL3_PROC_COUNT] = { {"NULL", ACL3_NULL, acl3svc_null, NULL, 0}, {"GETACL", ACL3_GETACL, acl3svc_getacl, NULL, 0}, @@ -554,7 +629,7 @@ rpcsvc_actor_t acl3svc_actors[ACL3_PROC_COUNT] = { rpcsvc_program_t acl3prog = { .progname = "ACL3", .prognum = ACL_PROGRAM, - .progver = ACL_V3, + .progver = ACLV3_VERSION, .progport = GF_NFS3_PORT, .actors = acl3svc_actors, .numactors = ACL3_PROC_COUNT, @@ -569,6 +644,11 @@ acl3svc_init(xlator_t *nfsx) dict_t *options = NULL; int ret = -1; char *portstr = NULL; + static gf_boolean_t acl3_inited = _gf_false; + + /* Already inited */ + if (acl3_inited) + return &acl3prog; nfs = (struct nfs_state*)nfsx->private; @@ -614,13 +694,14 @@ acl3svc_init(xlator_t *nfsx) goto err; } - rpcsvc_create_listeners (nfs->rpcsvc, options, "ACL"); + ret = rpcsvc_create_listeners (nfs->rpcsvc, options, "ACL"); if (ret == -1) { gf_log (GF_ACL, GF_LOG_ERROR, "Unable to create listeners"); dict_unref (options); goto err; } + acl3_inited = _gf_true; return &acl3prog; err: return NULL; diff --git a/xlators/nfs/server/src/acl3.h b/xlators/nfs/server/src/acl3.h index b668723c8..e0e61281a 100644 --- a/xlators/nfs/server/src/acl3.h +++ b/xlators/nfs/server/src/acl3.h @@ -11,18 +11,19 @@ #ifndef _ACL3_H #define _ACL3_H +#include "glusterfs-acl.h" + #define GF_ACL3_PORT 38469 #define GF_ACL GF_NFS"-ACL" -#define ACL_PROGRAM 100227 -#define ACL_V3 3 - -#define ACL_USER_OBJ 0x1 -#define ACL_GROUP_OBJ 0x4 -#define ACL_OTHER_OBJ 0x20 +/* + * NFSv3, identifies the default ACL by NFS_ACL_DEFAULT. Gluster + * NFS needs to mask it OFF before sending it upto POSIX layer + * or File system layer. + */ +#define NFS_ACL_DEFAULT 0x1000 -#define POSIX_ACL_XATTR_VERSION 0x0002 -#define NFS_ACL_MAX_ENTRIES 1024 +#define NFS_ACL_MAX_ENTRIES 1024 rpcsvc_program_t * acl3svc_init(xlator_t *nfsx); diff --git a/xlators/nfs/server/src/mount3.c b/xlators/nfs/server/src/mount3.c index 39d736e7d..b0824bf10 100644 --- a/xlators/nfs/server/src/mount3.c +++ b/xlators/nfs/server/src/mount3.c @@ -2,19 +2,10 @@ Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -39,26 +30,71 @@ #include "nfs-mem-types.h" #include "nfs.h" #include "common-utils.h" - +#include "store.h" #include <errno.h> #include <sys/socket.h> #include <sys/uio.h> + +#define IPv4_ADDR_SIZE 32 + +/* Macro to typecast the parameter to struct sockaddr_in + */ +#define SA(addr) ((struct sockaddr_in*)(addr)) + +/* Macro will mask the ip address with netmask. + */ +#define MASKED_IP(ipv4addr, netmask) \ + (ntohl(SA(ipv4addr)->sin_addr.s_addr) & (netmask)) + +/* Macro will compare two IP address after applying the mask + */ +#define COMPARE_IPv4_ADDRS(ip1, ip2, netmask) \ + ((MASKED_IP(ip1, netmask)) == (MASKED_IP(ip2, netmask))) + +/* This macro will assist in freeing up entire link list + * of host_auth_spec structure. + */ +#define FREE_HOSTSPEC(exp) do { \ + struct host_auth_spec *host= exp->hostspec; \ + while (NULL != host){ \ + struct host_auth_spec* temp = host; \ + host = host->next; \ + if (NULL != temp->host_addr) { \ + GF_FREE (temp->host_addr); \ + } \ + GF_FREE (temp); \ + } \ + exp->hostspec = NULL; \ + } while (0) + typedef ssize_t (*mnt3_serializer) (struct iovec outmsg, void *args); extern void * mount3udp_thread (void *argv); +static inline void +mnt3_export_free (struct mnt3_export *exp) +{ + if (!exp) + return; + + if (exp->exptype == MNT3_EXPTYPE_DIR) + FREE_HOSTSPEC (exp); + GF_FREE (exp->expname); + GF_FREE (exp); +} /* Generic reply function for MOUNTv3 specific replies. */ int mnt3svc_submit_reply (rpcsvc_request_t *req, void *arg, mnt3_serializer sfunc) { - struct iovec outmsg = {0, }; - struct iobuf *iob = NULL; - struct mount3_state *ms = NULL; - int ret = -1; + struct iovec outmsg = {0, }; + struct iobuf *iob = NULL; + struct mount3_state *ms = NULL; + int ret = -1; + ssize_t msglen = 0; struct iobref *iobref = NULL; if (!req) @@ -84,7 +120,12 @@ mnt3svc_submit_reply (rpcsvc_request_t *req, void *arg, mnt3_serializer sfunc) /* Use the given serializer to translate the give C structure in arg * to XDR format which will be written into the buffer in outmsg. */ - outmsg.iov_len = sfunc (outmsg, arg); + msglen = sfunc (outmsg, arg); + if (msglen < 0) { + gf_log (GF_MNT, GF_LOG_ERROR, "Failed to encode message"); + goto ret; + } + outmsg.iov_len = msglen; iobref = iobref_new (); if (iobref == NULL) { @@ -92,12 +133,14 @@ mnt3svc_submit_reply (rpcsvc_request_t *req, void *arg, mnt3_serializer sfunc) goto ret; } - iobref_add (iobref, iob); + ret = iobref_add (iobref, iob); + if (ret) { + gf_log (GF_MNT, GF_LOG_ERROR, "Failed to add iob to iobref"); + goto ret; + } /* Then, submit the message for transmission. */ ret = rpcsvc_submit_message (req, &outmsg, 1, NULL, 0, iobref); - iobuf_unref (iob); - iobref_unref (iobref); if (ret == -1) { gf_log (GF_MNT, GF_LOG_ERROR, "Reply submission failed"); goto ret; @@ -105,6 +148,11 @@ mnt3svc_submit_reply (rpcsvc_request_t *req, void *arg, mnt3_serializer sfunc) ret = 0; ret: + if (NULL != iob) + iobuf_unref (iob); + if (NULL != iobref) + iobref_unref (iobref); + return ret; } @@ -188,14 +236,278 @@ mnt3svc_set_mountres3 (mountstat3 stat, struct nfs3_fh *fh, int *authflavor, return res; } +/* Read the rmtab from the store_handle and append (or not) the entries to the + * mountlist. + * + * Requires the store_handle to be locked. + */ +static int +__mount_read_rmtab (gf_store_handle_t *sh, struct list_head *mountlist, + gf_boolean_t append) +{ + int ret = 0; + unsigned int idx = 0; + struct mountentry *me = NULL, *tmp = NULL; + /* me->hostname is a char[MNTPATHLEN] */ + char key[MNTPATHLEN + 11]; + + GF_ASSERT (sh && mountlist); + + if (!gf_store_locked_local (sh)) { + gf_log (GF_MNT, GF_LOG_WARNING, "Not reading unlocked %s", + sh->path); + return -1; + } + + if (!append) { + list_for_each_entry_safe (me, tmp, mountlist, mlist) { + list_del (&me->mlist); + GF_FREE (me); + } + me = NULL; + } + + for (;;) { + char *value = NULL; + + if (me && append) { + /* do not add duplicates */ + list_for_each_entry (tmp, mountlist, mlist) { + if (!strcmp(tmp->hostname, me->hostname) && + !strcmp(tmp->exname, me->exname)) { + GF_FREE (me); + goto dont_add; + } + } + list_add_tail (&me->mlist, mountlist); + } else if (me) { + list_add_tail (&me->mlist, mountlist); + } + +dont_add: + me = GF_CALLOC (1, sizeof (*me), gf_nfs_mt_mountentry); + if (!me) { + gf_log (GF_MNT, GF_LOG_ERROR, "Out of memory"); + ret = -1; + goto out; + } + + INIT_LIST_HEAD (&me->mlist); + + snprintf (key, 9 + MNTPATHLEN, "hostname-%d", idx); + ret = gf_store_retrieve_value (sh, key, &value); + if (ret) + break; + strncpy (me->hostname, value, MNTPATHLEN); + GF_FREE (value); + + snprintf (key, 11 + MNTPATHLEN, "mountpoint-%d", idx); + ret = gf_store_retrieve_value (sh, key, &value); + if (ret) + break; + strncpy (me->exname, value, MNTPATHLEN); + GF_FREE (value); + + idx++; + gf_log (GF_MNT, GF_LOG_TRACE, "Read entries %s:%s", me->hostname, me->exname); + } + gf_log (GF_MNT, GF_LOG_DEBUG, "Read %d entries from '%s'", idx, sh->path); + GF_FREE (me); +out: + return ret; +} + +/* Overwrite the contents of the rwtab with te in-memory client list. + * Fail gracefully if the stora_handle is not locked. + */ +static void +__mount_rewrite_rmtab(struct mount3_state *ms, gf_store_handle_t *sh) +{ + struct mountentry *me = NULL; + char key[16]; + int fd, ret; + unsigned int idx = 0; + + if (!gf_store_locked_local (sh)) { + gf_log (GF_MNT, GF_LOG_WARNING, "Not modifying unlocked %s", + sh->path); + return; + } + + fd = gf_store_mkstemp (sh); + if (fd == -1) { + gf_log (GF_MNT, GF_LOG_ERROR, "Failed to open %s", sh->path); + return; + } + + list_for_each_entry (me, &ms->mountlist, mlist) { + snprintf (key, 16, "hostname-%d", idx); + ret = gf_store_save_value (fd, key, me->hostname); + if (ret) + goto fail; + + snprintf (key, 16, "mountpoint-%d", idx); + ret = gf_store_save_value (fd, key, me->exname); + if (ret) + goto fail; + + idx++; + } + + gf_log (GF_MNT, GF_LOG_DEBUG, "Updated rmtab with %d entries", idx); + + close (fd); + if (gf_store_rename_tmppath (sh)) + gf_log (GF_MNT, GF_LOG_ERROR, "Failed to overwrite rwtab %s", + sh->path); + + return; + +fail: + gf_log (GF_MNT, GF_LOG_ERROR, "Failed to update %s", sh->path); + close (fd); + gf_store_unlink_tmppath (sh); +} + +/* Read the rmtab into a clean ms->mountlist. + */ +static void +mount_read_rmtab (struct mount3_state *ms) +{ + gf_store_handle_t *sh = NULL; + struct nfs_state *nfs = NULL; + int ret; + + nfs = (struct nfs_state *)ms->nfsx->private; + + ret = gf_store_handle_new (nfs->rmtab, &sh); + if (ret) { + gf_log (GF_MNT, GF_LOG_WARNING, "Failed to open '%s'", + nfs->rmtab); + return; + } + + if (gf_store_lock (sh)) { + gf_log (GF_MNT, GF_LOG_WARNING, "Failed to lock '%s'", + nfs->rmtab); + goto out; + } + + __mount_read_rmtab (sh, &ms->mountlist, _gf_false); + gf_store_unlock (sh); + +out: + gf_store_handle_destroy (sh); +} + +/* Write the ms->mountlist to the rmtab. + * + * The rmtab could be empty, or it can exists and have been updated by a + * different storage server without our knowing. + * + * 1. takes the store_handle lock on the current rmtab + * - blocks if an other storage server rewrites the rmtab at the same time + * 2. [if new_rmtab] takes the store_handle lock on the new rmtab + * 3. reads/merges the entries from the current rmtab + * 4. [if new_rmtab] reads/merges the entries from the new rmtab + * 5. [if new_rmtab] writes the new rmtab + * 6. [if not new_rmtab] writes the current rmtab + * 7 [if new_rmtab] replaces nfs->rmtab to point to the new location + * 8. [if new_rmtab] releases the store_handle lock of the new rmtab + * 9. releases the store_handle lock of the old rmtab + */ +void +mount_rewrite_rmtab (struct mount3_state *ms, char *new_rmtab) +{ + gf_store_handle_t *sh = NULL, *nsh = NULL; + struct nfs_state *nfs = NULL; + int ret; + char *rmtab = NULL; + + nfs = (struct nfs_state *)ms->nfsx->private; + + ret = gf_store_handle_new (nfs->rmtab, &sh); + if (ret) { + gf_log (GF_MNT, GF_LOG_WARNING, "Failed to open '%s'", + nfs->rmtab); + return; + } + + if (gf_store_lock (sh)) { + gf_log (GF_MNT, GF_LOG_WARNING, "Not rewriting '%s'", + nfs->rmtab); + goto free_sh; + } + + if (new_rmtab) { + ret = gf_store_handle_new (new_rmtab, &nsh); + if (ret) { + gf_log (GF_MNT, GF_LOG_WARNING, "Failed to open '%s'", + new_rmtab); + goto unlock_sh; + } + + if (gf_store_lock (nsh)) { + gf_log (GF_MNT, GF_LOG_WARNING, "Not rewriting '%s'", + new_rmtab); + goto free_nsh; + } + } + /* always read the currently used rmtab */ + __mount_read_rmtab (sh, &ms->mountlist, _gf_true); + + if (new_rmtab) { + /* read the new rmtab and write changes to the new location */ + __mount_read_rmtab (nsh, &ms->mountlist, _gf_true); + __mount_rewrite_rmtab (ms, nsh); + + /* replace the nfs->rmtab reference to the new rmtab */ + rmtab = gf_strdup(new_rmtab); + if (rmtab == NULL) { + gf_log (GF_MNT, GF_LOG_ERROR, "Out of memory, keeping " + "%s as rmtab", nfs->rmtab); + } else { + GF_FREE (nfs->rmtab); + nfs->rmtab = new_rmtab; + } + + gf_store_unlock (nsh); + } else { + /* rewrite the current (unchanged location) rmtab */ + __mount_rewrite_rmtab (ms, sh); + } + +free_nsh: + if (new_rmtab) + gf_store_handle_destroy (nsh); +unlock_sh: + gf_store_unlock (sh); +free_sh: + gf_store_handle_destroy (sh); +} + +/* Add a new NFS-client to the ms->mountlist and update the rmtab if we can. + * + * A NFS-client will only be removed from the ms->mountlist in case the + * NFS-client sends a unmount request. It is possible that a NFS-client + * crashed/rebooted had network loss or something else prevented the NFS-client + * to unmount cleanly. In this case, a duplicate entry would be added to the + * ms->mountlist, which is wrong and we should prevent. + * + * It is fully acceptible that the ms->mountlist is not 100% correct, this is a + * common issue for all(?) NFS-servers. + */ int mnt3svc_update_mountlist (struct mount3_state *ms, rpcsvc_request_t *req, char *expname) { struct mountentry *me = NULL; + struct mountentry *cur = NULL; int ret = -1; char *colon = NULL; + struct nfs_state *nfs = NULL; + gf_store_handle_t *sh = NULL; if ((!ms) || (!req) || (!expname)) return -1; @@ -205,14 +517,24 @@ mnt3svc_update_mountlist (struct mount3_state *ms, rpcsvc_request_t *req, if (!me) return -1; - strcpy (me->exname, expname); + nfs = (struct nfs_state *)ms->nfsx->private; + + ret = gf_store_handle_new (nfs->rmtab, &sh); + if (ret) { + gf_log (GF_MNT, GF_LOG_WARNING, "Failed to open '%s'", + nfs->rmtab); + goto free_err; + } + + strncpy (me->exname, expname, MNTPATHLEN); + INIT_LIST_HEAD (&me->mlist); /* Must get the IP or hostname of the client so we * can map it into the mount entry. */ ret = rpcsvc_transport_peername (req->trans, me->hostname, MNTPATHLEN); if (ret == -1) - goto free_err; + goto free_err2; colon = strrchr (me->hostname, ':'); if (colon) { @@ -220,10 +542,37 @@ mnt3svc_update_mountlist (struct mount3_state *ms, rpcsvc_request_t *req, } LOCK (&ms->mountlock); { + /* in case locking fails, we just don't write the rmtab */ + if (gf_store_lock (sh)) { + gf_log (GF_MNT, GF_LOG_WARNING, "Failed to lock '%s'" + ", changes will not be written", nfs->rmtab); + } else { + __mount_read_rmtab (sh, &ms->mountlist, _gf_false); + } + + /* do not add duplicates */ + list_for_each_entry (cur, &ms->mountlist, mlist) { + if (!strcmp(cur->hostname, me->hostname) && + !strcmp(cur->exname, me->exname)) { + GF_FREE (me); + goto dont_add; + } + } list_add_tail (&me->mlist, &ms->mountlist); + + /* only write the rmtab in case it was locked */ + if (gf_store_locked_local (sh)) + __mount_rewrite_rmtab (ms, sh); } +dont_add: + if (gf_store_locked_local (sh)) + gf_store_unlock (sh); + UNLOCK (&ms->mountlock); +free_err2: + gf_store_handle_destroy (sh); + free_err: if (ret == -1) GF_FREE (me); @@ -242,6 +591,7 @@ __mnt3_get_volume_id (struct mount3_state *ms, xlator_t *mntxl, if ((!ms) || (!mntxl)) return ret; + LOCK (&ms->mountlock); list_for_each_entry (exp, &ms->exportlist, explist) { if (exp->vol == mntxl) { uuid_copy (volumeid, exp->volumeid); @@ -251,6 +601,7 @@ __mnt3_get_volume_id (struct mount3_state *ms, xlator_t *mntxl, } out: + UNLOCK (&ms->mountlock); return ret; } @@ -271,7 +622,7 @@ mnt3svc_lookup_mount_cbk (call_frame_t *frame, void *cookie, rpcsvc_t *svc = NULL; xlator_t *mntxl = NULL; uuid_t volumeid = {0, }; - char fhstr[1024]; + char fhstr[1024], *path = NULL; req = (rpcsvc_request_t *)frame->local; @@ -293,7 +644,15 @@ mnt3svc_lookup_mount_cbk (call_frame_t *frame, void *cookie, if (status != MNT3_OK) goto xmit_res; - mnt3svc_update_mountlist (ms, req, mntxl->name); + path = GF_CALLOC (PATH_MAX, sizeof (char), gf_nfs_mt_char); + if (!path) { + gf_log (GF_MNT, GF_LOG_ERROR, "Out of memory"); + goto xmit_res; + } + + snprintf (path, PATH_MAX, "/%s", mntxl->name); + mnt3svc_update_mountlist (ms, req, path); + GF_FREE (path); if (gf_nfs_dvm_off (nfs_state (ms->nfsx))) { fh = nfs3_fh_build_indexed_root_fh (ms->nfsx->children, mntxl); goto xmit_res; @@ -303,7 +662,7 @@ mnt3svc_lookup_mount_cbk (call_frame_t *frame, void *cookie, fh = nfs3_fh_build_uuid_root_fh (volumeid); xmit_res: - nfs3_fh_to_str (&fh, fhstr); + nfs3_fh_to_str (&fh, fhstr, sizeof (fhstr)); gf_log (GF_MNT, GF_LOG_DEBUG, "MNT reply: fh %s, status: %d", fhstr, status); if (op_ret == 0) { @@ -467,8 +826,8 @@ mnt3_resolve_state_wipe (mnt3_resolve_t *mres) /* Sets up the component argument to contain the next component in the path and * sets up path as an absolute path starting from the next component. */ -char * -__setup_next_component (char *path, char *component) +static char * +setup_next_component (char *path, size_t plen, char *component, size_t clen) { char *comp = NULL; char *nextcomp = NULL; @@ -476,7 +835,7 @@ __setup_next_component (char *path, char *component) if ((!path) || (!component)) return NULL; - strcpy (component, path); + strncpy (component, path, clen); comp = index (component, (int)'/'); if (!comp) goto err; @@ -484,7 +843,7 @@ __setup_next_component (char *path, char *component) comp++; nextcomp = index (comp, (int)'/'); if (nextcomp) { - strcpy (path, nextcomp); + strncpy (path, nextcomp, plen); *nextcomp = '\0'; } else path[0] = '\0'; @@ -514,7 +873,9 @@ __mnt3_resolve_export_subdir_comp (mnt3_resolve_t *mres) if (!mres) return ret; - nextcomp = __setup_next_component (mres->remainingdir, dupsubdir); + nextcomp = setup_next_component (mres->remainingdir, + sizeof (mres->remainingdir), + dupsubdir, sizeof (dupsubdir)); if (!nextcomp) goto err; @@ -526,7 +887,7 @@ __mnt3_resolve_export_subdir_comp (mnt3_resolve_t *mres) if ((ret < 0) && (ret != -2)) { gf_log (GF_MNT, GF_LOG_ERROR, "Failed to resolve and create " "inode: parent gfid %s, entry %s", - uuid_utoa (mres->resolveloc.inode->gfid), nextcomp); + uuid_utoa (gfid), nextcomp); ret = -EFAULT; goto err; } @@ -554,6 +915,7 @@ mnt3_resolve_subdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, rpcsvc_t *svc = NULL; mountres3 res = {0, }; xlator_t *mntxl = NULL; + char *path = NULL; mres = frame->local; mntxl = (xlator_t *)cookie; @@ -571,15 +933,23 @@ mnt3_resolve_subdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (strlen (mres->remainingdir) <= 0) { op_ret = -1; mntstat = MNT3_OK; + path = GF_CALLOC (PATH_MAX, sizeof (char), gf_nfs_mt_char); + if (!path) { + gf_log (GF_MNT, GF_LOG_ERROR, "Memory allocation " + "failed"); + goto err; + } + snprintf (path, PATH_MAX, "/%s%s", mres->exp->vol->name, + mres->resolveloc.path); mnt3svc_update_mountlist (mres->mstate, mres->req, - mres->exp->expname); - goto err; + path); + GF_FREE (path); + } else { + mres->parentfh = fh; + op_ret = __mnt3_resolve_export_subdir_comp (mres); + if (op_ret < 0) + mntstat = mnt3svc_errno_to_mnterr (-op_ret); } - - mres->parentfh = fh; - op_ret = __mnt3_resolve_export_subdir_comp (mres); - if (op_ret < 0) - mntstat = mnt3svc_errno_to_mnterr (-op_ret); err: if (op_ret == -1) { gf_log (GF_MNT, GF_LOG_DEBUG, "Mount reply status: %d", @@ -622,7 +992,9 @@ __mnt3_resolve_subdir (mnt3_resolve_t *mres) if (!mres) return ret; - firstcomp = __setup_next_component (mres->remainingdir, dupsubdir); + firstcomp = setup_next_component (mres->remainingdir, + sizeof (mres->remainingdir), + dupsubdir, sizeof (dupsubdir)); if (!firstcomp) goto err; @@ -645,6 +1017,132 @@ err: } +/** + * This function will verify if the client is allowed to mount + * the directory or not. Client's IP address will be compared with + * allowed IP list or range present in mnt3_export structure. + * + * @param req - RPC request. This structure contains client's IP address. + * @param export - mnt3_export structure. Contains allowed IP list/range. + * + * @return 0 - on Success and -EACCES on failure. + */ +int +mnt3_verify_auth (rpcsvc_request_t *req, struct mnt3_export *export) +{ + int retvalue = -EACCES; + int ret = 0; + int shiftbits = 0; + uint32_t ipv4netmask = 0; + uint32_t routingprefix = 0; + struct host_auth_spec *host = NULL; + struct sockaddr_in *client_addr = NULL; + struct sockaddr_in *allowed_addr = NULL; + struct addrinfo *allowed_addrinfo = NULL; + + /* Sanity check */ + if ((NULL == req) || + (NULL == req->trans) || + (NULL == export) || + (NULL == export->hostspec)) { + gf_log (GF_MNT, GF_LOG_ERROR, "Invalid argument"); + return retvalue; + } + + host = export->hostspec; + + + /* Client's IP address. */ + client_addr = (struct sockaddr_in *)(&(req->trans->peerinfo.sockaddr)); + + /* Try to see if the client IP matches the allowed IP list.*/ + while (NULL != host){ + GF_ASSERT (host->host_addr); + + if (NULL != allowed_addrinfo) { + freeaddrinfo (allowed_addrinfo); + allowed_addrinfo = NULL; + } + + /* Get the addrinfo for the allowed host (host_addr). */ + ret = getaddrinfo (host->host_addr, + NULL, + NULL, + &allowed_addrinfo); + if (0 != ret){ + gf_log (GF_MNT, GF_LOG_ERROR, "getaddrinfo: %s\n", + gai_strerror (ret)); + host = host->next; + + /* Failed to get IP addrinfo. Continue to check other + * allowed IPs in the list. + */ + continue; + } + + allowed_addr = (struct sockaddr_in *)(allowed_addrinfo->ai_addr); + + if (NULL == allowed_addr) { + gf_log (GF_MNT, GF_LOG_ERROR, "Invalid structure"); + break; + } + + if (AF_INET == allowed_addr->sin_family){ + if (IPv4_ADDR_SIZE < host->routeprefix) { + gf_log (GF_MNT, GF_LOG_ERROR, "invalid IP " + "configured for export-dir AUTH"); + host = host->next; + continue; + } + + /* -1 means no route prefix is provided. In this case + * the IP should be an exact match. Which is same as + * providing a route prefix of IPv4_ADDR_SIZE. + */ + if (-1 == host->routeprefix) { + routingprefix = IPv4_ADDR_SIZE; + } else { + routingprefix = host->routeprefix; + } + + /* Create a mask from the routing prefix. User provided + * CIDR address is split into IP address (host_addr) and + * routing prefix (routeprefix). This CIDR address may + * denote a single, distinct interface address or the + * beginning address of an entire network. + * + * e.g. the IPv4 block 192.168.100.0/24 represents the + * 256 IPv4 addresses from 192.168.100.0 to + * 192.168.100.255. + * Therefore to check if an IP matches 192.168.100.0/24 + * we should mask the IP with FFFFFF00 and compare it + * with host address part of CIDR. + */ + shiftbits = IPv4_ADDR_SIZE - routingprefix; + ipv4netmask = 0xFFFFFFFFUL << shiftbits; + + /* Mask both the IPs and then check if they match + * or not. */ + if (COMPARE_IPv4_ADDRS (allowed_addr, + client_addr, + ipv4netmask)){ + retvalue = 0; + break; + } + } + + /* Client IP didn't match the allowed IP. + * Check with the next allowed IP.*/ + host = host->next; + } + + if (NULL != allowed_addrinfo) { + freeaddrinfo (allowed_addrinfo); + } + + return retvalue; +} + int mnt3_resolve_subdir (rpcsvc_request_t *req, struct mount3_state *ms, struct mnt3_export *exp, char *subdir) @@ -656,6 +1154,16 @@ mnt3_resolve_subdir (rpcsvc_request_t *req, struct mount3_state *ms, if ((!req) || (!ms) || (!exp) || (!subdir)) return ret; + /* Need to check AUTH */ + if (NULL != exp->hostspec) { + ret = mnt3_verify_auth (req, exp); + if (0 != ret) { + gf_log (GF_MNT,GF_LOG_ERROR, + "AUTH verification failed"); + return ret; + } + } + mres = GF_CALLOC (1, sizeof (mnt3_resolve_t), gf_nfs_mt_mnt3_resolve); if (!mres) { gf_log (GF_MNT, GF_LOG_ERROR, "Memory allocation failed"); @@ -665,7 +1173,7 @@ mnt3_resolve_subdir (rpcsvc_request_t *req, struct mount3_state *ms, mres->exp = exp; mres->mstate = ms; mres->req = req; - strcpy (mres->remainingdir, subdir); + strncpy (mres->remainingdir, subdir, MNTPATHLEN); if (gf_nfs_dvm_off (nfs_state (ms->nfsx))) pfh = nfs3_fh_build_indexed_root_fh (mres->mstate->nfsx->children, mres->exp->vol); else @@ -740,6 +1248,7 @@ mnt3_mntpath_to_export (struct mount3_state *ms, char *dirpath) if ((!ms) || (!dirpath)) return NULL; + LOCK (&ms->mountlock); list_for_each_entry (exp, &ms->exportlist, explist) { /* Search for the an exact match with the volume */ @@ -753,6 +1262,7 @@ mnt3_mntpath_to_export (struct mount3_state *ms, char *dirpath) gf_log (GF_MNT, GF_LOG_DEBUG, "Export not found"); foundexp: + UNLOCK (&ms->mountlock); return found; } @@ -781,7 +1291,7 @@ mnt3_check_client_net (struct mount3_state *ms, rpcsvc_request_t *req, gai_strerror (ret)); } - ret = rpcsvc_auth_check (svc->options, targetxl->name, trans); + ret = rpcsvc_auth_check (svc, targetxl->name, trans); if (ret == RPCSVC_AUTH_REJECT) { gf_log (GF_MNT, GF_LOG_INFO, "Peer %s not allowed", peer); goto err; @@ -976,6 +1486,9 @@ __build_mountlist (struct mount3_state *ms, int *count) if ((!ms) || (!count)) return NULL; + /* read rmtab, other peers might have updated it */ + mount_read_rmtab(ms); + *count = 0; gf_log (GF_MNT, GF_LOG_DEBUG, "Building mount list:"); list_for_each_entry (me, &ms->mountlist, mlist) { @@ -986,6 +1499,8 @@ __build_mountlist (struct mount3_state *ms, int *count) " failed"); goto free_list; } + if (!first) + first = mlist; mlist->ml_directory = GF_CALLOC (namelen + 2, sizeof (char), gf_nfs_mt_char); @@ -995,8 +1510,7 @@ __build_mountlist (struct mount3_state *ms, int *count) goto free_list; } - strcpy (mlist->ml_directory, "/"); - strcat (mlist->ml_directory, me->exname); + strcpy (mlist->ml_directory, me->exname); namelen = strlen (me->hostname); mlist->ml_hostname = GF_CALLOC (namelen + 2, sizeof (char), @@ -1017,9 +1531,6 @@ __build_mountlist (struct mount3_state *ms, int *count) } else prev = mlist; - if (!first) - first = mlist; - (*count)++; } @@ -1096,67 +1607,71 @@ rpcerr: int -__mnt3svc_umount (struct mount3_state *ms, char *dirpath, char *hostname) +mnt3svc_umount (struct mount3_state *ms, char *dirpath, char *hostname) { struct mountentry *me = NULL; - char *exname = NULL; int ret = -1; + gf_store_handle_t *sh = NULL; + struct nfs_state *nfs = NULL; if ((!ms) || (!dirpath) || (!hostname)) return -1; - if (list_empty (&ms->mountlist)) - return 0; - - if (dirpath[0] == '/') - exname = dirpath+1; - else - exname = dirpath; + nfs = (struct nfs_state *)ms->nfsx->private; - list_for_each_entry (me, &ms->mountlist, mlist) { - if ((strcmp (me->exname, exname) == 0) && - (strcmp (me->hostname, hostname) == 0)) { - ret = 0; - break; - } + ret = gf_store_handle_new (nfs->rmtab, &sh); + if (ret) { + gf_log (GF_MNT, GF_LOG_WARNING, "Failed to open '%s'", + nfs->rmtab); + return 0; } - /* Need this check here because at the end of the search me might still - * be pointing to the last entry, which may not be the one we're - * looking for. - */ - if (ret == -1) {/* Not found in list. */ - gf_log (GF_MNT, GF_LOG_DEBUG, "Export not found"); - goto ret; + ret = gf_store_lock (sh); + if (ret) { + goto out_free; } - if (!me) - goto ret; + LOCK (&ms->mountlock); + { + __mount_read_rmtab (sh, &ms->mountlist, _gf_false); + if (list_empty (&ms->mountlist)) { + ret = 0; + goto out_unlock; + } - gf_log (GF_MNT, GF_LOG_DEBUG, "Unmounting: dir %s, host: %s", - me->exname, me->hostname); - list_del (&me->mlist); - GF_FREE (me); - ret = 0; -ret: - return ret; -} + ret = -1; + list_for_each_entry (me, &ms->mountlist, mlist) { + if ((strcmp (me->exname, dirpath) == 0) && + (strcmp (me->hostname, hostname) == 0)) { + ret = 0; + break; + } + } + /* Need this check here because at the end of the search me + * might still be pointing to the last entry, which may not be + * the one we're looking for. + */ + if (ret == -1) {/* Not found in list. */ + gf_log (GF_MNT, GF_LOG_TRACE, "Export not found"); + goto out_unlock; + } + if (!me) + goto out_unlock; -int -mnt3svc_umount (struct mount3_state *ms, char *dirpath, char *hostname) -{ - int ret = -1; - if ((!ms) || (!dirpath) || (!hostname)) - return -1; + gf_log (GF_MNT, GF_LOG_DEBUG, "Unmounting: dir %s, host: %s", + me->exname, me->hostname); - LOCK (&ms->mountlock); - { - ret = __mnt3svc_umount (ms, dirpath, hostname); + list_del (&me->mlist); + GF_FREE (me); + __mount_rewrite_rmtab (ms, sh); } +out_unlock: UNLOCK (&ms->mountlock); - + gf_store_unlock (sh); +out_free: + gf_store_handle_destroy (sh); return ret; } @@ -1304,6 +1819,10 @@ mnt3_xlchildren_to_exports (rpcsvc_t *svc, struct mount3_state *ms) return NULL; nfs = (struct nfs_state *)ms->nfsx->private; + if (!nfs) + return NULL; + + LOCK (&ms->mountlock); list_for_each_entry(ent, &ms->exportlist, explist) { /* If volume is not started yet, do not list it for tools like @@ -1319,7 +1838,8 @@ mnt3_xlchildren_to_exports (rpcsvc_t *svc, struct mount3_state *ms) " failed"); goto free_list; } - + if (!first) + first = elist; elist->ex_dir = GF_CALLOC (namelen + 2, sizeof (char), gf_nfs_mt_char); if (!elist->ex_dir) { @@ -1327,16 +1847,10 @@ mnt3_xlchildren_to_exports (rpcsvc_t *svc, struct mount3_state *ms) " failed"); goto free_list; } - strcpy (elist->ex_dir, ent->expname); addrstr = rpcsvc_volume_allowed (svc->options, ent->vol->name); - if (addrstr) - addrstr = gf_strdup (addrstr); - else - addrstr = gf_strdup ("No Access"); - elist->ex_groups = GF_CALLOC (1, sizeof (struct groupnode), gf_nfs_mt_groupnode); if (!elist->ex_groups) { @@ -1344,21 +1858,29 @@ mnt3_xlchildren_to_exports (rpcsvc_t *svc, struct mount3_state *ms) " failed"); goto free_list; } + /*This check has to be done after checking + * elist->ex_groups allocation check to avoid resource leak; + */ + if (addrstr) + addrstr = gf_strdup (addrstr); + else + addrstr = gf_strdup ("No Access"); + if (!addrstr) { + goto free_list; + } elist->ex_groups->gr_name = addrstr; if (prev) { prev->ex_next = elist; prev = elist; } else prev = elist; - - if (!first) - first = elist; } ret = 0; free_list: + UNLOCK (&ms->mountlock); if (ret == -1) { xdr_free_exports_list (first); first = NULL; @@ -1464,12 +1986,13 @@ mount3udp_add_mountlist (char *host, dirpath *expname) while (*export == '/') export++; - strcpy (me->exname, export); - strcpy (me->hostname, host); + strncpy (me->exname, export, MNTPATHLEN); + strncpy (me->hostname, host, MNTPATHLEN); INIT_LIST_HEAD (&me->mlist); LOCK (&ms->mountlock); { list_add_tail (&me->mlist, &ms->mountlist); + mount_rewrite_rmtab(ms, NULL); } UNLOCK (&ms->mountlock); return 0; @@ -1485,11 +2008,155 @@ mount3udp_delete_mountlist (char *hostname, dirpath *expname) export = (char *)expname; while (*export == '/') export++; - __mnt3svc_umount (ms, export, hostname); + mnt3svc_umount (ms, export, hostname); return 0; } +/** + * This function will parse the hostip (IP addres, IP range, or hostname) + * and fill the host_auth_spec structure. + * + * @param hostspec - struct host_auth_spec + * @param hostip - IP address, IP range (CIDR format) or hostname + * + * @return 0 - on success and -1 on failure + */ +int +mnt3_export_fill_hostspec (struct host_auth_spec* hostspec, const char* hostip) +{ + char *ipdupstr = NULL; + char *savptr = NULL; + char *ip = NULL; + char *token = NULL; + int ret = -1; + + /* Create copy of the string so that the source won't change + */ + ipdupstr = gf_strdup (hostip); + if (NULL == ipdupstr) { + gf_log (GF_MNT, GF_LOG_ERROR, "Memory allocation failed"); + goto err; + } + + ip = strtok_r (ipdupstr, "/", &savptr); + hostspec->host_addr = gf_strdup (ip); + if (NULL == hostspec->host_addr) { + gf_log (GF_MNT, GF_LOG_ERROR, "Memory allocation failed"); + goto err; + } + + /* Check if the IP is in <IP address> / <Range> format. + * If yes, then strip the range and store it separately. + */ + token = strtok_r (NULL, "/", &savptr); + + if (NULL == token) { + hostspec->routeprefix = -1; + } else { + hostspec->routeprefix = atoi (token); + } + + // success + ret = 0; +err: + if (NULL != ipdupstr) { + GF_FREE (ipdupstr); + } + return ret; +} + + +/** + * This function will parse the AUTH parameter passed along with + * "export-dir" option. If AUTH parameter is present then it will be + * stripped from exportpath and stored in mnt3_export (exp) structure. + * + * @param exp - mnt3_export structure. Holds information needed for mount. + * @param exportpath - Value of "export-dir" key. Holds both export path + * and AUTH parameter for the path. + * exportpath format: <abspath>[(hostdesc[|hostspec|...])] + * + * @return This function will return 0 on success and -1 on failure. + */ +int +mnt3_export_parse_auth_param (struct mnt3_export* exp, char* exportpath) +{ + char *token = NULL; + char *savPtr = NULL; + char *hostip = NULL; + struct host_auth_spec *host = NULL; + int ret = 0; + + /* Using exportpath directly in strtok_r because we want + * to strip off AUTH parameter from exportpath. */ + token = strtok_r (exportpath, "(", &savPtr); + + /* Get the next token, which will be the AUTH parameter. */ + token = strtok_r (NULL, ")", &savPtr); + + if (NULL == token) { + /* If AUTH is not present then we should return success. */ + return 0; + } + + /* Free any previously allocated hostspec structure. */ + if (NULL != exp->hostspec) { + GF_FREE (exp->hostspec); + exp->hostspec = NULL; + } + + exp->hostspec = GF_CALLOC (1, + sizeof (*(exp->hostspec)), + gf_nfs_mt_auth_spec); + if (NULL == exp->hostspec){ + gf_log (GF_MNT, GF_LOG_ERROR, "Memory allocation failed"); + return -1; + } + + /* AUTH parameter can have multiple entries. For each entry + * a host_auth_spec structure is created. */ + host = exp->hostspec; + + hostip = strtok_r (token, "|", &savPtr); + + /* Parse all AUTH parameters separated by '|' */ + while (NULL != hostip){ + ret = mnt3_export_fill_hostspec (host, hostip); + if (0 != ret) { + gf_log(GF_MNT, GF_LOG_WARNING, + "Failed to parse hostspec: %s", hostip); + goto err; + } + + hostip = strtok_r (NULL, "|", &savPtr); + if (NULL == hostip) { + break; + } + + host->next = GF_CALLOC (1, sizeof (*(host)), + gf_nfs_mt_auth_spec); + if (NULL == host->next){ + gf_log (GF_MNT,GF_LOG_ERROR, + "Memory allocation failed"); + goto err; + } + host = host->next; + } + + /* In case of success return from here */ + return 0; +err: + /* In case of failure free up hostspec structure. */ + FREE_HOSTSPEC (exp); + + return -1; +} +/** + * exportpath will also have AUTH options (ip address, subnet address or + * hostname) mentioned. + * exportpath format: <abspath>[(hostdesc[|hostspec|...])] + */ struct mnt3_export * mnt3_init_export_ent (struct mount3_state *ms, xlator_t *xl, char *exportpath, uuid_t volumeid) @@ -1507,6 +2174,20 @@ mnt3_init_export_ent (struct mount3_state *ms, xlator_t *xl, char *exportpath, return NULL; } + if (NULL != exportpath) { + /* If exportpath is not NULL then we should check if AUTH + * parameter is present or not. If AUTH parameter is present + * then it will be stripped and stored in mnt3_export (exp) + * structure. + */ + if (0 != mnt3_export_parse_auth_param (exp, exportpath)){ + gf_log (GF_MNT, GF_LOG_ERROR, + "Failed to parse auth param"); + goto err; + } + } + + INIT_LIST_HEAD (&exp->explist); if (exportpath) alloclen = strlen (xl->name) + 2 + strlen (exportpath); @@ -1516,8 +2197,6 @@ mnt3_init_export_ent (struct mount3_state *ms, xlator_t *xl, char *exportpath, exp->expname = GF_CALLOC (alloclen, sizeof (char), gf_nfs_mt_char); if (!exp->expname) { gf_log (GF_MNT, GF_LOG_ERROR, "Memory allocation failed"); - GF_FREE (exp); - exp = NULL; goto err; } @@ -1534,8 +2213,9 @@ mnt3_init_export_ent (struct mount3_state *ms, xlator_t *xl, char *exportpath, ret = snprintf (exp->expname, alloclen, "/%s", xl->name); } if (ret < 0) { - gf_log (xl->name, GF_LOG_WARNING, - "failed to get the export name"); + gf_log (xl->name, GF_LOG_ERROR, + "Failed to set the export name"); + goto err; } /* Just copy without discrimination, we'll determine whether to * actually use it when a mount request comes in and a file handle @@ -1543,7 +2223,16 @@ mnt3_init_export_ent (struct mount3_state *ms, xlator_t *xl, char *exportpath, */ uuid_copy (exp->volumeid, volumeid); exp->vol = xl; + + /* On success we should return from here*/ + return exp; err: + /* On failure free exp and it's members.*/ + if (NULL != exp) { + mnt3_export_free (exp); + exp = NULL; + } + return exp; } @@ -1704,8 +2393,11 @@ __mnt3_init_volume_export (struct mount3_state *ms, dict_t *opts) goto err; } - gf_string2boolean (optstr, &boolt); - ret = 0; + ret = gf_string2boolean (optstr, &boolt); + if (ret < 0) { + gf_log (GF_MNT, GF_LOG_ERROR, "Failed to convert" + " string to boolean"); + } err: if (boolt == _gf_false) { @@ -1744,8 +2436,11 @@ __mnt3_init_dir_export (struct mount3_state *ms, dict_t *opts) goto err; } - gf_string2boolean (optstr, &boolt); - ret = 0; + ret = gf_string2boolean (optstr, &boolt); + if (ret < 0) { + gf_log (GF_MNT, GF_LOG_ERROR, "Failed to convert" + " string to boolean"); + } err: if (boolt == _gf_false) { @@ -1843,12 +2538,12 @@ out: } rpcsvc_actor_t mnt3svc_actors[MOUNT3_PROC_COUNT] = { - {"NULL", MOUNT3_NULL, mnt3svc_null, NULL, 0}, - {"MNT", MOUNT3_MNT, mnt3svc_mnt, NULL, 0}, - {"DUMP", MOUNT3_DUMP, mnt3svc_dump, NULL, 0}, - {"UMNT", MOUNT3_UMNT, mnt3svc_umnt, NULL, 0}, - {"UMNTALL", MOUNT3_UMNTALL, mnt3svc_umntall, NULL, 0}, - {"EXPORT", MOUNT3_EXPORT, mnt3svc_export, NULL, 0} + {"NULL", MOUNT3_NULL, mnt3svc_null, NULL, 0, DRC_NA}, + {"MNT", MOUNT3_MNT, mnt3svc_mnt, NULL, 0, DRC_NA}, + {"DUMP", MOUNT3_DUMP, mnt3svc_dump, NULL, 0, DRC_NA}, + {"UMNT", MOUNT3_UMNT, mnt3svc_umnt, NULL, 0, DRC_NA}, + {"UMNTALL", MOUNT3_UMNTALL, mnt3svc_umntall, NULL, 0, DRC_NA}, + {"EXPORT", MOUNT3_EXPORT, mnt3svc_export, NULL, 0, DRC_NA} }; @@ -1922,7 +2617,7 @@ mnt3svc_init (xlator_t *nfsx) } } - rpcsvc_create_listeners (nfs->rpcsvc, options, nfsx->name); + ret= rpcsvc_create_listeners (nfs->rpcsvc, options, nfsx->name); if (ret == -1) { gf_log (GF_NFS, GF_LOG_ERROR, "Unable to create listeners"); dict_unref (options); @@ -1939,12 +2634,12 @@ err: rpcsvc_actor_t mnt1svc_actors[MOUNT1_PROC_COUNT] = { - {"NULL", MOUNT1_NULL, mnt3svc_null, NULL, 0}, - {{0, 0}, }, - {"DUMP", MOUNT1_DUMP, mnt3svc_dump, NULL, 0}, - {"UMNT", MOUNT1_UMNT, mnt3svc_umnt, NULL, 0}, - {{0, 0}, }, - {"EXPORT", MOUNT1_EXPORT, mnt3svc_export, NULL, 0} + {"NULL", MOUNT1_NULL, mnt3svc_null, NULL, 0, DRC_NA}, + {"MNT", MOUNT1_MNT, NULL, NULL, 0, DRC_NA }, + {"DUMP", MOUNT1_DUMP, mnt3svc_dump, NULL, 0, DRC_NA}, + {"UMNT", MOUNT1_UMNT, mnt3svc_umnt, NULL, 0, DRC_NA}, + {"UMNTALL", MOUNT1_UMNTALL, NULL, NULL, 0, DRC_NA}, + {"EXPORT", MOUNT1_EXPORT, mnt3svc_export, NULL, 0, DRC_NA} }; rpcsvc_program_t mnt1prog = { @@ -2010,7 +2705,7 @@ mnt1svc_init (xlator_t *nfsx) } } - rpcsvc_create_listeners (nfs->rpcsvc, options, nfsx->name); + ret = rpcsvc_create_listeners (nfs->rpcsvc, options, nfsx->name); if (ret == -1) { gf_log (GF_NFS, GF_LOG_ERROR, "Unable to create listeners"); dict_unref (options); @@ -2021,3 +2716,44 @@ mnt1svc_init (xlator_t *nfsx) err: return NULL; } + +int +mount_reconfigure_state (xlator_t *nfsx, dict_t *options) +{ + int ret = -1; + struct nfs_state *nfs = NULL; + struct mount3_state *ms = NULL; + struct mnt3_export *exp = NULL; + struct mnt3_export *texp = NULL; + + if ((!nfsx) || (!options)) + return (-1); + + nfs = (struct nfs_state *)nfs_state (nfsx); + if (!nfs) + return (-1); + + ms = nfs->mstate; + if (!ms) + return (-1); + + /* + * Free() up the old export list. mnt3_init_options() will + * rebuild the export list from scratch. Do it with locking + * to avoid unnecessary race conditions. + */ + LOCK (&ms->mountlock); + list_for_each_entry_safe (exp, texp, &ms->exportlist, explist) { + list_del (&exp->explist); + mnt3_export_free (exp); + } + ret = mnt3_init_options (ms, options); + UNLOCK (&ms->mountlock); + + if (ret < 0) { + gf_log (GF_MNT, GF_LOG_ERROR, "Options reconfigure failed"); + return (-1); + } + + return (0); +} diff --git a/xlators/nfs/server/src/mount3.h b/xlators/nfs/server/src/mount3.h index c0eae3644..7fc16ed57 100644 --- a/xlators/nfs/server/src/mount3.h +++ b/xlators/nfs/server/src/mount3.h @@ -2,19 +2,10 @@ Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _MOUNT3_H_ @@ -53,6 +44,12 @@ mnt1svc_init (xlator_t *nfsx); extern int mount_init_state (xlator_t *nfsx); +extern int +mount_reconfigure_state (xlator_t *nfsx, dict_t *options); + +void +mount_rewrite_rmtab (struct mount3_state *ms, char *new_rmtab); + /* Data structure used to store the list of mounts points currently * in use by NFS clients. */ @@ -68,6 +65,13 @@ struct mountentry { #define MNT3_EXPTYPE_VOLUME 1 #define MNT3_EXPTYPE_DIR 2 +/* Structure to hold export-dir AUTH parameter */ +struct host_auth_spec { + char *host_addr; /* Allowed IP or host name */ + int routeprefix; /* Routing prefix */ + struct host_auth_spec *next; /* Pointer to next AUTH struct */ +}; + struct mnt3_export { struct list_head explist; @@ -75,6 +79,11 @@ struct mnt3_export { * is exported or the subdirectory in the volume. */ char *expname; + /* + * IP address, hostname or subnets who are allowed to connect to expname + * subvolume or subdirectory + */ + struct host_auth_spec* hostspec; xlator_t *vol; int exptype; @@ -101,8 +110,8 @@ struct mount3_state { gf_lock_t mountlock; /* Set to 0 if exporting full volumes is disabled. On by default. */ - int export_volumes; - int export_dirs; + gf_boolean_t export_volumes; + gf_boolean_t export_dirs; }; #define gf_mnt3_export_dirs(mst) ((mst)->export_dirs) diff --git a/xlators/nfs/server/src/mount3udp_svc.c b/xlators/nfs/server/src/mount3udp_svc.c index aa38b1cc4..fb59e282c 100644 --- a/xlators/nfs/server/src/mount3udp_svc.c +++ b/xlators/nfs/server/src/mount3udp_svc.c @@ -2,19 +2,10 @@ Copyright (c) 2012 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ diff --git a/xlators/nfs/server/src/nfs-common.c b/xlators/nfs/server/src/nfs-common.c index b3853ded5..f74396ee8 100644 --- a/xlators/nfs/server/src/nfs-common.c +++ b/xlators/nfs/server/src/nfs-common.c @@ -2,19 +2,10 @@ Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -94,7 +85,7 @@ nfs_mntpath_to_xlator (xlator_list_t *cl, char *path) if ((!cl) || (!path)) return NULL; - strcpy (volname, path); + strncpy (volname, path, MNTPATHLEN); pathlen = strlen (volname); gf_log (GF_NFS, GF_LOG_TRACE, "Subvolume search: %s", path); if (volname[0] == '/') diff --git a/xlators/nfs/server/src/nfs-common.h b/xlators/nfs/server/src/nfs-common.h index f74bb3187..2e97f1563 100644 --- a/xlators/nfs/server/src/nfs-common.h +++ b/xlators/nfs/server/src/nfs-common.h @@ -2,19 +2,10 @@ Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _NFS_COMMON_H_ diff --git a/xlators/nfs/server/src/nfs-fops.c b/xlators/nfs/server/src/nfs-fops.c index b6edc99c7..236b80c76 100644 --- a/xlators/nfs/server/src/nfs-fops.c +++ b/xlators/nfs/server/src/nfs-fops.c @@ -2,19 +2,10 @@ Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -191,6 +182,12 @@ nfs_create_frame (xlator_t *xl, nfs_user_t *nfu) frame = create_frame (xl, (call_pool_t *)xl->ctx->pool); if (!frame) goto err; + if (call_stack_alloc_groups (frame->root, nfu->ngrps) != 0) { + STACK_DESTROY (frame->root); + frame = NULL; + goto err; + } + frame->root->pid = NFS_PID; frame->root->uid = nfu->uid; frame->root->gid = nfu->gids[NFS_PRIMGID_IDX]; @@ -385,19 +382,6 @@ nfs_fop_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { struct nfs_fop_local *local = NULL; fop_lookup_cbk_t progcbk; - int32_t spb = 0; - - /* - * With native protocol, self-heal failures would be detected during - * open. NFS doesn't issue that open when revalidating cache, so we - * have to check for failures here instead. - */ - if (dict_get_int32(xattr, "split-brain", &spb) == 0) { - if (spb) { - op_ret = -1; - op_errno = EIO; - } - } if (op_ret == 0) { nfs_fix_generation(this,inode); diff --git a/xlators/nfs/server/src/nfs-fops.h b/xlators/nfs/server/src/nfs-fops.h index d846e14de..44e99c66b 100644 --- a/xlators/nfs/server/src/nfs-fops.h +++ b/xlators/nfs/server/src/nfs-fops.h @@ -2,19 +2,10 @@ Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _NFS_FOPS_H_ diff --git a/xlators/nfs/server/src/nfs-generics.c b/xlators/nfs/server/src/nfs-generics.c index 7f79bba9b..cb32b7f1b 100644 --- a/xlators/nfs/server/src/nfs-generics.c +++ b/xlators/nfs/server/src/nfs-generics.c @@ -2,19 +2,10 @@ Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H diff --git a/xlators/nfs/server/src/nfs-generics.h b/xlators/nfs/server/src/nfs-generics.h index 34e203a62..01876d68e 100644 --- a/xlators/nfs/server/src/nfs-generics.h +++ b/xlators/nfs/server/src/nfs-generics.h @@ -2,19 +2,10 @@ Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _NFS_GENERICS_H_ diff --git a/xlators/nfs/server/src/nfs-inodes.c b/xlators/nfs/server/src/nfs-inodes.c index 291152f85..63d5e8a19 100644 --- a/xlators/nfs/server/src/nfs-inodes.c +++ b/xlators/nfs/server/src/nfs-inodes.c @@ -2,19 +2,10 @@ Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -57,10 +48,10 @@ nfl_inodes_init (struct nfs_fop_local *nfl, inode_t *inode, inode_t *parent, nfl->newparent = inode_ref (newparent); if (name) - strcpy (nfl->path, name); + strncpy (nfl->path, name, NFS_NAME_MAX); if (newname) - strcpy (nfl->newpath, newname); + strncpy (nfl->newpath, newname, NFS_NAME_MAX); return; } diff --git a/xlators/nfs/server/src/nfs-inodes.h b/xlators/nfs/server/src/nfs-inodes.h index 7c962b339..ba7a57124 100644 --- a/xlators/nfs/server/src/nfs-inodes.h +++ b/xlators/nfs/server/src/nfs-inodes.h @@ -2,19 +2,10 @@ Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _NFS_INODES_H_ diff --git a/xlators/nfs/server/src/nfs-mem-types.h b/xlators/nfs/server/src/nfs-mem-types.h index 005598c1f..450b6f2fe 100644 --- a/xlators/nfs/server/src/nfs-mem-types.h +++ b/xlators/nfs/server/src/nfs-mem-types.h @@ -2,19 +2,10 @@ Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ @@ -54,6 +45,8 @@ enum gf_nfs_mem_types_ { gf_nfs_mt_nlm4_share, gf_nfs_mt_aux_gids, gf_nfs_mt_inode_ctx, + gf_nfs_mt_auth_spec, + gf_nfs_mt_arr, gf_nfs_mt_end }; #endif diff --git a/xlators/nfs/server/src/nfs.c b/xlators/nfs/server/src/nfs.c index 1d59ad399..75c8fe44e 100644 --- a/xlators/nfs/server/src/nfs.c +++ b/xlators/nfs/server/src/nfs.c @@ -2,19 +2,10 @@ Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ /* This is the primary translator source for NFS. @@ -43,10 +34,206 @@ #include "nlm4.h" #include "options.h" #include "acl3.h" +#include "rpc-drc.h" + +#define STRINGIFY(val) #val +#define TOSTRING(val) STRINGIFY(val) #define OPT_SERVER_AUX_GIDS "nfs.server-aux-gids" #define OPT_SERVER_GID_CACHE_TIMEOUT "nfs.server.aux-gid-timeout" +/* TODO: DATADIR should be based on configure's $(localstatedir) */ +#define DATADIR "/var/lib/glusterd" +#define NFS_DATADIR DATADIR "/nfs" + +/* Forward declaration */ +int nfs_add_initer (struct list_head *list, nfs_version_initer_t init); + +static int +nfs_init_version (xlator_t *this, nfs_version_initer_t init) +{ + int ret = -1; + struct nfs_initer_list *version = NULL; + struct nfs_initer_list *tmp = NULL; + rpcsvc_program_t *prog = NULL; + struct list_head *versions = NULL; + struct nfs_state *nfs = NULL; + gf_boolean_t found = _gf_false; + + if ((!this) || (!this->private) || (!init)) + return (-1); + + nfs = (struct nfs_state *)this->private; + + ret = nfs_add_initer (&nfs->versions, init); + if (ret == -1) { + gf_log (GF_NFS, GF_LOG_ERROR, + "Failed to add protocol initializer"); + goto err; + } + + versions = &nfs->versions; + list_for_each_entry_safe (version, tmp, versions, list) { + prog = version->program; + if (version->init == init) { + prog = init(this); + if (!prog) { + ret = -1; + goto err; + } + version->program = prog; + found = _gf_true; + break; + } + } + + /* program not added */ + if (!found) { + gf_log (GF_NFS, GF_LOG_ERROR, + "Program: %s NOT found", prog->progname); + goto err; + } + + /* Check if nfs.port is configured */ + if (nfs->override_portnum) + prog->progport = nfs->override_portnum; + + gf_log (GF_NFS, GF_LOG_DEBUG, "Starting program: %s", prog->progname); + + ret = rpcsvc_program_register (nfs->rpcsvc, prog); + if (ret == -1) { + gf_log (GF_NFS, GF_LOG_ERROR, "Program: %s init failed", + prog->progname); + goto err; + } + + /* Registration with portmapper is disabled, Nothing to do */ + if (!nfs->register_portmap) + goto err; + + ret = rpcsvc_program_register_portmap (prog, prog->progport); + if (ret == -1) { + gf_log (GF_NFS, GF_LOG_ERROR, + "Program %s registration failed", + prog->progname); + goto err; + } + ret = 0; /* All well */ +err: + return ret; +} + +static int +nfs_deinit_version (struct nfs_state *nfs, nfs_version_initer_t init) +{ + int ret = -1; + struct nfs_initer_list *version = NULL; + struct nfs_initer_list *tmp = NULL; + rpcsvc_program_t *prog = NULL; + struct list_head *versions = NULL; + + if ((!nfs) || (!init)) + return (-1); + + versions = &nfs->versions; + list_for_each_entry_safe (version, tmp, versions, list) { + prog = version->program; + if (version->init == init) { + prog = version->program; + ret = rpcsvc_program_unregister (nfs->rpcsvc, prog); + if (ret != 0) + return (-1); + list_del (&version->list); + GF_FREE (version); + return (0); + } + } + + return (-1); +} + +static int +nfs_reconfigure_acl3 (xlator_t *this) +{ + struct nfs_state *nfs = NULL; + + if ((!this) || (!this->private)) + return (-1); + + nfs = (struct nfs_state *)this->private; + + /* ACL is enabled */ + if (nfs->enable_acl) + return nfs_init_version (this, acl3svc_init); + + /* ACL is disabled */ + return nfs_deinit_version (nfs, acl3svc_init); +} + +static int +nfs_reconfigure_nlm4 (xlator_t *this) +{ + struct nfs_state *nfs = NULL; + + if ((!this) || (!this->private)) + return (-1); + + nfs = (struct nfs_state *)this->private; + + /* NLM is enabled */ + if (nfs->enable_nlm) + return nfs_init_version (this, nlm4svc_init); + + /* NLM is disabled */ + return nfs_deinit_version (nfs, nlm4svc_init); +} + +static int +nfs_program_register_portmap_all (struct nfs_state *nfs) +{ + struct list_head *versions = NULL; + struct nfs_initer_list *version = NULL; + struct nfs_initer_list *tmp = NULL; + rpcsvc_program_t *prog = NULL; + + if (nfs == NULL) + return (-1); + + versions = &nfs->versions; + list_for_each_entry_safe (version, tmp, versions, list) { + prog = version->program; + if (prog == NULL) + continue; + if (nfs->override_portnum) + prog->progport = nfs->override_portnum; + (void) rpcsvc_program_register_portmap (prog, prog->progport); + } + + return (0); +} + +static int +nfs_program_unregister_portmap_all (struct nfs_state *nfs) +{ + struct list_head *versions = NULL; + struct nfs_initer_list *version = NULL; + struct nfs_initer_list *tmp = NULL; + rpcsvc_program_t *prog = NULL; + + if (nfs == NULL) + return (-1); + + versions = &nfs->versions; + list_for_each_entry_safe (version, tmp, versions, list) { + prog = version->program; + if (prog == NULL) + continue; + (void) rpcsvc_program_unregister_portmap (prog); + } + + return (0); +} + /* Every NFS version must call this function with the init function * for its particular version. */ @@ -123,7 +310,7 @@ nfs_init_versions (struct nfs_state *nfs, xlator_t *this) ret = -1; goto err; } -// prog->actorxl = this; + version->program = prog; if (nfs->override_portnum) prog->progport = nfs->override_portnum; @@ -132,17 +319,21 @@ nfs_init_versions (struct nfs_state *nfs, xlator_t *this) ret = rpcsvc_program_register (nfs->rpcsvc, prog); if (ret == -1) { - gf_log (GF_NFS, GF_LOG_ERROR, "Program init failed"); + gf_log (GF_NFS, GF_LOG_ERROR, "Program: %s init failed", + prog->progname); goto err; } - if (rpcsvc_register_portmap_enabled(nfs->rpcsvc)) { + if (nfs->register_portmap) { ret = rpcsvc_program_register_portmap (prog, prog->progport); if (ret == -1) { - gf_log (GF_NFS, GF_LOG_ERROR, "Program registration failed"); + gf_log (GF_NFS, GF_LOG_ERROR, + "Program %s registration failed", + prog->progname); goto err; } } + } ret = 0; @@ -159,22 +350,22 @@ nfs_add_all_initiators (struct nfs_state *nfs) /* Add the initializers for all versions. */ ret = nfs_add_initer (&nfs->versions, mnt3svc_init); if (ret == -1) { - gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add protocol" - " initializer"); + gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add " + "MOUNT3 protocol initializer"); goto ret; } ret = nfs_add_initer (&nfs->versions, mnt1svc_init); if (ret == -1) { - gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add protocol" - " initializer"); + gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add " + "MOUNT1 protocol initializer"); goto ret; } ret = nfs_add_initer (&nfs->versions, nfs3svc_init); if (ret == -1) { - gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add protocol" - " initializer"); + gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add " + "NFS3 protocol initializer"); goto ret; } @@ -187,11 +378,13 @@ nfs_add_all_initiators (struct nfs_state *nfs) } } - ret = nfs_add_initer (&nfs->versions, acl3svc_init); - if (ret == -1) { - gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add protocol" - " initializer"); - goto ret; + if (nfs->enable_acl == _gf_true) { + ret = nfs_add_initer (&nfs->versions, acl3svc_init); + if (ret == -1) { + gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add " + "ACL protocol initializer"); + goto ret; + } } ret = 0; @@ -527,10 +720,11 @@ nfs_init_state (xlator_t *this) if (!this) return NULL; - if ((!this->children) || (!this->children->xlator)) { - gf_log (GF_NFS, GF_LOG_ERROR, "nfs must have at least one" - " child subvolume"); - return NULL; + if (!this->children) { + gf_log (GF_NFS, GF_LOG_INFO, + "NFS is manually disabled: Exiting"); + /* Nothing for nfs process to do, exit cleanly */ + kill (getpid (), SIGTERM); } nfs = GF_CALLOC (1, sizeof (*nfs), gf_nfs_mt_nfs_state); @@ -586,19 +780,17 @@ nfs_init_state (xlator_t *this) } nfs->enable_nlm = _gf_true; - if (!dict_get_str (this->options, "nfs.nlm", &optstr)) { - - ret = gf_string2boolean (optstr, &boolt); - if (ret < 0) { - gf_log (GF_NFS, GF_LOG_ERROR, "Failed to parse" - " bool string"); - goto free_foppool; - } + ret = dict_get_str_boolean (this->options, "nfs.nlm", _gf_true); + if (ret == _gf_false) { + gf_log (GF_NFS, GF_LOG_INFO, "NLM is manually disabled"); + nfs->enable_nlm = _gf_false; + } - if (boolt == _gf_false) { - gf_log (GF_NFS, GF_LOG_INFO, "NLM is manually disabled"); - nfs->enable_nlm = _gf_false; - } + nfs->enable_acl = _gf_true; + ret = dict_get_str_boolean (this->options, "nfs.acl", _gf_true); + if (ret == _gf_false) { + gf_log (GF_NFS, GF_LOG_INFO, "ACL is manually disabled"); + nfs->enable_acl = _gf_false; } nfs->enable_ino32 = 0; @@ -682,6 +874,15 @@ nfs_init_state (xlator_t *this) nfs->mount_udp = 1; } + nfs->rmtab = gf_strdup (NFS_DATADIR "/rmtab"); + if (dict_get(this->options, "nfs.mount-rmtab")) { + ret = dict_get_str (this->options, "nfs.mount-rmtab", &nfs->rmtab); + if (ret == -1) { + gf_log (GF_NFS, GF_LOG_ERROR, "Failed to parse dict"); + goto free_foppool; + } + } + /* support both options rpc-auth.ports.insecure and * rpc-auth-allow-insecure for backward compatibility */ @@ -747,10 +948,10 @@ nfs_init_state (xlator_t *this) GF_OPTION_INIT (OPT_SERVER_GID_CACHE_TIMEOUT, nfs->server_aux_gids_max_age, uint32, free_foppool); - if (gid_cache_init(&nfs->gid_cache, nfs->server_aux_gids_max_age) < 0) { - gf_log(GF_NFS, GF_LOG_ERROR, "Failed to initialize group cache."); - goto free_foppool; - } + if (gid_cache_init(&nfs->gid_cache, nfs->server_aux_gids_max_age) < 0) { + gf_log(GF_NFS, GF_LOG_ERROR, "Failed to initialize group cache."); + goto free_foppool; + } if (stat("/sbin/rpc.statd", &stbuf) == -1) { gf_log (GF_NFS, GF_LOG_WARNING, "/sbin/rpc.statd not found. " @@ -765,6 +966,8 @@ nfs_init_state (xlator_t *this) goto free_foppool; } + nfs->register_portmap = rpcsvc_register_portmap_enabled (nfs->rpcsvc); + this->private = (void *)nfs; INIT_LIST_HEAD (&nfs->versions); nfs->generation = 1965; @@ -786,6 +989,243 @@ free_rpcsvc: return nfs; } +int +nfs_drc_init (xlator_t *this) +{ + int ret = -1; + rpcsvc_t *svc = NULL; + + svc = ((struct nfs_state *)(this->private))->rpcsvc; + if (!svc) + goto out; + + ret = rpcsvc_drc_init (svc, this->options); + + out: + return ret; +} + +int +nfs_reconfigure_state (xlator_t *this, dict_t *options) +{ + int ret = 0; + int keyindx = 0; + char *optstr = NULL; + gf_boolean_t optbool; + uint32_t optuint32; + struct nfs_state *nfs = NULL; + char *blacklist_keys[] = { + "nfs.port", + "nfs.transport-type", + "nfs.mem-factor", + NULL}; + + GF_VALIDATE_OR_GOTO (GF_NFS, this, out); + GF_VALIDATE_OR_GOTO (GF_NFS, this->private, out); + GF_VALIDATE_OR_GOTO (GF_NFS, options, out); + + nfs = (struct nfs_state *)this->private; + + /* Black listed options can't be reconfigured, they need + * NFS to be restarted. There are two cases 1. SET 2. UNSET. + * 1. SET */ + while (blacklist_keys[keyindx]) { + if (dict_get (options, blacklist_keys[keyindx])) { + gf_log (GF_NFS, GF_LOG_ERROR, + "Reconfiguring %s needs NFS restart", + blacklist_keys[keyindx]); + goto out; + } + keyindx ++; + } + + /* UNSET for nfs.mem-factor */ + if ((!dict_get (options, "nfs.mem-factor")) && + (nfs->memfactor != GF_NFS_DEFAULT_MEMFACTOR)) { + gf_log (GF_NFS, GF_LOG_INFO, + "Reconfiguring nfs.mem-factor needs NFS restart"); + goto out; + } + + /* UNSET for nfs.port */ + if ((!dict_get (options, "nfs.port")) && + (nfs->override_portnum)) { + gf_log (GF_NFS, GF_LOG_ERROR, + "Reconfiguring nfs.port needs NFS restart"); + goto out; + } + + /* reconfig nfs.mount-rmtab */ + optstr = NFS_DATADIR "/rmtab"; + if (dict_get (options, "nfs.mount-rmtab")) { + ret = dict_get_str (options, "nfs.mount-rmtab", &optstr); + if (ret < 0) { + gf_log (GF_NFS, GF_LOG_ERROR, "Failed to read " + "reconfigured option: nfs.mount-rmtab"); + goto out; + } + gf_path_strip_trailing_slashes (optstr); + } + if (strcmp (nfs->rmtab, optstr) != 0) { + mount_rewrite_rmtab (nfs->mstate, optstr); + gf_log (GF_NFS, GF_LOG_INFO, + "Reconfigured nfs.mount-rmtab path: %s", + nfs->rmtab); + } + + GF_OPTION_RECONF (OPT_SERVER_AUX_GIDS, optbool, + options, bool, out); + if (nfs->server_aux_gids != optbool) { + nfs->server_aux_gids = optbool; + gf_log(GF_NFS, GF_LOG_INFO, "Reconfigured %s with value %d", + OPT_SERVER_AUX_GIDS, optbool); + } + + GF_OPTION_RECONF (OPT_SERVER_GID_CACHE_TIMEOUT, optuint32, + options, uint32, out); + if (nfs->server_aux_gids_max_age != optuint32) { + nfs->server_aux_gids_max_age = optuint32; + gid_cache_reconf (&nfs->gid_cache, optuint32); + gf_log(GF_NFS, GF_LOG_INFO, "Reconfigured %s with value %d", + OPT_SERVER_GID_CACHE_TIMEOUT, optuint32); + } + + /* reconfig nfs.dynamic-volumes */ + ret = dict_get_str_boolean (options, "nfs.dynamic-volumes", + GF_NFS_DVM_OFF); + switch (ret) { + case GF_NFS_DVM_ON: + case GF_NFS_DVM_OFF: + optbool = ret; + break; + default: + optbool = GF_NFS_DVM_OFF; + break; + } + if (nfs->dynamicvolumes != optbool) { + nfs->dynamicvolumes = optbool; + gf_log(GF_NFS, GF_LOG_INFO, "Reconfigured nfs.dynamic-volumes" + " with value %d", optbool); + } + + optbool = _gf_false; + if (dict_get (options, "nfs.enable-ino32")) { + ret = dict_get_str_boolean (options, "nfs.enable-ino32", + _gf_false); + if (ret < 0) { + gf_log (GF_NFS, GF_LOG_ERROR, + "Failed to read reconfigured option: " + "nfs.enable-ino32"); + goto out; + } + optbool = ret; + } + if (nfs->enable_ino32 != optbool) { + nfs->enable_ino32 = optbool; + gf_log(GF_NFS, GF_LOG_INFO, "Reconfigured nfs.enable-ino32" + " with value %d", optbool); + } + + /* nfs.nlm is enabled by default */ + ret = dict_get_str_boolean (options, "nfs.nlm", _gf_true); + if (ret < 0) { + optbool = _gf_true; + } else { + optbool = ret; + } + if (nfs->enable_nlm != optbool) { + gf_log (GF_NFS, GF_LOG_INFO, "NLM is manually %s", + (optbool ? "enabled":"disabled")); + nfs->enable_nlm = optbool; + nfs_reconfigure_nlm4 (this); + } + + /* nfs.acl is enabled by default */ + ret = dict_get_str_boolean (options, "nfs.acl", _gf_true); + if (ret < 0) { + optbool = _gf_true; + } else { + optbool = ret; + } + if (nfs->enable_acl != optbool) { + gf_log (GF_NFS, GF_LOG_INFO, "ACL is manually %s", + (optbool ? "enabled":"disabled")); + nfs->enable_acl = optbool; + nfs_reconfigure_acl3 (this); + } + + ret = 0; +out: + return ret; +} + + +/* + * reconfigure() for NFS server xlator. + */ +int +reconfigure (xlator_t *this, dict_t *options) +{ + int ret = 0; + struct nfs_state *nfs = NULL; + gf_boolean_t regpmap = _gf_true; + + if ((!this) || (!this->private) || (!options)) + return (-1); + + nfs = (struct nfs_state *)this->private; + + /* Reconfigure nfs options */ + ret = nfs_reconfigure_state(this, options); + if (ret) { + gf_log (GF_NFS, GF_LOG_ERROR, + "nfs reconfigure state failed"); + return (-1); + } + + /* Reconfigure nfs3 options */ + ret = nfs3_reconfigure_state(this, options); + if (ret) { + gf_log (GF_NFS, GF_LOG_ERROR, + "nfs3 reconfigure state failed"); + return (-1); + } + + /* Reconfigure mount options */ + ret = mount_reconfigure_state(this, options); + if (ret) { + gf_log (GF_NFS, GF_LOG_ERROR, + "mount reconfigure state failed"); + return (-1); + } + + /* Reconfigure rpc layer */ + ret = rpcsvc_reconfigure_options (nfs->rpcsvc, options); + if (ret) { + gf_log (GF_NFS, GF_LOG_ERROR, + "rpcsvc reconfigure options failed"); + return (-1); + } + regpmap = rpcsvc_register_portmap_enabled(nfs->rpcsvc); + if (nfs->register_portmap != regpmap) { + nfs->register_portmap = regpmap; + if (regpmap) { + (void) nfs_program_register_portmap_all (nfs); + } else { + (void) nfs_program_unregister_portmap_all (nfs); + } + } + + /* Reconfigure drc */ + ret = rpcsvc_drc_reconfigure (nfs->rpcsvc, options); + if (ret) { + gf_log (GF_NFS, GF_LOG_ERROR, + "rpcsvc DRC reconfigure failed"); + return (-1); + } + + return (0); +} int init (xlator_t *this) { @@ -841,7 +1281,9 @@ init (xlator_t *this) { goto err; } - gf_log (GF_NFS, GF_LOG_INFO, "NFS service started"); + ret = nfs_drc_init (this); + if (ret == 0) + gf_log (GF_NFS, GF_LOG_INFO, "NFS service started"); err: return ret; @@ -996,7 +1438,22 @@ nlm_priv (xlator_t *this); int32_t nfs_priv (xlator_t *this) { - return nlm_priv (this); + int32_t ret = -1; + + /* DRC needs the global drc structure, xl is of no use to it. */ + ret = rpcsvc_drc_priv (((struct nfs_state *)(this->private))->rpcsvc->drc); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "Statedump of DRC failed"); + goto out; + } + + ret = nlm_priv (this); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "Statedump of NLM failed"); + goto out; + } + out: + return ret; } @@ -1019,29 +1476,53 @@ struct xlator_dumpops dumpops = { struct volume_options options[] = { { .key = {"nfs3.read-size"}, .type = GF_OPTION_TYPE_SIZET, - .description = "Size in which the client should issue read requests" - " to the Gluster NFSv3 server. Must be a multiple of" - " 4KB." + .min = GF_NFS3_RTMIN, + .max = GF_NFS3_RTMAX, + .default_value = TOSTRING(GF_NFS3_RTPREF), + .description = "Size in which the client should issue read requests " + "to the Gluster NFSv3 server. Must be a multiple of " + "4KB (4096). Min and Max supported values are 4KB " + "(4096) and 1MB (1048576) respectively. If the " + "specified value is within the supported range but " + "not a multiple of 4096, it is rounded up to the " + "nearest multiple of 4096." }, { .key = {"nfs3.write-size"}, .type = GF_OPTION_TYPE_SIZET, - .description = "Size in which the client should issue write requests" - " to the Gluster NFSv3 server. Must be a multiple of" - " 4KB." + .min = GF_NFS3_WTMIN, + .max = GF_NFS3_WTMAX, + .default_value = TOSTRING(GF_NFS3_WTPREF), + .description = "Size in which the client should issue write requests " + "to the Gluster NFSv3 server. Must be a multiple of " + "1KB (1024). Min and Max supported values are " + "4KB (4096) and 1MB(1048576) respectively. If the " + "specified value is within the supported range but " + "not a multiple of 4096, it is rounded up to the " + "nearest multiple of 4096." }, { .key = {"nfs3.readdir-size"}, .type = GF_OPTION_TYPE_SIZET, + .min = GF_NFS3_DTMIN, + .max = GF_NFS3_DTMAX, + .default_value = TOSTRING(GF_NFS3_DTPREF), .description = "Size in which the client should issue directory " - " reading requests." + "reading requests to the Gluster NFSv3 server. Must " + "be a multiple of 1KB (1024). Min and Max supported " + "values are 4KB (4096) and 1MB (1048576) respectively." + "If the specified value is within the supported range " + "but not a multiple of 4096, it is rounded up to the " + "nearest multiple of 4096." }, { .key = {"nfs3.*.volume-access"}, .type = GF_OPTION_TYPE_STR, .value = {"read-only", "read-write"}, + .default_value = "read-write", .description = "Type of access desired for this subvolume: " " read-only, read-write(default)" }, { .key = {"nfs3.*.trusted-write"}, .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", .description = "On an UNSTABLE write from client, return STABLE flag" " to force client to not send a COMMIT request. In " "some environments, combined with a replicated " @@ -1056,6 +1537,7 @@ struct volume_options options[] = { }, { .key = {"nfs3.*.trusted-sync"}, .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", .description = "All writes and COMMIT requests are treated as async." " This implies that no write requests are guaranteed" " to be on server disks when the write reply is " @@ -1065,25 +1547,40 @@ struct volume_options options[] = { }, { .key = {"nfs3.*.export-dir"}, .type = GF_OPTION_TYPE_PATH, + .default_value = "", .description = "By default, all subvolumes of nfs are exported as " "individual exports. There are cases where a " "subdirectory or subdirectories in the volume need to " "be exported separately. This option can also be used " "in conjunction with nfs3.export-volumes option to " "restrict exports only to the subdirectories specified" - " through this option. Must be an absolute path." + " through this option. Must be an absolute path. Along" + " with path allowed list of IPs/hostname can be " + "associated with each subdirectory. If provided " + "connection will allowed only from these IPs. By " + "default connections from all IPs are allowed. " + "Format: <dir>[(hostspec[|hostspec|...])][,...]. Where" + " hostspec can be an IP address, hostname or an IP " + "range in CIDR notation. " + "e.g. /foo(192.168.1.0/24|host1|10.1.1.8),/host2." + " NOTE: Care must be taken while configuring this " + "option as invalid entries and/or unreachable DNS " + "servers can introduce unwanted delay in all the mount" + " calls." }, { .key = {"nfs3.export-dirs"}, .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", .description = "By default, all subvolumes of nfs are exported as " "individual exports. There are cases where a " "subdirectory or subdirectories in the volume need to " "be exported separately. Enabling this option allows " "any directory on a volumes to be exported separately." - " Directory exports are enabled by default." + "Directory exports are enabled by default." }, { .key = {"nfs3.export-volumes"}, .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", .description = "Enable or disable exporting whole volumes, instead " "if used in conjunction with nfs3.export-dir, can " "allow setting up only subdirectories as exports. On " @@ -1091,6 +1588,7 @@ struct volume_options options[] = { }, { .key = {"rpc-auth.auth-unix"}, .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", .description = "Disable or enable the AUTH_UNIX authentication type." "Must always be enabled for better interoperability." "However, can be disabled if needed. Enabled by" @@ -1098,12 +1596,14 @@ struct volume_options options[] = { }, { .key = {"rpc-auth.auth-null"}, .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", .description = "Disable or enable the AUTH_NULL authentication type." "Must always be enabled. This option is here only to" " avoid unrecognized option warnings" }, { .key = {"rpc-auth.auth-unix.*"}, .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", .description = "Disable or enable the AUTH_UNIX authentication type " "for a particular exported volume overriding defaults" " and general setting for AUTH_UNIX scheme. Must " @@ -1113,6 +1613,7 @@ struct volume_options options[] = { }, { .key = {"rpc-auth.auth-unix.*.allow"}, .type = GF_OPTION_TYPE_STR, + .default_value = "on", .description = "Disable or enable the AUTH_UNIX authentication type " "for a particular exported volume overriding defaults" " and general setting for AUTH_UNIX scheme. Must " @@ -1122,6 +1623,7 @@ struct volume_options options[] = { }, { .key = {"rpc-auth.auth-null.*"}, .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", .description = "Disable or enable the AUTH_NULL authentication type " "for a particular exported volume overriding defaults" " and general setting for AUTH_NULL. Must always be " @@ -1130,6 +1632,7 @@ struct volume_options options[] = { }, { .key = {"rpc-auth.addr.allow"}, .type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST, + .default_value = "all", .description = "Allow a comma separated list of addresses and/or" " hostnames to connect to the server. By default, all" " connections are allowed. This allows users to " @@ -1137,6 +1640,7 @@ struct volume_options options[] = { }, { .key = {"rpc-auth.addr.reject"}, .type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST, + .default_value = "none", .description = "Reject a comma separated list of addresses and/or" " hostnames from connecting to the server. By default," " all connections are allowed. This allows users to" @@ -1144,6 +1648,7 @@ struct volume_options options[] = { }, { .key = {"rpc-auth.addr.*.allow"}, .type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST, + .default_value = "all", .description = "Allow a comma separated list of addresses and/or" " hostnames to connect to the server. By default, all" " connections are allowed. This allows users to " @@ -1151,6 +1656,7 @@ struct volume_options options[] = { }, { .key = {"rpc-auth.addr.*.reject"}, .type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST, + .default_value = "none", .description = "Reject a comma separated list of addresses and/or" " hostnames from connecting to the server. By default," " all connections are allowed. This allows users to" @@ -1158,6 +1664,7 @@ struct volume_options options[] = { }, { .key = {"rpc-auth.ports.insecure"}, .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", .description = "Allow client connections from unprivileged ports. By " "default only privileged ports are allowed. This is a" "global setting in case insecure ports are to be " @@ -1165,31 +1672,35 @@ struct volume_options options[] = { }, { .key = {"rpc-auth.ports.*.insecure"}, .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", .description = "Allow client connections from unprivileged ports. By " "default only privileged ports are allowed. Use this" " option to enable or disable insecure ports for " - "a specific subvolume and to override the global setting " - " set by the previous option." + "a specific subvolume and to override the global " + "setting set by the previous option." }, { .key = {"rpc-auth.addr.namelookup"}, .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", .description = "Users have the option of turning on name lookup for" " incoming client connections using this option. Use this " "option to turn on name lookups during address-based " "authentication. Turning this on will enable you to" " use hostnames in nfs.rpc-auth-* filters. In some " "setups, the name server can take too long to reply to DNS " - "queries resulting in timeouts of mount requests. By default, " - " name lookup is off" + "queries resulting in timeouts of mount requests. By " + "default, name lookup is off" }, { .key = {"nfs.dynamic-volumes"}, .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", .description = "Internal option set to tell gnfs to use a different" " scheme for encoding file handles when DVM is being" " used." }, { .key = {"nfs3.*.volume-id"}, .type = GF_OPTION_TYPE_STR, + .default_value = "", .description = "When nfs.dynamic-volumes is set, gnfs expects every " "subvolume to have this option set for it, so that " "gnfs can use this option to identify the volume. " @@ -1198,22 +1709,34 @@ struct volume_options options[] = { }, { .key = {"nfs.enable-ino32"}, .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", .description = "For nfs clients or apps that do not support 64-bit " "inode numbers, use this option to make NFS return " - "32-bit inode numbers instead. Disabled by default, so " - "NFS returns 64-bit inode numbers." + "32-bit inode numbers instead. Disabled by default, so" + " NFS returns 64-bit inode numbers." }, { .key = {"rpc.register-with-portmap"}, .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", .description = "For systems that need to run multiple nfs servers, " "only one registration is possible with " "portmap service. Use this option to turn off portmap " "registration for Gluster NFS. On by default" }, + { .key = {"rpc.outstanding-rpc-limit"}, + .type = GF_OPTION_TYPE_INT, + .min = RPCSVC_MIN_OUTSTANDING_RPC_LIMIT, + .max = RPCSVC_MAX_OUTSTANDING_RPC_LIMIT, + .default_value = TOSTRING(RPCSVC_DEFAULT_OUTSTANDING_RPC_LIMIT), + .description = "Parameter to throttle the number of incoming RPC " + "requests from a client. 0 means no limit (can " + "potentially run out of memory)" + }, { .key = {"nfs.port"}, .type = GF_OPTION_TYPE_INT, .min = 1, .max = 0xffff, + .default_value = TOSTRING(GF_NFS3_PORT), .description = "Use this option on systems that need Gluster NFS to " "be associated with a non-default port number." }, @@ -1221,6 +1744,7 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_INT, .min = 1, .max = 1024, + .default_value = TOSTRING(GF_NFS_DEFAULT_MEMFACTOR), .description = "Use this option to make NFS be faster on systems by " "using more memory. This option specifies a multiple " "that determines the total amount of memory used. " @@ -1231,12 +1755,14 @@ struct volume_options options[] = { }, { .key = {"nfs.*.disable"}, .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", .description = "This option is used to start or stop NFS server" "for individual volume." }, { .key = {"nfs.nlm"}, .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", .description = "This option, if set to 'off', disables NLM server " "by not registering the service with the portmapper." " Set it to 'on' to re-enable it. Default value: 'on'" @@ -1244,11 +1770,21 @@ struct volume_options options[] = { { .key = {"nfs.mount-udp"}, .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", .description = "set the option to 'on' to enable mountd on UDP. " "Required for some Solaris and AIX NFS clients. " "The need for enabling this option often depends " "on the usage of NLM." }, + { .key = {"nfs.mount-rmtab"}, + .type = GF_OPTION_TYPE_PATH, + .default_value = DATADIR "/rmtab", + .description = "Set the location of the cache file that is used to " + "list all the NFS-clients that have connected " + "through the MOUNT protocol. If this is on shared " + "storage, all GlusterFS servers will update and " + "output (with 'showmount') the same list." + }, { .key = {OPT_SERVER_AUX_GIDS}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", @@ -1267,7 +1803,23 @@ struct volume_options options[] = { .description = "Number of seconds to cache auxiliary-GID data, when " OPT_SERVER_AUX_GIDS " is set." }, - + { .key = {"nfs.acl"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "This option is used to control ACL support for NFS." + }, + { .key = {"nfs.drc"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "on", + .description = "Enable Duplicate Request Cache in gNFS server to " + "improve correctness of non-idempotent operations like " + "write, delete, link, et al" + }, + { .key = {"nfs.drc-size"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "0x20000", + .description = "Sets the number of non-idempotent " + "requests to cache in drc" + }, { .key = {NULL} }, }; - diff --git a/xlators/nfs/server/src/nfs.h b/xlators/nfs/server/src/nfs.h index 7d5163dfe..00c7f8046 100644 --- a/xlators/nfs/server/src/nfs.h +++ b/xlators/nfs/server/src/nfs.h @@ -2,19 +2,10 @@ Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef __NFS_H__ @@ -45,7 +36,7 @@ #define GF_NFS_MAX_MEMFACTOR 30 #define GF_NFS_DVM_ON 1 -#define GF_NFS_DVM_OFF 2 +#define GF_NFS_DVM_OFF 0 /* This corresponds to the max 16 number of group IDs that are sent through an * RPC request. Since NFS is the only one going to set this, we can be safe @@ -86,12 +77,15 @@ struct nfs_state { unsigned int override_portnum; int allow_insecure; int enable_nlm; + int enable_acl; int mount_udp; + char *rmtab; struct rpc_clnt *rpc_clnt; gf_boolean_t server_aux_gids; uint32_t server_aux_gids_max_age; gid_cache_t gid_cache; uint32_t generation; + gf_boolean_t register_portmap; }; struct nfs_inode_ctx { diff --git a/xlators/nfs/server/src/nfs3-fh.c b/xlators/nfs/server/src/nfs3-fh.c index c4f59a622..e199c56dc 100644 --- a/xlators/nfs/server/src/nfs3-fh.c +++ b/xlators/nfs/server/src/nfs3-fh.c @@ -2,19 +2,10 @@ Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -120,17 +111,17 @@ nfs3_fh_is_root_fh (struct nfs3_fh *fh) void -nfs3_fh_to_str (struct nfs3_fh *fh, char *str) +nfs3_fh_to_str (struct nfs3_fh *fh, char *str, size_t len) { - char gfid[512]; - char exportid[512]; + char gfid[GF_UUID_BUF_SIZE]; + char exportid[GF_UUID_BUF_SIZE]; if ((!fh) || (!str)) return; - sprintf (str, "FH: exportid %s, gfid %s", - uuid_utoa_r (fh->exportid, exportid), - uuid_utoa_r (fh->gfid, gfid)); + snprintf (str, len, "FH: exportid %s, gfid %s", + uuid_utoa_r (fh->exportid, exportid), + uuid_utoa_r (fh->gfid, gfid)); } void diff --git a/xlators/nfs/server/src/nfs3-fh.h b/xlators/nfs/server/src/nfs3-fh.h index 23957d977..1049cdc96 100644 --- a/xlators/nfs/server/src/nfs3-fh.h +++ b/xlators/nfs/server/src/nfs3-fh.h @@ -2,19 +2,10 @@ Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _NFS_FH_H_ @@ -65,6 +56,11 @@ struct nfs3_fh { /* File/dir gfid. */ uuid_t gfid; + /* This structure must be exactly NFS3_FHSIZE (64) bytes long. + Having the structure shorter results in buffer overflows + during XDR decoding. + */ + unsigned char padding[NFS3_FHSIZE - GF_NFSFH_STATIC_SIZE]; } __attribute__((__packed__)); #define GF_NFS3FH_STATIC_INITIALIZER {{0},} @@ -92,7 +88,7 @@ extern void nfs3_log_fh (struct nfs3_fh *fh); extern void -nfs3_fh_to_str (struct nfs3_fh *fh, char *str); +nfs3_fh_to_str (struct nfs3_fh *fh, char *str, size_t len); extern int nfs3_fh_build_parent_fh (struct nfs3_fh *child, struct iatt *newstat, diff --git a/xlators/nfs/server/src/nfs3-helpers.c b/xlators/nfs/server/src/nfs3-helpers.c index f7b1bb0cd..9059fc341 100644 --- a/xlators/nfs/server/src/nfs3-helpers.c +++ b/xlators/nfs/server/src/nfs3-helpers.c @@ -2,19 +2,10 @@ Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -265,6 +256,20 @@ nfs3_errno_to_nfsstat3 (int errnum) return stat; } +/* + * Special case: If op_ret is -1, it's very unusual op_errno being + * 0 which means something came wrong from upper layer(s). If it + * happens by any means, then set NFS3 status to NFS3ERR_SERVERFAULT. + */ +inline nfsstat3 +nfs3_cbk_errno_status (int32_t op_ret, int32_t op_errno) +{ + if ((op_ret == -1) && (op_errno == 0)) { + return NFS3ERR_SERVERFAULT; + } + + return nfs3_errno_to_nfsstat3 (op_errno); +} void nfs3_fill_lookup3res_error (lookup3res *res, nfsstat3 stat, @@ -284,6 +289,9 @@ nfs3_stat_to_fattr3 (struct iatt *buf) { fattr3 fa = {0, }; + if (buf == NULL) + goto out; + if (IA_ISDIR (buf->ia_type)) fa.type = NF3DIR; else if (IA_ISREG (buf->ia_type)) @@ -353,6 +361,7 @@ nfs3_stat_to_fattr3 (struct iatt *buf) fa.mtime.seconds = buf->ia_mtime; fa.mtime.nseconds = buf->ia_mtime_nsec; +out: return fa; } @@ -496,7 +505,7 @@ nfs3_fill_fsinfo3res (struct nfs3_state *nfs3, fsinfo3res *res, resok.wtpref = nfs3->writesize; resok.wtmult = GF_NFS3_WTMULT; resok.dtpref = nfs3->readdirsize; - resok.maxfilesize = GF_NFS3_MAXFILE; + resok.maxfilesize = GF_NFS3_MAXFILESIZE; resok.time_delta = tdelta; resok.properties = GF_NFS3_FS_PROP; @@ -1607,13 +1616,14 @@ err: void nfs3_stat_to_errstr (uint32_t xid, char *op, nfsstat3 stat, int pstat, - char *errstr) + char *errstr, size_t len) { if ((!op) || (!errstr)) return; - sprintf (errstr, "XID: %x, %s: NFS: %d(%s), POSIX: %d(%s)", xid, op, - stat, nfsstat3_strerror (stat), pstat, strerror (pstat)); + snprintf (errstr, len, "XID: %x, %s: NFS: %d(%s), POSIX: %d(%s)", + xid, op,stat, nfsstat3_strerror (stat), pstat, + strerror (pstat)); } void @@ -1621,10 +1631,10 @@ nfs3_log_common_call (uint32_t xid, char *op, struct nfs3_fh *fh) { char fhstr[1024]; - if (THIS->ctx->log.loglevel < GF_LOG_DEBUG) - return; + if (THIS->ctx->log.loglevel < GF_LOG_DEBUG) + return; - nfs3_fh_to_str (fh, fhstr); + nfs3_fh_to_str (fh, fhstr, sizeof (fhstr)); gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, %s: args: %s", xid, op, fhstr); } @@ -1636,9 +1646,9 @@ nfs3_log_fh_entry_call (uint32_t xid, char *op, struct nfs3_fh *fh, { char fhstr[1024]; - if (THIS->ctx->log.loglevel < GF_LOG_DEBUG) - return; - nfs3_fh_to_str (fh, fhstr); + if (THIS->ctx->log.loglevel < GF_LOG_DEBUG) + return; + nfs3_fh_to_str (fh, fhstr, sizeof (fhstr)); gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, %s: args: %s, name: %s", xid, op, fhstr, name); } @@ -1651,10 +1661,10 @@ nfs3_log_rename_call (uint32_t xid, struct nfs3_fh *src, char *sname, char sfhstr[1024]; char dfhstr[1024]; - if (THIS->ctx->log.loglevel < GF_LOG_DEBUG) - return; - nfs3_fh_to_str (src, sfhstr); - nfs3_fh_to_str (dst, dfhstr); + if (THIS->ctx->log.loglevel < GF_LOG_DEBUG) + return; + nfs3_fh_to_str (src, sfhstr, sizeof (sfhstr)); + nfs3_fh_to_str (dst, dfhstr, sizeof (dfhstr)); gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, RENAME: args: Src: %s, " "name: %s, Dst: %s, name: %s", xid, sfhstr, sname, dfhstr, dname); @@ -1672,9 +1682,9 @@ nfs3_log_create_call (uint32_t xid, struct nfs3_fh *fh, char *name, char unchkd[] = "UNCHECKED"; char guarded[] = "GUARDED"; - if (THIS->ctx->log.loglevel < GF_LOG_DEBUG) - return; - nfs3_fh_to_str (fh, fhstr); + if (THIS->ctx->log.loglevel < GF_LOG_DEBUG) + return; + nfs3_fh_to_str (fh, fhstr, sizeof (fhstr)); if (mode == EXCLUSIVE) modestr = exclmode; else if (mode == GUARDED) @@ -1697,9 +1707,9 @@ nfs3_log_mknod_call (uint32_t xid, struct nfs3_fh *fh, char *name, int type) char sock[] = "SOCK"; char fifo[] = "FIFO"; - if (THIS->ctx->log.loglevel < GF_LOG_DEBUG) - return; - nfs3_fh_to_str (fh, fhstr); + if (THIS->ctx->log.loglevel < GF_LOG_DEBUG) + return; + nfs3_fh_to_str (fh, fhstr, sizeof (fhstr)); if (type == NF3CHR) modestr = chr; else if (type == NF3BLK) @@ -1720,9 +1730,9 @@ nfs3_log_symlink_call (uint32_t xid, struct nfs3_fh *fh, char *name, char *tgt) { char fhstr[1024]; - if (THIS->ctx->log.loglevel < GF_LOG_DEBUG) - return; - nfs3_fh_to_str (fh, fhstr); + if (THIS->ctx->log.loglevel < GF_LOG_DEBUG) + return; + nfs3_fh_to_str (fh, fhstr, sizeof (fhstr)); gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, SYMLINK: args: %s, name: %s," " target: %s", xid, fhstr, name, tgt); } @@ -1735,10 +1745,10 @@ nfs3_log_link_call (uint32_t xid, struct nfs3_fh *fh, char *name, char dfhstr[1024]; char tfhstr[1024]; - if (THIS->ctx->log.loglevel < GF_LOG_DEBUG) - return; - nfs3_fh_to_str (fh, dfhstr); - nfs3_fh_to_str (tgt, tfhstr); + if (THIS->ctx->log.loglevel < GF_LOG_DEBUG) + return; + nfs3_fh_to_str (fh, dfhstr, sizeof (dfhstr)); + nfs3_fh_to_str (tgt, tfhstr, sizeof (tfhstr)); gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, LINK: args: %s, name: %s," " target: %s", xid, dfhstr, name, tfhstr); } @@ -1750,9 +1760,9 @@ nfs3_log_rw_call (uint32_t xid, char *op, struct nfs3_fh *fh, offset3 offt, { char fhstr[1024]; - if (THIS->ctx->log.loglevel < GF_LOG_DEBUG) - return; - nfs3_fh_to_str (fh, fhstr); + if (THIS->ctx->log.loglevel < GF_LOG_DEBUG) + return; + nfs3_fh_to_str (fh, fhstr, sizeof (fhstr)); if (stablewrite == -1) gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, %s: args: %s, offset:" " %"PRIu64", count: %"PRIu32, xid, op, fhstr, offt, @@ -3383,11 +3393,11 @@ void nfs3_log_common_res (uint32_t xid, int op, nfsstat3 stat, int pstat) { char errstr[1024]; - int ll = nfs3_loglevel (op, stat); + int ll = nfs3_loglevel (op, stat); - if (THIS->ctx->log.loglevel < ll) - return; - nfs3_stat_to_errstr (xid, nfs3op_strings[op].str, stat, pstat, errstr); + if (THIS->ctx->log.loglevel < ll) + return; + nfs3_stat_to_errstr (xid, nfs3op_strings[op].str, stat, pstat, errstr, sizeof (errstr)); gf_log (GF_NFS3, ll, "%s", errstr); } @@ -3395,14 +3405,14 @@ void nfs3_log_readlink_res (uint32_t xid, nfsstat3 stat, int pstat, char *linkpath) { char errstr[1024]; - int ll = nfs3_loglevel (NFS3_READLINK, stat); + int ll = nfs3_loglevel (NFS3_READLINK, stat); - if (THIS->ctx->log.loglevel < ll) - return; + if (THIS->ctx->log.loglevel < ll) + return; - nfs3_stat_to_errstr (xid, "READLINK", stat, pstat, errstr); + nfs3_stat_to_errstr (xid, "READLINK", stat, pstat, errstr, sizeof (errstr)); gf_log (GF_NFS3, ll, "%s, target: %s", - errstr, linkpath); + errstr, linkpath); } @@ -3411,12 +3421,12 @@ nfs3_log_read_res (uint32_t xid, nfsstat3 stat, int pstat, count3 count, int is_eof, struct iovec *vec, int32_t veccount) { char errstr[1024]; - int ll = GF_LOG_DEBUG; + int ll = GF_LOG_DEBUG; ll = nfs3_loglevel (NFS3_READ, stat); - if (THIS->ctx->log.loglevel < ll) - return; - nfs3_stat_to_errstr (xid, "READ", stat, pstat, errstr); + if (THIS->ctx->log.loglevel < ll) + return; + nfs3_stat_to_errstr (xid, "READ", stat, pstat, errstr, sizeof (errstr)); if (vec) gf_log (GF_NFS3, ll, "%s, count: %"PRIu32", is_eof:" " %d, vector: count: %d, len: %zd", errstr, count, @@ -3432,12 +3442,12 @@ nfs3_log_write_res (uint32_t xid, nfsstat3 stat, int pstat, count3 count, int stable, uint64_t wverf) { char errstr[1024]; - int ll = nfs3_loglevel (NFS3_WRITE, stat); + int ll = nfs3_loglevel (NFS3_WRITE, stat); - if (THIS->ctx->log.loglevel < ll) - return; + if (THIS->ctx->log.loglevel < ll) + return; - nfs3_stat_to_errstr (xid, "WRITE", stat, pstat, errstr); + nfs3_stat_to_errstr (xid, "WRITE", stat, pstat, errstr, sizeof (errstr)); gf_log (GF_NFS3, ll, "%s, count: %"PRIu32", %s,wverf: %"PRIu64 , errstr, count, (stable == UNSTABLE)?"UNSTABLE":"STABLE", wverf); @@ -3450,12 +3460,12 @@ nfs3_log_newfh_res (uint32_t xid, int op, nfsstat3 stat, int pstat, { char errstr[1024]; char fhstr[1024]; - int ll = nfs3_loglevel (op, stat); + int ll = nfs3_loglevel (op, stat); - if (THIS->ctx->log.loglevel < ll) - return; - nfs3_stat_to_errstr (xid, nfs3op_strings[op].str, stat, pstat, errstr); - nfs3_fh_to_str (newfh, fhstr); + if (THIS->ctx->log.loglevel < ll) + return; + nfs3_stat_to_errstr (xid, nfs3op_strings[op].str, stat, pstat, errstr, sizeof (errstr)); + nfs3_fh_to_str (newfh, fhstr, sizeof (fhstr)); gf_log (GF_NFS3, nfs3_loglevel (op, stat), "%s, %s", errstr, fhstr); } @@ -3466,11 +3476,11 @@ nfs3_log_readdir_res (uint32_t xid, nfsstat3 stat, int pstat, uint64_t cverf, count3 count, int is_eof) { char errstr[1024]; - int ll = nfs3_loglevel (NFS3_READDIR, stat); + int ll = nfs3_loglevel (NFS3_READDIR, stat); - if (THIS->ctx->log.loglevel < ll) - return; - nfs3_stat_to_errstr (xid, "READDIR", stat, pstat, errstr); + if (THIS->ctx->log.loglevel < ll) + return; + nfs3_stat_to_errstr (xid, "READDIR", stat, pstat, errstr, sizeof (errstr)); gf_log (GF_NFS3, ll, "%s, count: %"PRIu32", cverf: %"PRIu64 ", is_eof: %d", errstr, count, cverf, is_eof); } @@ -3483,9 +3493,9 @@ nfs3_log_readdirp_res (uint32_t xid, nfsstat3 stat, int pstat, uint64_t cverf, char errstr[1024]; int ll = nfs3_loglevel (NFS3_READDIRP, stat); - if (THIS->ctx->log.loglevel < ll) - return; - nfs3_stat_to_errstr (xid, "READDIRPLUS", stat, pstat, errstr); + if (THIS->ctx->log.loglevel < ll) + return; + nfs3_stat_to_errstr (xid, "READDIRPLUS", stat, pstat, errstr, sizeof (errstr)); gf_log (GF_NFS3, ll, "%s, dircount: %"PRIu32", maxcount: %" PRIu32", cverf: %"PRIu64", is_eof: %d", errstr, dircount, maxcount, cverf, is_eof); @@ -3498,9 +3508,9 @@ nfs3_log_commit_res (uint32_t xid, nfsstat3 stat, int pstat, uint64_t wverf) char errstr[1024]; int ll = nfs3_loglevel (NFS3_COMMIT, stat); - if (THIS->ctx->log.loglevel < ll) - return; - nfs3_stat_to_errstr (xid, "COMMIT", stat, pstat, errstr); + if (THIS->ctx->log.loglevel < ll) + return; + nfs3_stat_to_errstr (xid, "COMMIT", stat, pstat, errstr, sizeof (errstr)); gf_log (GF_NFS3, ll, "%s, wverf: %"PRIu64, errstr, wverf); } @@ -3511,10 +3521,10 @@ nfs3_log_readdir_call (uint32_t xid, struct nfs3_fh *fh, count3 dircount, { char fhstr[1024]; - if (THIS->ctx->log.loglevel < GF_LOG_DEBUG) - return; + if (THIS->ctx->log.loglevel < GF_LOG_DEBUG) + return; - nfs3_fh_to_str (fh, fhstr); + nfs3_fh_to_str (fh, fhstr, sizeof (fhstr)); if (maxcount == 0) gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, READDIR: args: %s," diff --git a/xlators/nfs/server/src/nfs3-helpers.h b/xlators/nfs/server/src/nfs3-helpers.h index cc96051e1..4de1d5623 100644 --- a/xlators/nfs/server/src/nfs3-helpers.h +++ b/xlators/nfs/server/src/nfs3-helpers.h @@ -2,19 +2,10 @@ Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _NFS3_HELPER_H_ @@ -44,6 +35,9 @@ nfs3_extract_lookup_name (lookup3args *args); extern nfsstat3 nfs3_errno_to_nfsstat3 (int errnum); +extern nfsstat3 +nfs3_cbk_errno_status (int32_t, int32_t); + extern void nfs3_fill_lookup3res (lookup3res *res, nfsstat3 stat, struct nfs3_fh *newfh, struct iatt *stbuf, struct iatt *postparent, diff --git a/xlators/nfs/server/src/nfs3.c b/xlators/nfs/server/src/nfs3.c index d119055eb..f914c3193 100644 --- a/xlators/nfs/server/src/nfs3.c +++ b/xlators/nfs/server/src/nfs3.c @@ -2,19 +2,10 @@ Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -226,20 +217,22 @@ out: uuid_unparse (handle->exportid, exportid); \ uuid_unparse (handle->gfid, gfid); \ trans = rpcsvc_request_transport (req); \ - gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to map " \ - "FH to vol: client=%s, exportid=%s, gfid=%s",\ - trans->peerinfo.identifier, exportid, \ - gfid); \ - gf_log (GF_NFS3, GF_LOG_ERROR, \ - "Stale nfs client %s must be trying to "\ - "connect to a deleted volume, please " \ - "unmount it.", trans->peerinfo.identifier);\ + GF_LOG_OCCASIONALLY (nfs3state->occ_logger, \ + GF_NFS3, GF_LOG_ERROR, "Failed to map " \ + "FH to vol: client=%s, exportid=%s, " \ + "gfid=%s", trans->peerinfo.identifier, \ + exportid, gfid); \ + GF_LOG_OCCASIONALLY (nfs3state->occ_logger, \ + GF_NFS3, GF_LOG_ERROR, "Stale nfs " \ + "client %s must be trying to connect to"\ + " a deleted volume, please unmount it.",\ + trans->peerinfo.identifier); \ status = NFS3ERR_STALE; \ goto label; \ } else { \ - gf_log (GF_NFS3, GF_LOG_TRACE, "FH to Volume: %s"\ - ,volume->name); \ - rpcsvc_request_set_private (req, volume); \ + gf_log (GF_NFS3, GF_LOG_TRACE, "FH to Volume:" \ + "%s", volume->name); \ + rpcsvc_request_set_private (req, volume); \ } \ } while (0); \ @@ -263,8 +256,10 @@ out: xlatorp = nfs3_fh_to_xlator (cst->nfs3state, \ &cst->resolvefh); \ uuid_unparse (cst->resolvefh.gfid, gfid); \ - sprintf (buf, "(%s) %s : %s", trans->peerinfo.identifier,\ - xlatorp ? xlatorp->name : "ERR", gfid); \ + snprintf (buf, sizeof (buf), "(%s) %s : %s", \ + trans->peerinfo.identifier, \ + xlatorp ? xlatorp->name : "ERR", \ + gfid ); \ gf_log (GF_NFS3, GF_LOG_ERROR, "%s: %s", \ strerror(cst->resolve_errno), buf); \ nfstat = nfs3_errno_to_nfsstat3 (cst->resolve_errno);\ @@ -283,8 +278,10 @@ out: xlatorp = nfs3_fh_to_xlator (cst->nfs3state, \ &cst->resolvefh); \ uuid_unparse (cst->resolvefh.gfid, gfid); \ - sprintf (buf, "(%s) %s : %s", trans->peerinfo.identifier,\ - xlatorp ? xlatorp->name : "ERR", gfid); \ + snprintf (buf, sizeof (buf), "(%s) %s : %s", \ + trans->peerinfo.identifier, \ + xlatorp ? xlatorp->name : "ERR", \ + gfid); \ gf_log (GF_NFS3, GF_LOG_ERROR, "%s: %s", \ strerror(cst->resolve_errno), buf); \ nfstat = nfs3_errno_to_nfsstat3 (cs->resolve_errno);\ @@ -552,23 +549,18 @@ nfs3svc_submit_reply (rpcsvc_request_t *req, void *arg, nfs3_serializer sfunc) iobref = iobref_new (); if (!iobref) { - iobuf_unref (iob); gf_log (GF_NFS3, GF_LOG_ERROR, "failed on iobref_new()"); goto ret; } - iobref_add (iobref, iob); + ret = iobref_add (iobref, iob); + if (ret) { + gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to add iob to iobref"); + goto ret; + } /* Then, submit the message for transmission. */ ret = rpcsvc_submit_message (req, &outmsg, 1, NULL, 0, iobref); - - /* Now that we've done our job of handing the message to the RPC layer - * we can safely unref the iob in the hope that RPC layer must have - * ref'ed the iob on receiving into the txlist. - */ - iobuf_unref (iob); - iobref_unref (iobref); - if (ret == -1) { gf_log (GF_NFS3, GF_LOG_ERROR, "Reply submission failed"); goto ret; @@ -576,6 +568,14 @@ nfs3svc_submit_reply (rpcsvc_request_t *req, void *arg, nfs3_serializer sfunc) ret = 0; ret: + /* Now that we've done our job of handing the message to the RPC layer + * we can safely unref the iob in the hope that RPC layer must have + * ref'ed the iob on receiving into the txlist. + */ + if (NULL != iob) + iobuf_unref (iob); + if (NULL != iobref) + iobref_unref (iobref); return ret; } @@ -607,19 +607,14 @@ nfs3svc_submit_vector_reply (rpcsvc_request_t *req, void *arg, new_iobref = 1; } - iobref_add (iobref, iob); + ret = iobref_add (iobref, iob); + if (ret) { + gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to add iob to iobref"); + goto ret; + } /* Then, submit the message for transmission. */ ret = rpcsvc_submit_message (req, &outmsg, 1, payload, vcount, iobref); - - /* Now that we've done our job of handing the message to the RPC layer - * we can safely unref the iob in the hope that RPC layer must have - * ref'ed the iob on receiving into the txlist. - */ - iobuf_unref (iob); - if (new_iobref) - iobref_unref (iobref); - if (ret == -1) { gf_log (GF_NFS3, GF_LOG_ERROR, "Reply submission failed"); goto ret; @@ -627,6 +622,14 @@ nfs3svc_submit_vector_reply (rpcsvc_request_t *req, void *arg, ret = 0; ret: + /* Now that we've done our job of handing the message to the RPC layer + * we can safely unref the iob in the hope that RPC layer must have + * ref'ed the iob on receiving into the txlist. + */ + if (NULL != iob) + iobuf_unref (iob); + if (new_iobref) + iobref_unref (iobref); return ret; } @@ -701,7 +704,7 @@ nfs3svc_getattr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - status = nfs3_errno_to_nfsstat3 (op_errno); + status = nfs3_cbk_errno_status (op_ret, op_errno); } else { nfs_fix_generation(this,inode); @@ -731,7 +734,7 @@ nfs3svc_getattr_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - status = nfs3_errno_to_nfsstat3 (op_errno); + status = nfs3_cbk_errno_status (op_ret, op_errno); } nfs3_log_common_res (rpcsvc_request_xid (cs->req), NFS3_GETATTR, @@ -915,7 +918,7 @@ nfs3svc_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto nfs3err; } @@ -955,7 +958,7 @@ nfs3svc_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto nfs3err; } @@ -1011,7 +1014,7 @@ nfs3svc_setattr_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto nfs3err; } @@ -1225,7 +1228,7 @@ nfs3svc_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, (op_errno == ENOENT ? GF_LOG_TRACE : GF_LOG_WARNING), "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - status = nfs3_errno_to_nfsstat3 (op_errno); + status = nfs3_cbk_errno_status (op_ret, op_errno); goto xmit_res; } @@ -1269,7 +1272,7 @@ nfs3svc_lookup_parentdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - status = nfs3_errno_to_nfsstat3 (op_errno); + status = nfs3_cbk_errno_status (op_ret, op_errno); goto xmit_res; } @@ -1311,7 +1314,11 @@ nfs3_lookup_parentdir_resume (void *carg) nfs3_call_state_t *cs = NULL; inode_t *parent = NULL; - GF_VALIDATE_OR_GOTO (GF_NFS3, carg, nfs3err); + if (!carg) { + gf_log (GF_NFS3, GF_LOG_ERROR, "Invalid argument," + " carg value NULL"); + return EINVAL; + } cs = (nfs3_call_state_t *)carg; nfs3_check_fh_resolve_status (cs, stat, nfs3err); @@ -1382,7 +1389,11 @@ nfs3_lookup_resume (void *carg) nfs3_call_state_t *cs = NULL; struct nfs3_fh newfh = {{0},}; - GF_VALIDATE_OR_GOTO (GF_NFS3, carg, nfs3err); + if (!carg) { + gf_log (GF_NFS3, GF_LOG_ERROR, "Invalid argument," + " carg value NULL"); + return EINVAL; + } cs = (nfs3_call_state_t *)carg; nfs3_check_fh_resolve_status (cs, stat, nfs3err); @@ -1519,7 +1530,7 @@ nfs3svc_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - status = nfs3_errno_to_nfsstat3 (op_errno); + status = nfs3_cbk_errno_status (op_ret, op_errno); } nfs3_log_common_res (rpcsvc_request_xid (cs->req), NFS3_ACCESS, status, op_errno); @@ -1537,7 +1548,11 @@ nfs3_access_resume (void *carg) nfs_user_t nfu = {0, }; nfs3_call_state_t *cs = NULL; - GF_VALIDATE_OR_GOTO (GF_NFS3, carg, nfs3err); + if (!carg) { + gf_log (GF_NFS3, GF_LOG_ERROR, "Invalid argument," + " carg value NULL"); + return EINVAL; + } cs = (nfs3_call_state_t *)carg; nfs3_check_fh_resolve_status (cs, stat, nfs3err); @@ -1655,7 +1670,7 @@ nfs3svc_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto nfs3err; } @@ -1822,7 +1837,7 @@ nfs3svc_read_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto err; } else stat = NFS3_OK; @@ -2008,7 +2023,7 @@ nfs3svc_write_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); } else stat = NFS3_OK; @@ -2068,7 +2083,7 @@ nfs3svc_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto err; } @@ -2320,7 +2335,7 @@ nfs3svc_create_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto nfs3err; } @@ -2353,7 +2368,7 @@ nfs3svc_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto nfs3err; } @@ -2458,7 +2473,7 @@ nfs3svc_create_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); ret = -op_errno; - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto nfs3err; } @@ -2629,13 +2644,7 @@ nfs3svc_create (rpcsvc_request_t *req) } cval = (uint64_t *)args.how.createhow3_u.verf; - if (cval) - cverf = *cval; - else { - gf_log(GF_NFS3, GF_LOG_ERROR, - "Error getting createverf3 from args"); - goto rpcerr; - } + cverf = *cval; ret = nfs3_create (req, &dirfh, name, args.how.mode, &args.how.createhow3_u.obj_attributes, cverf); @@ -2679,7 +2688,7 @@ nfs3svc_mkdir_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto nfs3err; } @@ -2711,7 +2720,7 @@ nfs3svc_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto nfs3err; } @@ -2889,7 +2898,7 @@ nfs3svc_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto nfs3err; } @@ -3051,7 +3060,7 @@ nfs3svc_mknod_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto nfs3err; } @@ -3083,7 +3092,7 @@ nfs3svc_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto nfs3err; } @@ -3341,7 +3350,7 @@ nfs3svc_remove_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); } if (op_ret == 0) @@ -3507,7 +3516,7 @@ nfs3svc_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); } else { stat = NFS3_OK; } @@ -3662,7 +3671,7 @@ nfs3svc_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "%x: rename %s -> %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->oploc.path, cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto nfs3err; } @@ -3867,7 +3876,7 @@ nfs3svc_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "%x: link %s <- %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->oploc.path, cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); } else stat = NFS3_OK; @@ -4077,7 +4086,7 @@ nfs3svc_readdir_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto nfs3err; } @@ -4132,7 +4141,7 @@ nfs3svc_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto err; } @@ -4380,16 +4389,8 @@ nfs3svc_readdir (rpcsvc_request_t *req) rpcsvc_request_seterr (req, GARBAGE_ARGS); goto rpcerr; } - cval = (uint64_t *) ra.cookieverf; - - if (cval) - verf = *cval; - else { - gf_log(GF_NFS3, GF_LOG_ERROR, - "Error getting cookieverf from readdir args"); - goto rpcerr; - } + verf = *cval; ret = nfs3_readdir (req, &fh, ra.cookie, verf, ra.count, 0); if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) { @@ -4420,16 +4421,8 @@ nfs3svc_readdirp (rpcsvc_request_t *req) rpcsvc_request_seterr (req, GARBAGE_ARGS); goto rpcerr; } - cval = (uint64_t *) ra.cookieverf; - - if (cval) - cverf = *cval; - else { - gf_log (GF_NFS3, GF_LOG_ERROR, - "Error getting cookieverf from readdirp args"); - goto rpcerr; - } + cverf = *cval; ret = nfs3_readdir (req, &fh, ra.cookie, cverf, ra.dircount, ra.maxcount); @@ -4472,7 +4465,7 @@ nfs3_fsstat_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); } else stat = NFS3_OK; @@ -4500,7 +4493,7 @@ nfs3_fsstat_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); ret = -op_errno; - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); goto err; } @@ -4659,7 +4652,7 @@ nfs3svc_fsinfo_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - status = nfs3_errno_to_nfsstat3 (op_errno); + status = nfs3_cbk_errno_status (op_ret, op_errno); }else status = NFS3_OK; @@ -4801,7 +4794,7 @@ nfs3svc_pathconf_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); } else { /* If stat fop failed, we can still send the other components * in a pathconf reply. @@ -4945,7 +4938,7 @@ nfs3svc_commit_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (GF_NFS, GF_LOG_WARNING, "%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req), cs->resolvedloc.path, strerror (op_errno)); - stat = nfs3_errno_to_nfsstat3 (op_errno); + stat = nfs3_cbk_errno_status (op_ret, op_errno); } else stat = NFS3_OK; @@ -5106,28 +5099,28 @@ rpcerr: rpcsvc_actor_t nfs3svc_actors[NFS3_PROC_COUNT] = { - {"NULL", NFS3_NULL, nfs3svc_null, NULL, 0}, - {"GETATTR", NFS3_GETATTR, nfs3svc_getattr,NULL, 0}, - {"SETATTR", NFS3_SETATTR, nfs3svc_setattr,NULL, 0}, - {"LOOKUP", NFS3_LOOKUP, nfs3svc_lookup, NULL, 0}, - {"ACCESS", NFS3_ACCESS, nfs3svc_access, NULL, 0}, - {"READLINK", NFS3_READLINK, nfs3svc_readlink,NULL, 0}, - {"READ", NFS3_READ, nfs3svc_read, NULL, 0}, - {"WRITE", NFS3_WRITE, nfs3svc_write, nfs3svc_write_vecsizer, 0}, - {"CREATE", NFS3_CREATE, nfs3svc_create, NULL, 0}, - {"MKDIR", NFS3_MKDIR, nfs3svc_mkdir, NULL, 0}, - {"SYMLINK", NFS3_SYMLINK, nfs3svc_symlink,NULL, 0}, - {"MKNOD", NFS3_MKNOD, nfs3svc_mknod, NULL, 0}, - {"REMOVE", NFS3_REMOVE, nfs3svc_remove, NULL, 0}, - {"RMDIR", NFS3_RMDIR, nfs3svc_rmdir, NULL, 0}, - {"RENAME", NFS3_RENAME, nfs3svc_rename, NULL, 0}, - {"LINK", NFS3_LINK, nfs3svc_link, NULL, 0}, - {"READDIR", NFS3_READDIR, nfs3svc_readdir,NULL, 0}, - {"READDIRPLUS", NFS3_READDIRP, nfs3svc_readdirp,NULL, 0}, - {"FSSTAT", NFS3_FSSTAT, nfs3svc_fsstat, NULL, 0}, - {"FSINFO", NFS3_FSINFO, nfs3svc_fsinfo, NULL, 0}, - {"PATHCONF", NFS3_PATHCONF, nfs3svc_pathconf,NULL, 0}, - {"COMMIT", NFS3_COMMIT, nfs3svc_commit, NULL, 0} + {"NULL", NFS3_NULL, nfs3svc_null, NULL, 0, DRC_IDEMPOTENT}, + {"GETATTR", NFS3_GETATTR, nfs3svc_getattr, NULL, 0, DRC_IDEMPOTENT}, + {"SETATTR", NFS3_SETATTR, nfs3svc_setattr, NULL, 0, DRC_NON_IDEMPOTENT}, + {"LOOKUP", NFS3_LOOKUP, nfs3svc_lookup, NULL, 0, DRC_IDEMPOTENT}, + {"ACCESS", NFS3_ACCESS, nfs3svc_access, NULL, 0, DRC_IDEMPOTENT}, + {"READLINK", NFS3_READLINK, nfs3svc_readlink, NULL, 0, DRC_IDEMPOTENT}, + {"READ", NFS3_READ, nfs3svc_read, NULL, 0, DRC_IDEMPOTENT}, + {"WRITE", NFS3_WRITE, nfs3svc_write, nfs3svc_write_vecsizer, 0, DRC_NON_IDEMPOTENT}, + {"CREATE", NFS3_CREATE, nfs3svc_create, NULL, 0, DRC_NON_IDEMPOTENT}, + {"MKDIR", NFS3_MKDIR, nfs3svc_mkdir, NULL, 0, DRC_NON_IDEMPOTENT}, + {"SYMLINK", NFS3_SYMLINK, nfs3svc_symlink, NULL, 0, DRC_NON_IDEMPOTENT}, + {"MKNOD", NFS3_MKNOD, nfs3svc_mknod, NULL, 0, DRC_NON_IDEMPOTENT}, + {"REMOVE", NFS3_REMOVE, nfs3svc_remove, NULL, 0, DRC_NON_IDEMPOTENT}, + {"RMDIR", NFS3_RMDIR, nfs3svc_rmdir, NULL, 0, DRC_NON_IDEMPOTENT}, + {"RENAME", NFS3_RENAME, nfs3svc_rename, NULL, 0, DRC_NON_IDEMPOTENT}, + {"LINK", NFS3_LINK, nfs3svc_link, NULL, 0, DRC_NON_IDEMPOTENT}, + {"READDIR", NFS3_READDIR, nfs3svc_readdir, NULL, 0, DRC_IDEMPOTENT}, + {"READDIRPLUS", NFS3_READDIRP, nfs3svc_readdirp, NULL, 0, DRC_IDEMPOTENT}, + {"FSSTAT", NFS3_FSSTAT, nfs3svc_fsstat, NULL, 0, DRC_IDEMPOTENT}, + {"FSINFO", NFS3_FSINFO, nfs3svc_fsinfo, NULL, 0, DRC_IDEMPOTENT}, + {"PATHCONF", NFS3_PATHCONF, nfs3svc_pathconf, NULL, 0, DRC_IDEMPOTENT}, + {"COMMIT", NFS3_COMMIT, nfs3svc_commit, NULL, 0, DRC_IDEMPOTENT} }; @@ -5144,21 +5137,48 @@ rpcsvc_program_t nfs3prog = { .min_auth = AUTH_NULL, }; +/* + * This function rounds up the input value to multiple of 4096. Min and Max + * supported I/O size limits are 4KB (GF_NFS3_FILE_IO_SIZE_MIN) and + * 1MB (GF_NFS3_FILE_IO_SIZE_MAX). + */ +void +nfs3_iosize_roundup_4KB (uint64_t *ioszptr) +{ + uint64_t iosize; + uint64_t iopages; + + if (!ioszptr) + return; + + iosize = *ioszptr; + iopages = (iosize + GF_NFS3_IO_SIZE -1) >> GF_NFS3_IO_SHIFT; + iosize = (iopages * GF_NFS3_IO_SIZE); + + /* Double check - boundary conditions */ + if (iosize < GF_NFS3_FILE_IO_SIZE_MIN) { + iosize = GF_NFS3_FILE_IO_SIZE_MIN; + } else if (iosize > GF_NFS3_FILE_IO_SIZE_MAX) { + iosize = GF_NFS3_FILE_IO_SIZE_MAX; + } + + *ioszptr = iosize; +} int -nfs3_init_options (struct nfs3_state *nfs3, xlator_t *nfsx) +nfs3_init_options (struct nfs3_state *nfs3, dict_t *options) { int ret = -1; char *optstr = NULL; uint64_t size64 = 0; - if ((!nfs3) || (!nfsx)) + if ((!nfs3) || (!options)) return -1; /* nfs3.read-size */ nfs3->readsize = GF_NFS3_RTPREF; - if (dict_get (nfsx->options, "nfs3.read-size")) { - ret = dict_get_str (nfsx->options, "nfs3.read-size", &optstr); + if (dict_get (options, "nfs3.read-size")) { + ret = dict_get_str (options, "nfs3.read-size", &optstr); if (ret < 0) { gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to read " " option: nfs3.read-size"); @@ -5167,19 +5187,21 @@ nfs3_init_options (struct nfs3_state *nfs3, xlator_t *nfsx) } ret = gf_string2bytesize (optstr, &size64); - nfs3->readsize = size64; if (ret == -1) { gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to format" " option: nfs3.read-size"); ret = -1; goto err; } + + nfs3_iosize_roundup_4KB (&size64); + nfs3->readsize = size64; } /* nfs3.write-size */ nfs3->writesize = GF_NFS3_WTPREF; - if (dict_get (nfsx->options, "nfs3.write-size")) { - ret = dict_get_str (nfsx->options, "nfs3.write-size", &optstr); + if (dict_get (options, "nfs3.write-size")) { + ret = dict_get_str (options, "nfs3.write-size", &optstr); if (ret < 0) { gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to read " " option: nfs3.write-size"); @@ -5188,19 +5210,21 @@ nfs3_init_options (struct nfs3_state *nfs3, xlator_t *nfsx) } ret = gf_string2bytesize (optstr, &size64); - nfs3->writesize = size64; if (ret == -1) { gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to format" " option: nfs3.write-size"); ret = -1; goto err; } + + nfs3_iosize_roundup_4KB (&size64); + nfs3->writesize = size64; } /* nfs3.readdir.size */ nfs3->readdirsize = GF_NFS3_DTPREF; - if (dict_get (nfsx->options, "nfs3.readdir-size")) { - ret = dict_get_str (nfsx->options,"nfs3.readdir-size", &optstr); + if (dict_get (options, "nfs3.readdir-size")) { + ret = dict_get_str (options,"nfs3.readdir-size", &optstr); if (ret < 0) { gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to read" " option: nfs3.readdir-size"); @@ -5209,15 +5233,16 @@ nfs3_init_options (struct nfs3_state *nfs3, xlator_t *nfsx) } ret = gf_string2bytesize (optstr, &size64); - nfs3->readdirsize = size64; if (ret == -1) { gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to format" " option: nfs3.readdir-size"); ret = -1; goto err; } - } + nfs3_iosize_roundup_4KB (&size64); + nfs3->readdirsize = size64; + } /* We want to use the size of the biggest param for the io buffer size. */ @@ -5238,9 +5263,10 @@ err: return ret; } - int -nfs3_init_subvolume_options (struct nfs3_state *nfs3, struct nfs3_export *exp) +nfs3_init_subvolume_options (xlator_t *nfsx, + struct nfs3_export *exp, + dict_t *options) { int ret = -1; char *optstr = NULL; @@ -5248,14 +5274,20 @@ nfs3_init_subvolume_options (struct nfs3_state *nfs3, struct nfs3_export *exp) char *name = NULL; gf_boolean_t boolt = _gf_false; uuid_t volumeid = {0, }; - dict_t *options = NULL; - if ((!exp) || (!nfs3)) + if ((!nfsx) || (!exp)) return -1; - options = nfs3->nfsx->options; + /* For init, fetch options from xlator but for + * reconfigure, take the parameter */ + if (!options) + options = nfsx->options; + + if (!options) + return (-1); + uuid_clear (volumeid); - if (gf_nfs_dvm_off (nfs_state (nfs3->nfsx))) + if (gf_nfs_dvm_off (nfs_state (nfsx))) goto no_dvm; ret = snprintf (searchkey, 1024, "nfs3.%s.volume-id",exp->subvol->name); @@ -5421,7 +5453,7 @@ nfs3_init_subvolume (struct nfs3_state *nfs3, xlator_t *subvol) INIT_LIST_HEAD (&exp->explist); gf_log (GF_NFS3, GF_LOG_TRACE, "Initing state: %s", exp->subvol->name); - ret = nfs3_init_subvolume_options (nfs3, exp); + ret = nfs3_init_subvolume_options (nfs3->nfsx, exp, NULL); if (ret == -1) { gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to init subvol"); goto exp_free; @@ -5486,7 +5518,7 @@ nfs3_init_state (xlator_t *nfsx) } nfs = nfsx->private; - ret = nfs3_init_options (nfs3, nfsx); + ret = nfs3_init_options (nfs3, nfsx->options); if (ret == -1) { gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to init options"); goto ret; @@ -5518,7 +5550,7 @@ nfs3_init_state (xlator_t *nfsx) LOCK_INIT (&nfs3->fdlrulock); nfs3->fdcount = 0; - rpcsvc_create_listeners (nfs->rpcsvc, nfsx->options, nfsx->name); + ret = rpcsvc_create_listeners (nfs->rpcsvc, nfsx->options, nfsx->name); if (ret == -1) { gf_log (GF_NFS, GF_LOG_ERROR, "Unable to create listeners"); goto free_localpool; @@ -5560,4 +5592,39 @@ nfs3svc_init (xlator_t *nfsx) return &nfs3prog; } +int +nfs3_reconfigure_state (xlator_t *nfsx, dict_t *options) +{ + int ret = -1; + struct nfs3_export *exp = NULL; + struct nfs_state *nfs = NULL; + struct nfs3_state *nfs3 = NULL; + + if ((!nfsx) || (!nfsx->private) || (!options)) + goto out; + + nfs = (struct nfs_state *)nfsx->private; + nfs3 = nfs->nfs3state; + if (!nfs3) + goto out; + ret = nfs3_init_options (nfs3, options); + if (ret) { + gf_log (GF_NFS3, GF_LOG_ERROR, + "Failed to reconfigure options"); + goto out; + } + + list_for_each_entry (exp, &nfs3->exports, explist) { + ret = nfs3_init_subvolume_options (nfsx, exp, options); + if (ret) { + gf_log (GF_NFS3, GF_LOG_ERROR, + "Failed to reconfigure subvol options"); + goto out; + } + } + + ret = 0; +out: + return ret; +} diff --git a/xlators/nfs/server/src/nfs3.h b/xlators/nfs/server/src/nfs3.h index 111542bc6..0c35445a4 100644 --- a/xlators/nfs/server/src/nfs3.h +++ b/xlators/nfs/server/src/nfs3.h @@ -2,19 +2,10 @@ Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _NFS3_H_ @@ -48,16 +39,39 @@ /* Static values used for FSINFO -FIXME: This should be configurable */ -#define GF_NFS3_RTMAX (64 * GF_UNIT_KB) -#define GF_NFS3_RTPREF (64 * GF_UNIT_KB) -#define GF_NFS3_RTMULT (4 * GF_UNIT_KB) -#define GF_NFS3_WTMAX (64 * GF_UNIT_KB) -#define GF_NFS3_WTPREF (64 * GF_UNIT_KB) -#define GF_NFS3_WTMULT (4 * GF_UNIT_KB) -#define GF_NFS3_DTMIN (4 * GF_UNIT_KB) -#define GF_NFS3_DTPREF (64 * GF_UNIT_KB) -#define GF_NFS3_MAXFILE (1 * GF_UNIT_PB) + * To change the maximum rsize and wsize supported by the NFS client, adjust + * GF_NFS3_FILE_IO_SIZE_MAX. The Gluster NFS server defaults to 1MB(1048576) + * (same as kernel NFS server). For slower network, rsize/wsize can be trimmed + * to 16/32/64-KB. rsize and wsize can be tuned through nfs.read-size and + * nfs.write-size respectively. + * + * NB: For Kernel-NFS, NFS_MAX_FILE_IO_SIZE is 1048576U (1MB). + */ +#define GF_NFS3_FILE_IO_SIZE_MAX (1 * GF_UNIT_MB) /* 1048576 */ +#define GF_NFS3_FILE_IO_SIZE_MIN (4 * GF_UNIT_KB) /* 4096 */ + +#define GF_NFS3_FILE_IO_SIZE_DEF GF_NFS3_FILE_IO_SIZE_MAX + +#define GF_NFS3_RTMAX GF_NFS3_FILE_IO_SIZE_MAX +#define GF_NFS3_RTMIN GF_NFS3_FILE_IO_SIZE_MIN +#define GF_NFS3_RTPREF GF_NFS3_FILE_IO_SIZE_DEF +#define GF_NFS3_RTMULT GF_NFS3_FILE_IO_SIZE_MIN + +#define GF_NFS3_WTMAX GF_NFS3_FILE_IO_SIZE_MAX +#define GF_NFS3_WTMIN GF_NFS3_FILE_IO_SIZE_MIN +#define GF_NFS3_WTPREF GF_NFS3_FILE_IO_SIZE_DEF +#define GF_NFS3_WTMULT GF_NFS3_FILE_IO_SIZE_MIN + +/* This can be tuned through nfs.readdir-size */ +#define GF_NFS3_DTMAX GF_NFS3_FILE_IO_SIZE_MAX +#define GF_NFS3_DTMIN GF_NFS3_FILE_IO_SIZE_MIN +#define GF_NFS3_DTPREF GF_NFS3_FILE_IO_SIZE_DEF + +#define GF_NFS3_MAXFILESIZE (1 * GF_UNIT_PB) + +#define GF_NFS3_IO_SIZE 4096 /* 4-KB */ +#define GF_NFS3_IO_SHIFT 12 /* 2^12 = 4KB */ + /* FIXME: Handle time resolutions */ #define GF_NFS3_TIMEDELTA_SECS {1,0} #define GF_NFS3_TIMEDELTA_NSECS {0,1} @@ -120,20 +134,21 @@ typedef struct nfs3_state { uint64_t serverstart; /* NFSv3 Protocol configurables */ - size_t readsize; - size_t writesize; - size_t readdirsize; + uint64_t readsize; + uint64_t writesize; + uint64_t readdirsize; /* Size of the iobufs used, depends on the sizes of the three params * above. */ - size_t iobsize; + uint64_t iobsize; unsigned int memfactor; struct list_head fdlru; gf_lock_t fdlrulock; int fdcount; + uint32_t occ_logger; } nfs3_state_t; typedef enum nfs3_lookup_type { @@ -262,9 +277,9 @@ struct inode_op_queue { pthread_mutex_t qlock; }; - - - extern rpcsvc_program_t * nfs3svc_init (xlator_t *nfsx); + +extern int +nfs3_reconfigure_state (xlator_t *nfsx, dict_t *options); #endif diff --git a/xlators/nfs/server/src/nlm4.c b/xlators/nfs/server/src/nlm4.c index 595738b2c..5c5d87412 100644 --- a/xlators/nfs/server/src/nlm4.c +++ b/xlators/nfs/server/src/nlm4.c @@ -2,19 +2,10 @@ Copyright (c) 2012 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -149,8 +140,10 @@ nfs3_fh_to_xlator (struct nfs3_state *nfs3, struct nfs3_fh *fh); xlatorp = nfs3_fh_to_xlator (cst->nfs3state, \ &cst->resolvefh); \ uuid_unparse (cst->resolvefh.gfid, gfid); \ - sprintf (buf, "(%s) %s : %s", trans->peerinfo.identifier,\ - xlatorp ? xlatorp->name : "ERR", gfid); \ + snprintf (buf, sizeof (buf), "(%s) %s : %s", \ + trans->peerinfo.identifier, \ + xlatorp ? xlatorp->name : "ERR", \ + gfid); \ gf_log (GF_NLM, GF_LOG_ERROR, "Unable to resolve FH"\ ": %s", buf); \ nfstat = nlm4_errno_to_nlm4stat (cst->resolve_errno);\ @@ -235,7 +228,7 @@ nlm_is_oh_same_lkowner (gf_lkowner_t *a, netobj *b) !memcmp (a->data, b->n_bytes, a->len)); } -nfsstat3 +nlm4_stats nlm4_errno_to_nlm4stat (int errnum) { nlm4_stats stat = nlm4_denied; @@ -434,10 +427,11 @@ ret: int nlm4svc_submit_reply (rpcsvc_request_t *req, void *arg, nlm4_serializer sfunc) { - struct iovec outmsg = {0, }; - struct iobuf *iob = NULL; - struct nfs3_state *nfs3 = NULL; - int ret = -1; + struct iovec outmsg = {0, }; + struct iobuf *iob = NULL; + struct nfs3_state *nfs3 = NULL; + int ret = -1; + ssize_t msglen = 0; struct iobref *iobref = NULL; if (!req) @@ -462,7 +456,12 @@ nlm4svc_submit_reply (rpcsvc_request_t *req, void *arg, nlm4_serializer sfunc) /* Use the given serializer to translate the give C structure in arg * to XDR format which will be written into the buffer in outmsg. */ - outmsg.iov_len = sfunc (outmsg, arg); + msglen = sfunc (outmsg, arg); + if (msglen < 0) { + gf_log (GF_NLM, GF_LOG_ERROR, "Failed to encode message"); + goto ret; + } + outmsg.iov_len = msglen; iobref = iobref_new (); if (iobref == NULL) { @@ -470,7 +469,11 @@ nlm4svc_submit_reply (rpcsvc_request_t *req, void *arg, nlm4_serializer sfunc) goto ret; } - iobref_add (iobref, iob); + ret = iobref_add (iobref, iob); + if (ret) { + gf_log (GF_NLM, GF_LOG_ERROR, "Failed to add iob to iobref"); + goto ret; + } /* Then, submit the message for transmission. */ ret = rpcsvc_submit_message (req, &outmsg, 1, NULL, 0, iobref); @@ -956,7 +959,7 @@ nlm4_establish_callback (void *csarg) case AF_INET: inet_ntop (AF_INET, &sock_union.sin.sin_addr, peerip, INET6_ADDRSTRLEN+1); - inet_ntop (AF_INET, &(((struct sockaddr_in *)&cs->req->trans->myinfo.sockaddr)->sin_addr), + inet_ntop (AF_INET, &(((struct sockaddr_in *)&cs->trans->myinfo.sockaddr)->sin_addr), myip, INET6_ADDRSTRLEN + 1); break; @@ -1094,7 +1097,11 @@ nlm4svc_send_granted (nfs3_call_state_t *cs) goto ret; } - iobref_add (iobref, iobuf); + ret = iobref_add (iobref, iobuf); + if (ret) { + gf_log (GF_NLM, GF_LOG_ERROR, "Failed to add iob to iobref"); + goto ret; + } ret = rpc_clnt_submit (rpc_clnt, &nlm4clntprog, NLM4_GRANTED, nlm4svc_send_granted_cbk, &outmsg, 1, @@ -1814,7 +1821,7 @@ nlm4_add_share_to_inode (nlm_share_t *share) inode = share->inode; ret = inode_ctx_get (inode, this, &ctx); - if (ret || !head) { + if (ret == -1) { ictx = GF_CALLOC (1, sizeof (struct nfs_inode_ctx), gf_nfs_mt_inode_ctx); if (!ictx ) { @@ -2286,34 +2293,34 @@ nlm4svc_sm_notify (struct nlm_sm_status *status) rpcsvc_actor_t nlm4svc_actors[NLM4_PROC_COUNT] = { /* 0 */ - {"NULL", NLM4_NULL, nlm4svc_null, NULL}, - {"TEST", NLM4_TEST, nlm4svc_test, NULL}, - {"LOCK", NLM4_LOCK, nlm4svc_lock, NULL}, - {"CANCEL", NLM4_CANCEL, nlm4svc_cancel, NULL}, - {"UNLOCK", NLM4_UNLOCK, nlm4svc_unlock, NULL}, + {"NULL", NLM4_NULL, nlm4svc_null, NULL, 0, DRC_IDEMPOTENT}, + {"TEST", NLM4_TEST, nlm4svc_test, NULL, 0, DRC_IDEMPOTENT}, + {"LOCK", NLM4_LOCK, nlm4svc_lock, NULL, 0, DRC_NON_IDEMPOTENT}, + {"CANCEL", NLM4_CANCEL, nlm4svc_cancel, NULL, 0, DRC_NON_IDEMPOTENT}, + {"UNLOCK", NLM4_UNLOCK, nlm4svc_unlock, NULL, 0, DRC_NON_IDEMPOTENT}, /* 5 */ - {"GRANTED", NLM4_GRANTED, NULL, NULL}, - {"TEST", NLM4_TEST_MSG, NULL, NULL}, - {"LOCK", NLM4_LOCK_MSG, NULL, NULL}, - {"CANCEL", NLM4_CANCEL_MSG, NULL, NULL}, - {"UNLOCK", NLM4_UNLOCK_MSG, NULL, NULL}, + {"GRANTED", NLM4_GRANTED, NULL, NULL, 0, DRC_NA}, + {"TEST", NLM4_TEST_MSG, NULL, NULL, 0, DRC_NA}, + {"LOCK", NLM4_LOCK_MSG, NULL, NULL, 0, DRC_NA}, + {"CANCEL", NLM4_CANCEL_MSG, NULL, NULL, 0, DRC_NA}, + {"UNLOCK", NLM4_UNLOCK_MSG, NULL, NULL, 0, DRC_NA}, /* 10 */ - {"GRANTED", NLM4_GRANTED_MSG, NULL, NULL}, - {"TEST", NLM4_TEST_RES, NULL, NULL}, - {"LOCK", NLM4_LOCK_RES, NULL, NULL}, - {"CANCEL", NLM4_CANCEL_RES, NULL, NULL}, - {"UNLOCK", NLM4_UNLOCK_RES, NULL, NULL}, + {"GRANTED", NLM4_GRANTED_MSG, NULL, NULL, 0, DRC_NA}, + {"TEST", NLM4_TEST_RES, NULL, NULL, 0, DRC_NA}, + {"LOCK", NLM4_LOCK_RES, NULL, NULL, 0, DRC_NA}, + {"CANCEL", NLM4_CANCEL_RES, NULL, NULL, 0, DRC_NA}, + {"UNLOCK", NLM4_UNLOCK_RES, NULL, NULL, 0, DRC_NA}, /* 15 ; procedures 17,18,19 are not defined by nlm */ - {"GRANTED", NLM4_GRANTED_RES, NULL, NULL}, - {"SM_NOTIFY", NLM4_SM_NOTIFY, NULL, NULL}, - {"SEVENTEEN", NLM4_SEVENTEEN, NULL, NULL}, - {"EIGHTEEN", NLM4_EIGHTEEN, NULL, NULL}, - {"NINETEEN", NLM4_NINETEEN, NULL, NULL}, + {"GRANTED", NLM4_GRANTED_RES, NULL, NULL, 0, DRC_NA}, + {"SM_NOTIFY", NLM4_SM_NOTIFY, NULL, NULL, 0, DRC_NA}, + {"SEVENTEEN", NLM4_SEVENTEEN, NULL, NULL, 0, DRC_NA}, + {"EIGHTEEN", NLM4_EIGHTEEN, NULL, NULL, 0, DRC_NA}, + {"NINETEEN", NLM4_NINETEEN, NULL, NULL, 0, DRC_NA}, /* 20 */ - {"SHARE", NLM4_SHARE, nlm4svc_share, NULL}, - {"UNSHARE", NLM4_UNSHARE, nlm4svc_unshare, NULL}, - {"NM_LOCK", NLM4_NM_LOCK, nlm4svc_nm_lock, NULL}, - {"FREE_ALL", NLM4_FREE_ALL, nlm4svc_free_all, NULL}, + {"SHARE", NLM4_SHARE, nlm4svc_share, NULL, 0, DRC_NON_IDEMPOTENT}, + {"UNSHARE", NLM4_UNSHARE, nlm4svc_unshare, NULL, 0, DRC_NON_IDEMPOTENT}, + {"NM_LOCK", NLM4_NM_LOCK, nlm4svc_nm_lock, NULL, 0, DRC_NON_IDEMPOTENT}, + {"FREE_ALL", NLM4_FREE_ALL, nlm4svc_free_all, NULL, 0, DRC_IDEMPOTENT}, }; rpcsvc_program_t nlm4prog = { @@ -2349,9 +2356,14 @@ nlm4svc_init(xlator_t *nfsx) int ret = -1; char *portstr = NULL; pthread_t thr; - struct timeval timeout = {0,}; + struct timespec timeout = {0,}; FILE *pidfile = NULL; pid_t pid = -1; + static gf_boolean_t nlm4_inited = _gf_false; + + /* Already inited */ + if (nlm4_inited) + return &nlm4prog; nfs = (struct nfs_state*)nfsx->private; @@ -2397,7 +2409,7 @@ nlm4svc_init(xlator_t *nfsx) goto err; } - rpcsvc_create_listeners (nfs->rpcsvc, options, "NLM"); + ret = rpcsvc_create_listeners (nfs->rpcsvc, options, "NLM"); if (ret == -1) { gf_log (GF_NLM, GF_LOG_ERROR, "Unable to create listeners"); dict_unref (options); @@ -2453,7 +2465,10 @@ nlm4svc_init(xlator_t *nfsx) pthread_create (&thr, NULL, nsm_thread, (void*)NULL); timeout.tv_sec = nlm_grace_period; + timeout.tv_nsec = 0; + gf_timer_call_after (nfsx->ctx, timeout, nlm_grace_period_over, NULL); + nlm4_inited = _gf_true; return &nlm4prog; err: return NULL; diff --git a/xlators/nfs/server/src/nlm4.h b/xlators/nfs/server/src/nlm4.h index 4659915aa..9b5d54081 100644 --- a/xlators/nfs/server/src/nlm4.h +++ b/xlators/nfs/server/src/nlm4.h @@ -2,19 +2,10 @@ Copyright (c) 2012 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _NLM4_H_ diff --git a/xlators/nfs/server/src/nlmcbk_svc.c b/xlators/nfs/server/src/nlmcbk_svc.c index 5401dc39b..e1b588765 100644 --- a/xlators/nfs/server/src/nlmcbk_svc.c +++ b/xlators/nfs/server/src/nlmcbk_svc.c @@ -2,19 +2,10 @@ Copyright (c) 2012 Gluster, Inc. <http://www.gluster.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ /* @@ -92,10 +83,14 @@ void * nsm_thread (void *argv) { register SVCXPRT *transp; + int ret = 0; - pmap_unset (NLMCBK_PROGRAM, NLMCBK_V1); - - transp = svcudp_create(RPC_ANYSOCK); + ret = pmap_unset (NLMCBK_PROGRAM, NLMCBK_V1); + if (ret == 0) { + gf_log (GF_NLM, GF_LOG_ERROR, "pmap_unset failed"); + return NULL; + } + transp = svcudp_create(RPC_ANYSOCK); if (transp == NULL) { gf_log (GF_NLM, GF_LOG_ERROR, "cannot create udp service."); return NULL; diff --git a/xlators/performance/Makefile.am b/xlators/performance/Makefile.am index f99e11829..a494190ba 100644 --- a/xlators/performance/Makefile.am +++ b/xlators/performance/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = write-behind read-ahead io-threads io-cache symlink-cache quick-read md-cache open-behind +SUBDIRS = write-behind read-ahead readdir-ahead io-threads io-cache symlink-cache quick-read md-cache open-behind CLEANFILES = diff --git a/xlators/performance/io-cache/src/io-cache.c b/xlators/performance/io-cache/src/io-cache.c index 30dc14a9c..201777b38 100644 --- a/xlators/performance/io-cache/src/io-cache.c +++ b/xlators/performance/io-cache/src/io-cache.c @@ -316,9 +316,11 @@ ioc_forget (xlator_t *this, inode_t *inode) static int32_t ioc_invalidate(xlator_t *this, inode_t *inode) { + uint64_t ioc_addr = 0; ioc_inode_t *ioc_inode = NULL; - inode_ctx_get(inode, this, (uint64_t *) &ioc_inode); + inode_ctx_get(inode, this, (uint64_t *) &ioc_addr); + ioc_inode = (void *) ioc_addr; if (ioc_inode) ioc_inode_flush(ioc_inode); @@ -1422,6 +1424,58 @@ ioc_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, return 0; } +static int32_t +ioc_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, pre, post, xdata); + return 0; +} + +static int32_t +ioc_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + uint64_t ioc_inode = 0; + + inode_ctx_get (fd->inode, this, &ioc_inode); + + if (ioc_inode) + ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + + STACK_WIND(frame, ioc_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata); + return 0; +} + +static int32_t +ioc_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + STACK_UNWIND_STRICT(zerofill, frame, op_ret, + op_errno, pre, post, xdata); + return 0; +} + +static int32_t +ioc_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + uint64_t ioc_inode = 0; + + inode_ctx_get (fd->inode, this, &ioc_inode); + + if (ioc_inode) + ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + + STACK_WIND(frame, ioc_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata); + return 0; +} + + int32_t ioc_get_priority_list (const char *opt_str, struct list_head *first) { @@ -1885,11 +1939,11 @@ int ioc_inode_dump (xlator_t *this, inode_t *inode) { - char *path = NULL; + char *path = NULL; int ret = -1; char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, }; uint64_t tmp_ioc_inode = 0; - ioc_inode_t *ioc_inode = NULL; + ioc_inode_t *ioc_inode = NULL; gf_boolean_t section_added = _gf_false; char uuid_str[64] = {0,}; @@ -1903,9 +1957,6 @@ ioc_inode_dump (xlator_t *this, inode_t *inode) if (ioc_inode == NULL) goto out; - gf_proc_dump_add_section (key_prefix); - section_added = _gf_true; - /* Similar to ioc_page_dump function its better to use * pthread_mutex_trylock and not to use gf_log in statedump * to avoid deadlocks. @@ -1913,24 +1964,30 @@ ioc_inode_dump (xlator_t *this, inode_t *inode) ret = pthread_mutex_trylock (&ioc_inode->inode_lock); if (ret) goto out; - else + { - gf_proc_dump_write ("inode.weight", "%d", ioc_inode->weight); + if (uuid_is_null (ioc_inode->inode->gfid)) + goto unlock; + + gf_proc_dump_add_section (key_prefix); + section_added = _gf_true; - //inode_path takes blocking lock on the itable. __inode_path (ioc_inode->inode, NULL, &path); + gf_proc_dump_write ("inode.weight", "%d", ioc_inode->weight); + if (path) { gf_proc_dump_write ("path", "%s", path); GF_FREE (path); } + gf_proc_dump_write ("uuid", "%s", uuid_utoa_r (ioc_inode->inode->gfid, uuid_str)); __ioc_cache_dump (ioc_inode, key_prefix); __ioc_inode_waitq_dump (ioc_inode, key_prefix); - - pthread_mutex_unlock (&ioc_inode->inode_lock); } +unlock: + pthread_mutex_unlock (&ioc_inode->inode_lock); out: if (ret && ioc_inode) { @@ -2044,6 +2101,8 @@ struct xlator_fops fops = { .mknod = ioc_mknod, .readdirp = ioc_readdirp, + .discard = ioc_discard, + .zerofill = ioc_zerofill, }; diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c index 226c091f1..bbcf4ed26 100644 --- a/xlators/performance/io-threads/src/io-threads.c +++ b/xlators/performance/io-threads/src/io-threads.c @@ -307,6 +307,9 @@ iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub) case GF_FOP_XATTROP: case GF_FOP_FXATTROP: case GF_FOP_RCHECKSUM: + case GF_FOP_FALLOCATE: + case GF_FOP_DISCARD: + case GF_FOP_ZEROFILL: pri = IOT_PRI_LO; break; @@ -321,9 +324,9 @@ iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub) break; } out: - ret = do_iot_schedule (this->private, stub, pri); gf_log (this->name, GF_LOG_DEBUG, "%s scheduled as %s fop", gf_fop_list[stub->fop], iot_get_pri_meaning (pri)); + ret = do_iot_schedule (this->private, stub, pri); return ret; } @@ -2406,6 +2409,155 @@ out: return 0; } +int +iot_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata) +{ + STACK_UNWIND_STRICT (fallocate, frame, op_ret, op_errno, preop, postop, + xdata); + return 0; +} + + +int +iot_fallocate_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + STACK_WIND (frame, iot_fallocate_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fallocate, fd, mode, offset, len, + xdata); + return 0; +} + + +int +iot_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + call_stub_t *stub = NULL; + int ret = -1; + + stub = fop_fallocate_stub(frame, iot_fallocate_wrapper, fd, mode, offset, + len, xdata); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot create fallocate stub" + "(out of memory)"); + ret = -ENOMEM; + goto out; + } + + ret = iot_schedule (frame, this, stub); + +out: + if (ret < 0) { + STACK_UNWIND_STRICT (fallocate, frame, -1, -ret, NULL, NULL, + NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } + return 0; +} + +int +iot_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata) +{ + STACK_UNWIND_STRICT (discard, frame, op_ret, op_errno, preop, postop, + xdata); + return 0; +} + + +int +iot_discard_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + STACK_WIND (frame, iot_discard_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->discard, fd, offset, len, xdata); + return 0; +} + + +int +iot_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + call_stub_t *stub = NULL; + int ret = -1; + + stub = fop_discard_stub(frame, iot_discard_wrapper, fd, offset, len, + xdata); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot create discard stub" + "(out of memory)"); + ret = -ENOMEM; + goto out; + } + + ret = iot_schedule (frame, this, stub); + +out: + if (ret < 0) { + STACK_UNWIND_STRICT (discard, frame, -1, -ret, NULL, NULL, + NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } + return 0; +} + +int +iot_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *preop, struct iatt *postop, dict_t *xdata) +{ + STACK_UNWIND_STRICT (zerofill, frame, op_ret, op_errno, preop, postop, + xdata); + return 0; +} + +int +iot_zerofill_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, dict_t *xdata) +{ + STACK_WIND (frame, iot_zerofill_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->zerofill, fd, offset, len, xdata); + return 0; +} + +int +iot_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + call_stub_t *stub = NULL; + int ret = -1; + + stub = fop_zerofill_stub(frame, iot_zerofill_wrapper, fd, + offset, len, xdata); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot create zerofill stub" + "(out of memory)"); + ret = -ENOMEM; + goto out; + } + + ret = iot_schedule (frame, this, stub); + +out: + if (ret < 0) { + STACK_UNWIND_STRICT (zerofill, frame, -1, -ret, NULL, NULL, + NULL); + if (stub != NULL) { + call_stub_destroy (stub); + } + } + return 0; +} + int __iot_workers_scale (iot_conf_t *conf) @@ -2432,7 +2584,7 @@ __iot_workers_scale (iot_conf_t *conf) while (diff) { diff --; - ret = pthread_create (&thread, &conf->w_attr, iot_worker, conf); + ret = gf_thread_create (&thread, &conf->w_attr, iot_worker, conf); if (ret == 0) { conf->curr_count++; gf_log (conf->this->name, GF_LOG_DEBUG, @@ -2736,6 +2888,9 @@ struct xlator_fops fops = { .xattrop = iot_xattrop, .fxattrop = iot_fxattrop, .rchecksum = iot_rchecksum, + .fallocate = iot_fallocate, + .discard = iot_discard, + .zerofill = iot_zerofill, }; struct xlator_cbks cbks; diff --git a/xlators/performance/md-cache/src/md-cache.c b/xlators/performance/md-cache/src/md-cache.c index 24924200d..84c363ad9 100644 --- a/xlators/performance/md-cache/src/md-cache.c +++ b/xlators/performance/md-cache/src/md-cache.c @@ -18,6 +18,7 @@ #include "dict.h" #include "xlator.h" #include "md-cache-mem-types.h" +#include "glusterfs-acl.h" #include <assert.h> #include <sys/time.h> @@ -42,12 +43,12 @@ static struct mdc_key { int check; } mdc_keys[] = { { - .name = "system.posix_acl_access", + .name = POSIX_ACL_ACCESS_XATTR, .load = 0, .check = 1, }, { - .name = "system.posix_acl_default", + .name = POSIX_ACL_DEFAULT_XATTR, .load = 0, .check = 1, }, @@ -132,6 +133,7 @@ struct mdc_local { loc_t loc2; fd_t *fd; char *linkname; + char *key; dict_t *xattr; }; @@ -174,7 +176,7 @@ __mdc_inode_ctx_set (xlator_t *this, inode_t *inode, struct md_cache *mdc) uint64_t mdc_int = 0; mdc_int = (long) mdc; - ret = __inode_ctx_set2 (inode, this, &mdc_int, 0); + ret = __inode_ctx_set (inode, this, &mdc_int); return ret; } @@ -229,6 +231,8 @@ mdc_local_wipe (xlator_t *this, mdc_local_t *local) GF_FREE (local->linkname); + GF_FREE (local->key); + if (local->xattr) dict_unref (local->xattr); @@ -585,6 +589,31 @@ out: int +mdc_inode_xatt_unset (xlator_t *this, inode_t *inode, char *name) +{ + int ret = -1; + struct md_cache *mdc = NULL; + + mdc = mdc_inode_prep (this, inode); + if (!mdc) + goto out; + + if (!name) + goto out; + + LOCK (&mdc->lock); + { + dict_del (mdc->xattr, name); + } + UNLOCK (&mdc->lock); + + ret = 0; +out: + return ret; +} + + +int mdc_inode_xatt_get (xlator_t *this, inode_t *inode, dict_t **dict) { int ret = -1; @@ -616,6 +645,46 @@ out: } +int +mdc_inode_iatt_invalidate (xlator_t *this, inode_t *inode) +{ + int ret = -1; + struct md_cache *mdc = NULL; + + if (mdc_inode_ctx_get (this, inode, &mdc) != 0) + goto out; + + LOCK (&mdc->lock); + { + mdc->ia_time = 0; + } + UNLOCK (&mdc->lock); + +out: + return ret; +} + + +int +mdc_inode_xatt_invalidate (xlator_t *this, inode_t *inode) +{ + int ret = -1; + struct md_cache *mdc = NULL; + + if (mdc_inode_ctx_get (this, inode, &mdc) != 0) + goto out; + + LOCK (&mdc->lock); + { + mdc->xa_time = 0; + } + UNLOCK (&mdc->lock); + +out: + return ret; +} + + void mdc_load_reqs (xlator_t *this, dict_t *dict) { @@ -731,6 +800,13 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, if (!local) goto uncached; + if (!loc->name) + /* A nameless discovery is dangerous to cache. We + perform nameless lookup with the intention of + re-establishing an inode "properly" + */ + goto uncached; + loc_copy (&local->loc, loc); ret = mdc_inode_iatt_get (this, loc->inode, &stbuf); @@ -1579,6 +1655,8 @@ mdc_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, mdc_inode_xatt_update (this, local->loc.inode, local->xattr); + mdc_inode_iatt_invalidate (this, local->loc.inode); + out: MDC_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); @@ -1620,6 +1698,7 @@ mdc_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, mdc_inode_xatt_update (this, local->fd->inode, local->xattr); + mdc_inode_iatt_invalidate (this, local->fd->inode); out: MDC_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata); @@ -1689,7 +1768,7 @@ mdc_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key, if (ret != 0) goto uncached; - if (!xattr || dict_get (xattr, (char *)key)) { + if (!xattr || !dict_get (xattr, (char *)key)) { ret = -1; op_errno = ENODATA; } @@ -1751,7 +1830,7 @@ mdc_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key, if (ret != 0) goto uncached; - if (!xattr || dict_get (xattr, (char *)key)) { + if (!xattr || !dict_get (xattr, (char *)key)) { ret = -1; op_errno = ENODATA; } @@ -1767,6 +1846,97 @@ uncached: return 0; } +int +mdc_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + if (local->key) + mdc_inode_xatt_unset (this, local->loc.inode, local->key); + else + mdc_inode_xatt_invalidate (this, local->loc.inode); + + mdc_inode_iatt_invalidate (this, local->loc.inode); +out: + MDC_STACK_UNWIND (removexattr, frame, op_ret, op_errno, xdata); + + return 0; +} + + +int +mdc_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + loc_copy (&local->loc, loc); + + local->key = gf_strdup (name); + + STACK_WIND (frame, mdc_removexattr_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->removexattr, + loc, name, xdata); + return 0; +} + + +int +mdc_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + if (local->key) + mdc_inode_xatt_unset (this, local->fd->inode, local->key); + else + mdc_inode_xatt_invalidate (this, local->fd->inode); + + mdc_inode_iatt_invalidate (this, local->fd->inode); +out: + MDC_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, xdata); + + return 0; +} + + +int +mdc_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = mdc_local_get (frame); + + local->fd = fd_ref (fd); + + local->key = gf_strdup (name); + + STACK_WIND (frame, mdc_fremovexattr_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fremovexattr, + fd, name, xdata); + return 0; +} + int mdc_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -1849,6 +2019,123 @@ mdc_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, return 0; } +int +mdc_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf); + +out: + MDC_STACK_UNWIND (fallocate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + +int mdc_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + mdc_local_t *local; + + local = mdc_local_get(frame); + local->fd = fd_ref(fd); + + STACK_WIND(frame, mdc_fallocate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len, + xdata); + + return 0; +} + +int +mdc_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf); + +out: + MDC_STACK_UNWIND(discard, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + +int mdc_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + mdc_local_t *local; + + local = mdc_local_get(frame); + local->fd = fd_ref(fd); + + STACK_WIND(frame, mdc_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, fd, offset, len, + xdata); + + return 0; +} + +int +mdc_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + mdc_local_t *local = NULL; + + local = frame->local; + + if (op_ret != 0) + goto out; + + if (!local) + goto out; + + mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf); + +out: + MDC_STACK_UNWIND(zerofill, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + + return 0; +} + +int mdc_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + mdc_local_t *local; + + local = mdc_local_get(frame); + local->fd = fd_ref(fd); + + STACK_WIND(frame, mdc_zerofill_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->zerofill, fd, offset, len, + xdata); + + return 0; +} + int mdc_forget (xlator_t *this, inode_t *inode) @@ -1976,8 +2263,13 @@ struct xlator_fops fops = { .fsetxattr = mdc_fsetxattr, .getxattr = mdc_getxattr, .fgetxattr = mdc_fgetxattr, + .removexattr = mdc_removexattr, + .fremovexattr= mdc_fremovexattr, .readdirp = mdc_readdirp, - .readdir = mdc_readdir + .readdir = mdc_readdir, + .fallocate = mdc_fallocate, + .discard = mdc_discard, + .zerofill = mdc_zerofill, }; diff --git a/xlators/performance/open-behind/src/open-behind.c b/xlators/performance/open-behind/src/open-behind.c index b405b94cd..7e5b57278 100644 --- a/xlators/performance/open-behind/src/open-behind.c +++ b/xlators/performance/open-behind/src/open-behind.c @@ -681,6 +681,63 @@ err: return 0; } +int +ob_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + call_stub_t *stub; + + stub = fop_fallocate_stub(frame, default_fallocate_resume, fd, mode, + offset, len, xdata); + if (!stub) + goto err; + + open_and_resume(this, fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT(fallocate, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; +} + +int +ob_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + call_stub_t *stub; + + stub = fop_discard_stub(frame, default_discard_resume, fd, offset, len, + xdata); + if (!stub) + goto err; + + open_and_resume(this, fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT(discard, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; +} + +int +ob_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + call_stub_t *stub; + + stub = fop_zerofill_stub(frame, default_zerofill_resume, fd, + offset, len, xdata); + if (!stub) + goto err; + + open_and_resume(this, fd, stub); + + return 0; +err: + STACK_UNWIND_STRICT(zerofill, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; +} + int ob_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, @@ -697,6 +754,8 @@ ob_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, fd = fd_lookup (loc->inode, 0); open_and_resume (this, fd, stub); + if (fd) + fd_unref (fd); return 0; err: @@ -721,6 +780,8 @@ ob_rename (call_frame_t *frame, xlator_t *this, loc_t *src, loc_t *dst, fd = fd_lookup (dst->inode, 0); open_and_resume (this, fd, stub); + if (fd) + fd_unref (fd); return 0; err: @@ -903,6 +964,9 @@ struct xlator_fops fops = { .fentrylk = ob_fentrylk, .fxattrop = ob_fxattrop, .fsetattr = ob_fsetattr, + .fallocate = ob_fallocate, + .discard = ob_discard, + .zerofill = ob_zerofill, .unlink = ob_unlink, .rename = ob_rename, .lk = ob_lk, diff --git a/xlators/performance/read-ahead/src/read-ahead.c b/xlators/performance/read-ahead/src/read-ahead.c index 522fa52b8..069ab1f1a 100644 --- a/xlators/performance/read-ahead/src/read-ahead.c +++ b/xlators/performance/read-ahead/src/read-ahead.c @@ -942,6 +942,106 @@ unwind: return 0; } +int +ra_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + GF_ASSERT (frame); + + STACK_UNWIND_STRICT (discard, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + return 0; +} + +static int +ra_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + + GF_ASSERT (frame); + GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); + + inode = fd->inode; + + LOCK (&inode->lock); + { + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + fd_ctx_get (iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + if (!file) + continue; + + flush_region(frame, file, offset, len, 1); + } + } + UNLOCK (&inode->lock); + + STACK_WIND (frame, ra_discard_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->discard, fd, offset, len, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT (discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int +ra_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + GF_ASSERT (frame); + + STACK_UNWIND_STRICT (zerofill, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + return 0; +} + +static int +ra_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + uint64_t tmp_file = 0; + int32_t op_errno = EINVAL; + + GF_ASSERT (frame); + GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind); + GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind); + + inode = fd->inode; + + LOCK (&inode->lock); + { + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + fd_ctx_get (iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + if (!file) + continue; + + flush_region(frame, file, offset, len, 1); + } + } + UNLOCK (&inode->lock); + + STACK_WIND (frame, ra_zerofill_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->zerofill, fd, + offset, len, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT (zerofill, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} int ra_priv_dump (xlator_t *this) @@ -1123,6 +1223,8 @@ struct xlator_fops fops = { .truncate = ra_truncate, .ftruncate = ra_ftruncate, .fstat = ra_fstat, + .discard = ra_discard, + .zerofill = ra_zerofill, }; struct xlator_cbks cbks = { diff --git a/xlators/performance/readdir-ahead/Makefile.am b/xlators/performance/readdir-ahead/Makefile.am new file mode 100644 index 000000000..a985f42a8 --- /dev/null +++ b/xlators/performance/readdir-ahead/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/performance/readdir-ahead/src/Makefile.am b/xlators/performance/readdir-ahead/src/Makefile.am new file mode 100644 index 000000000..cdabd1428 --- /dev/null +++ b/xlators/performance/readdir-ahead/src/Makefile.am @@ -0,0 +1,15 @@ +xlator_LTLIBRARIES = readdir-ahead.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + +readdir_ahead_la_LDFLAGS = -module -avoidversion + +readdir_ahead_la_SOURCES = readdir-ahead.c +readdir_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = readdir-ahead.h readdir-ahead-mem-types.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h b/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h new file mode 100644 index 000000000..39e2c5369 --- /dev/null +++ b/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h @@ -0,0 +1,24 @@ +/* + Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef __RDA_MEM_TYPES_H__ +#define __RDA_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_rda_mem_types_ { + gf_rda_mt_rda_local = gf_common_mt_end + 1, + gf_rda_mt_rda_fd_ctx, + gf_rda_mt_rda_priv, + gf_rda_mt_end +}; + +#endif diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead.c b/xlators/performance/readdir-ahead/src/readdir-ahead.c new file mode 100644 index 000000000..53e6756f0 --- /dev/null +++ b/xlators/performance/readdir-ahead/src/readdir-ahead.c @@ -0,0 +1,560 @@ +/* + Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +/* + * performance/readdir-ahead preloads a local buffer with directory entries + * on opendir. The optimization involves using maximum sized gluster rpc + * requests (128k) to minimize overhead of smaller client requests. + * + * For example, fuse currently supports a maximum readdir buffer of 4k + * (regardless of the filesystem client's buffer size). readdir-ahead should + * effectively convert these smaller requests into fewer, larger sized requests + * for simple, sequential workloads (i.e., ls). + * + * The translator is currently designed to handle the simple, sequential case + * only. If a non-sequential directory read occurs, readdir-ahead disables + * preloads on the directory. + */ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "xlator.h" +#include "call-stub.h" +#include "readdir-ahead.h" +#include "readdir-ahead-mem-types.h" +#include "defaults.h" + +static int rda_fill_fd(call_frame_t *, xlator_t *, fd_t *); + +/* + * Get (or create) the fd context for storing prepopulated directory + * entries. + */ +static struct +rda_fd_ctx *get_rda_fd_ctx(fd_t *fd, xlator_t *this) +{ + uint64_t val; + struct rda_fd_ctx *ctx; + + LOCK(&fd->lock); + + if (__fd_ctx_get(fd, this, &val) < 0) { + ctx = GF_CALLOC(1, sizeof(struct rda_fd_ctx), + gf_rda_mt_rda_fd_ctx); + if (!ctx) + goto out; + + LOCK_INIT(&ctx->lock); + INIT_LIST_HEAD(&ctx->entries.list); + ctx->state = RDA_FD_NEW; + /* ctx offset values initialized to 0 */ + + if (__fd_ctx_set(fd, this, (uint64_t) ctx) < 0) { + GF_FREE(ctx); + ctx = NULL; + goto out; + } + } else { + ctx = (struct rda_fd_ctx *) val; + } +out: + UNLOCK(&fd->lock); + return ctx; +} + +/* + * Reset the tracking state of the context. + */ +static void +rda_reset_ctx(struct rda_fd_ctx *ctx) +{ + ctx->state = RDA_FD_NEW; + ctx->cur_offset = 0; + ctx->cur_size = 0; + ctx->next_offset = 0; + gf_dirent_free(&ctx->entries); +} + +/* + * Check whether we can handle a request. Offset verification is done by the + * caller, so we only check whether the preload buffer has completion status + * (including an error) or has some data to return. + */ +static gf_boolean_t +rda_can_serve_readdirp(struct rda_fd_ctx *ctx, size_t request_size) +{ + if ((ctx->state & RDA_FD_EOD) || + (ctx->state & RDA_FD_ERROR) || + (!(ctx->state & RDA_FD_PLUGGED) && (ctx->cur_size > 0))) + return _gf_true; + + return _gf_false; +} + +/* + * Serve a request from the fd dentry list based on the size of the request + * buffer. ctx must be locked. + */ +static int32_t +__rda_serve_readdirp(xlator_t *this, gf_dirent_t *entries, size_t request_size, + struct rda_fd_ctx *ctx) +{ + gf_dirent_t *dirent, *tmp; + size_t dirent_size, size = 0; + int32_t count = 0; + struct rda_priv *priv = this->private; + + list_for_each_entry_safe(dirent, tmp, &ctx->entries.list, list) { + dirent_size = gf_dirent_size(dirent->d_name); + if (size + dirent_size > request_size) + break; + + size += dirent_size; + list_del_init(&dirent->list); + ctx->cur_size -= dirent_size; + + list_add_tail(&dirent->list, &entries->list); + ctx->cur_offset = dirent->d_off; + count++; + } + + if (ctx->cur_size <= priv->rda_low_wmark) + ctx->state |= RDA_FD_PLUGGED; + + return count; +} + +static int32_t +rda_readdirp_stub(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) +{ + gf_dirent_t entries; + int32_t ret; + struct rda_fd_ctx *ctx; + int op_errno = 0; + + ctx = get_rda_fd_ctx(fd, this); + INIT_LIST_HEAD(&entries.list); + ret = __rda_serve_readdirp(this, &entries, size, ctx); + + if (!ret && (ctx->state & RDA_FD_ERROR)) { + ret = -1; + op_errno = ctx->op_errno; + ctx->state &= ~RDA_FD_ERROR; + + /* + * the preload has stopped running in the event of an error, so + * pass all future requests along + */ + ctx->state |= RDA_FD_BYPASS; + } + + STACK_UNWIND_STRICT(readdirp, frame, ret, op_errno, &entries, xdata); + gf_dirent_free(&entries); + + return 0; +} + +static int32_t +rda_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *xdata) +{ + struct rda_fd_ctx *ctx; + call_stub_t *stub; + int fill = 0; + + ctx = get_rda_fd_ctx(fd, this); + if (!ctx) + goto err; + + if (ctx->state & RDA_FD_BYPASS) + goto bypass; + + LOCK(&ctx->lock); + + /* recheck now that we have the lock */ + if (ctx->state & RDA_FD_BYPASS) { + UNLOCK(&ctx->lock); + goto bypass; + } + + /* + * If a new read comes in at offset 0 and the buffer has been + * completed, reset the context and kickstart the filler again. + */ + if (!off && (ctx->state & RDA_FD_EOD) && (ctx->cur_size == 0)) { + rda_reset_ctx(ctx); + fill = 1; + } + + /* + * If a readdir occurs at an unexpected offset or we already have a + * request pending, admit defeat and just get out of the way. + */ + if (off != ctx->cur_offset || ctx->stub) { + ctx->state |= RDA_FD_BYPASS; + UNLOCK(&ctx->lock); + goto bypass; + } + + stub = fop_readdirp_stub(frame, rda_readdirp_stub, fd, size, off, xdata); + if (!stub) { + UNLOCK(&ctx->lock); + goto err; + } + + /* + * If we haven't bypassed the preload, this means we can either serve + * the request out of the preload or the request that enables us to do + * so is in flight... + */ + if (rda_can_serve_readdirp(ctx, size)) + call_resume(stub); + else + ctx->stub = stub; + + UNLOCK(&ctx->lock); + + if (fill) + rda_fill_fd(frame, this, fd); + + return 0; + +bypass: + STACK_WIND(frame, default_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata); + return 0; + +err: + STACK_UNWIND_STRICT(readdirp, frame, -1, ENOMEM, NULL, NULL); + return 0; +} + +static int32_t +rda_fill_fd_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) +{ + gf_dirent_t *dirent, *tmp; + struct rda_local *local = frame->local; + struct rda_fd_ctx *ctx = local->ctx; + struct rda_priv *priv = this->private; + int fill = 1; + + LOCK(&ctx->lock); + + /* Verify that the preload buffer is still pending on this data. */ + if (ctx->next_offset != local->offset) { + gf_log(this->name, GF_LOG_ERROR, + "Out of sequence directory preload."); + ctx->state |= (RDA_FD_BYPASS|RDA_FD_ERROR); + ctx->op_errno = EUCLEAN; + + goto out; + } + + if (entries) { + list_for_each_entry_safe(dirent, tmp, &entries->list, list) { + list_del_init(&dirent->list); + /* must preserve entry order */ + list_add_tail(&dirent->list, &ctx->entries.list); + + ctx->cur_size += gf_dirent_size(dirent->d_name); + ctx->next_offset = dirent->d_off; + } + } + + if (ctx->cur_size >= priv->rda_high_wmark) + ctx->state &= ~RDA_FD_PLUGGED; + + if (!op_ret) { + /* we've hit eod */ + ctx->state &= ~RDA_FD_RUNNING; + ctx->state |= RDA_FD_EOD; + } else if (op_ret == -1) { + /* kill the preload and pend the error */ + ctx->state &= ~RDA_FD_RUNNING; + ctx->state |= RDA_FD_ERROR; + ctx->op_errno = op_errno; + } + + /* + * NOTE: The strict bypass logic in readdirp() means a pending request + * is always based on ctx->cur_offset. + */ + if (ctx->stub && + rda_can_serve_readdirp(ctx, ctx->stub->args.size)) { + call_resume(ctx->stub); + ctx->stub = NULL; + } + +out: + /* + * If we have been marked for bypass and have no pending stub, clear the + * run state so we stop preloading the context with entries. + */ + if ((ctx->state & RDA_FD_BYPASS) && !ctx->stub) + ctx->state &= ~RDA_FD_RUNNING; + + if (!(ctx->state & RDA_FD_RUNNING)) { + fill = 0; + STACK_DESTROY(ctx->fill_frame->root); + ctx->fill_frame = NULL; + } + + UNLOCK(&ctx->lock); + + if (fill) + rda_fill_fd(frame, this, local->fd); + + return 0; +} + +/* + * Start prepopulating the fd context with directory entries. + */ +static int +rda_fill_fd(call_frame_t *frame, xlator_t *this, fd_t *fd) +{ + call_frame_t *nframe = NULL; + struct rda_local *local = NULL; + struct rda_fd_ctx *ctx; + off_t offset; + struct rda_priv *priv = this->private; + + ctx = get_rda_fd_ctx(fd, this); + if (!ctx) + goto err; + + LOCK(&ctx->lock); + + if (ctx->state & RDA_FD_NEW) { + ctx->state &= ~RDA_FD_NEW; + ctx->state |= RDA_FD_RUNNING; + if (priv->rda_low_wmark) + ctx->state |= RDA_FD_PLUGGED; + } + + offset = ctx->next_offset; + + if (!ctx->fill_frame) { + nframe = copy_frame(frame); + if (!nframe) { + UNLOCK(&ctx->lock); + goto err; + } + + local = mem_get0(this->local_pool); + if (!local) { + UNLOCK(&ctx->lock); + goto err; + } + + local->ctx = ctx; + local->fd = fd; + nframe->local = local; + + ctx->fill_frame = nframe; + } else { + nframe = ctx->fill_frame; + local = nframe->local; + } + + local->offset = offset; + + UNLOCK(&ctx->lock); + + STACK_WIND(nframe, rda_fill_fd_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, priv->rda_req_size, + offset, NULL); + + return 0; + +err: + if (nframe) + FRAME_DESTROY(nframe); + + return -1; +} + +static int32_t +rda_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +{ + if (!op_ret) + rda_fill_fd(frame, this, fd); + + STACK_UNWIND_STRICT(opendir, frame, op_ret, op_errno, fd, xdata); + return 0; +} + +static int32_t +rda_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) +{ + STACK_WIND(frame, rda_opendir_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, loc, fd, xdata); + return 0; +} + +static int32_t +rda_releasedir(xlator_t *this, fd_t *fd) +{ + uint64_t val; + struct rda_fd_ctx *ctx; + + if (fd_ctx_del(fd, this, &val) < 0) + return -1; + + ctx = (struct rda_fd_ctx *) val; + if (!ctx) + return 0; + + rda_reset_ctx(ctx); + + if (ctx->fill_frame) + STACK_DESTROY(ctx->fill_frame->root); + + if (ctx->stub) + gf_log(this->name, GF_LOG_ERROR, + "released a directory with a pending stub"); + + GF_FREE(ctx); + return 0; +} + +int32_t +mem_acct_init(xlator_t *this) +{ + int ret = -1; + + if (!this) + goto out; + + ret = xlator_mem_acct_init(this, gf_rda_mt_end + 1); + + if (ret != 0) + gf_log(this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + +out: + return ret; +} + +int +reconfigure(xlator_t *this, dict_t *options) +{ + struct rda_priv *priv = this->private; + + GF_OPTION_RECONF("rda-request-size", priv->rda_req_size, options, + uint32, err); + GF_OPTION_RECONF("rda-low-wmark", priv->rda_low_wmark, options, size, + err); + GF_OPTION_RECONF("rda-high-wmark", priv->rda_high_wmark, options, size, + err); + + return 0; +err: + return -1; +} + +int +init(xlator_t *this) +{ + struct rda_priv *priv = NULL; + + GF_VALIDATE_OR_GOTO("readdir-ahead", this, err); + + if (!this->children || this->children->next) { + gf_log(this->name, GF_LOG_ERROR, + "FATAL: readdir-ahead not configured with exactly one" + " child"); + goto err; + } + + if (!this->parents) { + gf_log(this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + priv = GF_CALLOC(1, sizeof(struct rda_priv), gf_rda_mt_rda_priv); + if (!priv) + goto err; + this->private = priv; + + this->local_pool = mem_pool_new(struct rda_local, 32); + if (!this->local_pool) + goto err; + + GF_OPTION_INIT("rda-request-size", priv->rda_req_size, uint32, err); + GF_OPTION_INIT("rda-low-wmark", priv->rda_low_wmark, size, err); + GF_OPTION_INIT("rda-high-wmark", priv->rda_high_wmark, size, err); + + return 0; + +err: + if (this->local_pool) + mem_pool_destroy(this->local_pool); + if (priv) + GF_FREE(priv); + + return -1; +} + + +void +fini(xlator_t *this) +{ + GF_VALIDATE_OR_GOTO ("readdir-ahead", this, out); + + GF_FREE(this->private); + +out: + return; +} + +struct xlator_fops fops = { + .opendir = rda_opendir, + .readdirp = rda_readdirp, +}; + +struct xlator_cbks cbks = { + .releasedir = rda_releasedir, +}; + +struct volume_options options[] = { + { .key = {"rda-request-size"}, + .type = GF_OPTION_TYPE_INT, + .min = 4096, + .max = 131072, + .default_value = "131072", + .description = "readdir-ahead request size", + }, + { .key = {"rda-low-wmark"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 0, + .max = 10 * GF_UNIT_MB, + .default_value = "4096", + .description = "the value under which we plug", + }, + { .key = {"rda-high-wmark"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 0, + .max = 100 * GF_UNIT_MB, + .default_value = "131072", + .description = "the value over which we unplug", + }, + { .key = {NULL} }, +}; + diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead.h b/xlators/performance/readdir-ahead/src/readdir-ahead.h new file mode 100644 index 000000000..e48786dae --- /dev/null +++ b/xlators/performance/readdir-ahead/src/readdir-ahead.h @@ -0,0 +1,46 @@ +/* + Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef __READDIR_AHEAD_H +#define __READDIR_AHEAD_H + +/* state flags */ +#define RDA_FD_NEW (1 << 0) +#define RDA_FD_RUNNING (1 << 1) +#define RDA_FD_EOD (1 << 2) +#define RDA_FD_ERROR (1 << 3) +#define RDA_FD_BYPASS (1 << 4) +#define RDA_FD_PLUGGED (1 << 5) + +struct rda_fd_ctx { + off_t cur_offset; /* current head of the ctx */ + size_t cur_size; /* current size of the preload */ + off_t next_offset; /* tail of the ctx */ + uint32_t state; + gf_lock_t lock; + gf_dirent_t entries; + call_frame_t *fill_frame; + call_stub_t *stub; + int op_errno; +}; + +struct rda_local { + struct rda_fd_ctx *ctx; + fd_t *fd; + off_t offset; +}; + +struct rda_priv { + uint32_t rda_req_size; + uint64_t rda_low_wmark; + uint64_t rda_high_wmark; +}; + +#endif /* __READDIR_AHEAD_H */ diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c index 6a2fcc40f..95c5921c6 100644 --- a/xlators/performance/write-behind/src/write-behind.c +++ b/xlators/performance/write-behind/src/write-behind.c @@ -213,11 +213,7 @@ wb_fd_err (fd_t *fd, xlator_t *this, int32_t *op_errno) int32_t tmp = 0; if (fd_ctx_get (fd, this, &value) == 0) { - if (value != EBADF) { - fd_ctx_set (fd, this, EBADF); - } - - if (op_errno != NULL) { + if (op_errno) { tmp = value; *op_errno = tmp; } @@ -749,7 +745,7 @@ wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } while (0) -void +int wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head) { struct iovec vector[MAX_VECTOR_COUNT]; @@ -799,22 +795,23 @@ wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head) head->stub->args.flags, head->stub->args.iobref, NULL); - return; + return 0; err: if (!fderr) { /* frame creation failure */ - wb_fulfill_err (head, ENOMEM); + fderr = ENOMEM; + wb_fulfill_err (head, fderr); } wb_head_done (head); - return; + return fderr; } #define NEXT_HEAD(head, req) do { \ if (head) \ - wb_fulfill_head (wb_inode, head); \ + ret |= wb_fulfill_head (wb_inode, head); \ head = req; \ expected_offset = req->stub->args.offset + \ req->write_size; \ @@ -823,7 +820,7 @@ err: } while (0) -void +int wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities) { wb_request_t *req = NULL; @@ -833,6 +830,7 @@ wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities) off_t expected_offset = 0; size_t curr_aggregate = 0; size_t vector_count = 0; + int ret = 0; conf = wb_inode->this->private; @@ -876,8 +874,9 @@ wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities) } if (head) - wb_fulfill_head (wb_inode, head); - return; + ret |= wb_fulfill_head (wb_inode, head); + + return ret; } @@ -1162,27 +1161,35 @@ wb_process_queue (wb_inode_t *wb_inode) list_head_t tasks = {0, }; list_head_t lies = {0, }; list_head_t liabilities = {0, }; + int retry = 0; INIT_LIST_HEAD (&tasks); INIT_LIST_HEAD (&lies); INIT_LIST_HEAD (&liabilities); - LOCK (&wb_inode->lock); - { - __wb_preprocess_winds (wb_inode); + do { + LOCK (&wb_inode->lock); + { + __wb_preprocess_winds (wb_inode); - __wb_pick_winds (wb_inode, &tasks, &liabilities); + __wb_pick_winds (wb_inode, &tasks, &liabilities); - __wb_pick_unwinds (wb_inode, &lies); + __wb_pick_unwinds (wb_inode, &lies); - } - UNLOCK (&wb_inode->lock); + } + UNLOCK (&wb_inode->lock); - wb_do_unwinds (wb_inode, &lies); + wb_do_unwinds (wb_inode, &lies); - wb_do_winds (wb_inode, &tasks); + wb_do_winds (wb_inode, &tasks); - wb_fulfill (wb_inode, &liabilities); + /* fd might've been marked bad due to previous errors. + * Since, caller of wb_process_queue might be the last fop on + * inode, make sure we keep processing request queue, till there + * are no requests left. + */ + retry = wb_fulfill (wb_inode, &liabilities); + } while (retry); return; } @@ -1413,7 +1420,7 @@ wb_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) if (!wb_enqueue (wb_inode, stub)) goto unwind; - wb_process_queue (wb_inode); + wb_process_queue (wb_inode); return 0; diff --git a/xlators/playground/Makefile.am b/xlators/playground/Makefile.am new file mode 100644 index 000000000..e7de6b31a --- /dev/null +++ b/xlators/playground/Makefile.am @@ -0,0 +1,2 @@ +SUBDIRS = template +CLEANFILES = diff --git a/xlators/playground/template/Makefile.am b/xlators/playground/template/Makefile.am new file mode 100644 index 000000000..f26892443 --- /dev/null +++ b/xlators/playground/template/Makefile.am @@ -0,0 +1,2 @@ +SUBDIRS = src + diff --git a/xlators/playground/template/src/Makefile.am b/xlators/playground/template/src/Makefile.am new file mode 100644 index 000000000..21f1c5f6b --- /dev/null +++ b/xlators/playground/template/src/Makefile.am @@ -0,0 +1,16 @@ +xlator_LTLIBRARIES = template.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/features + +template_la_LDFLAGS = -module -avoid-version + +template_la_SOURCES = template.c +template_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = template.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src + +AM_CFLAGS = -Wall $(GF_CFLAGS) + +CLEANFILES = + diff --git a/xlators/playground/template/src/template.c b/xlators/playground/template/src/template.c new file mode 100644 index 000000000..37a7794a0 --- /dev/null +++ b/xlators/playground/template/src/template.c @@ -0,0 +1,49 @@ +/* + Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "template.h" + +int32_t +init (xlator_t *this) +{ + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "not configured with exactly one child. exiting"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + return 0; +} + +void +fini (xlator_t *this) +{ + return; +} + +struct xlator_fops fops = { +}; + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { + { .key = {NULL} }, +}; diff --git a/xlators/playground/template/src/template.h b/xlators/playground/template/src/template.h new file mode 100644 index 000000000..d6aced501 --- /dev/null +++ b/xlators/playground/template/src/template.h @@ -0,0 +1,24 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef __TEMPLATE_H__ +#define __TEMPLATE_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "defaults.h" + +#endif /* __TEMPLATE_H__ */ diff --git a/xlators/protocol/auth/addr/src/addr.c b/xlators/protocol/auth/addr/src/addr.c index 199fc6db0..64e8d0fc6 100644 --- a/xlators/protocol/auth/addr/src/addr.c +++ b/xlators/protocol/auth/addr/src/addr.c @@ -2,19 +2,10 @@ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ diff --git a/xlators/protocol/auth/login/src/login.c b/xlators/protocol/auth/login/src/login.c index 702a876ac..c2f0bf0d0 100644 --- a/xlators/protocol/auth/login/src/login.c +++ b/xlators/protocol/auth/login/src/login.c @@ -2,19 +2,10 @@ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H diff --git a/xlators/protocol/client/src/client-handshake.c b/xlators/protocol/client/src/client-handshake.c index adf53f15e..5668fea53 100644 --- a/xlators/protocol/client/src/client-handshake.c +++ b/xlators/protocol/client/src/client-handshake.c @@ -53,7 +53,7 @@ rpc_client_ping_timer_expired (void *data) rpc_clnt_connection_t *conn = NULL; int disconnect = 0; int transport_activity = 0; - struct timeval timeout = {0, }; + struct timespec timeout = {0, }; struct timeval current = {0, }; struct rpc_clnt *clnt = NULL; xlator_t *this = NULL; @@ -101,7 +101,7 @@ rpc_client_ping_timer_expired (void *data) "ping timer expired but transport activity " "detected - not bailing transport"); timeout.tv_sec = conf->opt.ping_timeout; - timeout.tv_usec = 0; + timeout.tv_nsec = 0; conn->ping_timer = gf_timer_call_after (this->ctx, timeout, @@ -140,7 +140,7 @@ client_start_ping (void *data) clnt_conf_t *conf = NULL; rpc_clnt_connection_t *conn = NULL; int32_t ret = -1; - struct timeval timeout = {0, }; + struct timespec timeout = {0, }; call_frame_t *frame = NULL; int frame_count = 0; @@ -196,7 +196,7 @@ client_start_ping (void *data) } timeout.tv_sec = conf->opt.ping_timeout; - timeout.tv_usec = 0; + timeout.tv_nsec = 0; conn->ping_timer = gf_timer_call_after (this->ctx, timeout, @@ -241,7 +241,7 @@ client_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, { xlator_t *this = NULL; rpc_clnt_connection_t *conn = NULL; - struct timeval timeout = {0, }; + struct timespec timeout = {0, }; call_frame_t *frame = NULL; clnt_conf_t *conf = NULL; @@ -281,7 +281,7 @@ client_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, timeout.tv_sec = conf->opt.ping_timeout; - timeout.tv_usec = 0; + timeout.tv_nsec = 0; gf_timer_call_cancel (this->ctx, conn->ping_timer); diff --git a/xlators/protocol/client/src/client-rpc-fops.c b/xlators/protocol/client/src/client-rpc-fops.c index 71ca3fbd2..6355450c3 100644 --- a/xlators/protocol/client/src/client-rpc-fops.c +++ b/xlators/protocol/client/src/client-rpc-fops.c @@ -1936,6 +1936,172 @@ out: return 0; } +int +client3_3_fallocate_cbk (struct rpc_req *req, struct iovec *iov, int count, + void *myframe) +{ + call_frame_t *frame = NULL; + gfs3_fallocate_rsp rsp = {0,}; + struct iatt prestat = {0,}; + struct iatt poststat = {0,}; + int ret = 0; + xlator_t *this = NULL; + dict_t *xdata = NULL; + + + this = THIS; + + frame = myframe; + + if (-1 == req->rpc_status) { + rsp.op_ret = -1; + rsp.op_errno = ENOTCONN; + goto out; + } + ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gfs3_fallocate_rsp); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "XDR decoding failed"); + rsp.op_ret = -1; + rsp.op_errno = EINVAL; + goto out; + } + + if (-1 != rsp.op_ret) { + gf_stat_to_iatt (&rsp.statpre, &prestat); + gf_stat_to_iatt (&rsp.statpost, &poststat); + } + + GF_PROTOCOL_DICT_UNSERIALIZE (this, xdata, (rsp.xdata.xdata_val), + (rsp.xdata.xdata_len), ret, + rsp.op_errno, out); + +out: + if (rsp.op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, "remote operation failed: %s", + strerror (gf_error_to_errno (rsp.op_errno))); + } + CLIENT_STACK_UNWIND (fallocate, frame, rsp.op_ret, + gf_error_to_errno (rsp.op_errno), &prestat, + &poststat, xdata); + + free (rsp.xdata.xdata_val); + + if (xdata) + dict_unref (xdata); + + return 0; +} + +int +client3_3_discard_cbk(struct rpc_req *req, struct iovec *iov, int count, + void *myframe) +{ + call_frame_t *frame = NULL; + gfs3_discard_rsp rsp = {0,}; + struct iatt prestat = {0,}; + struct iatt poststat = {0,}; + int ret = 0; + xlator_t *this = NULL; + dict_t *xdata = NULL; + + this = THIS; + + frame = myframe; + + if (-1 == req->rpc_status) { + rsp.op_ret = -1; + rsp.op_errno = ENOTCONN; + goto out; + } + ret = xdr_to_generic(*iov, &rsp, (xdrproc_t) xdr_gfs3_discard_rsp); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "XDR decoding failed"); + rsp.op_ret = -1; + rsp.op_errno = EINVAL; + goto out; + } + + if (-1 != rsp.op_ret) { + gf_stat_to_iatt (&rsp.statpre, &prestat); + gf_stat_to_iatt (&rsp.statpost, &poststat); + } + + GF_PROTOCOL_DICT_UNSERIALIZE (this, xdata, (rsp.xdata.xdata_val), + (rsp.xdata.xdata_len), ret, + rsp.op_errno, out); + +out: + if (rsp.op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, "remote operation failed: %s", + strerror (gf_error_to_errno (rsp.op_errno))); + } + CLIENT_STACK_UNWIND (discard, frame, rsp.op_ret, + gf_error_to_errno (rsp.op_errno), &prestat, + &poststat, xdata); + + free (rsp.xdata.xdata_val); + + if (xdata) + dict_unref (xdata); + + return 0; +} + +int +client3_3_zerofill_cbk(struct rpc_req *req, struct iovec *iov, int count, + void *myframe) +{ + call_frame_t *frame = NULL; + gfs3_zerofill_rsp rsp = {0,}; + struct iatt prestat = {0,}; + struct iatt poststat = {0,}; + int ret = 0; + xlator_t *this = NULL; + dict_t *xdata = NULL; + + this = THIS; + + frame = myframe; + + if (-1 == req->rpc_status) { + rsp.op_ret = -1; + rsp.op_errno = ENOTCONN; + goto out; + } + ret = xdr_to_generic(*iov, &rsp, (xdrproc_t) xdr_gfs3_zerofill_rsp); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "XDR decoding failed"); + rsp.op_ret = -1; + rsp.op_errno = EINVAL; + goto out; + } + + if (-1 != rsp.op_ret) { + gf_stat_to_iatt (&rsp.statpre, &prestat); + gf_stat_to_iatt (&rsp.statpost, &poststat); + } + + GF_PROTOCOL_DICT_UNSERIALIZE (this, xdata, (rsp.xdata.xdata_val), + (rsp.xdata.xdata_len), ret, + rsp.op_errno, out); + +out: + if (rsp.op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "remote operation failed: %s", + strerror (gf_error_to_errno (rsp.op_errno))); + } + CLIENT_STACK_UNWIND (zerofill, frame, rsp.op_ret, + gf_error_to_errno (rsp.op_errno), &prestat, + &poststat, xdata); + + free (rsp.xdata.xdata_val); + + if (xdata) + dict_unref (xdata); + + return 0; +} int client3_3_setattr_cbk (struct rpc_req *req, struct iovec *iov, int count, @@ -5786,7 +5952,140 @@ unwind: return 0; } +int32_t +client3_3_fallocate(call_frame_t *frame, xlator_t *this, void *data) +{ + clnt_args_t *args = NULL; + int64_t remote_fd = -1; + clnt_conf_t *conf = NULL; + gfs3_fallocate_req req = {{0},}; + int op_errno = ESTALE; + int ret = 0; + + if (!frame || !this || !data) + goto unwind; + + args = data; + conf = this->private; + + CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD, + remote_fd, op_errno, unwind); + + req.fd = remote_fd; + req.flags = args->flags; + req.offset = args->offset; + req.size = args->size; + memcpy(req.gfid, args->fd->inode->gfid, 16); + GF_PROTOCOL_DICT_SERIALIZE (this, args->xdata, (&req.xdata.xdata_val), + req.xdata.xdata_len, op_errno, unwind); + + ret = client_submit_request (this, &req, frame, conf->fops, + GFS3_OP_FALLOCATE, + client3_3_fallocate_cbk, NULL, + NULL, 0, NULL, 0, + NULL, (xdrproc_t)xdr_gfs3_fallocate_req); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, "failed to send the fop"); + } + + GF_FREE (req.xdata.xdata_val); + + return 0; +unwind: + CLIENT_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + GF_FREE (req.xdata.xdata_val); + + return 0; +} + +int32_t +client3_3_discard(call_frame_t *frame, xlator_t *this, void *data) +{ + clnt_args_t *args = NULL; + int64_t remote_fd = -1; + clnt_conf_t *conf = NULL; + gfs3_discard_req req = {{0},}; + int op_errno = ESTALE; + int ret = 0; + + if (!frame || !this || !data) + goto unwind; + + args = data; + conf = this->private; + + CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD, + remote_fd, op_errno, unwind); + + req.fd = remote_fd; + req.offset = args->offset; + req.size = args->size; + memcpy(req.gfid, args->fd->inode->gfid, 16); + + GF_PROTOCOL_DICT_SERIALIZE (this, args->xdata, (&req.xdata.xdata_val), + req.xdata.xdata_len, op_errno, unwind); + + ret = client_submit_request(this, &req, frame, conf->fops, + GFS3_OP_DISCARD, client3_3_discard_cbk, + NULL, NULL, 0, NULL, 0, NULL, + (xdrproc_t) xdr_gfs3_discard_req); + if (ret) + gf_log (this->name, GF_LOG_WARNING, "failed to send the fop"); + + GF_FREE (req.xdata.xdata_val); + + return 0; +unwind: + CLIENT_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL); + GF_FREE (req.xdata.xdata_val); + + return 0; +} + +int32_t +client3_3_zerofill(call_frame_t *frame, xlator_t *this, void *data) +{ + clnt_args_t *args = NULL; + int64_t remote_fd = -1; + clnt_conf_t *conf = NULL; + gfs3_zerofill_req req = {{0},}; + int op_errno = ESTALE; + int ret = 0; + + if (!frame || !this || !data) + goto unwind; + + args = data; + conf = this->private; + + CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD, + remote_fd, op_errno, unwind); + + req.fd = remote_fd; + req.offset = args->offset; + req.size = args->size; + memcpy(req.gfid, args->fd->inode->gfid, 16); + + GF_PROTOCOL_DICT_SERIALIZE (this, args->xdata, (&req.xdata.xdata_val), + req.xdata.xdata_len, op_errno, unwind); + + ret = client_submit_request(this, &req, frame, conf->fops, + GFS3_OP_ZEROFILL, client3_3_zerofill_cbk, + NULL, NULL, 0, NULL, 0, NULL, + (xdrproc_t) xdr_gfs3_zerofill_req); + if (ret) + gf_log (this->name, GF_LOG_WARNING, "failed to send the fop"); + + GF_FREE (req.xdata.xdata_val); + + return 0; +unwind: + CLIENT_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL); + GF_FREE (req.xdata.xdata_val); + + return 0; +} /* Table Specific to FOPS */ @@ -5833,6 +6132,9 @@ rpc_clnt_procedure_t clnt3_3_fop_actors[GF_FOP_MAXVALUE] = { [GF_FOP_SETATTR] = { "SETATTR", client3_3_setattr }, [GF_FOP_FSETATTR] = { "FSETATTR", client3_3_fsetattr }, [GF_FOP_READDIRP] = { "READDIRP", client3_3_readdirp }, + [GF_FOP_FALLOCATE] = { "FALLOCATE", client3_3_fallocate }, + [GF_FOP_DISCARD] = { "DISCARD", client3_3_discard }, + [GF_FOP_ZEROFILL] = { "ZEROFILL", client3_3_zerofill}, [GF_FOP_RELEASE] = { "RELEASE", client3_3_release }, [GF_FOP_RELEASEDIR] = { "RELEASEDIR", client3_3_releasedir }, [GF_FOP_GETSPEC] = { "GETSPEC", client3_getspec }, @@ -5885,6 +6187,10 @@ char *clnt3_3_fop_names[GFS3_OP_MAXVALUE] = { [GFS3_OP_RELEASE] = "RELEASE", [GFS3_OP_RELEASEDIR] = "RELEASEDIR", [GFS3_OP_FREMOVEXATTR] = "FREMOVEXATTR", + [GFS3_OP_FALLOCATE] = "FALLOCATE", + [GFS3_OP_DISCARD] = "DISCARD", + [GFS3_OP_ZEROFILL] = "ZEROFILL", + }; rpc_clnt_prog_t clnt3_3_fop_prog = { diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c index e24282e28..1f7d13ea4 100644 --- a/xlators/protocol/client/src/client.c +++ b/xlators/protocol/client/src/client.c @@ -130,7 +130,7 @@ client_register_grace_timer (xlator_t *this, clnt_conf_t *conf) conf->grace_timer = gf_timer_call_after (this->ctx, - conf->grace_tv, + conf->grace_ts, client_grace_timeout, conf->rpc); } @@ -1961,6 +1961,110 @@ out: return 0; } +int32_t +client_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + int ret = -1; + clnt_conf_t *conf = NULL; + rpc_clnt_procedure_t *proc = NULL; + clnt_args_t args = {0,}; + + conf = this->private; + if (!conf || !conf->fops) + goto out; + + args.fd = fd; + args.flags = mode; + args.offset = offset; + args.size = len; + args.xdata = xdata; + + proc = &conf->fops->proctable[GF_FOP_FALLOCATE]; + if (!proc) { + gf_log (this->name, GF_LOG_ERROR, + "rpc procedure not found for %s", + gf_fop_list[GF_FOP_FALLOCATE]); + goto out; + } + if (proc->fn) + ret = proc->fn (frame, this, &args); +out: + if (ret) + STACK_UNWIND_STRICT (fallocate, frame, -1, ENOTCONN, NULL, NULL, NULL); + + return 0; +} + +int32_t +client_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int ret = -1; + clnt_conf_t *conf = NULL; + rpc_clnt_procedure_t *proc = NULL; + clnt_args_t args = {0,}; + + conf = this->private; + if (!conf || !conf->fops) + goto out; + + args.fd = fd; + args.offset = offset; + args.size = len; + args.xdata = xdata; + + proc = &conf->fops->proctable[GF_FOP_DISCARD]; + if (!proc) { + gf_log (this->name, GF_LOG_ERROR, + "rpc procedure not found for %s", + gf_fop_list[GF_FOP_DISCARD]); + goto out; + } + if (proc->fn) + ret = proc->fn (frame, this, &args); +out: + if (ret) + STACK_UNWIND_STRICT(discard, frame, -1, ENOTCONN, NULL, NULL, NULL); + + return 0; +} + +int32_t +client_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int ret = -1; + clnt_conf_t *conf = NULL; + rpc_clnt_procedure_t *proc = NULL; + clnt_args_t args = {0,}; + + conf = this->private; + if (!conf || !conf->fops) + goto out; + + args.fd = fd; + args.offset = offset; + args.size = len; + args.xdata = xdata; + + proc = &conf->fops->proctable[GF_FOP_ZEROFILL]; + if (!proc) { + gf_log (this->name, GF_LOG_ERROR, + "rpc procedure not found for %s", + gf_fop_list[GF_FOP_ZEROFILL]); + goto out; + } + if (proc->fn) + ret = proc->fn (frame, this, &args); +out: + if (ret) + STACK_UNWIND_STRICT(zerofill, frame, -1, ENOTCONN, + NULL, NULL, NULL); + + return 0; +} + int32_t client_getspec (call_frame_t *frame, xlator_t *this, const char *key, @@ -2346,14 +2450,14 @@ client_init_grace_timer (xlator_t *this, dict_t *options, ret = dict_get_int32 (options, "grace-timeout", &grace_timeout); if (!ret) - conf->grace_tv.tv_sec = grace_timeout; + conf->grace_ts.tv_sec = grace_timeout; else - conf->grace_tv.tv_sec = 10; + conf->grace_ts.tv_sec = 10; - conf->grace_tv.tv_usec = 0; + conf->grace_ts.tv_nsec = 0; gf_log (this->name, GF_LOG_DEBUG, "Client grace timeout " - "value = %"PRIu64, conf->grace_tv.tv_sec); + "value = %"PRIu64, conf->grace_ts.tv_sec); ret = 0; out: @@ -2679,6 +2783,9 @@ struct xlator_fops fops = { .fxattrop = client_fxattrop, .setattr = client_setattr, .fsetattr = client_fsetattr, + .fallocate = client_fallocate, + .discard = client_discard, + .zerofill = client_zerofill, .getspec = client_getspec, }; diff --git a/xlators/protocol/client/src/client.h b/xlators/protocol/client/src/client.h index 37ba264ce..afab2d74f 100644 --- a/xlators/protocol/client/src/client.h +++ b/xlators/protocol/client/src/client.h @@ -104,7 +104,7 @@ typedef struct clnt_conf { uint16_t lk_version; /* this variable is used to distinguish client-server transaction while performing lock healing */ - struct timeval grace_tv; + struct timespec grace_ts; gf_timer_t *grace_timer; gf_boolean_t grace_timer_needed; /* The state of this flag will be used to decide whether diff --git a/xlators/protocol/server/src/Makefile.am b/xlators/protocol/server/src/Makefile.am index 25d6706cc..6a18bf025 100644 --- a/xlators/protocol/server/src/Makefile.am +++ b/xlators/protocol/server/src/Makefile.am @@ -16,9 +16,9 @@ AM_CPPFLAGS = $(GF_CPPFLAGS) \ -I$(top_srcdir)/libglusterfs/src \ -DCONFDIR=\"$(sysconfdir)/glusterfs\" \ -DLIBDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/auth\" \ - -I$(top_srcdir)/xlators/protocol/lib/src \ - -I$(top_srcdir)/rpc/rpc-lib/src/ \ - -I$(top_srcdir)/rpc/xdr/src/ + -I$(top_srcdir)/xlators/protocol/lib/src \ + -I$(top_srcdir)/rpc/rpc-lib/src \ + -I$(top_srcdir)/rpc/xdr/src AM_CFLAGS = -Wall $(GF_CFLAGS) \ -DDATADIR=\"$(localstatedir)\" diff --git a/xlators/protocol/server/src/server-handshake.c b/xlators/protocol/server/src/server-handshake.c index ed3d75860..d4941011d 100644 --- a/xlators/protocol/server/src/server-handshake.c +++ b/xlators/protocol/server/src/server-handshake.c @@ -94,9 +94,9 @@ _volfile_update_checksum (xlator_t *this, char *key, uint32_t checksum) if (temp_volfile->checksum != checksum) { gf_log (this->name, GF_LOG_INFO, - "the volume file got modified between earlier access " - "and now, this may lead to inconsistency between " - "clients, advised to remount client"); + "the volume file was modified between a prior access " + "and now. This may lead to inconsistency between " + "clients, you are advised to remount client"); temp_volfile->checksum = checksum; } @@ -109,10 +109,10 @@ static size_t getspec_build_volfile_path (xlator_t *this, const char *key, char *path, size_t path_len) { - int ret = -1; + char *filename = NULL; + server_conf_t *conf = NULL; + int ret = -1; int free_filename = 0; - char *filename = NULL; - server_conf_t *conf = NULL; char data_key[256] = {0,}; conf = this->private; @@ -329,14 +329,15 @@ server_setvolume (rpcsvc_request_t *req) { gf_setvolume_req args = {{0,},}; gf_setvolume_rsp rsp = {0,}; - server_connection_t *conn = NULL; + client_t *client = NULL; + server_ctx_t *serv_ctx = NULL; server_conf_t *conf = NULL; peer_info_t *peerinfo = NULL; dict_t *reply = NULL; dict_t *config_params = NULL; dict_t *params = NULL; char *name = NULL; - char *process_uuid = NULL; + char *client_uid = NULL; char *clnt_version = NULL; xlator_t *xl = NULL; char *msg = NULL; @@ -393,7 +394,7 @@ server_setvolume (rpcsvc_request_t *req) params->extra_free = buf; buf = NULL; - ret = dict_get_str (params, "process-uuid", &process_uuid); + ret = dict_get_str (params, "process-uuid", &client_uid); if (ret < 0) { ret = dict_set_str (reply, "ERROR", "UUID not specified"); @@ -420,25 +421,32 @@ server_setvolume (rpcsvc_request_t *req) goto fail; } - conn = server_connection_get (this, process_uuid); - if (!conn) { + client = gf_client_get (this, &req->cred, client_uid); + if (client == NULL) { op_ret = -1; op_errno = ENOMEM; goto fail; } - gf_log (this->name, GF_LOG_DEBUG, "Connected to %s", conn->id); - cancelled = server_cancel_conn_timer (this, conn); - if (cancelled)//Do connection_put on behalf of grace-timer-handler. - server_connection_put (this, conn, NULL); - if (conn->lk_version != 0 && - conn->lk_version != lk_version) { - (void) server_connection_cleanup (this, conn, + gf_log (this->name, GF_LOG_DEBUG, "Connected to %s", client->client_uid); + cancelled = server_cancel_grace_timer (this, client); + if (cancelled)//Do gf_client_put on behalf of grace-timer-handler. + gf_client_put (client, NULL); + + serv_ctx = server_ctx_get (client, client->this); + if (serv_ctx == NULL) { + gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed"); + goto fail; + } + + if (serv_ctx->lk_version != 0 && + serv_ctx->lk_version != lk_version) { + (void) server_connection_cleanup (this, client, INTERNAL_LOCKS | POSIX_LOCKS); } - if (req->trans->xl_private != conn) - req->trans->xl_private = conn; + if (req->trans->xl_private != client) + req->trans->xl_private = client; ret = dict_get_int32 (params, "fops-version", &fop_version); if (ret < 0) { @@ -563,10 +571,10 @@ server_setvolume (rpcsvc_request_t *req) gf_log (this->name, GF_LOG_INFO, "accepted client from %s (version: %s)", - conn->id, + client->client_uid, (clnt_version) ? clnt_version : "old"); op_ret = 0; - conn->bound_xl = xl; + client->bound_xl = xl; ret = dict_set_str (reply, "ERROR", "Success"); if (ret < 0) gf_log (this->name, GF_LOG_DEBUG, @@ -574,7 +582,7 @@ server_setvolume (rpcsvc_request_t *req) } else { gf_log (this->name, GF_LOG_ERROR, "Cannot authenticate client from %s %s", - conn->id, + client->client_uid, (clnt_version) ? clnt_version : "old"); op_ret = -1; @@ -586,7 +594,7 @@ server_setvolume (rpcsvc_request_t *req) goto fail; } - if (conn->bound_xl == NULL) { + if (client->bound_xl == NULL) { ret = dict_set_str (reply, "ERROR", "Check volfile and handshake " "options in protocol/client"); @@ -599,20 +607,21 @@ server_setvolume (rpcsvc_request_t *req) goto fail; } - if ((conn->bound_xl != NULL) && + if ((client->bound_xl != NULL) && (ret >= 0) && - (conn->bound_xl->itable == NULL)) { + (client->bound_xl->itable == NULL)) { /* create inode table for this bound_xl, if one doesn't already exist */ gf_log (this->name, GF_LOG_TRACE, "creating inode table with lru_limit=%"PRId32", " "xlator=%s", conf->inode_lru_limit, - conn->bound_xl->name); + client->bound_xl->name); /* TODO: what is this ? */ - conn->bound_xl->itable = inode_table_new (conf->inode_lru_limit, - conn->bound_xl); + client->bound_xl->itable = + inode_table_new (conf->inode_lru_limit, + client->bound_xl); } ret = dict_set_str (reply, "process-uuid", @@ -621,8 +630,7 @@ server_setvolume (rpcsvc_request_t *req) gf_log (this->name, GF_LOG_DEBUG, "failed to set 'process-uuid'"); - ret = dict_set_uint32 (reply, "clnt-lk-version", - conn->lk_version); + ret = dict_set_uint32 (reply, "clnt-lk-version", serv_ctx->lk_version); if (ret) gf_log (this->name, GF_LOG_WARNING, "failed to set 'clnt-lk-version'"); @@ -664,15 +672,15 @@ fail: * list of connections the server is maintaining and might segfault * during statedump when bound_xl of the connection is accessed. */ - if (op_ret && conn && !xl) { + if (op_ret && !xl) { /* We would have set the xl_private of the transport to the * @conn. But if we have put the connection i.e shutting down * the connection, then we should set xl_private to NULL as it * would be pointing to a freed memory and would segfault when * accessed upon getting DISCONNECT. */ - if (server_connection_put (this, conn, NULL) == NULL) - req->trans->xl_private = NULL; + gf_client_put (client, NULL); + req->trans->xl_private = NULL; } server_submit_reply (NULL, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_setvolume_rsp); @@ -709,12 +717,13 @@ server_ping (rpcsvc_request_t *req) int server_set_lk_version (rpcsvc_request_t *req) { - int op_ret = -1; - int op_errno = EINVAL; - gf_set_lk_ver_req args = {0, }; - gf_set_lk_ver_rsp rsp = {0,}; - server_connection_t *conn = NULL; - xlator_t *this = NULL; + int op_ret = -1; + int op_errno = EINVAL; + gf_set_lk_ver_req args = {0,}; + gf_set_lk_ver_rsp rsp = {0,}; + client_t *client = NULL; + server_ctx_t *serv_ctx = NULL; + xlator_t *this = NULL; this = req->svc->mydata; //TODO: Decide on an appropriate errno for the error-path @@ -730,9 +739,15 @@ server_set_lk_version (rpcsvc_request_t *req) goto fail; } - conn = server_connection_get (this, args.uid); - conn->lk_version = args.lk_ver; - server_connection_put (this, conn, NULL); + client = gf_client_get (this, &req->cred, args.uid); + serv_ctx = server_ctx_get (client, client->this); + if (serv_ctx == NULL) { + gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed"); + goto fail; + } + + serv_ctx->lk_version = args.lk_ver; + gf_client_put (client, NULL); rsp.lk_ver = args.lk_ver; @@ -749,11 +764,11 @@ fail: } rpcsvc_actor_t gluster_handshake_actors[] = { - [GF_HNDSK_NULL] = {"NULL", GF_HNDSK_NULL, server_null, NULL, 0}, - [GF_HNDSK_SETVOLUME] = {"SETVOLUME", GF_HNDSK_SETVOLUME, server_setvolume, NULL, 0}, - [GF_HNDSK_GETSPEC] = {"GETSPEC", GF_HNDSK_GETSPEC, server_getspec, NULL, 0}, - [GF_HNDSK_PING] = {"PING", GF_HNDSK_PING, server_ping, NULL, 0}, - [GF_HNDSK_SET_LK_VER] = {"SET_LK_VER", GF_HNDSK_SET_LK_VER, server_set_lk_version, NULL, 0}, + [GF_HNDSK_NULL] = {"NULL", GF_HNDSK_NULL, server_null, NULL, 0, DRC_NA}, + [GF_HNDSK_SETVOLUME] = {"SETVOLUME", GF_HNDSK_SETVOLUME, server_setvolume, NULL, 0, DRC_NA}, + [GF_HNDSK_GETSPEC] = {"GETSPEC", GF_HNDSK_GETSPEC, server_getspec, NULL, 0, DRC_NA}, + [GF_HNDSK_PING] = {"PING", GF_HNDSK_PING, server_ping, NULL, 0, DRC_NA}, + [GF_HNDSK_SET_LK_VER] = {"SET_LK_VER", GF_HNDSK_SET_LK_VER, server_set_lk_version, NULL, 0, DRC_NA}, }; diff --git a/xlators/protocol/server/src/server-helpers.c b/xlators/protocol/server/src/server-helpers.c index 246a5ec6c..f0b040c74 100644 --- a/xlators/protocol/server/src/server-helpers.c +++ b/xlators/protocol/server/src/server-helpers.c @@ -26,6 +26,9 @@ server_decode_groups (call_frame_t *frame, rpcsvc_request_t *req) GF_VALIDATE_OR_GOTO ("server", frame, out); GF_VALIDATE_OR_GOTO ("server", req, out); + if (call_stack_alloc_groups (frame->root, req->auxgidcount) != 0) + return -1; + frame->root->ngrps = req->auxgidcount; if (frame->root->ngrps == 0) return 0; @@ -39,6 +42,7 @@ out: return 0; } + void server_loc_wipe (loc_t *loc) { @@ -70,11 +74,6 @@ server_resolve_wipe (server_resolve_t *resolve) void free_state (server_state_t *state) { - if (state->conn) { - //xprt_svc_unref (state->conn); - state->conn = NULL; - } - if (state->xprt) { rpc_transport_unref (state->xprt); state->xprt = NULL; @@ -123,280 +122,26 @@ free_state (server_state_t *state) } -int -gf_add_locker (server_connection_t *conn, const char *volume, - loc_t *loc, fd_t *fd, pid_t pid, gf_lkowner_t *owner, - glusterfs_fop_t type) -{ - int32_t ret = -1; - struct _locker *new = NULL; - struct _lock_table *table = NULL; - - GF_VALIDATE_OR_GOTO ("server", volume, out); - - new = GF_CALLOC (1, sizeof (struct _locker), gf_server_mt_locker_t); - if (new == NULL) { - goto out; - } - INIT_LIST_HEAD (&new->lockers); - - new->volume = gf_strdup (volume); - - if (fd == NULL) { - loc_copy (&new->loc, loc); - } else { - new->fd = fd_ref (fd); - } - - new->pid = pid; - new->owner = *owner; - - pthread_mutex_lock (&conn->lock); - { - table = conn->ltable; - if (type == GF_FOP_ENTRYLK) - list_add_tail (&new->lockers, &table->entrylk_lockers); - else - list_add_tail (&new->lockers, &table->inodelk_lockers); - } - pthread_mutex_unlock (&conn->lock); -out: - return ret; -} - - -int -gf_del_locker (server_connection_t *conn, const char *volume, - loc_t *loc, fd_t *fd, gf_lkowner_t *owner, - glusterfs_fop_t type) -{ - struct _locker *locker = NULL; - struct _locker *tmp = NULL; - int32_t ret = -1; - struct list_head *head = NULL; - struct _lock_table *table = NULL; - int found = 0; - - GF_VALIDATE_OR_GOTO ("server", volume, out); - - pthread_mutex_lock (&conn->lock); - { - table = conn->ltable; - if (type == GF_FOP_ENTRYLK) { - head = &table->entrylk_lockers; - } else { - head = &table->inodelk_lockers; - } - - list_for_each_entry_safe (locker, tmp, head, lockers) { - if (!is_same_lkowner (&locker->owner, owner) || - strcmp (locker->volume, volume)) - continue; - - if (locker->fd && fd && (locker->fd == fd)) - found = 1; - else if (locker->loc.inode && loc && - (locker->loc.inode == loc->inode)) - found = 1; - if (found) { - list_del_init (&locker->lockers); - break; - } - } - if (!found) - locker = NULL; - } - pthread_mutex_unlock (&conn->lock); - - if (locker) { - if (locker->fd) - fd_unref (locker->fd); - else - loc_wipe (&locker->loc); - - GF_FREE (locker->volume); - GF_FREE (locker); - } - - ret = 0; -out: - return ret; -} - -static struct _lock_table * -gf_lock_table_new (void) -{ - struct _lock_table *new = NULL; - - new = GF_CALLOC (1, sizeof (struct _lock_table), gf_server_mt_lock_table_t); - if (new == NULL) { - goto out; - } - INIT_LIST_HEAD (&new->entrylk_lockers); - INIT_LIST_HEAD (&new->inodelk_lockers); -out: - return new; -} - -static int -server_nop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - int ret = -1; - server_state_t *state = NULL; - - GF_VALIDATE_OR_GOTO ("server", frame, out); - GF_VALIDATE_OR_GOTO ("server", cookie, out); - GF_VALIDATE_OR_GOTO ("server", this, out); - - if (frame->root->trans) - server_conn_unref (frame->root->trans); - state = CALL_STATE(frame); - - if (state) - free_state (state); - STACK_DESTROY (frame->root); - - ret = 0; -out: - return ret; -} - -int -do_lock_table_cleanup (xlator_t *this, server_connection_t *conn, - call_frame_t *frame, struct _lock_table *ltable) -{ - struct list_head inodelk_lockers, entrylk_lockers; - call_frame_t *tmp_frame = NULL; - struct gf_flock flock = {0, }; - xlator_t *bound_xl = NULL; - struct _locker *locker = NULL, *tmp = NULL; - int ret = -1; - - GF_VALIDATE_OR_GOTO ("server", this, out); - GF_VALIDATE_OR_GOTO ("server", conn, out); - GF_VALIDATE_OR_GOTO ("server", frame, out); - GF_VALIDATE_OR_GOTO ("server", ltable, out); - - bound_xl = conn->bound_xl; - INIT_LIST_HEAD (&inodelk_lockers); - INIT_LIST_HEAD (&entrylk_lockers); - - list_splice_init (<able->inodelk_lockers, - &inodelk_lockers); - - list_splice_init (<able->entrylk_lockers, &entrylk_lockers); - GF_FREE (ltable); - - flock.l_type = F_UNLCK; - flock.l_start = 0; - flock.l_len = 0; - list_for_each_entry_safe (locker, - tmp, &inodelk_lockers, lockers) { - tmp_frame = copy_frame (frame); - if (tmp_frame == NULL) { - goto out; - } - /* - lock owner = 0 is a special case that tells posix-locks - to release all locks from this transport - */ - tmp_frame->root->pid = 0; - tmp_frame->root->trans = server_conn_ref (conn); - memset (&tmp_frame->root->lk_owner, 0, sizeof (gf_lkowner_t)); - - if (locker->fd) { - gf_log (this->name, GF_LOG_DEBUG, "finodelk " - "released on inode with gfid %s", - uuid_utoa (locker->fd->inode->gfid)); - - STACK_WIND (tmp_frame, server_nop_cbk, bound_xl, - bound_xl->fops->finodelk, - locker->volume, - locker->fd, F_SETLK, &flock, NULL); - fd_unref (locker->fd); - } else { - gf_log (this->name, GF_LOG_DEBUG, "inodelk released " - "on %s", locker->loc.path); - - STACK_WIND (tmp_frame, server_nop_cbk, bound_xl, - bound_xl->fops->inodelk, - locker->volume, - &(locker->loc), F_SETLK, &flock, NULL); - loc_wipe (&locker->loc); - } - - GF_FREE (locker->volume); - - list_del_init (&locker->lockers); - GF_FREE (locker); - } - - tmp = NULL; - locker = NULL; - list_for_each_entry_safe (locker, tmp, &entrylk_lockers, lockers) { - tmp_frame = copy_frame (frame); - - tmp_frame->root->pid = 0; - tmp_frame->root->trans = server_conn_ref (conn); - memset (&tmp_frame->root->lk_owner, 0, sizeof (gf_lkowner_t)); - - if (locker->fd) { - GF_ASSERT (locker->fd->inode); - - gf_log (this->name, GF_LOG_DEBUG, "fentrylk " - "released on inode with gfid %s", - uuid_utoa (locker->fd->inode->gfid)); - - STACK_WIND (tmp_frame, server_nop_cbk, bound_xl, - bound_xl->fops->fentrylk, - locker->volume, - locker->fd, NULL, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); - fd_unref (locker->fd); - } else { - gf_log (this->name, GF_LOG_DEBUG, "entrylk released " - "on %s", locker->loc.path); - - STACK_WIND (tmp_frame, server_nop_cbk, bound_xl, - bound_xl->fops->entrylk, - locker->volume, - &(locker->loc), NULL, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); - loc_wipe (&locker->loc); - } - - GF_FREE (locker->volume); - - list_del_init (&locker->lockers); - GF_FREE (locker); - } - ret = 0; - -out: - return ret; -} - - static int server_connection_cleanup_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - int32_t ret = -1; - fd_t *fd = NULL; + int32_t ret = -1; + fd_t *fd = NULL; + client_t *client = NULL; GF_VALIDATE_OR_GOTO ("server", this, out); GF_VALIDATE_OR_GOTO ("server", cookie, out); GF_VALIDATE_OR_GOTO ("server", frame, out); fd = frame->local; + client = frame->root->client; fd_unref (fd); frame->local = NULL; - if (frame->root->trans) - server_conn_unref (frame->root->trans); + gf_client_unref (client); STACK_DESTROY (frame->root); ret = 0; @@ -405,9 +150,8 @@ out: } -int -do_fd_cleanup (xlator_t *this, server_connection_t *conn, call_frame_t *frame, - fdentry_t *fdentries, int fd_count) +static int +do_fd_cleanup (xlator_t *this, client_t* client, fdentry_t *fdentries, int fd_count) { fd_t *fd = NULL; int i = 0, ret = -1; @@ -416,16 +160,14 @@ do_fd_cleanup (xlator_t *this, server_connection_t *conn, call_frame_t *frame, char *path = NULL; GF_VALIDATE_OR_GOTO ("server", this, out); - GF_VALIDATE_OR_GOTO ("server", conn, out); - GF_VALIDATE_OR_GOTO ("server", frame, out); GF_VALIDATE_OR_GOTO ("server", fdentries, out); - bound_xl = conn->bound_xl; + bound_xl = client->bound_xl; for (i = 0;i < fd_count; i++) { fd = fdentries[i].fd; if (fd != NULL) { - tmp_frame = copy_frame (frame); + tmp_frame = create_frame (this, this->ctx->pool); if (tmp_frame == NULL) { goto out; } @@ -435,20 +177,20 @@ do_fd_cleanup (xlator_t *this, server_connection_t *conn, call_frame_t *frame, ret = inode_path (fd->inode, NULL, &path); if (ret > 0) { - gf_log (this->name, GF_LOG_INFO, "fd cleanup on " - "%s", path); + gf_log (this->name, GF_LOG_INFO, + "fd cleanup on %s", path); GF_FREE (path); } else { - gf_log (this->name, GF_LOG_INFO, "fd cleanup on" - " inode with gfid %s", + gf_log (this->name, GF_LOG_INFO, + "fd cleanup on inode with gfid %s", uuid_utoa (fd->inode->gfid)); } tmp_frame->local = fd; tmp_frame->root->pid = 0; - tmp_frame->root->trans = server_conn_ref (conn); + gf_client_ref (client); memset (&tmp_frame->root->lk_owner, 0, sizeof (gf_lkowner_t)); @@ -465,312 +207,72 @@ out: return ret; } -int -do_connection_cleanup (xlator_t *this, server_connection_t *conn, - struct _lock_table *ltable, fdentry_t *fdentries, int fd_count) -{ - int ret = 0; - int saved_ret = 0; - call_frame_t *frame = NULL; - server_state_t *state = NULL; - - GF_VALIDATE_OR_GOTO ("server", this, out); - GF_VALIDATE_OR_GOTO ("server", conn, out); - - if (!ltable && !fdentries) - goto out; - - frame = create_frame (this, this->ctx->pool); - if (frame == NULL) { - goto out; - } - - if (ltable) - saved_ret = do_lock_table_cleanup (this, conn, frame, ltable); - - if (fdentries != NULL) { - ret = do_fd_cleanup (this, conn, frame, fdentries, fd_count); - } - - state = CALL_STATE (frame); - GF_FREE (state); - - STACK_DESTROY (frame->root); - - if (saved_ret || ret) { - ret = -1; - } - -out: - return ret; -} int -server_connection_cleanup (xlator_t *this, server_connection_t *conn, +server_connection_cleanup (xlator_t *this, client_t *client, int32_t flags) { - struct _lock_table *ltable = NULL; - fdentry_t *fdentries = NULL; - uint32_t fd_count = 0; - int ret = 0; + server_ctx_t *serv_ctx = NULL; + fdentry_t *fdentries = NULL; + uint32_t fd_count = 0; + int cd_ret = 0; + int ret = 0; GF_VALIDATE_OR_GOTO (this->name, this, out); - GF_VALIDATE_OR_GOTO (this->name, conn, out); + GF_VALIDATE_OR_GOTO (this->name, client, out); GF_VALIDATE_OR_GOTO (this->name, flags, out); - pthread_mutex_lock (&conn->lock); - { - if (conn->ltable && (flags & INTERNAL_LOCKS)) { - ltable = conn->ltable; - conn->ltable = gf_lock_table_new (); - } - - if (conn->fdtable && (flags & POSIX_LOCKS)) - fdentries = gf_fd_fdtable_get_all_fds (conn->fdtable, - &fd_count); - } - pthread_mutex_unlock (&conn->lock); - - if (conn->bound_xl) - ret = do_connection_cleanup (this, conn, ltable, - fdentries, fd_count); + serv_ctx = server_ctx_get (client, client->this); -out: - return ret; -} - -void -server_log_conn_destroy (server_connection_t *conn) -{ - int i = 0; - char *rsp_failures_msg = NULL; - char *free_ptr = NULL; - char *msg = NULL; - char *failed_to_rsp = ""; - char *sep1 = " - "; - char *sep2 = ", "; - int msg_len = 0; - - for (i = GF_FOP_NULL + 1; i < GF_FOP_MAXVALUE; i++) { - msg_len += strlen (gf_fop_list[i]); - msg_len += 20; //Max len of uint64_t is 20 - //Separators for fop-string, count - msg_len += strlen (sep1) + strlen (sep2); - } - - rsp_failures_msg = GF_CALLOC (msg_len + 1, 1, gf_common_mt_char); - if (rsp_failures_msg == NULL) { - rsp_failures_msg = ""; + if (serv_ctx == NULL) { + gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed"); goto out; - } else { - free_ptr = rsp_failures_msg; - } - - msg = rsp_failures_msg; - for (i = GF_FOP_NULL + 1; i < GF_FOP_MAXVALUE; i++) { - if (!conn->rsp_failure_fops[i]) - continue; - //Note: Please make sure the size is calculated appropriately - //if you plan to change the format string. - msg += sprintf (msg, "%s%s%"PRIu64"%s", gf_fop_list[i], sep1, - conn->rsp_failure_fops[i], sep2); - } - if (rsp_failures_msg[0]) { - failed_to_rsp = " - Failed to respond to following operations:"; - //Remove last comma - rsp_failures_msg[strlen (rsp_failures_msg) - 2] = '\0'; - } -out: - gf_log (conn->this->name, GF_LOG_INFO, "destroyed connection of " - "%s %s %s", conn->id, failed_to_rsp, rsp_failures_msg); - GF_FREE (free_ptr); -} - -int -server_connection_destroy (xlator_t *this, server_connection_t *conn) -{ - xlator_t *bound_xl = NULL; - int32_t ret = -1; - struct list_head inodelk_lockers; - struct list_head entrylk_lockers; - struct _lock_table *ltable = NULL; - fdtable_t *fdtable = NULL; - - GF_VALIDATE_OR_GOTO ("server", this, out); - GF_VALIDATE_OR_GOTO ("server", conn, out); - - bound_xl = (xlator_t *) (conn->bound_xl); - - if (bound_xl) { - pthread_mutex_lock (&(conn->lock)); - { - if (conn->ltable) { - ltable = conn->ltable; - conn->ltable = NULL; - } - if (conn->fdtable) { - fdtable = conn->fdtable; - conn->fdtable = NULL; - } - } - pthread_mutex_unlock (&conn->lock); - - INIT_LIST_HEAD (&inodelk_lockers); - INIT_LIST_HEAD (&entrylk_lockers); - - if (ltable) { - list_splice_init (<able->inodelk_lockers, - &inodelk_lockers); - - list_splice_init (<able->entrylk_lockers, - &entrylk_lockers); - GF_FREE (ltable); - } - - GF_ASSERT (list_empty (&inodelk_lockers)); - GF_ASSERT (list_empty (&entrylk_lockers)); - - if (fdtable) - gf_fd_fdtable_destroy (fdtable); } - server_log_conn_destroy (conn); - - pthread_mutex_destroy (&conn->lock); - GF_FREE (conn->id); - GF_FREE (conn); - ret = 0; -out: - return ret; -} - -server_connection_t* -server_conn_unref (server_connection_t *conn) -{ - server_connection_t *todel = NULL; - xlator_t *this = NULL; - - pthread_mutex_lock (&conn->lock); + LOCK (&serv_ctx->fdtable_lock); { - conn->ref--; - - if (!conn->ref) { - todel = conn; - } + if (serv_ctx->fdtable && (flags & POSIX_LOCKS)) + fdentries = gf_fd_fdtable_get_all_fds (serv_ctx->fdtable, + &fd_count); } - pthread_mutex_unlock (&conn->lock); + UNLOCK (&serv_ctx->fdtable_lock); - if (todel) { - this = THIS; - server_connection_destroy (this, todel); - conn = NULL; - } - return conn; -} + if (client->bound_xl == NULL) + goto out; -server_connection_t* -server_conn_ref (server_connection_t *conn) -{ - pthread_mutex_lock (&conn->lock); - { - conn->ref++; + if (flags & INTERNAL_LOCKS) { + cd_ret = gf_client_disconnect (client); } - pthread_mutex_unlock (&conn->lock); - return conn; -} + if (fdentries != NULL) + ret = do_fd_cleanup (this, client, fdentries, fd_count); + else + gf_log (this->name, GF_LOG_INFO, "no fdentries to clean"); -server_connection_t * -server_connection_get (xlator_t *this, const char *id) -{ - server_connection_t *conn = NULL; - server_connection_t *trav = NULL; - server_conf_t *conf = NULL; - - GF_VALIDATE_OR_GOTO ("server", this, out); - GF_VALIDATE_OR_GOTO ("server", id, out); - - conf = this->private; - - pthread_mutex_lock (&conf->mutex); - { - list_for_each_entry (trav, &conf->conns, list) { - if (!strcmp (trav->id, id)) { - conn = trav; - conn->bind_ref++; - goto unlock; - } - } - - conn = (void *) GF_CALLOC (1, sizeof (*conn), - gf_server_mt_conn_t); - if (!conn) - goto unlock; - - conn->id = gf_strdup (id); - /*'0' denotes uninitialised lock state*/ - conn->lk_version = 0; - conn->fdtable = gf_fd_fdtable_alloc (); - conn->ltable = gf_lock_table_new (); - conn->this = this; - conn->bind_ref = 1; - conn->ref = 1;//when bind_ref becomes 0 it calls conn_unref - pthread_mutex_init (&conn->lock, NULL); - list_add (&conn->list, &conf->conns); + if (cd_ret || ret) + ret = -1; - } -unlock: - pthread_mutex_unlock (&conf->mutex); out: - return conn; + return ret; } -server_connection_t* -server_connection_put (xlator_t *this, server_connection_t *conn, - gf_boolean_t *detached) -{ - server_conf_t *conf = NULL; - gf_boolean_t unref = _gf_false; - - if (detached) - *detached = _gf_false; - conf = this->private; - pthread_mutex_lock (&conf->mutex); - { - conn->bind_ref--; - if (!conn->bind_ref) { - list_del_init (&conn->list); - unref = _gf_true; - } - } - pthread_mutex_unlock (&conf->mutex); - if (unref) { - gf_log (this->name, GF_LOG_INFO, "Shutting down connection %s", - conn->id); - if (detached) - *detached = _gf_true; - server_conn_unref (conn); - conn = NULL; - } - return conn; -} static call_frame_t * server_alloc_frame (rpcsvc_request_t *req) { - call_frame_t *frame = NULL; - server_state_t *state = NULL; - server_connection_t *conn = NULL; + call_frame_t *frame = NULL; + server_state_t *state = NULL; + client_t *client = NULL; GF_VALIDATE_OR_GOTO ("server", req, out); GF_VALIDATE_OR_GOTO ("server", req->trans, out); GF_VALIDATE_OR_GOTO ("server", req->svc, out); GF_VALIDATE_OR_GOTO ("server", req->svc->ctx, out); - conn = (server_connection_t *)req->trans->xl_private; - GF_VALIDATE_OR_GOTO ("server", conn, out); + client = req->trans->xl_private; + GF_VALIDATE_OR_GOTO ("server", client, out); - frame = create_frame (conn->this, req->svc->ctx->pool); + frame = create_frame (client->this, req->svc->ctx->pool); if (!frame) goto out; @@ -778,45 +280,46 @@ server_alloc_frame (rpcsvc_request_t *req) if (!state) goto out; - if (conn->bound_xl) - state->itable = conn->bound_xl->itable; + if (client->bound_xl) + state->itable = client->bound_xl->itable; state->xprt = rpc_transport_ref (req->trans); - state->conn = conn; - state->resolve.fd_no = -1; state->resolve2.fd_no = -1; + frame->root->client = client; frame->root->state = state; /* which socket */ frame->root->unique = 0; /* which call */ - frame->this = conn->this; + frame->this = client->this; out: return frame; } - call_frame_t * get_frame_from_request (rpcsvc_request_t *req) { - call_frame_t *frame = NULL; + call_frame_t *frame = NULL; + client_t *client = NULL; GF_VALIDATE_OR_GOTO ("server", req, out); + client = req->trans->xl_private; + frame = server_alloc_frame (req); if (!frame) goto out; frame->root->op = req->procnum; - frame->root->type = req->type; frame->root->unique = req->xid; frame->root->uid = req->uid; frame->root->gid = req->gid; frame->root->pid = req->pid; - frame->root->trans = server_conn_ref (req->trans->xl_private); + gf_client_ref (client); + frame->root->client = client; frame->root->lk_owner = req->lk_owner; server_decode_groups (frame, req); @@ -901,84 +404,6 @@ out: return ret; } -void -put_server_conn_state (xlator_t *this, rpc_transport_t *xprt) -{ - GF_VALIDATE_OR_GOTO ("server", this, out); - GF_VALIDATE_OR_GOTO ("server", xprt, out); - - xprt->xl_private = NULL; -out: - return; -} - -server_connection_t * -get_server_conn_state (xlator_t *this, rpc_transport_t *xprt) -{ - GF_VALIDATE_OR_GOTO ("server", this, out); - GF_VALIDATE_OR_GOTO ("server", xprt, out); - - return (server_connection_t *)xprt->xl_private; -out: - return NULL; -} - -server_connection_t * -create_server_conn_state (xlator_t *this, rpc_transport_t *xprt) -{ - server_connection_t *conn = NULL; - int ret = -1; - - GF_VALIDATE_OR_GOTO ("server", this, out); - GF_VALIDATE_OR_GOTO ("server", xprt, out); - - conn = GF_CALLOC (1, sizeof (*conn), gf_server_mt_conn_t); - if (!conn) - goto out; - - pthread_mutex_init (&conn->lock, NULL); - - conn->fdtable = gf_fd_fdtable_alloc (); - if (!conn->fdtable) - goto out; - - conn->ltable = gf_lock_table_new (); - if (!conn->ltable) - goto out; - - conn->this = this; - - xprt->xl_private = conn; - - ret = 0; -out: - if (ret) - destroy_server_conn_state (conn); - - return conn; -} - -void -destroy_server_conn_state (server_connection_t *conn) -{ - GF_VALIDATE_OR_GOTO ("server", conn, out); - - if (conn->ltable) { - /* TODO */ - //FREE (conn->ltable); - ; - } - - if (conn->fdtable) - gf_fd_fdtable_destroy (conn->fdtable); - - pthread_mutex_destroy (&conn->lock); - - GF_FREE (conn); -out: - return; -} - void print_caller (char *str, int size, call_frame_t *frame) @@ -1105,12 +530,15 @@ server_print_params (char *str, int size, server_state_t *state) filled += snprintf (str + filled, size - filled, "volume=%s,", state->volume); +/* FIXME snprintf (str + filled, size - filled, - "bound_xl=%s}", state->conn->bound_xl->name); + "bound_xl=%s}", state->client->bound_xl->name); +*/ out: return; } + int server_resolve_is_empty (server_resolve_t *resolve) { @@ -1126,6 +554,7 @@ server_resolve_is_empty (server_resolve_t *resolve) return 1; } + void server_print_reply (call_frame_t *frame, int op_ret, int op_errno) { @@ -1171,16 +600,16 @@ out: void server_print_request (call_frame_t *frame) { - server_conf_t *conf = NULL; - xlator_t *this = NULL; + server_conf_t *conf = NULL; + xlator_t *this = NULL; server_state_t *state = NULL; + char *op = "UNKNOWN"; char resolve_vars[256]; char resolve2_vars[256]; char loc_vars[256]; char loc2_vars[256]; char other_vars[512]; char caller[512]; - char *op = "UNKNOWN"; GF_VALIDATE_OR_GOTO ("server", frame, out); @@ -1231,13 +660,14 @@ out: return; } + int serialize_rsp_direntp (gf_dirent_t *entries, gfs3_readdirp_rsp *rsp) { gf_dirent_t *entry = NULL; - gfs3_dirplist *trav = NULL; - gfs3_dirplist *prev = NULL; - int ret = -1; + gfs3_dirplist *trav = NULL; + gfs3_dirplist *prev = NULL; + int ret = -1; GF_VALIDATE_OR_GOTO ("server", entries, out); GF_VALIDATE_OR_GOTO ("server", rsp, out); @@ -1305,10 +735,10 @@ out: int serialize_rsp_dirent (gf_dirent_t *entries, gfs3_readdir_rsp *rsp) { - gf_dirent_t *entry = NULL; - gfs3_dirlist *trav = NULL; - gfs3_dirlist *prev = NULL; - int ret = -1; + gf_dirent_t *entry = NULL; + gfs3_dirlist *trav = NULL; + gfs3_dirlist *prev = NULL; + int ret = -1; GF_VALIDATE_OR_GOTO ("server", entries, out); GF_VALIDATE_OR_GOTO ("server", rsp, out); @@ -1335,11 +765,12 @@ out: return ret; } + int readdir_rsp_cleanup (gfs3_readdir_rsp *rsp) { - gfs3_dirlist *prev = NULL; - gfs3_dirlist *trav = NULL; + gfs3_dirlist *prev = NULL; + gfs3_dirlist *trav = NULL; trav = rsp->reply; prev = trav; @@ -1352,6 +783,7 @@ readdir_rsp_cleanup (gfs3_readdir_rsp *rsp) return 0; } + int readdirp_rsp_cleanup (gfs3_readdirp_rsp *rsp) { @@ -1370,6 +802,7 @@ readdirp_rsp_cleanup (gfs3_readdirp_rsp *rsp) return 0; } + int gf_server_check_getxattr_cmd (call_frame_t *frame, const char *key) { @@ -1398,13 +831,14 @@ gf_server_check_getxattr_cmd (call_frame_t *frame, const char *key) return 0; } + int gf_server_check_setxattr_cmd (call_frame_t *frame, dict_t *dict) { - server_conf_t *conf = NULL; - rpc_transport_t *xprt = NULL; - uint64_t total_read = 0; + server_conf_t *conf = NULL; + rpc_transport_t *xprt = NULL; + uint64_t total_read = 0; uint64_t total_write = 0; conf = frame->this->private; @@ -1425,32 +859,403 @@ gf_server_check_setxattr_cmd (call_frame_t *frame, dict_t *dict) return 0; } + gf_boolean_t -server_cancel_conn_timer (xlator_t *this, server_connection_t *conn) +server_cancel_grace_timer (xlator_t *this, client_t *client) { - gf_timer_t *timer = NULL; - gf_boolean_t cancelled = _gf_false; + server_ctx_t *serv_ctx = NULL; + gf_timer_t *timer = NULL; + gf_boolean_t cancelled = _gf_false; - if (!this || !conn) { - gf_log (THIS->name, GF_LOG_ERROR, "Invalid arguments to " - "cancel connection timer"); + if (!this || !client) { + gf_log (THIS->name, GF_LOG_ERROR, + "Invalid arguments to cancel connection timer"); return cancelled; } - pthread_mutex_lock (&conn->lock); - { - if (!conn->timer) - goto unlock; + serv_ctx = server_ctx_get (client, client->this); - timer = conn->timer; - conn->timer = NULL; + if (serv_ctx == NULL) { + gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed"); + goto out; } -unlock: - pthread_mutex_unlock (&conn->lock); + + LOCK (&serv_ctx->fdtable_lock); + { + if (serv_ctx->grace_timer) { + timer = serv_ctx->grace_timer; + serv_ctx->grace_timer = NULL; + } + } + UNLOCK (&serv_ctx->fdtable_lock); if (timer) { gf_timer_call_cancel (this->ctx, timer); cancelled = _gf_true; } +out: return cancelled; } + +server_ctx_t* +server_ctx_get (client_t *client, xlator_t *xlator) +{ + void *tmp = NULL; + server_ctx_t *ctx = NULL; + + client_ctx_get (client, xlator, &tmp); + + ctx = tmp; + + if (ctx != NULL) + goto out; + + ctx = GF_CALLOC (1, sizeof (server_ctx_t), gf_server_mt_server_conf_t); + + if (ctx == NULL) + goto out; + + /* ctx->lk_version = 0; redundant */ + ctx->fdtable = gf_fd_fdtable_alloc (); + + if (ctx->fdtable == NULL) { + GF_FREE (ctx); + ctx = NULL; + goto out; + } + + LOCK_INIT (&ctx->fdtable_lock); + + if (client_ctx_set (client, xlator, ctx) != 0) { + LOCK_DESTROY (&ctx->fdtable_lock); + GF_FREE (ctx); + ctx = NULL; + } + +out: + return ctx; +} + +int32_t +gf_barrier_transmit (server_conf_t *conf, gf_barrier_payload_t *payload) +{ + gf_barrier_t *barrier = NULL; + int32_t ret = -1; + client_t *client = NULL; + gf_boolean_t lk_heal = _gf_false; + call_frame_t *frame = NULL; + server_state_t *state = NULL; + + GF_VALIDATE_OR_GOTO ("barrier", conf, out); + GF_VALIDATE_OR_GOTO ("barrier", conf->barrier, out); + GF_VALIDATE_OR_GOTO ("barrier", payload, out); + + barrier = conf->barrier; + + frame = payload->frame; + if (frame) { + state = CALL_STATE (frame); + frame->local = NULL; + client = frame->root->client; + } + /* currently lk fops are not barrier'ed. This is reflecting code in + * server_submit_reply */ + if (client) + lk_heal = ((server_conf_t *) client->this->private)->lk_heal; + + ret = rpcsvc_submit_generic (payload->req, &payload->rsp, 1, + payload->payload, payload->payload_count, + payload->iobref); + iobuf_unref (payload->iob); + if (ret == -1) { + gf_log_callingfn ("", GF_LOG_ERROR, "Reply submission failed"); + if (frame && client && !lk_heal) { + server_connection_cleanup (frame->this, client, + INTERNAL_LOCKS | POSIX_LOCKS); + } else { + /* TODO: Failure of open(dir), create, inodelk, entrylk + or lk fops send failure must be handled specially. */ + } + goto ret; + } + + ret = 0; +ret: + if (state) { + free_state (state); + } + + if (frame) { + gf_client_unref (client); + STACK_DESTROY (frame->root); + } + + if (payload->free_iobref) { + iobref_unref (payload->iobref); + } +out: + return ret; +} + +gf_barrier_payload_t * +gf_barrier_dequeue (gf_barrier_t *barrier) +{ + gf_barrier_payload_t *payload = NULL; + + if (!barrier || list_empty (&barrier->queue)) + return NULL; + + payload = list_entry (barrier->queue.next, + gf_barrier_payload_t, list); + if (payload) { + list_del_init (&payload->list); + barrier->cur_size--; + } + + return payload; +} + + +void* +gf_barrier_dequeue_start (void *data) +{ + server_conf_t *conf = NULL; + gf_barrier_t *barrier = NULL; + gf_barrier_payload_t *payload = NULL; + + conf = (server_conf_t *)data; + if (!conf || !conf->barrier) + return NULL; + barrier = conf->barrier; + + LOCK (&barrier->lock); + { + while (barrier->cur_size) { + payload = gf_barrier_dequeue (barrier); + if (payload) { + if (gf_barrier_transmit (conf, payload)) { + gf_log ("server", GF_LOG_WARNING, + "Failed to transmit"); + } + GF_FREE (payload); + } + } + } + UNLOCK (&barrier->lock); + return NULL; +} + +void +gf_barrier_timeout (void *data) +{ + server_conf_t *conf = NULL; + gf_barrier_t *barrier = NULL; + gf_boolean_t need_dequeue = _gf_false; + + conf = (server_conf_t *)data; + if (!conf || !conf->barrier) + goto out; + barrier = conf->barrier; + + gf_log ("", GF_LOG_INFO, "barrier timed-out"); + LOCK (&barrier->lock); + { + need_dequeue = barrier->on; + barrier->on = _gf_false; + } + UNLOCK (&barrier->lock); + + if (need_dequeue == _gf_true) + gf_barrier_dequeue_start (data); +out: + return; +} + + +int32_t +gf_barrier_start (xlator_t *this) +{ + server_conf_t *conf = NULL; + gf_barrier_t *barrier = NULL; + int32_t ret = -1; + struct timespec time = {0,}; + + conf = this->private; + + GF_VALIDATE_OR_GOTO ("server", this, out); + GF_VALIDATE_OR_GOTO (this->name, conf, out); + GF_VALIDATE_OR_GOTO (this->name, conf->barrier, out); + + barrier = conf->barrier; + + gf_log (this->name, GF_LOG_INFO, "barrier start called"); + LOCK (&barrier->lock); + { + /* if barrier is on, reset timer */ + if (barrier->on == _gf_true) { + ret = gf_timer_call_cancel (this->ctx, barrier->timer); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to " + "unset timer, failing barrier start"); + goto unlock; + } + } + + barrier->on = _gf_true; + time.tv_sec = barrier->time_out; + time.tv_nsec = 0; + + barrier->timer = gf_timer_call_after (this->ctx, time, + gf_barrier_timeout, + (void *)conf); + if (!barrier->timer) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set " + "timer, failing barrier start"); + barrier->on = _gf_false; + } + } +unlock: + UNLOCK (&barrier->lock); + + ret = 0; +out: + return ret; +} + +int32_t +gf_barrier_stop (xlator_t *this) +{ + server_conf_t *conf = NULL; + gf_barrier_t *barrier = NULL; + int32_t ret = -1; + gf_boolean_t need_dequeue = _gf_false; + + conf = this->private; + + GF_VALIDATE_OR_GOTO ("server", this, out); + GF_VALIDATE_OR_GOTO (this->name, conf, out); + GF_VALIDATE_OR_GOTO (this->name, conf->barrier, out); + + barrier = conf->barrier; + + gf_log (this->name, GF_LOG_INFO, "barrier stop called"); + LOCK (&barrier->lock); + { + need_dequeue = barrier->on; + barrier->on = _gf_false; + } + UNLOCK (&barrier->lock); + + if (need_dequeue == _gf_true) { + gf_timer_call_cancel (this->ctx, barrier->timer); + ret = gf_thread_create (&conf->barrier_th, NULL, + gf_barrier_dequeue_start, + conf); + if (ret) { + gf_log (this->name, GF_LOG_CRITICAL, + "Failed to start un-barriering"); + goto out; + } + } + ret = 0; +out: + return ret; +} + +int32_t +gf_barrier_fops_configure (xlator_t *this, gf_barrier_t *barrier, char *str) +{ + int32_t ret = -1; + char *dup_str = NULL; + char *str_tok = NULL; + char *save_ptr = NULL; + uint64_t fops = 0; + + /* by defaul fsync & flush needs to be barriered */ + + fops |= 1 << GFS3_OP_FSYNC; + fops |= 1 << GFS3_OP_FLUSH; + + if (!str) + goto done; + + dup_str = gf_strdup (str); + if (!dup_str) + goto done; + + str_tok = strtok_r (dup_str, ",", &save_ptr); + if (!str_tok) + goto done; + + fops = 0; + while (str_tok) { + if (!strcmp(str_tok, "writev")) { + fops |= ((uint64_t)1 << GFS3_OP_WRITE); + } else if (!strcmp(str_tok, "fsync")) { + fops |= ((uint64_t)1 << GFS3_OP_FSYNC); + } else if (!strcmp(str_tok, "read")) { + fops |= ((uint64_t)1 << GFS3_OP_READ); + } else if (!strcmp(str_tok, "rename")) { + fops |= ((uint64_t)1 << GFS3_OP_RENAME); + } else if (!strcmp(str_tok, "flush")) { + fops |= ((uint64_t)1 << GFS3_OP_FLUSH); + } else if (!strcmp(str_tok, "ftruncate")) { + fops |= ((uint64_t)1 << GFS3_OP_FTRUNCATE); + } else if (!strcmp(str_tok, "fallocate")) { + fops |= ((uint64_t)1 << GFS3_OP_FALLOCATE); + } else if (!strcmp(str_tok, "rmdir")) { + fops |= ((uint64_t)1 << GFS3_OP_RMDIR); + } else { + gf_log ("barrier", GF_LOG_ERROR, + "Invalid barrier fop %s", str_tok); + } + + str_tok = strtok_r (NULL, ",", &save_ptr); + } +done: + LOCK (&barrier->lock); + { + barrier->fops = fops; + } + UNLOCK (&barrier->lock); + ret = 0; + + GF_FREE (dup_str); + return ret; +} + +void +gf_barrier_enqueue (gf_barrier_t *barrier, gf_barrier_payload_t *payload) +{ + list_add_tail (&payload->list, &barrier->queue); + barrier->cur_size++; +} + +gf_barrier_payload_t * +gf_barrier_payload (rpcsvc_request_t *req, struct iovec *rsp, + call_frame_t *frame, struct iovec *payload_orig, + int payloadcount, struct iobref *iobref, + struct iobuf *iob, gf_boolean_t free_iobref) +{ + gf_barrier_payload_t *payload = NULL; + + if (!rsp) + return NULL; + + payload = GF_CALLOC (1, sizeof (*payload),1); + if (!payload) + return NULL; + + INIT_LIST_HEAD (&payload->list); + + payload->req = req; + memcpy (&payload->rsp, rsp, sizeof (struct iovec)); + payload->frame = frame; + payload->payload = payload_orig; + payload->payload_count = payloadcount; + payload->iobref = iobref; + payload->iob = iob; + payload->free_iobref = free_iobref; + + return payload; +} diff --git a/xlators/protocol/server/src/server-helpers.h b/xlators/protocol/server/src/server-helpers.h index 0b7424bab..b455aa6df 100644 --- a/xlators/protocol/server/src/server-helpers.h +++ b/xlators/protocol/server/src/server-helpers.h @@ -15,13 +15,8 @@ #define CALL_STATE(frame) ((server_state_t *)frame->root->state) -#define BOUND_XL(frame) ((xlator_t *) CALL_STATE(frame)->conn->bound_xl) - #define XPRT_FROM_FRAME(frame) ((rpc_transport_t *) CALL_STATE(frame)->xprt) -#define SERVER_CONNECTION(frame) \ - ((server_connection_t *) CALL_STATE(frame)->conn) - #define SERVER_CONF(frame) \ ((server_conf_t *)XPRT_FROM_FRAME(frame)->this->private) @@ -34,45 +29,26 @@ #define IS_NOT_ROOT(pathlen) ((pathlen > 2)? 1 : 0) +#define is_fop_barriered(fops, procnum) (fops & ((uint64_t)1 << procnum)) + +#define barrier_add_to_queue(barrier) (barrier->on || barrier->cur_size) + void free_state (server_state_t *state); void server_loc_wipe (loc_t *loc); -int32_t -gf_add_locker (server_connection_t *conn, const char *volume, - loc_t *loc, - fd_t *fd, - pid_t pid, - gf_lkowner_t *owner, - glusterfs_fop_t type); - -int32_t -gf_del_locker (server_connection_t *conn, const char *volume, - loc_t *loc, - fd_t *fd, - gf_lkowner_t *owner, - glusterfs_fop_t type); - void server_print_request (call_frame_t *frame); call_frame_t * get_frame_from_request (rpcsvc_request_t *req); -gf_boolean_t -server_cancel_conn_timer (xlator_t *this, server_connection_t *conn); - -void -put_server_conn_state (xlator_t *this, rpc_transport_t *xprt); - -server_connection_t * -get_server_conn_state (xlator_t *this, rpc_transport_t *xptr); - -server_connection_t * -create_server_conn_state (xlator_t *this, rpc_transport_t *xptr); +int +server_connection_cleanup (xlator_t *this, struct _client_t *client, + int32_t flags); -void -destroy_server_conn_state (server_connection_t *conn); +gf_boolean_t +server_cancel_grace_timer (xlator_t *this, struct _client_t *client); int server_build_config (xlator_t *this, server_conf_t *conf); @@ -82,4 +58,17 @@ int serialize_rsp_direntp (gf_dirent_t *entries, gfs3_readdirp_rsp *rsp); int readdirp_rsp_cleanup (gfs3_readdirp_rsp *rsp); int readdir_rsp_cleanup (gfs3_readdir_rsp *rsp); +server_ctx_t *server_ctx_get (client_t *client, xlator_t *xlator); + +int32_t gf_barrier_start (xlator_t *this); +int32_t gf_barrier_stop (xlator_t *this); +int32_t gf_barrier_fops_configure (xlator_t *this, gf_barrier_t *barrier, + char *str); +void gf_barrier_enqueue (gf_barrier_t *barrier, gf_barrier_payload_t *stub); +gf_barrier_payload_t * +gf_barrier_payload (rpcsvc_request_t *req, struct iovec *rsp, + call_frame_t *frame, struct iovec *payload, + int payloadcount, struct iobref *iobref, + struct iobuf *iob, gf_boolean_t free_iobref); + #endif /* !_SERVER_HELPERS_H */ diff --git a/xlators/protocol/server/src/server-resolve.c b/xlators/protocol/server/src/server-resolve.c index 4a91a4104..cc4686a03 100644 --- a/xlators/protocol/server/src/server-resolve.c +++ b/xlators/protocol/server/src/server-resolve.c @@ -147,7 +147,8 @@ resolve_gfid_cbk (call_frame_t *frame, void *cookie, xlator_t *this, (char **) &resolve_loc->path); STACK_WIND (frame, resolve_gfid_entry_cbk, - BOUND_XL (frame), BOUND_XL (frame)->fops->lookup, + frame->root->client->bound_xl, + frame->root->client->bound_xl->fops->lookup, &resolve->resolve_loc, NULL); return 0; out: @@ -179,7 +180,8 @@ resolve_gfid (call_frame_t *frame) ret = loc_path (resolve_loc, NULL); STACK_WIND (frame, resolve_gfid_cbk, - BOUND_XL (frame), BOUND_XL (frame)->fops->lookup, + frame->root->client->bound_xl, + frame->root->client->bound_xl->fops->lookup, &resolve->resolve_loc, NULL); return 0; } @@ -449,14 +451,14 @@ server_resolve_anonfd (call_frame_t *frame) int server_resolve_fd (call_frame_t *frame) { - server_state_t *state = NULL; - server_resolve_t *resolve = NULL; - server_connection_t *conn = NULL; - uint64_t fd_no = -1; + server_ctx_t *serv_ctx = NULL; + server_state_t *state = NULL; + client_t *client = NULL; + server_resolve_t *resolve = NULL; + uint64_t fd_no = -1; state = CALL_STATE (frame); resolve = state->resolve_now; - conn = SERVER_CONNECTION (frame); fd_no = resolve->fd_no; @@ -465,7 +467,18 @@ server_resolve_fd (call_frame_t *frame) return 0; } - state->fd = gf_fd_fdptr_get (conn->fdtable, fd_no); + client = frame->root->client; + + serv_ctx = server_ctx_get (client, client->this); + + if (serv_ctx == NULL) { + gf_log ("", GF_LOG_INFO, "server_ctx_get() failed"); + resolve->op_ret = -1; + resolve->op_errno = ENOMEM; + return 0; + } + + state->fd = gf_fd_fdptr_get (serv_ctx->fdtable, fd_no); if (!state->fd) { gf_log ("", GF_LOG_INFO, "fd not found in context"); @@ -520,14 +533,12 @@ int server_resolve_done (call_frame_t *frame) { server_state_t *state = NULL; - xlator_t *bound_xl = NULL; state = CALL_STATE (frame); - bound_xl = BOUND_XL (frame); server_print_request (frame); - state->resume_fn (frame, bound_xl); + state->resume_fn (frame, frame->root->client->bound_xl); return 0; } diff --git a/xlators/protocol/server/src/server-rpc-fops.c b/xlators/protocol/server/src/server-rpc-fops.c index a9d2ee5b6..138e601ce 100644 --- a/xlators/protocol/server/src/server-rpc-fops.c +++ b/xlators/protocol/server/src/server-rpc-fops.c @@ -24,6 +24,11 @@ #include "xdr-nfs3.h" +#define SERVER_REQ_SET_ERROR(req, ret) \ + do { \ + rpcsvc_request_seterr (req, GARBAGE_ARGS); \ + ret = RPCSVC_ACTOR_ERROR; \ + } while (0) /* Callback function section */ int @@ -31,17 +36,14 @@ server_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct statvfs *buf, dict_t *xdata) { - gfs3_statfs_rsp rsp = {0,}; - rpcsvc_request_t *req = NULL; - - req = frame->local; + gfs3_statfs_rsp rsp = {0,}; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%"PRId64": STATFS (%s)", + gf_log (this->name, GF_LOG_WARNING, "%"PRId64": STATFS (%s)", frame->root->unique, strerror (op_errno)); goto out; } @@ -52,7 +54,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); - + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_statfs_rsp); @@ -67,16 +69,15 @@ server_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, inode_t *inode, struct iatt *stbuf, dict_t *xdata, struct iatt *postparent) { - rpcsvc_request_t *req = NULL; - server_state_t *state = NULL; - inode_t *root_inode = NULL; - inode_t *link_inode = NULL; - loc_t fresh_loc = {0,}; - gfs3_lookup_rsp rsp = {0,}; - uuid_t rootgfid = {0,}; + rpcsvc_request_t *req = NULL; + server_state_t *state = NULL; + inode_t *root_inode = NULL; + inode_t *link_inode = NULL; + loc_t fresh_loc = {0,}; + gfs3_lookup_rsp rsp = {0,}; + uuid_t rootgfid = {0,}; - req = frame->local; - state = CALL_STATE(frame); + state = CALL_STATE (frame); if (state->is_revalidate == 1 && op_ret == -1) { state->is_revalidate = 2; @@ -84,8 +85,9 @@ server_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, inode_unref (fresh_loc.inode); fresh_loc.inode = inode_new (state->itable); - STACK_WIND (frame, server_lookup_cbk, BOUND_XL (frame), - BOUND_XL (frame)->fops->lookup, + STACK_WIND (frame, server_lookup_cbk, + frame->root->client->bound_xl, + frame->root->client->bound_xl->fops->lookup, &fresh_loc, state->xdata); loc_wipe (&fresh_loc); @@ -94,7 +96,7 @@ server_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_stat_from_iatt (&rsp.postparent, postparent); - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret) { @@ -108,7 +110,7 @@ server_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - root_inode = BOUND_XL(frame)->itable->root; + root_inode = frame->root->client->bound_xl->itable->root; if (inode == root_inode) { /* we just looked up root ("/") */ stbuf->ia_ino = 1; @@ -137,8 +139,9 @@ out: if (state->resolve.bname) { gf_log (this->name, ((op_errno == ENOENT) ? GF_LOG_TRACE : GF_LOG_INFO), - "%"PRId64": LOOKUP %s (%s/%s) ==> (%s)", - frame->root->unique, state->loc.path, + "%"PRId64": LOOKUP %s (%s/%s) ==> " + "(%s)", frame->root->unique, + state->loc.path, uuid_utoa (state->resolve.pargfid), state->resolve.bname, strerror (op_errno)); @@ -152,6 +155,7 @@ out: } } + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_lookup_rsp); @@ -166,21 +170,20 @@ server_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { - gfs3_lk_rsp rsp = {0,}; - rpcsvc_request_t *req = NULL; - server_state_t *state = NULL; - - req = frame->local; - state = CALL_STATE(frame); + gfs3_lk_rsp rsp = {0,}; + rpcsvc_request_t *req = NULL; + server_state_t *state = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret) { if ((op_errno != ENOSYS) && (op_errno != EAGAIN)) { + state = CALL_STATE (frame); gf_log (this->name, GF_LOG_INFO, - "%"PRId64": LK %"PRId64" (%s) ==> (%s)", - frame->root->unique, state->resolve.fd_no, + "%"PRId64": LK %"PRId64" (%s) ==> " + "(%s)", frame->root->unique, + state->resolve.fd_no, uuid_utoa (state->resolve.gfid), strerror (op_errno)); } @@ -209,6 +212,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_lk_rsp); @@ -222,21 +226,19 @@ int server_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - gf_common_rsp rsp = {0,}; - server_connection_t *conn = NULL; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; + gf_common_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; - req = frame->local; - conn = SERVER_CONNECTION(frame); - state = CALL_STATE(frame); - - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); + state = CALL_STATE (frame); + if (op_ret < 0) { if ((op_errno != ENOSYS) && (op_errno != EAGAIN)) { - gf_log (this->name, GF_LOG_INFO, + gf_log (this->name, (op_errno == ENOENT)? + GF_LOG_DEBUG:GF_LOG_ERROR, "%"PRId64": INODELK %s (%s) ==> (%s)", frame->root->unique, state->loc.path, uuid_utoa (state->resolve.gfid), @@ -245,20 +247,11 @@ server_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - if (state->flock.l_type == F_UNLCK) - gf_del_locker (conn, state->volume, - &state->loc, NULL, &frame->root->lk_owner, - GF_FOP_INODELK); - else - gf_add_locker (conn, state->volume, - &state->loc, NULL, frame->root->pid, - &frame->root->lk_owner, - GF_FOP_INODELK); - out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_common_rsp); @@ -272,43 +265,32 @@ int server_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - gf_common_rsp rsp = {0,}; - server_state_t *state = NULL; - server_connection_t *conn = NULL; - rpcsvc_request_t *req = NULL; + gf_common_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; - req = frame->local; - conn = SERVER_CONNECTION(frame); - state = CALL_STATE(frame); - - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); + state = CALL_STATE (frame); + if (op_ret < 0) { if ((op_errno != ENOSYS) && (op_errno != EAGAIN)) { gf_log (this->name, GF_LOG_INFO, - "%"PRId64": FINODELK %"PRId64" (%s) ==> (%s)", - frame->root->unique, state->resolve.fd_no, + "%"PRId64": FINODELK %"PRId64" (%s) " + "==> (%s)", frame->root->unique, + state->resolve.fd_no, uuid_utoa (state->resolve.gfid), strerror (op_errno)); } goto out; } - if (state->flock.l_type == F_UNLCK) - gf_del_locker (conn, state->volume, - NULL, state->fd, - &frame->root->lk_owner, GF_FOP_INODELK); - else - gf_add_locker (conn, state->volume, - NULL, state->fd, - frame->root->pid, - &frame->root->lk_owner, GF_FOP_INODELK); - out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_common_rsp); @@ -321,18 +303,15 @@ int server_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - server_connection_t *conn = NULL; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - gf_common_rsp rsp = {0,}; - - req = frame->local; - conn = SERVER_CONNECTION(frame); - state = CALL_STATE(frame); + gf_common_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); + state = CALL_STATE (frame); + if (op_ret < 0) { if ((op_errno != ENOSYS) && (op_errno != EAGAIN)) { gf_log (this->name, GF_LOG_INFO, @@ -344,20 +323,11 @@ server_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - if (state->cmd == ENTRYLK_UNLOCK) - gf_del_locker (conn, state->volume, - &state->loc, NULL, &frame->root->lk_owner, - GF_FOP_ENTRYLK); - else - gf_add_locker (conn, state->volume, - &state->loc, NULL, frame->root->pid, - &frame->root->lk_owner, - GF_FOP_ENTRYLK); - out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_common_rsp); @@ -371,18 +341,15 @@ int server_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - gf_common_rsp rsp = {0,}; - server_connection_t *conn = NULL; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - conn = SERVER_CONNECTION(frame); - state = CALL_STATE(frame); + gf_common_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); + state = CALL_STATE (frame); + if (op_ret < 0) { if ((op_errno != ENOSYS) && (op_errno != EAGAIN)) { gf_log (this->name, GF_LOG_INFO, @@ -394,19 +361,11 @@ server_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - if (state->cmd == ENTRYLK_UNLOCK) - gf_del_locker (conn, state->volume, - NULL, state->fd, &frame->root->lk_owner, - GF_FOP_ENTRYLK); - else - gf_add_locker (conn, state->volume, - NULL, state->fd, frame->root->pid, - &frame->root->lk_owner, GF_FOP_ENTRYLK); - out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_common_rsp); @@ -420,17 +379,15 @@ int server_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - gf_common_rsp rsp = {0,}; - rpcsvc_request_t *req = NULL; - server_state_t *state = NULL; - - req = frame->local; - state = CALL_STATE(frame); + gf_common_rsp rsp = {0,}; + rpcsvc_request_t *req = NULL; + server_state_t *state = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret) { + state = CALL_STATE (frame); gf_log (this->name, GF_LOG_INFO, "%"PRId64": ACCESS %s (%s) ==> (%s)", frame->root->unique, state->loc.path, @@ -443,6 +400,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_common_rsp); @@ -456,17 +414,16 @@ server_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - gfs3_rmdir_rsp rsp = {0,}; - server_state_t *state = NULL; - inode_t *parent = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE(frame); + gfs3_rmdir_rsp rsp = {0,}; + server_state_t *state = NULL; + inode_t *parent = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); + state = CALL_STATE (frame); + if (op_ret) { gf_log (this->name, GF_LOG_INFO, "%"PRId64": RMDIR %s (%s/%s) ==> (%s)", @@ -495,6 +452,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_rmdir_rsp); @@ -509,17 +467,16 @@ server_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *stbuf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - gfs3_mkdir_rsp rsp = {0,}; - server_state_t *state = NULL; - inode_t *link_inode = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE(frame); + gfs3_mkdir_rsp rsp = {0,}; + server_state_t *state = NULL; + inode_t *link_inode = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); + state = CALL_STATE (frame); + if (op_ret < 0) { gf_log (this->name, GF_LOG_INFO, "%"PRId64": MKDIR %s (%s/%s) ==> (%s)", @@ -542,6 +499,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_mkdir_rsp); @@ -556,17 +514,16 @@ server_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - gfs3_mknod_rsp rsp = {0,}; - server_state_t *state = NULL; - inode_t *link_inode = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE(frame); + gfs3_mknod_rsp rsp = {0,}; + server_state_t *state = NULL; + inode_t *link_inode = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); + state = CALL_STATE (frame); + if (op_ret < 0) { gf_log (this->name, GF_LOG_INFO, "%"PRId64": MKNOD %s (%s/%s) ==> (%s)", @@ -589,6 +546,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_mknod_rsp); @@ -601,17 +559,15 @@ int server_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - gf_common_rsp rsp = {0,}; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE(frame); + gf_common_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret < 0) { + state = CALL_STATE (frame); gf_log (this->name, GF_LOG_INFO, "%"PRId64": FSYNCDIR %"PRId64" (%s) ==> (%s)", frame->root->unique, state->resolve.fd_no, @@ -624,6 +580,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_common_rsp); @@ -637,18 +594,16 @@ server_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, dict_t *xdata) { - gfs3_readdir_rsp rsp = {0,}; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - int ret = 0; - - req = frame->local; - state = CALL_STATE(frame); + gfs3_readdir_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; + int ret = 0; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret < 0) { + state = CALL_STATE (frame); gf_log (this->name, GF_LOG_INFO, "%"PRId64": READDIR %"PRId64" (%s) ==> (%s)", frame->root->unique, state->resolve.fd_no, @@ -671,6 +626,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_readdir_rsp); @@ -685,29 +641,33 @@ int server_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - server_connection_t *conn = NULL; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - gfs3_opendir_rsp rsp = {0,}; - uint64_t fd_no = 0; - - req = frame->local; - conn = SERVER_CONNECTION (frame); - state = CALL_STATE (frame); + server_state_t *state = NULL; + server_ctx_t *serv_ctx = NULL; + rpcsvc_request_t *req = NULL; + gfs3_opendir_rsp rsp = {0,}; + uint64_t fd_no = 0; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, + state = CALL_STATE (frame); + gf_log (this->name, (op_errno == ENOENT)? + GF_LOG_DEBUG:GF_LOG_ERROR, "%"PRId64": OPENDIR %s (%s) ==> (%s)", frame->root->unique, state->loc.path, uuid_utoa (state->resolve.gfid), strerror (op_errno)); goto out; } + serv_ctx = server_ctx_get (frame->root->client, this); + if (serv_ctx == NULL) { + gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed"); + goto out; + } + fd_bind (fd); - fd_no = gf_fd_unused_get (conn->fdtable, fd); + fd_no = gf_fd_unused_get (serv_ctx->fdtable, fd); fd_ref (fd); // on behalf of the client out: @@ -715,6 +675,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_opendir_rsp); @@ -727,17 +688,15 @@ int server_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - gf_common_rsp rsp = {0,}; - rpcsvc_request_t *req = NULL; - server_state_t *state = NULL; - - req = frame->local; - state = CALL_STATE(frame); + gf_common_rsp rsp = {0,}; + rpcsvc_request_t *req = NULL; + server_state_t *state = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret == -1) { + state = CALL_STATE (frame); gf_log (this->name, GF_LOG_INFO, "%"PRId64": REMOVEXATTR %s (%s) of key %s ==> (%s)", frame->root->unique, state->loc.path, @@ -750,6 +709,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_common_rsp); @@ -762,17 +722,15 @@ int server_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - gf_common_rsp rsp = {0,}; - rpcsvc_request_t *req = NULL; + gf_common_rsp rsp = {0,}; + rpcsvc_request_t *req = NULL; server_state_t *state = NULL; - req = frame->local; - state = CALL_STATE(frame); - - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret == -1) { + state = CALL_STATE (frame); gf_log (this->name, GF_LOG_INFO, "%"PRId64": FREMOVEXATTR %"PRId64" (%s) (%s) ==> (%s)", frame->root->unique, state->resolve.fd_no, @@ -785,6 +743,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_common_rsp); @@ -798,17 +757,15 @@ server_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - gfs3_getxattr_rsp rsp = {0,}; - rpcsvc_request_t *req = NULL; - server_state_t *state = NULL; - - req = frame->local; - state = CALL_STATE (frame); + gfs3_getxattr_rsp rsp = {0,}; + rpcsvc_request_t *req = NULL; + server_state_t *state = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret == -1) { + state = CALL_STATE (frame); gf_log (this->name, (((op_errno == ENOTSUP) || (op_errno == ENODATA) || (op_errno == ENOENT)) ? @@ -820,13 +777,14 @@ server_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - GF_PROTOCOL_DICT_SERIALIZE (this, dict, (&rsp.dict.dict_val), + GF_PROTOCOL_DICT_SERIALIZE (this, dict, &rsp.dict.dict_val, rsp.dict.dict_len, op_errno, out); out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_getxattr_rsp); @@ -843,17 +801,15 @@ server_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - gfs3_fgetxattr_rsp rsp = {0,}; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE (frame); + gfs3_fgetxattr_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret == -1) { + state = CALL_STATE (frame); gf_log (this->name, ((op_errno == ENOTSUP) ? GF_LOG_DEBUG : GF_LOG_INFO), "%"PRId64": FGETXATTR %"PRId64" (%s) (%s) ==> (%s)", @@ -863,7 +819,7 @@ server_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - GF_PROTOCOL_DICT_SERIALIZE (this, dict, (&rsp.dict.dict_val), + GF_PROTOCOL_DICT_SERIALIZE (this, dict, &rsp.dict.dict_val, rsp.dict.dict_len, op_errno, out); out: @@ -871,6 +827,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_fgetxattr_rsp); @@ -886,11 +843,11 @@ static int _gf_server_log_setxattr_failure (dict_t *d, char *k, data_t *v, void *tmp) { - server_state_t *state = NULL; - call_frame_t *frame = NULL; + server_state_t *state = NULL; + call_frame_t *frame = NULL; frame = tmp; - state = CALL_STATE(frame); + state = CALL_STATE (frame); gf_log (THIS->name, GF_LOG_INFO, "%"PRId64": SETXATTR %s (%s) ==> %s", @@ -907,13 +864,11 @@ server_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, rpcsvc_request_t *req = NULL; server_state_t *state = NULL; - req = frame->local; - state = CALL_STATE(frame); - - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret == -1) { + state = CALL_STATE (frame); if (op_errno != ENOTSUP) dict_foreach (state->dict, _gf_server_log_setxattr_failure, @@ -929,6 +884,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_common_rsp); @@ -942,11 +898,11 @@ static int _gf_server_log_fsetxattr_failure (dict_t *d, char *k, data_t *v, void *tmp) { - call_frame_t *frame = NULL; - server_state_t *state = NULL; + call_frame_t *frame = NULL; + server_state_t *state = NULL; frame = tmp; - state = CALL_STATE(frame); + state = CALL_STATE (frame); gf_log (THIS->name, GF_LOG_INFO, "%"PRId64": FSETXATTR %"PRId64" (%s) ==> %s", @@ -964,13 +920,11 @@ server_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, rpcsvc_request_t *req = NULL; server_state_t *state = NULL; - req = frame->local; - state = CALL_STATE(frame); - - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret == -1) { + state = CALL_STATE (frame); if (op_errno != ENOTSUP) { dict_foreach (state->dict, _gf_server_log_fsetxattr_failure, @@ -986,6 +940,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_common_rsp); @@ -1001,20 +956,19 @@ server_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *prenewparent, struct iatt *postnewparent, dict_t *xdata) { - gfs3_rename_rsp rsp = {0,}; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - inode_t *tmp_inode = NULL; - inode_t *tmp_parent = NULL; - char oldpar_str[50] = {0,}; - char newpar_str[50] = {0,}; - - req = frame->local; - state = CALL_STATE(frame); + gfs3_rename_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; + inode_t *tmp_inode = NULL; + inode_t *tmp_parent = NULL; + char oldpar_str[50] = {0,}; + char newpar_str[50] = {0,}; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); + state = CALL_STATE (frame); + if (op_ret == -1) { uuid_utoa_r (state->resolve.gfid, oldpar_str); uuid_utoa_r (state->resolve2.gfid, newpar_str); @@ -1029,7 +983,7 @@ server_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, stbuf->ia_type = state->loc.inode->ia_type; /* TODO: log gfid of the inodes */ - gf_log (state->conn->bound_xl->name, GF_LOG_TRACE, + gf_log (frame->root->client->bound_xl->name, GF_LOG_TRACE, "%"PRId64": RENAME_CBK %s ==> %s", frame->root->unique, state->loc.name, state->loc2.name); @@ -1071,6 +1025,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_rename_rsp); @@ -1084,17 +1039,16 @@ server_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - gfs3_unlink_rsp rsp = {0,}; - server_state_t *state = NULL; - inode_t *parent = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE(frame); + gfs3_unlink_rsp rsp = {0,}; + server_state_t *state = NULL; + inode_t *parent = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); + state = CALL_STATE (frame); + if (op_ret) { gf_log (this->name, (op_errno == ENOENT)? GF_LOG_DEBUG:GF_LOG_ERROR, @@ -1106,7 +1060,7 @@ server_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } /* TODO: log gfid of the inodes */ - gf_log (state->conn->bound_xl->name, GF_LOG_TRACE, + gf_log (frame->root->client->bound_xl->name, GF_LOG_TRACE, "%"PRId64": UNLINK_CBK %s", frame->root->unique, state->loc.name); @@ -1126,6 +1080,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_unlink_rsp); @@ -1140,17 +1095,16 @@ server_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *stbuf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - gfs3_symlink_rsp rsp = {0,}; - server_state_t *state = NULL; - inode_t *link_inode = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE(frame); + gfs3_symlink_rsp rsp = {0,}; + server_state_t *state = NULL; + inode_t *link_inode = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); + state = CALL_STATE (frame); + if (op_ret < 0) { gf_log (this->name, GF_LOG_INFO, "%"PRId64": SYMLINK %s (%s/%s) ==> (%s)", @@ -1173,6 +1127,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_symlink_rsp); @@ -1188,27 +1143,27 @@ server_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *stbuf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - gfs3_link_rsp rsp = {0,}; - server_state_t *state = NULL; - inode_t *link_inode = NULL; - rpcsvc_request_t *req = NULL; + gfs3_link_rsp rsp = {0,}; + server_state_t *state = NULL; + inode_t *link_inode = NULL; + rpcsvc_request_t *req = NULL; char gfid_str[50] = {0,}; char newpar_str[50] = {0,}; - req = frame->local; - state = CALL_STATE(frame); - - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); + state = CALL_STATE (frame); + if (op_ret) { uuid_utoa_r (state->resolve.gfid, gfid_str); uuid_utoa_r (state->resolve2.pargfid, newpar_str); gf_log (this->name, GF_LOG_INFO, "%"PRId64": LINK %s (%s) -> %s/%s ==> (%s)", - frame->root->unique, state->loc.path, gfid_str, - newpar_str, state->resolve2.bname, strerror (op_errno)); + frame->root->unique, state->loc.path, + gfid_str, newpar_str, state->resolve2.bname, + strerror (op_errno)); goto out; } @@ -1224,6 +1179,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_link_rsp); @@ -1237,17 +1193,15 @@ server_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - gfs3_truncate_rsp rsp = {0,}; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE (frame); + gfs3_truncate_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret) { + state = CALL_STATE (frame); gf_log (this->name, GF_LOG_INFO, "%"PRId64": TRUNCATE %s (%s) ==> (%s)", frame->root->unique, state->loc.path, @@ -1262,6 +1216,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_truncate_rsp); @@ -1275,17 +1230,15 @@ server_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *stbuf, dict_t *xdata) { - gfs3_fstat_rsp rsp = {0,}; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE(frame); + gfs3_fstat_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret) { + state = CALL_STATE (frame); gf_log (this->name, GF_LOG_INFO, "%"PRId64": FSTAT %"PRId64" (%s) ==> (%s)", frame->root->unique, state->resolve.fd_no, @@ -1299,6 +1252,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_fstat_rsp); @@ -1312,17 +1266,15 @@ server_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - gfs3_ftruncate_rsp rsp = {0}; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE (frame); + gfs3_ftruncate_rsp rsp = {0}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret) { + state = CALL_STATE (frame); gf_log (this->name, GF_LOG_INFO, "%"PRId64": FTRUNCATE %"PRId64" (%s)==> (%s)", frame->root->unique, state->resolve.fd_no, @@ -1337,6 +1289,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_ftruncate_rsp); @@ -1349,21 +1302,20 @@ int server_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - gf_common_rsp rsp = {0,}; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE(frame); + gf_common_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "%"PRId64": FLUSH %"PRId64" (%s) ==> (%s)", + state = CALL_STATE (frame); + gf_log (this->name, (op_errno == ENOENT)? + GF_LOG_DEBUG:GF_LOG_ERROR, + "%"PRId64": FLUSH %"PRId64" (%s) ==> (%s)", frame->root->unique, state->resolve.fd_no, - uuid_utoa (state->resolve.gfid), strerror (op_errno)); + uuid_utoa (state->resolve.gfid), strerror (op_errno)); goto out; } @@ -1371,6 +1323,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_common_rsp); @@ -1384,17 +1337,15 @@ server_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - gfs3_fsync_rsp rsp = {0,}; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE(frame); + gfs3_fsync_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret < 0) { + state = CALL_STATE (frame); gf_log (this->name, GF_LOG_INFO, "%"PRId64": FSYNC %"PRId64" (%s) ==> (%s)", frame->root->unique, state->resolve.fd_no, @@ -1409,6 +1360,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_fsync_rsp); @@ -1422,17 +1374,15 @@ server_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - gfs3_write_rsp rsp = {0,}; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE(frame); + gfs3_write_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret < 0) { + state = CALL_STATE (frame); gf_log (this->name, GF_LOG_INFO, "%"PRId64": WRITEV %"PRId64" (%s) ==> (%s)", frame->root->unique, state->resolve.fd_no, @@ -1447,6 +1397,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_write_rsp); @@ -1462,12 +1413,9 @@ server_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iovec *vector, int32_t count, struct iatt *stbuf, struct iobref *iobref, dict_t *xdata) { - gfs3_read_rsp rsp = {0,}; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE(frame); + gfs3_read_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; #ifdef GF_TESTING_IO_XDATA { @@ -1479,10 +1427,11 @@ server_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "testing-xdata-value"); } #endif - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret < 0) { + state = CALL_STATE (frame); gf_log (this->name, GF_LOG_INFO, "%"PRId64": READV %"PRId64" (%s) ==> (%s)", frame->root->unique, state->resolve.fd_no, @@ -1497,6 +1446,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, vector, count, iobref, (xdrproc_t)xdr_gfs3_read_rsp); @@ -1511,17 +1461,15 @@ server_rchecksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this, uint32_t weak_checksum, uint8_t *strong_checksum, dict_t *xdata) { - gfs3_rchecksum_rsp rsp = {0,}; - rpcsvc_request_t *req = NULL; + gfs3_rchecksum_rsp rsp = {0,}; + rpcsvc_request_t *req = NULL; server_state_t *state = NULL; - req = frame->local; - state = CALL_STATE(frame); - - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret < 0) { + state = CALL_STATE (frame); gf_log (this->name, GF_LOG_INFO, "%"PRId64": RCHECKSUM %"PRId64" (%s)==> (%s)", frame->root->unique, state->resolve.fd_no, @@ -1538,6 +1486,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_rchecksum_rsp); @@ -1551,21 +1500,19 @@ int server_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - server_connection_t *conn = NULL; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - uint64_t fd_no = 0; - gfs3_open_rsp rsp = {0,}; - - req = frame->local; - conn = SERVER_CONNECTION (frame); - state = CALL_STATE (frame); + server_state_t *state = NULL; + server_ctx_t *serv_ctx = NULL; + rpcsvc_request_t *req = NULL; + uint64_t fd_no = 0; + gfs3_open_rsp rsp = {0,}; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, + state = CALL_STATE (frame); + gf_log (this->name, (op_errno == ENOENT)? + GF_LOG_DEBUG:GF_LOG_ERROR, "%"PRId64": OPEN %s (%s) ==> (%s)", frame->root->unique, state->loc.path, uuid_utoa (state->resolve.gfid), @@ -1573,8 +1520,14 @@ server_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } + serv_ctx = server_ctx_get (frame->root->client, this); + if (serv_ctx == NULL) { + gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed"); + goto out; + } + fd_bind (fd); - fd_no = gf_fd_unused_get (conn->fdtable, fd); + fd_no = gf_fd_unused_get (serv_ctx->fdtable, fd); fd_ref (fd); rsp.fd = fd_no; @@ -1582,6 +1535,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_open_rsp); GF_FREE (rsp.xdata.xdata_val); @@ -1596,20 +1550,18 @@ server_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *stbuf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - server_connection_t *conn = NULL; server_state_t *state = NULL; + server_ctx_t *serv_ctx = NULL; inode_t *link_inode = NULL; rpcsvc_request_t *req = NULL; uint64_t fd_no = 0; gfs3_create_rsp rsp = {0,}; - req = frame->local; - conn = SERVER_CONNECTION (frame); - state = CALL_STATE (frame); - - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); + state = CALL_STATE (frame); + if (op_ret < 0) { gf_log (this->name, GF_LOG_INFO, "%"PRId64": CREATE %s (%s/%s) ==> (%s)", @@ -1620,7 +1572,7 @@ server_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } /* TODO: log gfid too */ - gf_log (state->conn->bound_xl->name, GF_LOG_TRACE, + gf_log (frame->root->client->bound_xl->name, GF_LOG_TRACE, "%"PRId64": CREATE %s (%s)", frame->root->unique, state->loc.name, uuid_utoa (stbuf->ia_gfid)); @@ -1647,9 +1599,14 @@ server_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, inode_lookup (link_inode); inode_unref (link_inode); - fd_bind (fd); + serv_ctx = server_ctx_get (frame->root->client, this); + if (serv_ctx == NULL) { + gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed"); + goto out; + } - fd_no = gf_fd_unused_get (conn->fdtable, fd); + fd_bind (fd); + fd_no = gf_fd_unused_get (serv_ctx->fdtable, fd); fd_ref (fd); if ((fd_no < 0) || (fd == 0)) { @@ -1666,6 +1623,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_create_rsp); @@ -1679,17 +1637,15 @@ server_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, const char *buf, struct iatt *stbuf, dict_t *xdata) { - gfs3_readlink_rsp rsp = {0,}; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE(frame); + gfs3_readlink_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret < 0) { + state = CALL_STATE (frame); gf_log (this->name, GF_LOG_INFO, "%"PRId64": READLINK %s (%s) ==> (%s)", frame->root->unique, state->loc.path, @@ -1708,6 +1664,7 @@ out: if (!rsp.path) rsp.path = ""; + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_readlink_rsp); @@ -1721,18 +1678,17 @@ server_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *stbuf, dict_t *xdata) { - gfs3_stat_rsp rsp = {0,}; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE (frame); + gfs3_stat_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret) { - gf_log (this->name, GF_LOG_INFO, + state = CALL_STATE (frame); + gf_log (this->name, (op_errno == ENOENT)? + GF_LOG_DEBUG:GF_LOG_ERROR, "%"PRId64": STAT %s (%s) ==> (%s)", frame->root->unique, state->loc.path, uuid_utoa (state->resolve.gfid), @@ -1746,6 +1702,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_stat_rsp); @@ -1760,17 +1717,15 @@ server_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *statpre, struct iatt *statpost, dict_t *xdata) { - gfs3_setattr_rsp rsp = {0,}; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE (frame); + gfs3_setattr_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret) { + state = CALL_STATE (frame); gf_log (this->name, GF_LOG_INFO, "%"PRId64": SETATTR %s (%s) ==> (%s)", frame->root->unique, state->loc.path, @@ -1786,6 +1741,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_setattr_rsp); @@ -1799,17 +1755,15 @@ server_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *statpre, struct iatt *statpost, dict_t *xdata) { - gfs3_fsetattr_rsp rsp = {0,}; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE (frame); + gfs3_fsetattr_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret) { + state = CALL_STATE (frame); gf_log (this->name, GF_LOG_INFO, "%"PRId64": FSETATTR %"PRId64" (%s) ==> (%s)", frame->root->unique, state->resolve.fd_no, @@ -1825,6 +1779,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_fsetattr_rsp); @@ -1839,17 +1794,15 @@ server_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - gfs3_xattrop_rsp rsp = {0,}; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE (frame); + gfs3_xattrop_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret < 0) { + state = CALL_STATE (frame); gf_log (this->name, GF_LOG_INFO, "%"PRId64": XATTROP %s (%s) ==> (%s)", frame->root->unique, state->loc.path, @@ -1858,13 +1811,14 @@ server_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - GF_PROTOCOL_DICT_SERIALIZE (this, dict, (&rsp.dict.dict_val), + GF_PROTOCOL_DICT_SERIALIZE (this, dict, &rsp.dict.dict_val, rsp.dict.dict_len, op_errno, out); out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_xattrop_rsp); @@ -1881,17 +1835,15 @@ server_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - gfs3_xattrop_rsp rsp = {0,}; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - - req = frame->local; - state = CALL_STATE(frame); + gfs3_xattrop_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret < 0) { + state = CALL_STATE (frame); gf_log (this->name, GF_LOG_INFO, "%"PRId64": FXATTROP %"PRId64" (%s) ==> (%s)", frame->root->unique, state->resolve.fd_no, @@ -1900,13 +1852,14 @@ server_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - GF_PROTOCOL_DICT_SERIALIZE (this, dict, (&rsp.dict.dict_val), + GF_PROTOCOL_DICT_SERIALIZE (this, dict, &rsp.dict.dict_val, rsp.dict.dict_len, op_errno, out); out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_fxattrop_rsp); @@ -1920,20 +1873,19 @@ out: int server_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, dict_t *xdata) + int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, + dict_t *xdata) { - gfs3_readdirp_rsp rsp = {0,}; - server_state_t *state = NULL; - rpcsvc_request_t *req = NULL; - int ret = 0; - - req = frame->local; - state = CALL_STATE(frame); + gfs3_readdirp_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; + int ret = 0; - GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, rsp.xdata.xdata_len, op_errno, out); if (op_ret < 0) { + state = CALL_STATE (frame); gf_log (this->name, GF_LOG_INFO, "%"PRId64": READDIRP %"PRId64" (%s) ==> (%s)", frame->root->unique, state->resolve.fd_no, @@ -1959,6 +1911,7 @@ out: rsp.op_ret = op_ret; rsp.op_errno = gf_errno_to_error (op_errno); + req = frame->local; server_submit_reply (frame, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gfs3_readdirp_rsp); @@ -1969,6 +1922,122 @@ out: return 0; } +int +server_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *statpre, struct iatt *statpost, dict_t *xdata) +{ + gfs3_fallocate_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; + + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, + rsp.xdata.xdata_len, op_errno, out); + + if (op_ret) { + state = CALL_STATE (frame); + gf_log (this->name, GF_LOG_INFO, + "%"PRId64": FALLOCATE %"PRId64" (%s) ==> (%s)", + frame->root->unique, state->resolve.fd_no, + uuid_utoa (state->resolve.gfid), + strerror (op_errno)); + goto out; + } + + gf_stat_from_iatt (&rsp.statpre, statpre); + gf_stat_from_iatt (&rsp.statpost, statpost); + +out: + rsp.op_ret = op_ret; + rsp.op_errno = gf_errno_to_error (op_errno); + + req = frame->local; + server_submit_reply(frame, req, &rsp, NULL, 0, NULL, + (xdrproc_t) xdr_gfs3_fallocate_rsp); + + GF_FREE (rsp.xdata.xdata_val); + + return 0; +} + +int +server_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *statpre, struct iatt *statpost, dict_t *xdata) +{ + gfs3_discard_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; + + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val, + rsp.xdata.xdata_len, op_errno, out); + + if (op_ret) { + state = CALL_STATE (frame); + gf_log (this->name, GF_LOG_INFO, + "%"PRId64": DISCARD %"PRId64" (%s) ==> (%s)", + frame->root->unique, state->resolve.fd_no, + uuid_utoa (state->resolve.gfid), + strerror (op_errno)); + goto out; + } + + gf_stat_from_iatt (&rsp.statpre, statpre); + gf_stat_from_iatt (&rsp.statpost, statpost); + +out: + rsp.op_ret = op_ret; + rsp.op_errno = gf_errno_to_error (op_errno); + + req = frame->local; + server_submit_reply(frame, req, &rsp, NULL, 0, NULL, + (xdrproc_t) xdr_gfs3_discard_rsp); + + GF_FREE (rsp.xdata.xdata_val); + + return 0; +} + +int +server_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iatt *statpre, struct iatt *statpost, dict_t *xdata) +{ + gfs3_zerofill_rsp rsp = {0,}; + server_state_t *state = NULL; + rpcsvc_request_t *req = NULL; + + req = frame->local; + state = CALL_STATE (frame); + + GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val), + rsp.xdata.xdata_len, op_errno, out); + + if (op_ret) { + gf_log (this->name, GF_LOG_INFO, + "%"PRId64": ZEROFILL%"PRId64" (%s) ==> (%s)", + frame->root->unique, state->resolve.fd_no, + uuid_utoa (state->resolve.gfid), + strerror (op_errno)); + goto out; + } + + gf_stat_from_iatt (&rsp.statpre, statpre); + gf_stat_from_iatt (&rsp.statpost, statpost); + +out: + rsp.op_ret = op_ret; + rsp.op_errno = gf_errno_to_error (op_errno); + + server_submit_reply(frame, req, &rsp, NULL, 0, NULL, + (xdrproc_t) xdr_gfs3_zerofill_rsp); + + GF_FREE (rsp.xdata.xdata_val); + + return 0; +} + + /* Resume function section */ int @@ -2132,6 +2201,7 @@ err: int server_fentrylk_resume (call_frame_t *frame, xlator_t *bound_xl) { + GF_UNUSED int ret = -1; server_state_t *state = NULL; state = CALL_STATE (frame); @@ -2139,6 +2209,13 @@ server_fentrylk_resume (call_frame_t *frame, xlator_t *bound_xl) if (state->resolve.op_ret != 0) goto err; + if (!state->xdata) + state->xdata = dict_new (); + + if (state->xdata) + ret = dict_set_str (state->xdata, "connection-id", + frame->root->client->client_uid); + STACK_WIND (frame, server_fentrylk_cbk, bound_xl, bound_xl->fops->fentrylk, state->volume, state->fd, state->name, @@ -2155,6 +2232,7 @@ err: int server_entrylk_resume (call_frame_t *frame, xlator_t *bound_xl) { + GF_UNUSED int ret = -1; server_state_t *state = NULL; state = CALL_STATE (frame); @@ -2162,6 +2240,13 @@ server_entrylk_resume (call_frame_t *frame, xlator_t *bound_xl) if (state->resolve.op_ret != 0) goto err; + if (!state->xdata) + state->xdata = dict_new (); + + if (state->xdata) + ret = dict_set_str (state->xdata, "connection-id", + frame->root->client->client_uid); + STACK_WIND (frame, server_entrylk_cbk, bound_xl, bound_xl->fops->entrylk, state->volume, &state->loc, state->name, @@ -2177,13 +2262,24 @@ err: int server_finodelk_resume (call_frame_t *frame, xlator_t *bound_xl) { + GF_UNUSED int ret = -1; server_state_t *state = NULL; + gf_log (bound_xl->name, GF_LOG_WARNING, "frame %p, xlator %p", + frame, bound_xl); + state = CALL_STATE (frame); if (state->resolve.op_ret != 0) goto err; + if (!state->xdata) + state->xdata = dict_new (); + + if (state->xdata) + ret = dict_set_str (state->xdata, "connection-id", + frame->root->client->client_uid); + STACK_WIND (frame, server_finodelk_cbk, bound_xl, bound_xl->fops->finodelk, state->volume, state->fd, state->cmd, &state->flock, state->xdata); @@ -2199,13 +2295,24 @@ err: int server_inodelk_resume (call_frame_t *frame, xlator_t *bound_xl) { + GF_UNUSED int ret = -1; server_state_t *state = NULL; + gf_log (bound_xl->name, GF_LOG_WARNING, "frame %p, xlator %p", + frame, bound_xl); + state = CALL_STATE (frame); if (state->resolve.op_ret != 0) goto err; + if (!state->xdata) + state->xdata = dict_new (); + + if (state->xdata) + ret = dict_set_str (state->xdata, "connection-id", + frame->root->client->client_uid); + STACK_WIND (frame, server_inodelk_cbk, bound_xl, bound_xl->fops->inodelk, state->volume, &state->loc, state->cmd, &state->flock, state->xdata); @@ -2914,6 +3021,69 @@ err: return 0; } +int +server_fallocate_resume (call_frame_t *frame, xlator_t *bound_xl) +{ + server_state_t *state = NULL; + + state = CALL_STATE (frame); + + if (state->resolve.op_ret != 0) + goto err; + + STACK_WIND (frame, server_fallocate_cbk, + bound_xl, bound_xl->fops->fallocate, + state->fd, state->flags, state->offset, state->size, + state->xdata); + return 0; +err: + server_fallocate_cbk(frame, NULL, frame->this, state->resolve.op_ret, + state->resolve.op_errno, NULL, NULL, NULL); + + return 0; +} + +int +server_discard_resume (call_frame_t *frame, xlator_t *bound_xl) +{ + server_state_t *state = NULL; + + state = CALL_STATE (frame); + + if (state->resolve.op_ret != 0) + goto err; + + STACK_WIND (frame, server_discard_cbk, + bound_xl, bound_xl->fops->discard, + state->fd, state->offset, state->size, state->xdata); + return 0; +err: + server_discard_cbk(frame, NULL, frame->this, state->resolve.op_ret, + state->resolve.op_errno, NULL, NULL, NULL); + + return 0; +} + +int +server_zerofill_resume (call_frame_t *frame, xlator_t *bound_xl) +{ + server_state_t *state = NULL; + + state = CALL_STATE (frame); + + if (state->resolve.op_ret != 0) + goto err; + + STACK_WIND (frame, server_zerofill_cbk, + bound_xl, bound_xl->fops->zerofill, + state->fd, state->offset, state->size, state->xdata); + return 0; +err: + server_zerofill_cbk(frame, NULL, frame->this, state->resolve.op_ret, + state->resolve.op_errno, NULL, NULL, NULL); + + return 0; +} @@ -2936,31 +3106,31 @@ server3_3_stat (rpcsvc_request_t *req) ret = xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_stat_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { - // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_STAT; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } state->resolve.type = RESOLVE_MUST; memcpy (state->resolve.gfid, args.gfid, 16); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); @@ -2971,7 +3141,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -2993,22 +3163,22 @@ server3_3_setattr (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_setattr_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_SETATTR; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -3018,9 +3188,10 @@ server3_3_setattr (rpcsvc_request_t *req) gf_stat_to_iatt (&args.stbuf, &state->stbuf); state->valid = args.valid; - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -3028,7 +3199,7 @@ server3_3_setattr (rpcsvc_request_t *req) out: if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); free (args.xdata.xdata_val); @@ -3052,22 +3223,21 @@ server3_3_fsetattr (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_fsetattr_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { - // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_FSETATTR; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -3077,13 +3247,196 @@ server3_3_fsetattr (rpcsvc_request_t *req) gf_stat_to_iatt (&args.stbuf, &state->stbuf); state->valid = args.valid; - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, + op_errno, out); + + ret = 0; + resolve_and_resume (frame, server_fsetattr_resume); + +out: + free (args.xdata.xdata_val); + + if (op_errno) + SERVER_REQ_SET_ERROR (req, ret); + + return ret; +} + +int +server3_3_fallocate(rpcsvc_request_t *req) +{ + server_state_t *state = NULL; + call_frame_t *frame = NULL; + gfs3_fallocate_req args = {{0},}; + int ret = -1; + int op_errno = 0; + + if (!req) + return ret; + + ret = xdr_to_generic (req->msg[0], &args, + (xdrproc_t)xdr_gfs3_fallocate_req); + if (ret < 0) { + //failed to decode msg; + SERVER_REQ_SET_ERROR (req, ret); + goto out; + } + + frame = get_frame_from_request (req); + if (!frame) { + // something wrong, mostly insufficient memory + SERVER_REQ_SET_ERROR (req, ret); + goto out; + } + frame->root->op = GF_FOP_FALLOCATE; + + state = CALL_STATE (frame); + if (!frame->root->client->bound_xl) { + /* auth failure, request on subvolume without setvolume */ + SERVER_REQ_SET_ERROR (req, ret); + goto out; + } + + state->resolve.type = RESOLVE_MUST; + state->resolve.fd_no = args.fd; + + state->flags = args.flags; + state->offset = args.offset; + state->size = args.size; + memcpy(state->resolve.gfid, args.gfid, 16); + + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, + op_errno, out); + + ret = 0; + resolve_and_resume (frame, server_fallocate_resume); + +out: + free (args.xdata.xdata_val); + + if (op_errno) + SERVER_REQ_SET_ERROR (req, ret); + + return ret; +} + + +int +server3_3_discard(rpcsvc_request_t *req) +{ + server_state_t *state = NULL; + call_frame_t *frame = NULL; + gfs3_discard_req args = {{0},}; + int ret = -1; + int op_errno = 0; + + if (!req) + return ret; + + ret = xdr_to_generic (req->msg[0], &args, + (xdrproc_t)xdr_gfs3_discard_req); + if (ret < 0) { + //failed to decode msg; + SERVER_REQ_SET_ERROR (req, ret); + goto out; + } + + frame = get_frame_from_request (req); + if (!frame) { + // something wrong, mostly insufficient memory + SERVER_REQ_SET_ERROR (req, ret); + goto out; + } + frame->root->op = GF_FOP_DISCARD; + + state = CALL_STATE (frame); + if (!frame->root->client->bound_xl) { + /* auth failure, request on subvolume without setvolume */ + SERVER_REQ_SET_ERROR (req, ret); + goto out; + } + + state->resolve.type = RESOLVE_MUST; + state->resolve.fd_no = args.fd; + + state->offset = args.offset; + state->size = args.size; + memcpy(state->resolve.gfid, args.gfid, 16); + + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, + op_errno, out); + + ret = 0; + resolve_and_resume (frame, server_discard_resume); + +out: + free (args.xdata.xdata_val); + + if (op_errno) + SERVER_REQ_SET_ERROR (req, ret); + + return ret; +} + + +int +server3_3_zerofill(rpcsvc_request_t *req) +{ + server_state_t *state = NULL; + call_frame_t *frame = NULL; + gfs3_zerofill_req args = {{0},}; + int ret = -1; + int op_errno = 0; + + if (!req) + return ret; + + ret = xdr_to_generic (req->msg[0], &args, + (xdrproc_t)xdr_gfs3_zerofill_req); + if (ret < 0) { + /*failed to decode msg*/; + req->rpc_err = GARBAGE_ARGS; + goto out; + } + + frame = get_frame_from_request (req); + if (!frame) { + /* something wrong, mostly insufficient memory*/ + req->rpc_err = GARBAGE_ARGS; /* TODO */ + goto out; + } + frame->root->op = GF_FOP_ZEROFILL; + + state = CALL_STATE (frame); + if (!frame->root->client->bound_xl) { + /* auth failure, request on subvolume without setvolume */ + req->rpc_err = GARBAGE_ARGS; + goto out; + } + + state->resolve.type = RESOLVE_MUST; + state->resolve.fd_no = args.fd; + + state->offset = args.offset; + state->size = args.size; + memcpy(state->resolve.gfid, args.gfid, 16); + + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, state->xdata, (args.xdata.xdata_val), (args.xdata.xdata_len), ret, op_errno, out); ret = 0; - resolve_and_resume (frame, server_fsetattr_resume); + resolve_and_resume (frame, server_zerofill_resume); out: free (args.xdata.xdata_val); @@ -3094,7 +3447,6 @@ out: return ret; } - int server3_3_readlink (rpcsvc_request_t *req) { @@ -3111,22 +3463,22 @@ server3_3_readlink (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_readlink_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_READLINK; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -3135,9 +3487,10 @@ server3_3_readlink (rpcsvc_request_t *req) state->size = args.size; - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -3147,7 +3500,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -3171,22 +3524,22 @@ server3_3_create (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_create_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_CREATE; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -3204,9 +3557,10 @@ server3_3_create (rpcsvc_request_t *req) } /* TODO: can do alloca for xdata field instead of stdalloc */ - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -3217,7 +3571,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -3238,22 +3592,22 @@ server3_3_open (rpcsvc_request_t *req) ret = xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_open_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_OPEN; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -3262,16 +3616,17 @@ server3_3_open (rpcsvc_request_t *req) state->flags = gf_flags_to_flags (args.flags); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; resolve_and_resume (frame, server_open_resume); out: if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); free (args.xdata.xdata_val); @@ -3294,22 +3649,22 @@ server3_3_readv (rpcsvc_request_t *req) ret = xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_read_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_READ; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -3321,9 +3676,10 @@ server3_3_readv (rpcsvc_request_t *req) memcpy (state->resolve.gfid, args.gfid, 16); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -3333,7 +3689,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -3357,22 +3713,22 @@ server3_3_writev (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_write_req); if (len < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_WRITE; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -3400,9 +3756,10 @@ server3_3_writev (rpcsvc_request_t *req) state->size += state->payload_vector[i].iov_len; } - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); #ifdef GF_TESTING_IO_XDATA @@ -3415,7 +3772,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -3481,26 +3838,36 @@ server3_3_writev_vecsizer (int state, ssize_t *readsize, char *base_addr, int server3_3_release (rpcsvc_request_t *req) { - server_connection_t *conn = NULL; - gfs3_release_req args = {{0,},}; - gf_common_rsp rsp = {0,}; - int ret = -1; + client_t *client = NULL; + server_ctx_t *serv_ctx = NULL; + gfs3_release_req args = {{0,},}; + gf_common_rsp rsp = {0,}; + int ret = -1; ret = xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_release_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } - conn = req->trans->xl_private; - if (!conn) { + client = req->trans->xl_private; + if (!client) { /* Handshake is not complete yet. */ req->rpc_err = SYSTEM_ERR; goto out; } - gf_fd_put (conn->fdtable, args.fd); + + serv_ctx = server_ctx_get (client, client->this); + if (serv_ctx == NULL) { + gf_log (req->trans->name, GF_LOG_INFO, + "server_ctx_get() failed"); + req->rpc_err = SYSTEM_ERR; + goto out; + } + + gf_fd_put (serv_ctx->fdtable, args.fd); server_submit_reply (NULL, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_common_rsp); @@ -3513,26 +3880,35 @@ out: int server3_3_releasedir (rpcsvc_request_t *req) { - server_connection_t *conn = NULL; - gfs3_releasedir_req args = {{0,},}; - gf_common_rsp rsp = {0,}; - int ret = -1; + client_t *client = NULL; + server_ctx_t *serv_ctx = NULL; + gfs3_releasedir_req args = {{0,},}; + gf_common_rsp rsp = {0,}; + int ret = -1; ret = xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_release_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } - conn = req->trans->xl_private; - if (!conn) { - req->rpc_err = GARBAGE_ARGS; + client = req->trans->xl_private; + if (!client) { + SERVER_REQ_SET_ERROR (req, ret); + goto out; + } + + serv_ctx = server_ctx_get (client, client->this); + if (serv_ctx == NULL) { + gf_log (req->trans->name, GF_LOG_INFO, + "server_ctx_get() failed"); + req->rpc_err = SYSTEM_ERR; goto out; } - gf_fd_put (conn->fdtable, args.fd); + gf_fd_put (serv_ctx->fdtable, args.fd); server_submit_reply (NULL, req, &rsp, NULL, 0, NULL, (xdrproc_t)xdr_gf_common_rsp); @@ -3559,22 +3935,22 @@ server3_3_fsync (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_fsync_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_FSYNC; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -3583,9 +3959,10 @@ server3_3_fsync (rpcsvc_request_t *req) state->flags = args.data; memcpy (state->resolve.gfid, args.gfid, 16); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -3594,7 +3971,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -3617,22 +3994,22 @@ server3_3_flush (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_flush_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_FLUSH; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -3640,9 +4017,10 @@ server3_3_flush (rpcsvc_request_t *req) state->resolve.fd_no = args.fd; memcpy (state->resolve.gfid, args.gfid, 16); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -3651,7 +4029,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -3674,22 +4052,22 @@ server3_3_ftruncate (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_ftruncate_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_FTRUNCATE; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -3698,9 +4076,10 @@ server3_3_ftruncate (rpcsvc_request_t *req) state->offset = args.offset; memcpy (state->resolve.gfid, args.gfid, 16); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -3709,7 +4088,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -3731,22 +4110,22 @@ server3_3_fstat (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_fstat_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_FSTAT; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -3754,9 +4133,10 @@ server3_3_fstat (rpcsvc_request_t *req) state->resolve.fd_no = args.fd; memcpy (state->resolve.gfid, args.gfid, 16); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -3765,7 +4145,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -3787,22 +4167,22 @@ server3_3_truncate (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_truncate_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_TRUNCATE; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -3810,9 +4190,10 @@ server3_3_truncate (rpcsvc_request_t *req) memcpy (state->resolve.gfid, args.gfid, 16); state->offset = args.offset; - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -3821,7 +4202,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -3846,22 +4227,22 @@ server3_3_unlink (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_unlink_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_UNLINK; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -3871,9 +4252,10 @@ server3_3_unlink (rpcsvc_request_t *req) state->flags = args.xflags; - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -3882,7 +4264,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -3907,22 +4289,22 @@ server3_3_setxattr (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_setxattr_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_SETXATTR; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -3930,7 +4312,8 @@ server3_3_setxattr (rpcsvc_request_t *req) state->flags = args.flags; memcpy (state->resolve.gfid, args.gfid, 16); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, dict, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + dict, (args.dict.dict_val), (args.dict.dict_len), ret, op_errno, out); @@ -3940,9 +4323,10 @@ server3_3_setxattr (rpcsvc_request_t *req) /* There can be some commands hidden in key, check and proceed */ gf_server_check_setxattr_cmd (frame, dict); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -3953,7 +4337,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); if (dict) dict_unref (dict); @@ -3981,22 +4365,22 @@ server3_3_fsetxattr (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_fsetxattr_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_FSETXATTR; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -4005,16 +4389,18 @@ server3_3_fsetxattr (rpcsvc_request_t *req) state->flags = args.flags; memcpy (state->resolve.gfid, args.gfid, 16); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, dict, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + dict, (args.dict.dict_val), (args.dict.dict_len), ret, op_errno, out); state->dict = dict; - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -4025,7 +4411,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); if (dict) dict_unref (dict); @@ -4053,22 +4439,22 @@ server3_3_fxattrop (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_fxattrop_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_FXATTROP; - state = CALL_STATE(frame); - if (!state->conn->bound_xl) { + state = CALL_STATE (frame); + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -4077,16 +4463,18 @@ server3_3_fxattrop (rpcsvc_request_t *req) state->flags = args.flags; memcpy (state->resolve.gfid, args.gfid, 16); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, dict, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + dict, (args.dict.dict_val), (args.dict.dict_len), ret, op_errno, out); state->dict = dict; - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -4098,7 +4486,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); if (dict) dict_unref (dict); @@ -4127,22 +4515,22 @@ server3_3_xattrop (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_xattrop_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_XATTROP; - state = CALL_STATE(frame); - if (!state->conn->bound_xl) { + state = CALL_STATE (frame); + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -4150,16 +4538,18 @@ server3_3_xattrop (rpcsvc_request_t *req) state->flags = args.flags; memcpy (state->resolve.gfid, args.gfid, 16); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, dict, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + dict, (args.dict.dict_val), (args.dict.dict_len), ret, op_errno, out); state->dict = dict; - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -4170,7 +4560,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); if (dict) dict_unref (dict); @@ -4197,22 +4587,22 @@ server3_3_getxattr (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_getxattr_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_GETXATTR; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -4225,9 +4615,10 @@ server3_3_getxattr (rpcsvc_request_t *req) gf_server_check_getxattr_cmd (frame, state->name); } - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -4236,7 +4627,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -4259,22 +4650,22 @@ server3_3_fgetxattr (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_fgetxattr_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_FGETXATTR; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -4285,9 +4676,10 @@ server3_3_fgetxattr (rpcsvc_request_t *req) if (args.namelen) state->name = gf_strdup (args.name); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -4296,7 +4688,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -4321,22 +4713,22 @@ server3_3_removexattr (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_removexattr_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_REMOVEXATTR; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -4344,9 +4736,10 @@ server3_3_removexattr (rpcsvc_request_t *req) memcpy (state->resolve.gfid, args.gfid, 16); state->name = gf_strdup (args.name); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -4355,7 +4748,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -4378,22 +4771,22 @@ server3_3_fremovexattr (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_fremovexattr_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_FREMOVEXATTR; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -4402,9 +4795,10 @@ server3_3_fremovexattr (rpcsvc_request_t *req) memcpy (state->resolve.gfid, args.gfid, 16); state->name = gf_strdup (args.name); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -4413,7 +4807,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -4437,31 +4831,32 @@ server3_3_opendir (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_opendir_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_OPENDIR; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } state->resolve.type = RESOLVE_MUST; memcpy (state->resolve.gfid, args.gfid, 16); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -4470,7 +4865,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -4493,22 +4888,22 @@ server3_3_readdirp (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_readdirp_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_READDIRP; - state = CALL_STATE(frame); - if (!state->conn->bound_xl) { + state = CALL_STATE (frame); + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -4529,7 +4924,8 @@ server3_3_readdirp (rpcsvc_request_t *req) memcpy (state->resolve.gfid, args.gfid, 16); /* here, dict itself works as xdata */ - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->dict, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->dict, (args.dict.dict_val), (args.dict.dict_len), ret, op_errno, out); @@ -4539,7 +4935,7 @@ server3_3_readdirp (rpcsvc_request_t *req) resolve_and_resume (frame, server_readdirp_resume); out: if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); free (args.dict.dict_val); @@ -4563,22 +4959,22 @@ server3_3_readdir (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_readdir_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_READDIR; - state = CALL_STATE(frame); - if (!state->conn->bound_xl) { + state = CALL_STATE (frame); + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -4598,9 +4994,10 @@ server3_3_readdir (rpcsvc_request_t *req) state->offset = args.offset; memcpy (state->resolve.gfid, args.gfid, 16); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -4609,7 +5006,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -4630,22 +5027,22 @@ server3_3_fsyncdir (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_fsyncdir_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_FSYNCDIR; - state = CALL_STATE(frame); - if (!state->conn->bound_xl) { + state = CALL_STATE (frame); + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -4654,9 +5051,10 @@ server3_3_fsyncdir (rpcsvc_request_t *req) state->flags = args.data; memcpy (state->resolve.gfid, args.gfid, 16); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -4665,7 +5063,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -4690,22 +5088,22 @@ server3_3_mknod (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_mknod_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_MKNOD; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -4717,9 +5115,10 @@ server3_3_mknod (rpcsvc_request_t *req) state->dev = args.dev; state->umask = args.umask; - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -4727,7 +5126,7 @@ server3_3_mknod (rpcsvc_request_t *req) out: if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); /* memory allocated by libc, don't use GF_FREE */ free (args.xdata.xdata_val); @@ -4755,22 +5154,22 @@ server3_3_mkdir (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_mkdir_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_MKDIR; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -4782,9 +5181,10 @@ server3_3_mkdir (rpcsvc_request_t *req) state->umask = args.umask; /* TODO: can do alloca for xdata field instead of stdalloc */ - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -4792,7 +5192,7 @@ server3_3_mkdir (rpcsvc_request_t *req) out: if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); free (args.xdata.xdata_val); @@ -4818,22 +5218,22 @@ server3_3_rmdir (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_rmdir_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_RMDIR; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -4843,9 +5243,10 @@ server3_3_rmdir (rpcsvc_request_t *req) state->flags = args.xflags; - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -4854,7 +5255,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -4880,22 +5281,22 @@ server3_3_inodelk (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_inodelk_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_INODELK; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -4932,9 +5333,10 @@ server3_3_inodelk (rpcsvc_request_t *req) break; } - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -4945,7 +5347,7 @@ out: free (args.flock.lk_owner.lk_owner_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -4967,22 +5369,22 @@ server3_3_finodelk (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_finodelk_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_FINODELK; - state = CALL_STATE(frame); - if (!state->conn->bound_xl) { + state = CALL_STATE (frame); + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -5020,9 +5422,10 @@ server3_3_finodelk (rpcsvc_request_t *req) break; } - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -5033,7 +5436,7 @@ out: free (args.flock.lk_owner.lk_owner_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -5058,22 +5461,22 @@ server3_3_entrylk (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_entrylk_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_ENTRYLK; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -5087,9 +5490,10 @@ server3_3_entrylk (rpcsvc_request_t *req) state->cmd = args.cmd; state->type = args.type; - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -5098,7 +5502,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -5122,22 +5526,22 @@ server3_3_fentrylk (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_fentrylk_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_FENTRYLK; - state = CALL_STATE(frame); - if (!state->conn->bound_xl) { + state = CALL_STATE (frame); + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -5151,9 +5555,10 @@ server3_3_fentrylk (rpcsvc_request_t *req) state->name = gf_strdup (args.name); state->volume = gf_strdup (args.volume); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -5162,7 +5567,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -5183,22 +5588,22 @@ server3_3_access (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_access_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_ACCESS; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -5206,9 +5611,10 @@ server3_3_access (rpcsvc_request_t *req) memcpy (state->resolve.gfid, args.gfid, 16); state->mask = args.mask; - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -5217,7 +5623,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -5243,22 +5649,22 @@ server3_3_symlink (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_symlink_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_SYMLINK; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -5268,9 +5674,10 @@ server3_3_symlink (rpcsvc_request_t *req) state->name = gf_strdup (args.linkname); state->umask = args.umask; - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -5278,7 +5685,7 @@ server3_3_symlink (rpcsvc_request_t *req) out: if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); /* memory allocated by libc, don't use GF_FREE */ free (args.xdata.xdata_val); @@ -5305,22 +5712,22 @@ server3_3_link (rpcsvc_request_t *req) ret = xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_link_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_LINK; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -5331,9 +5738,10 @@ server3_3_link (rpcsvc_request_t *req) state->resolve2.bname = gf_strdup (args.newbname); memcpy (state->resolve2.pargfid, args.newgfid, 16); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -5342,7 +5750,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -5367,22 +5775,22 @@ server3_3_rename (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_rename_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_RENAME; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -5394,9 +5802,10 @@ server3_3_rename (rpcsvc_request_t *req) state->resolve2.bname = gf_strdup (args.newbname); memcpy (state->resolve2.pargfid, args.newgfid, 16); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -5405,7 +5814,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -5425,22 +5834,22 @@ server3_3_lk (rpcsvc_request_t *req) ret = xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_lk_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_LK; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -5488,7 +5897,7 @@ server3_3_lk (rpcsvc_request_t *req) state->flock.l_type = F_UNLCK; break; default: - gf_log (state->conn->bound_xl->name, GF_LOG_ERROR, + gf_log (frame->root->client->bound_xl->name, GF_LOG_ERROR, "fd - %"PRId64" (%s): Unknown lock type: %"PRId32"!", state->resolve.fd_no, uuid_utoa (state->fd->inode->gfid), state->type); @@ -5496,9 +5905,10 @@ server3_3_lk (rpcsvc_request_t *req) } - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -5509,7 +5919,7 @@ out: free (args.flock.lk_owner.lk_owner_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -5531,22 +5941,22 @@ server3_3_rchecksum (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_rchecksum_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_RCHECKSUM; - state = CALL_STATE(frame); - if (!state->conn->bound_xl) { + state = CALL_STATE (frame); + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -5555,9 +5965,10 @@ server3_3_rchecksum (rpcsvc_request_t *req) state->offset = args.offset; state->size = args.len; - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -5566,7 +5977,7 @@ out: free (args.xdata.xdata_val); if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } @@ -5603,14 +6014,14 @@ server3_3_lookup (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_lookup_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto err; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto err; } frame->root->op = GF_FOP_LOOKUP; @@ -5620,9 +6031,9 @@ server3_3_lookup (rpcsvc_request_t *req) */ state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } @@ -5635,9 +6046,10 @@ server3_3_lookup (rpcsvc_request_t *req) memcpy (state->resolve.gfid, args.gfid, 16); } - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; @@ -5669,88 +6081,92 @@ server3_3_statfs (rpcsvc_request_t *req) (xdrproc_t)xdr_gfs3_statfs_req); if (ret < 0) { //failed to decode msg; - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame = get_frame_from_request (req); if (!frame) { // something wrong, mostly insufficient memory - req->rpc_err = GARBAGE_ARGS; /* TODO */ + SERVER_REQ_SET_ERROR (req, ret); goto out; } frame->root->op = GF_FOP_STATFS; state = CALL_STATE (frame); - if (!state->conn->bound_xl) { + if (!frame->root->client->bound_xl) { /* auth failure, request on subvolume without setvolume */ - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); goto out; } state->resolve.type = RESOLVE_MUST; memcpy (state->resolve.gfid, args.gfid, 16); - GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata, - (args.xdata.xdata_val), - (args.xdata.xdata_len), ret, + GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, + state->xdata, + args.xdata.xdata_val, + args.xdata.xdata_len, ret, op_errno, out); ret = 0; resolve_and_resume (frame, server_statfs_resume); out: if (op_errno) - req->rpc_err = GARBAGE_ARGS; + SERVER_REQ_SET_ERROR (req, ret); return ret; } rpcsvc_actor_t glusterfs3_3_fop_actors[] = { - [GFS3_OP_NULL] = { "NULL", GFS3_OP_NULL, server_null, NULL, 0}, - [GFS3_OP_STAT] = { "STAT", GFS3_OP_STAT, server3_3_stat, NULL, 0}, - [GFS3_OP_READLINK] = { "READLINK", GFS3_OP_READLINK, server3_3_readlink, NULL, 0}, - [GFS3_OP_MKNOD] = { "MKNOD", GFS3_OP_MKNOD, server3_3_mknod, NULL, 0}, - [GFS3_OP_MKDIR] = { "MKDIR", GFS3_OP_MKDIR, server3_3_mkdir, NULL, 0}, - [GFS3_OP_UNLINK] = { "UNLINK", GFS3_OP_UNLINK, server3_3_unlink, NULL, 0}, - [GFS3_OP_RMDIR] = { "RMDIR", GFS3_OP_RMDIR, server3_3_rmdir, NULL, 0}, - [GFS3_OP_SYMLINK] = { "SYMLINK", GFS3_OP_SYMLINK, server3_3_symlink, NULL, 0}, - [GFS3_OP_RENAME] = { "RENAME", GFS3_OP_RENAME, server3_3_rename, NULL, 0}, - [GFS3_OP_LINK] = { "LINK", GFS3_OP_LINK, server3_3_link, NULL, 0}, - [GFS3_OP_TRUNCATE] = { "TRUNCATE", GFS3_OP_TRUNCATE, server3_3_truncate, NULL, 0}, - [GFS3_OP_OPEN] = { "OPEN", GFS3_OP_OPEN, server3_3_open, NULL, 0}, - [GFS3_OP_READ] = { "READ", GFS3_OP_READ, server3_3_readv, NULL, 0}, - [GFS3_OP_WRITE] = { "WRITE", GFS3_OP_WRITE, server3_3_writev, server3_3_writev_vecsizer, 0}, - [GFS3_OP_STATFS] = { "STATFS", GFS3_OP_STATFS, server3_3_statfs, NULL, 0}, - [GFS3_OP_FLUSH] = { "FLUSH", GFS3_OP_FLUSH, server3_3_flush, NULL, 0}, - [GFS3_OP_FSYNC] = { "FSYNC", GFS3_OP_FSYNC, server3_3_fsync, NULL, 0}, - [GFS3_OP_SETXATTR] = { "SETXATTR", GFS3_OP_SETXATTR, server3_3_setxattr, NULL, 0}, - [GFS3_OP_GETXATTR] = { "GETXATTR", GFS3_OP_GETXATTR, server3_3_getxattr, NULL, 0}, - [GFS3_OP_REMOVEXATTR] = { "REMOVEXATTR", GFS3_OP_REMOVEXATTR, server3_3_removexattr, NULL, 0}, - [GFS3_OP_OPENDIR] = { "OPENDIR", GFS3_OP_OPENDIR, server3_3_opendir, NULL, 0}, - [GFS3_OP_FSYNCDIR] = { "FSYNCDIR", GFS3_OP_FSYNCDIR, server3_3_fsyncdir, NULL, 0}, - [GFS3_OP_ACCESS] = { "ACCESS", GFS3_OP_ACCESS, server3_3_access, NULL, 0}, - [GFS3_OP_CREATE] = { "CREATE", GFS3_OP_CREATE, server3_3_create, NULL, 0}, - [GFS3_OP_FTRUNCATE] = { "FTRUNCATE", GFS3_OP_FTRUNCATE, server3_3_ftruncate, NULL, 0}, - [GFS3_OP_FSTAT] = { "FSTAT", GFS3_OP_FSTAT, server3_3_fstat, NULL, 0}, - [GFS3_OP_LK] = { "LK", GFS3_OP_LK, server3_3_lk, NULL, 0}, - [GFS3_OP_LOOKUP] = { "LOOKUP", GFS3_OP_LOOKUP, server3_3_lookup, NULL, 0}, - [GFS3_OP_READDIR] = { "READDIR", GFS3_OP_READDIR, server3_3_readdir, NULL, 0}, - [GFS3_OP_INODELK] = { "INODELK", GFS3_OP_INODELK, server3_3_inodelk, NULL, 0}, - [GFS3_OP_FINODELK] = { "FINODELK", GFS3_OP_FINODELK, server3_3_finodelk, NULL, 0}, - [GFS3_OP_ENTRYLK] = { "ENTRYLK", GFS3_OP_ENTRYLK, server3_3_entrylk, NULL, 0}, - [GFS3_OP_FENTRYLK] = { "FENTRYLK", GFS3_OP_FENTRYLK, server3_3_fentrylk, NULL, 0}, - [GFS3_OP_XATTROP] = { "XATTROP", GFS3_OP_XATTROP, server3_3_xattrop, NULL, 0}, - [GFS3_OP_FXATTROP] = { "FXATTROP", GFS3_OP_FXATTROP, server3_3_fxattrop, NULL, 0}, - [GFS3_OP_FGETXATTR] = { "FGETXATTR", GFS3_OP_FGETXATTR, server3_3_fgetxattr, NULL, 0}, - [GFS3_OP_FSETXATTR] = { "FSETXATTR", GFS3_OP_FSETXATTR, server3_3_fsetxattr, NULL, 0}, - [GFS3_OP_RCHECKSUM] = { "RCHECKSUM", GFS3_OP_RCHECKSUM, server3_3_rchecksum, NULL, 0}, - [GFS3_OP_SETATTR] = { "SETATTR", GFS3_OP_SETATTR, server3_3_setattr, NULL, 0}, - [GFS3_OP_FSETATTR] = { "FSETATTR", GFS3_OP_FSETATTR, server3_3_fsetattr, NULL, 0}, - [GFS3_OP_READDIRP] = { "READDIRP", GFS3_OP_READDIRP, server3_3_readdirp, NULL, 0}, - [GFS3_OP_RELEASE] = { "RELEASE", GFS3_OP_RELEASE, server3_3_release, NULL, 0}, - [GFS3_OP_RELEASEDIR] = { "RELEASEDIR", GFS3_OP_RELEASEDIR, server3_3_releasedir, NULL, 0}, - [GFS3_OP_FREMOVEXATTR] = { "FREMOVEXATTR", GFS3_OP_FREMOVEXATTR, server3_3_fremovexattr, NULL, 0}, + [GFS3_OP_NULL] = {"NULL", GFS3_OP_NULL, server_null, NULL, 0, DRC_NA}, + [GFS3_OP_STAT] = {"STAT", GFS3_OP_STAT, server3_3_stat, NULL, 0, DRC_NA}, + [GFS3_OP_READLINK] = {"READLINK", GFS3_OP_READLINK, server3_3_readlink, NULL, 0, DRC_NA}, + [GFS3_OP_MKNOD] = {"MKNOD", GFS3_OP_MKNOD, server3_3_mknod, NULL, 0, DRC_NA}, + [GFS3_OP_MKDIR] = {"MKDIR", GFS3_OP_MKDIR, server3_3_mkdir, NULL, 0, DRC_NA}, + [GFS3_OP_UNLINK] = {"UNLINK", GFS3_OP_UNLINK, server3_3_unlink, NULL, 0, DRC_NA}, + [GFS3_OP_RMDIR] = {"RMDIR", GFS3_OP_RMDIR, server3_3_rmdir, NULL, 0, DRC_NA}, + [GFS3_OP_SYMLINK] = {"SYMLINK", GFS3_OP_SYMLINK, server3_3_symlink, NULL, 0, DRC_NA}, + [GFS3_OP_RENAME] = {"RENAME", GFS3_OP_RENAME, server3_3_rename, NULL, 0, DRC_NA}, + [GFS3_OP_LINK] = {"LINK", GFS3_OP_LINK, server3_3_link, NULL, 0, DRC_NA}, + [GFS3_OP_TRUNCATE] = {"TRUNCATE", GFS3_OP_TRUNCATE, server3_3_truncate, NULL, 0, DRC_NA}, + [GFS3_OP_OPEN] = {"OPEN", GFS3_OP_OPEN, server3_3_open, NULL, 0, DRC_NA}, + [GFS3_OP_READ] = {"READ", GFS3_OP_READ, server3_3_readv, NULL, 0, DRC_NA}, + [GFS3_OP_WRITE] = {"WRITE", GFS3_OP_WRITE, server3_3_writev, server3_3_writev_vecsizer, 0, DRC_NA}, + [GFS3_OP_STATFS] = {"STATFS", GFS3_OP_STATFS, server3_3_statfs, NULL, 0, DRC_NA}, + [GFS3_OP_FLUSH] = {"FLUSH", GFS3_OP_FLUSH, server3_3_flush, NULL, 0, DRC_NA}, + [GFS3_OP_FSYNC] = {"FSYNC", GFS3_OP_FSYNC, server3_3_fsync, NULL, 0, DRC_NA}, + [GFS3_OP_SETXATTR] = {"SETXATTR", GFS3_OP_SETXATTR, server3_3_setxattr, NULL, 0, DRC_NA}, + [GFS3_OP_GETXATTR] = {"GETXATTR", GFS3_OP_GETXATTR, server3_3_getxattr, NULL, 0, DRC_NA}, + [GFS3_OP_REMOVEXATTR] = {"REMOVEXATTR", GFS3_OP_REMOVEXATTR, server3_3_removexattr, NULL, 0, DRC_NA}, + [GFS3_OP_OPENDIR] = {"OPENDIR", GFS3_OP_OPENDIR, server3_3_opendir, NULL, 0, DRC_NA}, + [GFS3_OP_FSYNCDIR] = {"FSYNCDIR", GFS3_OP_FSYNCDIR, server3_3_fsyncdir, NULL, 0, DRC_NA}, + [GFS3_OP_ACCESS] = {"ACCESS", GFS3_OP_ACCESS, server3_3_access, NULL, 0, DRC_NA}, + [GFS3_OP_CREATE] = {"CREATE", GFS3_OP_CREATE, server3_3_create, NULL, 0, DRC_NA}, + [GFS3_OP_FTRUNCATE] = {"FTRUNCATE", GFS3_OP_FTRUNCATE, server3_3_ftruncate, NULL, 0, DRC_NA}, + [GFS3_OP_FSTAT] = {"FSTAT", GFS3_OP_FSTAT, server3_3_fstat, NULL, 0, DRC_NA}, + [GFS3_OP_LK] = {"LK", GFS3_OP_LK, server3_3_lk, NULL, 0, DRC_NA}, + [GFS3_OP_LOOKUP] = {"LOOKUP", GFS3_OP_LOOKUP, server3_3_lookup, NULL, 0, DRC_NA}, + [GFS3_OP_READDIR] = {"READDIR", GFS3_OP_READDIR, server3_3_readdir, NULL, 0, DRC_NA}, + [GFS3_OP_INODELK] = {"INODELK", GFS3_OP_INODELK, server3_3_inodelk, NULL, 0, DRC_NA}, + [GFS3_OP_FINODELK] = {"FINODELK", GFS3_OP_FINODELK, server3_3_finodelk, NULL, 0, DRC_NA}, + [GFS3_OP_ENTRYLK] = {"ENTRYLK", GFS3_OP_ENTRYLK, server3_3_entrylk, NULL, 0, DRC_NA}, + [GFS3_OP_FENTRYLK] = {"FENTRYLK", GFS3_OP_FENTRYLK, server3_3_fentrylk, NULL, 0, DRC_NA}, + [GFS3_OP_XATTROP] = {"XATTROP", GFS3_OP_XATTROP, server3_3_xattrop, NULL, 0, DRC_NA}, + [GFS3_OP_FXATTROP] = {"FXATTROP", GFS3_OP_FXATTROP, server3_3_fxattrop, NULL, 0, DRC_NA}, + [GFS3_OP_FGETXATTR] = {"FGETXATTR", GFS3_OP_FGETXATTR, server3_3_fgetxattr, NULL, 0, DRC_NA}, + [GFS3_OP_FSETXATTR] = {"FSETXATTR", GFS3_OP_FSETXATTR, server3_3_fsetxattr, NULL, 0, DRC_NA}, + [GFS3_OP_RCHECKSUM] = {"RCHECKSUM", GFS3_OP_RCHECKSUM, server3_3_rchecksum, NULL, 0, DRC_NA}, + [GFS3_OP_SETATTR] = {"SETATTR", GFS3_OP_SETATTR, server3_3_setattr, NULL, 0, DRC_NA}, + [GFS3_OP_FSETATTR] = {"FSETATTR", GFS3_OP_FSETATTR, server3_3_fsetattr, NULL, 0, DRC_NA}, + [GFS3_OP_READDIRP] = {"READDIRP", GFS3_OP_READDIRP, server3_3_readdirp, NULL, 0, DRC_NA}, + [GFS3_OP_RELEASE] = {"RELEASE", GFS3_OP_RELEASE, server3_3_release, NULL, 0, DRC_NA}, + [GFS3_OP_RELEASEDIR] = {"RELEASEDIR", GFS3_OP_RELEASEDIR, server3_3_releasedir, NULL, 0, DRC_NA}, + [GFS3_OP_FREMOVEXATTR] = {"FREMOVEXATTR", GFS3_OP_FREMOVEXATTR, server3_3_fremovexattr, NULL, 0, DRC_NA}, + [GFS3_OP_FALLOCATE] = {"FALLOCATE", GFS3_OP_FALLOCATE, server3_3_fallocate, NULL, 0, DRC_NA}, + [GFS3_OP_DISCARD] = {"DISCARD", GFS3_OP_DISCARD, server3_3_discard, NULL, 0, DRC_NA}, + [GFS3_OP_ZEROFILL] = {"ZEROFILL", GFS3_OP_ZEROFILL, server3_3_zerofill, NULL, 0, DRC_NA}, }; diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c index 63e3ce918..589bd7b36 100644 --- a/xlators/protocol/server/src/server.c +++ b/xlators/protocol/server/src/server.c @@ -25,33 +25,56 @@ #include "statedump.h" #include "defaults.h" #include "authenticate.h" -#include "rpcsvc.h" void grace_time_handler (void *data) { - server_connection_t *conn = NULL; - xlator_t *this = NULL; - gf_boolean_t cancelled = _gf_false; - gf_boolean_t detached = _gf_false; + client_t *client = NULL; + xlator_t *this = NULL; + gf_timer_t *timer = NULL; + server_ctx_t *serv_ctx = NULL; + gf_boolean_t cancelled = _gf_false; + gf_boolean_t detached = _gf_false; - conn = data; - this = conn->this; + client = data; + this = client->this; - GF_VALIDATE_OR_GOTO (THIS->name, conn, out); GF_VALIDATE_OR_GOTO (THIS->name, this, out); - gf_log (this->name, GF_LOG_INFO, "grace timer expired for %s", conn->id); + gf_log (this->name, GF_LOG_INFO, "grace timer expired for %s", + client->client_uid); - cancelled = server_cancel_conn_timer (this, conn); + serv_ctx = server_ctx_get (client, this); + + if (serv_ctx == NULL) { + gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed"); + goto out; + } + + LOCK (&serv_ctx->fdtable_lock); + { + if (serv_ctx->grace_timer) { + timer = serv_ctx->grace_timer; + serv_ctx->grace_timer = NULL; + } + } + UNLOCK (&serv_ctx->fdtable_lock); + if (timer) { + gf_timer_call_cancel (this->ctx, timer); + cancelled = _gf_true; + } if (cancelled) { - //conn should not be destroyed in conn_put, so take a ref. - server_conn_ref (conn); - server_connection_put (this, conn, &detached); + + /* + * client must not be destroyed in gf_client_put(), + * so take a ref. + */ + gf_client_ref (client); + gf_client_put (client, &detached); if (detached)//reconnection did not happen :-( - server_connection_cleanup (this, conn, - INTERNAL_LOCKS | POSIX_LOCKS); - server_conn_unref (conn); + server_connection_cleanup (this, client, + INTERNAL_LOCKS | POSIX_LOCKS); + gf_client_unref (client); } out: return; @@ -107,8 +130,6 @@ ret: return iob; } - - int server_submit_reply (call_frame_t *frame, rpcsvc_request_t *req, void *arg, struct iovec *payload, int payloadcount, @@ -119,20 +140,24 @@ server_submit_reply (call_frame_t *frame, rpcsvc_request_t *req, void *arg, struct iovec rsp = {0,}; server_state_t *state = NULL; char new_iobref = 0; - server_connection_t *conn = NULL; + client_t *client = NULL; gf_boolean_t lk_heal = _gf_false; - glusterfs_fop_t fop = GF_FOP_NULL; + server_conf_t *conf = NULL; + gf_barrier_t *barrier = NULL; + gf_barrier_payload_t *stub = NULL; + gf_boolean_t barriered = _gf_false; GF_VALIDATE_OR_GOTO ("server", req, ret); if (frame) { state = CALL_STATE (frame); frame->local = NULL; - conn = SERVER_CONNECTION(frame); + client = frame->root->client; + conf = (server_conf_t *) client->this->private; } - if (conn) - lk_heal = ((server_conf_t *) conn->this->private)->lk_heal; + if (client) + lk_heal = ((server_conf_t *) client->this->private)->lk_heal; if (!iobref) { iobref = iobref_new (); @@ -151,6 +176,32 @@ server_submit_reply (call_frame_t *frame, rpcsvc_request_t *req, void *arg, iobref_add (iobref, iob); + if (conf) + barrier = conf->barrier; + if (barrier) { + /* todo: write's with fd flags set to O_SYNC and O_DIRECT */ + LOCK (&barrier->lock); + { + if (is_fop_barriered (barrier->fops, req->procnum) && + (barrier_add_to_queue (barrier))) { + stub = gf_barrier_payload (req, &rsp, frame, + payload, + payloadcount, iobref, + iob, new_iobref); + if (stub) { + gf_barrier_enqueue (barrier, stub); + barriered = _gf_true; + } else { + gf_log ("", GF_LOG_ERROR, "Failed to " + " barrier fop %"PRIu64, + ((uint64_t)1 << req->procnum)); + } + } + } + UNLOCK (&barrier->lock); + if (barriered == _gf_true) + goto out; + } /* Then, submit the message for transmission. */ ret = rpcsvc_submit_generic (req, &rsp, 1, payload, payloadcount, iobref); @@ -165,17 +216,9 @@ server_submit_reply (call_frame_t *frame, rpcsvc_request_t *req, void *arg, */ iobuf_unref (iob); if (ret == -1) { - if (frame && conn && !lk_heal) { - fop = frame->root->op; - if ((GF_FOP_NULL < fop) && - (fop < GF_FOP_MAXVALUE)) { - pthread_mutex_lock (&conn->lock); - { - conn->rsp_failure_fops[fop]++; - } - pthread_mutex_unlock (&conn->lock); - } - server_connection_cleanup (frame->this, conn, + gf_log_callingfn ("", GF_LOG_ERROR, "Reply submission failed"); + if (frame && client && !lk_heal) { + server_connection_cleanup (frame->this, client, INTERNAL_LOCKS | POSIX_LOCKS); } else { gf_log_callingfn ("", GF_LOG_ERROR, @@ -193,185 +236,17 @@ ret: } if (frame) { - if (frame->root->trans) - server_conn_unref (frame->root->trans); + gf_client_unref (client); STACK_DESTROY (frame->root); } if (new_iobref) { iobref_unref (iobref); } - - return ret; -} - -/* */ -int -server_fd_to_dict (xlator_t *this, dict_t *dict) -{ - server_conf_t *conf = NULL; - server_connection_t *trav = NULL; - char key[GF_DUMP_MAX_BUF_LEN] = {0,}; - int count = 0; - int ret = -1; - - GF_VALIDATE_OR_GOTO (THIS->name, this, out); - GF_VALIDATE_OR_GOTO (this->name, dict, out); - - conf = this->private; - if (!conf) - return -1; - - ret = pthread_mutex_trylock (&conf->mutex); - if (ret) - return -1; - - list_for_each_entry (trav, &conf->conns, list) { - memset (key, 0, sizeof (key)); - snprintf (key, sizeof (key), "conn%d", count++); - fdtable_dump_to_dict (trav->fdtable, key, dict); - } - pthread_mutex_unlock (&conf->mutex); - - ret = dict_set_int32 (dict, "conncount", count); -out: - return ret; -} - -int -server_fd (xlator_t *this) -{ - server_conf_t *conf = NULL; - server_connection_t *trav = NULL; - char key[GF_DUMP_MAX_BUF_LEN]; - int i = 1; - int ret = -1; - gf_boolean_t section_added = _gf_false; - - GF_VALIDATE_OR_GOTO ("server", this, out); - - conf = this->private; - if (!conf) { - gf_log (this->name, GF_LOG_WARNING, - "conf null in xlator"); - return -1; - } - - gf_proc_dump_add_section("xlator.protocol.server.conn"); - section_added = _gf_true; - - ret = pthread_mutex_trylock (&conf->mutex); - if (ret) - goto out; - - list_for_each_entry (trav, &conf->conns, list) { - if (trav->id) { - gf_proc_dump_build_key(key, - "conn","%d.id", i); - gf_proc_dump_write(key, "%s", trav->id); - } - - gf_proc_dump_build_key(key,"conn","%d.ref",i) - gf_proc_dump_write(key, "%d", trav->ref); - if (trav->bound_xl) { - gf_proc_dump_build_key(key, - "conn","%d.bound_xl", i); - gf_proc_dump_write(key, "%s", trav->bound_xl->name); - } - - gf_proc_dump_build_key(key, - "conn","%d.id", i); - fdtable_dump(trav->fdtable,key); - i++; - } - pthread_mutex_unlock (&conf->mutex); - - ret = 0; out: - if (ret) { - if (section_added == _gf_false) - gf_proc_dump_add_section("xlator.protocol.server.conn"); - gf_proc_dump_write ("Unable to dump the list of connections", - "(Lock acquisition failed) %s", - this?this->name:"server"); - } return ret; } -void -ltable_dump (server_connection_t *trav) -{ - char key[GF_DUMP_MAX_BUF_LEN] = {0,}; - struct _locker *locker = NULL; - char locker_data[GF_MAX_LOCK_OWNER_LEN] = {0,}; - int count = 0; - - gf_proc_dump_build_key(key, - "conn","bound_xl.ltable.inodelk.%s", - trav->bound_xl?trav->bound_xl->name:""); - gf_proc_dump_add_section(key); - - list_for_each_entry (locker, &trav->ltable->inodelk_lockers, lockers) { - count++; - gf_proc_dump_write("volume", "%s", locker->volume); - if (locker->fd) { - gf_proc_dump_write("fd", "%p", locker->fd); - gf_proc_dump_write("gfid", "%s", - uuid_utoa (locker->fd->inode->gfid)); - } else { - gf_proc_dump_write("fd", "%s", locker->loc.path); - gf_proc_dump_write("gfid", "%s", - uuid_utoa (locker->loc.inode->gfid)); - } - gf_proc_dump_write("pid", "%d", locker->pid); - gf_proc_dump_write("lock length", "%d", locker->owner.len); - lkowner_unparse (&locker->owner, locker_data, - locker->owner.len); - gf_proc_dump_write("lock owner", "%s", locker_data); - memset (locker_data, 0, sizeof (locker_data)); - - gf_proc_dump_build_key (key, "inode", "%d", count); - gf_proc_dump_add_section (key); - if (locker->fd) - inode_dump (locker->fd->inode, key); - else - inode_dump (locker->loc.inode, key); - } - - count = 0; - locker = NULL; - gf_proc_dump_build_key(key, - "conn","bound_xl.ltable.entrylk.%s", - trav->bound_xl?trav->bound_xl->name:""); - gf_proc_dump_add_section(key); - - list_for_each_entry (locker, &trav->ltable->entrylk_lockers, - lockers) { - count++; - gf_proc_dump_write("volume", "%s", locker->volume); - if (locker->fd) { - gf_proc_dump_write("fd", "%p", locker->fd); - gf_proc_dump_write("gfid", "%s", - uuid_utoa (locker->fd->inode->gfid)); - } else { - gf_proc_dump_write("fd", "%s", locker->loc.path); - gf_proc_dump_write("gfid", "%s", - uuid_utoa (locker->loc.inode->gfid)); - } - gf_proc_dump_write("pid", "%d", locker->pid); - gf_proc_dump_write("lock length", "%d", locker->owner.len); - lkowner_unparse (&locker->owner, locker_data, locker->owner.len); - gf_proc_dump_write("lock data", "%s", locker_data); - memset (locker_data, 0, sizeof (locker_data)); - - gf_proc_dump_build_key (key, "inode", "%d", count); - gf_proc_dump_add_section (key); - if (locker->fd) - inode_dump (locker->fd->inode, key); - else - inode_dump (locker->loc.inode, key); - } -} int server_priv_to_dict (xlator_t *this, dict_t *dict) @@ -478,104 +353,6 @@ out: return ret; } -int -server_inode_to_dict (xlator_t *this, dict_t *dict) -{ - server_conf_t *conf = NULL; - server_connection_t *trav = NULL; - char key[32] = {0,}; - int count = 0; - int ret = -1; - xlator_t *prev_bound_xl = NULL; - - GF_VALIDATE_OR_GOTO (THIS->name, this, out); - GF_VALIDATE_OR_GOTO (this->name, dict, out); - - conf = this->private; - if (!conf) - return -1; - - ret = pthread_mutex_trylock (&conf->mutex); - if (ret) - return -1; - - list_for_each_entry (trav, &conf->conns, list) { - if (trav->bound_xl && trav->bound_xl->itable) { - /* Presently every brick contains only one - * bound_xl for all connections. This will lead - * to duplicating of the inode lists, if listing - * is done for every connection. This simple check - * prevents duplication in the present case. If - * need arises the check can be improved. - */ - if (trav->bound_xl == prev_bound_xl) - continue; - prev_bound_xl = trav->bound_xl; - - memset (key, 0, sizeof (key)); - snprintf (key, sizeof (key), "conn%d", count); - inode_table_dump_to_dict (trav->bound_xl->itable, - key, dict); - count++; - } - } - pthread_mutex_unlock (&conf->mutex); - - ret = dict_set_int32 (dict, "conncount", count); - -out: - if (prev_bound_xl) - prev_bound_xl = NULL; - return ret; -} - -int -server_inode (xlator_t *this) -{ - server_conf_t *conf = NULL; - server_connection_t *trav = NULL; - char key[GF_DUMP_MAX_BUF_LEN]; - int i = 1; - int ret = -1; - - GF_VALIDATE_OR_GOTO ("server", this, out); - - conf = this->private; - if (!conf) { - gf_log (this->name, GF_LOG_WARNING, - "conf null in xlator"); - return -1; - } - - ret = pthread_mutex_trylock (&conf->mutex); - if (ret) - goto out; - - list_for_each_entry (trav, &conf->conns, list) { - ret = pthread_mutex_trylock (&trav->lock); - if (!ret) - { - gf_proc_dump_build_key(key, - "conn","%d.ltable", i); - gf_proc_dump_add_section(key); - ltable_dump (trav); - i++; - pthread_mutex_unlock (&trav->lock); - }else - continue; - } - pthread_mutex_unlock (&conf->mutex); - - ret = 0; -out: - if (ret) - gf_proc_dump_write ("Unable to dump the lock table", - "(Lock acquisition failed) %s", - this?this->name:"server"); - - return ret; -} - static int get_auth_types (dict_t *this, char *key, data_t *value, void *data) @@ -718,9 +495,10 @@ server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event, { gf_boolean_t detached = _gf_false; xlator_t *this = NULL; - rpc_transport_t *xprt = NULL; - server_connection_t *conn = NULL; + rpc_transport_t *trans = NULL; server_conf_t *conf = NULL; + client_t *client = NULL; + server_ctx_t *serv_ctx = NULL; if (!xl || !data) { gf_log_callingfn ("server", GF_LOG_WARNING, @@ -729,7 +507,7 @@ server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event, } this = xl; - xprt = data; + trans = data; conf = this->private; switch (event) { @@ -737,17 +515,17 @@ server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event, { /* Have a structure per new connection */ /* TODO: Should we create anything here at all ? * / - conn = create_server_conn_state (this, xprt); - if (!conn) + client->conn = create_server_conn_state (this, trans); + if (!client->conn) goto out; - xprt->protocol_private = conn; + trans->protocol_private = client->conn; */ - INIT_LIST_HEAD (&xprt->list); + INIT_LIST_HEAD (&trans->list); pthread_mutex_lock (&conf->mutex); { - list_add_tail (&xprt->list, &conf->xprt_list); + list_add_tail (&trans->list, &conf->xprt_list); } pthread_mutex_unlock (&conf->mutex); @@ -760,51 +538,58 @@ server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event, */ pthread_mutex_lock (&conf->mutex); { - list_del_init (&xprt->list); + list_del_init (&trans->list); } pthread_mutex_unlock (&conf->mutex); - conn = get_server_conn_state (this, xprt); - if (!conn) + client = trans->xl_private; + if (!client) break; gf_log (this->name, GF_LOG_INFO, "disconnecting connection" - " from %s, Number of pending operations: %"PRIu64, - conn->id, conn->ref); + "from %s", client->client_uid); /* If lock self heal is off, then destroy the conn object, else register a grace timer event */ if (!conf->lk_heal) { - server_conn_ref (conn); - server_connection_put (this, conn, &detached); + gf_client_ref (client); + gf_client_put (client, &detached); if (detached) - server_connection_cleanup (this, conn, - INTERNAL_LOCKS | - POSIX_LOCKS); - server_conn_unref (conn); - } else { - put_server_conn_state (this, xprt); - server_connection_cleanup (this, conn, INTERNAL_LOCKS); + server_connection_cleanup (this, client, + INTERNAL_LOCKS | POSIX_LOCKS); + gf_client_unref (client); + break; + } + trans->xl_private = NULL; + server_connection_cleanup (this, client, INTERNAL_LOCKS); - pthread_mutex_lock (&conn->lock); - { - if (conn->timer) - goto unlock; + serv_ctx = server_ctx_get (client, this); - gf_log (this->name, GF_LOG_INFO, "starting a grace " - "timer for %s", conn->id); + if (serv_ctx == NULL) { + gf_log (this->name, GF_LOG_INFO, + "server_ctx_get() failed"); + goto out; + } + + LOCK (&serv_ctx->fdtable_lock); + { + if (!serv_ctx->grace_timer) { + + gf_log (this->name, GF_LOG_INFO, + "starting a grace timer for %s", + client->client_uid); - conn->timer = gf_timer_call_after (this->ctx, - conf->grace_tv, - grace_time_handler, - conn); + serv_ctx->grace_timer = + gf_timer_call_after (this->ctx, + conf->grace_ts, + grace_time_handler, + client); } - unlock: - pthread_mutex_unlock (&conn->lock); } + UNLOCK (&serv_ctx->fdtable_lock); break; case RPCSVC_EVENT_TRANSPORT_DESTROY: - /*- conn obj has been disassociated from xprt on first + /*- conn obj has been disassociated from trans on first * disconnect. * conn cleanup and destruction is handed over to * grace_time_handler or the subsequent handler that 'owns' @@ -898,14 +683,14 @@ server_init_grace_timer (xlator_t *this, dict_t *options, ret = dict_get_int32 (options, "grace-timeout", &grace_timeout); if (!ret) - conf->grace_tv.tv_sec = grace_timeout; + conf->grace_ts.tv_sec = grace_timeout; else - conf->grace_tv.tv_sec = 10; + conf->grace_ts.tv_sec = 10; gf_log (this->name, GF_LOG_DEBUG, "Server grace timeout " - "value = %"PRIu64, conf->grace_tv.tv_sec); + "value = %"PRIu64, conf->grace_ts.tv_sec); - conf->grace_tv.tv_usec = 0; + conf->grace_ts.tv_nsec = 0; ret = 0; out: @@ -952,12 +737,6 @@ reconfigure (xlator_t *this, dict_t *options) } - /*ret = dict_get_str (options, "statedump-path", &statedump_path); - if (!ret) { - gf_path_strip_trailing_slashes (statedump_path); - GF_FREE (this->ctx->statedump_path); - this->ctx->statedump_path = gf_strdup (statedump_path); - }*/ GF_OPTION_RECONF ("statedump-path", statedump_path, options, path, out); if (!statedump_path) { @@ -996,6 +775,7 @@ reconfigure (xlator_t *this, dict_t *options) (void) rpcsvc_set_allow_insecure (rpc_conf, options); (void) rpcsvc_set_root_squash (rpc_conf, options); + (void) rpcsvc_set_outstanding_rpc_limit (rpc_conf, options); list_for_each_entry (listeners, &(rpc_conf->listeners), list) { if (listeners->trans != NULL) { if (listeners->trans->reconfigure ) @@ -1012,6 +792,26 @@ out: return ret; } +static int32_t +client_destroy_cbk (xlator_t *this, client_t *client) +{ + void *tmp = NULL; + server_ctx_t *ctx = NULL; + + client_ctx_del (client, this, &tmp); + + ctx = tmp; + + if (ctx == NULL) + return 0; + + gf_fd_fdtable_destroy (ctx->fdtable); + LOCK_DESTROY (&ctx->fdtable_lock); + GF_FREE (ctx); + + return 0; +} + int init (xlator_t *this) { @@ -1019,6 +819,8 @@ init (xlator_t *this) server_conf_t *conf = NULL; rpcsvc_listener_t *listener = NULL; char *statedump_path = NULL; + gf_barrier_t *barrier = NULL; + char *str = NULL; GF_VALIDATE_OR_GOTO ("init", this, out); if (this->children == NULL) { @@ -1038,7 +840,6 @@ init (xlator_t *this) GF_VALIDATE_OR_GOTO(this->name, conf, out); - INIT_LIST_HEAD (&conf->conns); INIT_LIST_HEAD (&conf->xprt_list); pthread_mutex_init (&conf->mutex, NULL); @@ -1160,6 +961,37 @@ init (xlator_t *this) } } #endif + /* barrier related */ + barrier = GF_CALLOC (1, sizeof (*barrier),1); + if (!barrier) { + gf_log (this->name, GF_LOG_WARNING, + "WARNING: Failed to allocate barrier"); + ret = -1; + goto out; + } + + LOCK_INIT (&barrier->lock); + INIT_LIST_HEAD (&barrier->queue); + barrier->on = _gf_false; + + GF_OPTION_INIT ("barrier-queue-length", barrier->max_size, + int64, out); + GF_OPTION_INIT ("barrier-timeout", barrier->time_out, + uint64, out); + + ret = dict_get_str (this->options, "barrier-fops", &str); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, + "setting barrier fops to default value"); + } + ret = gf_barrier_fops_configure (this, barrier, str); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "invalid barrier fops specified"); + goto out; + } + + conf->barrier = barrier; this->private = conf; ret = 0; @@ -1213,27 +1045,65 @@ int notify (xlator_t *this, int32_t event, void *data, ...) { int ret = 0; + int32_t val = 0; + dict_t *dict = NULL; + dict_t *output = NULL; + va_list ap; + + dict = data; + va_start (ap, data); + output = va_arg (ap, dict_t*); + va_end (ap); + switch (event) { + case GF_EVENT_VOLUME_BARRIER_OP: + ret = dict_get_int32 (dict, "barrier", &val); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Wrong BARRIER event"); + goto out; + } + /* !val un-barrier, if val, barrier */ + if (val) { + ret = gf_barrier_start (this); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Barrier start failed"); + } else { + ret = gf_barrier_stop (this); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Barrier stop failed"); + } + ret = dict_set_int32 (output, "barrier-status", ret); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "Failed to set barrier-status in dict"); + break; + + /* todo: call default_notify to make other xlators handle it.*/ default: default_notify (this, event, data); break; } - +out: return ret; } struct xlator_fops fops; -struct xlator_cbks cbks; +struct xlator_cbks cbks = { + .client_destroy = client_destroy_cbk, +}; struct xlator_dumpops dumpops = { .priv = server_priv, - .fd = server_fd, - .inode = server_inode, + .fd = gf_client_dump_fdtables, + .inode = gf_client_dump_inodes, .priv_to_dict = server_priv_to_dict, - .fd_to_dict = server_fd_to_dict, - .inode_to_dict = server_inode_to_dict, + .fd_to_dict = gf_client_dump_fdtables_to_dict, + .inode_to_dict = gf_client_dump_inodes_to_dict, }; @@ -1322,6 +1192,23 @@ struct volume_options options[] = { "hostnames to connect to the server. By default, all" " connections are allowed." }, - + {.key = {"barrier-timeout"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "60", + .min = 0, + .max = 360, + .description = "Barrier timeout in seconds", + }, + {.key = {"barrier-queue-length"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "4096", + .min = 0, + .max = 16384, + .description = "Barrier queue length", + }, + {.key = {"barrier-fops"}, + .type = GF_OPTION_TYPE_STR, + .description = "Allow a comma seperated fop lists", + }, { .key = {NULL} }, }; diff --git a/xlators/protocol/server/src/server.h b/xlators/protocol/server/src/server.h index 1da574934..782327d77 100644 --- a/xlators/protocol/server/src/server.h +++ b/xlators/protocol/server/src/server.h @@ -13,6 +13,7 @@ #include <pthread.h> +#include "fd.h" #include "rpcsvc.h" #include "fd.h" @@ -20,70 +21,47 @@ #include "server-mem-types.h" #include "glusterfs3.h" #include "timer.h" +#include "client_t.h" #define DEFAULT_BLOCK_SIZE 4194304 /* 4MB */ #define DEFAULT_VOLUME_FILE_PATH CONFDIR "/glusterfs.vol" #define GF_MAX_SOCKET_WINDOW_SIZE (1 * GF_UNIT_MB) #define GF_MIN_SOCKET_WINDOW_SIZE (0) -typedef enum { - INTERNAL_LOCKS = 1, - POSIX_LOCKS = 2, -} server_lock_flags_t; - -typedef struct _server_state server_state_t; - -struct _locker { - struct list_head lockers; - char *volume; - loc_t loc; - fd_t *fd; - gf_lkowner_t owner; - pid_t pid; +struct _gf_barrier_payload { + rpcsvc_request_t *req; + struct iovec rsp; + call_frame_t *frame; + struct iovec *payload; + struct iobref *iobref; + struct iobuf *iob; + int payload_count; + gf_boolean_t free_iobref; + struct list_head list; }; -struct _lock_table { - struct list_head inodelk_lockers; - struct list_head entrylk_lockers; +typedef struct _gf_barrier_payload gf_barrier_payload_t; + +struct _gf_barrier { + gf_lock_t lock; + gf_boolean_t on; + gf_boolean_t force; + size_t cur_size; + int64_t max_size; + uint64_t fops; + gf_timer_t *timer; + uint64_t time_out; + struct list_head queue; }; -/* private structure per connection (transport object) - * used as transport_t->xl_private - */ -struct _server_connection { - struct list_head list; - char *id; - uint64_t ref; - int bind_ref; - pthread_mutex_t lock; - fdtable_t *fdtable; - struct _lock_table *ltable; - gf_timer_t *timer; - xlator_t *bound_xl; - xlator_t *this; - uint32_t lk_version; - uint64_t rsp_failure_fops[GF_FOP_MAXVALUE]; -}; - -typedef struct _server_connection server_connection_t; - - -server_connection_t * -server_connection_get (xlator_t *this, const char *id); +typedef struct _gf_barrier gf_barrier_t; -server_connection_t * -server_connection_put (xlator_t *this, server_connection_t *conn, - gf_boolean_t *detached); - -server_connection_t* -server_conn_unref (server_connection_t *conn); - -server_connection_t* -server_conn_ref (server_connection_t *conn); +typedef enum { + INTERNAL_LOCKS = 1, + POSIX_LOCKS = 2, +} server_lock_flags_t; -int -server_connection_cleanup (xlator_t *this, server_connection_t *conn, - int32_t flags); +typedef struct _server_state server_state_t; int server_null (rpcsvc_request_t *req); @@ -103,11 +81,12 @@ struct server_conf { heal is on else off. */ char *conf_dir; struct _volfile_ctx *volfile; - struct timeval grace_tv; + struct timespec grace_ts; dict_t *auth_modules; pthread_mutex_t mutex; - struct list_head conns; + gf_barrier_t *barrier; struct list_head xprt_list; + pthread_t barrier_th; }; typedef struct server_conf server_conf_t; @@ -145,11 +124,10 @@ int resolve_and_resume (call_frame_t *frame, server_resume_fn_t fn); struct _server_state { - server_connection_t *conn; - rpc_transport_t *xprt; - inode_table_t *itable; + rpc_transport_t *xprt; + inode_table_t *itable; - server_resume_fn_t resume_fn; + server_resume_fn_t resume_fn; loc_t loc; loc_t loc2; @@ -185,7 +163,7 @@ struct _server_state { int mask; char is_revalidate; dict_t *dict; - struct gf_flock flock; + struct gf_flock flock; const char *volume; dir_entry_t *entry; @@ -193,10 +171,20 @@ struct _server_state { mode_t umask; }; + extern struct rpcsvc_program gluster_handshake_prog; extern struct rpcsvc_program glusterfs3_3_fop_prog; extern struct rpcsvc_program gluster_ping_prog; + +typedef struct _server_ctx { + gf_lock_t fdtable_lock; + fdtable_t *fdtable; + struct _gf_timer *grace_timer; + uint32_t lk_version; +} server_ctx_t; + + int server_submit_reply (call_frame_t *frame, rpcsvc_request_t *req, void *arg, struct iovec *payload, int payloadcount, @@ -205,6 +193,4 @@ server_submit_reply (call_frame_t *frame, rpcsvc_request_t *req, void *arg, int gf_server_check_setxattr_cmd (call_frame_t *frame, dict_t *dict); int gf_server_check_getxattr_cmd (call_frame_t *frame, const char *name); -void ltable_dump (server_connection_t *conn); - #endif /* !_SERVER_H */ diff --git a/xlators/storage/Makefile.am b/xlators/storage/Makefile.am index e1316a127..c08e8e41b 100644 --- a/xlators/storage/Makefile.am +++ b/xlators/storage/Makefile.am @@ -1,6 +1,7 @@ SUBDIRS = posix if ENABLE_BD_XLATOR -SUBDIRS += bd_map +SUBDIRS += bd endif + CLEANFILES = diff --git a/xlators/storage/bd/Makefile.am b/xlators/storage/bd/Makefile.am new file mode 100644 index 000000000..a985f42a8 --- /dev/null +++ b/xlators/storage/bd/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/storage/bd_map/src/Makefile.am b/xlators/storage/bd/src/Makefile.am index 91412e91d..3d93f7442 100644 --- a/xlators/storage/bd_map/src/Makefile.am +++ b/xlators/storage/bd/src/Makefile.am @@ -1,14 +1,13 @@ - if ENABLE_BD_XLATOR -xlator_LTLIBRARIES = bd_map.la +xlator_LTLIBRARIES = bd.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage -bd_map_la_LDFLAGS = -module -avoid-version +bd_la_LDFLAGS = -module -avoid-version LIBBD = -llvm2app -lrt -bd_map_la_SOURCES = bd_map.c bd_map_help.c -bd_map_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBBD) +bd_la_SOURCES = bd.c bd-helper.c bd-aio.c +bd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBBD) $(LIBAIO) -noinst_HEADERS = bd_map.h bd_map_help.h +noinst_HEADERS = bd.h bd-aio.h AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ -I$(top_srcdir)/rpc/xdr/src \ diff --git a/xlators/storage/bd/src/bd-aio.c b/xlators/storage/bd/src/bd-aio.c new file mode 100644 index 000000000..62d4590f7 --- /dev/null +++ b/xlators/storage/bd/src/bd-aio.c @@ -0,0 +1,527 @@ +/* + Copyright IBM, Corp. 2013 + + This file is part of GlusterFS. + + Author: M. Mohan Kumar <mohan@in.ibm.com> + + Based on posix-aio.c + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <lvm2app.h> +#include <sys/uio.h> + +#include "xlator.h" +#include "glusterfs.h" +#include "defaults.h" +#include "bd.h" +#include "bd-aio.h" + +#ifdef HAVE_LIBAIO +#include <libaio.h> + +struct bd_aio_cb { + struct iocb iocb; + call_frame_t *frame; + struct iobuf *iobuf; + struct iobref *iobref; + struct iatt prebuf; + int op; + off_t offset; + fd_t *fd; +}; + +void +__bd_fd_set_odirect (fd_t *fd, bd_fd_t *bd_fd, int opflags, + off_t offset, size_t size) +{ + int odirect = 0; + int flags = 0; + int ret = 0; + + odirect = bd_fd->odirect; + + if ((fd->flags|opflags) & O_DIRECT) { + /* if instructed, use O_DIRECT always */ + odirect = 1; + } else { + /* else use O_DIRECT when feasible */ + if ((offset|size) & 0xfff) + odirect = 0; + else + odirect = 1; + } + + if (!odirect && bd_fd->odirect) { + flags = fcntl (bd_fd->fd, F_GETFL); + ret = fcntl (bd_fd->fd, F_SETFL, (flags & (~O_DIRECT))); + bd_fd->odirect = 0; + } + + if (odirect && !bd_fd->odirect) { + flags = fcntl (bd_fd->fd, F_GETFL); + ret = fcntl (bd_fd->fd, F_SETFL, (flags | O_DIRECT)); + bd_fd->odirect = 1; + } + + if (ret) { + gf_log (THIS->name, GF_LOG_WARNING, + "fcntl() failed (%s). fd=%d flags=%d pfd->odirect=%d", + strerror (errno), bd_fd->fd, flags, bd_fd->odirect); + } +} + +int +bd_aio_readv_complete (struct bd_aio_cb *paiocb, int res, int res2) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct iobuf *iobuf = NULL; + struct iatt postbuf = {0,}; + int op_ret = -1; + int op_errno = 0; + struct iovec iov; + struct iobref *iobref = NULL; + off_t offset = 0; + bd_attr_t *bdatt = NULL; + + frame = paiocb->frame; + this = frame->this; + iobuf = paiocb->iobuf; + offset = paiocb->offset; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_log (this->name, GF_LOG_ERROR, + "readv(async) failed fd=%p,size=%lu,offset=%llu (%d/%s)", + paiocb->fd, paiocb->iocb.u.c.nbytes, + (unsigned long long) paiocb->offset, + res, strerror (op_errno)); + goto out; + } + + bd_inode_ctx_get (paiocb->fd->inode, this, &bdatt); + memcpy (&postbuf, &bdatt->iatt, sizeof (struct iatt)); + + op_ret = res; + op_errno = 0; + + iobref = iobref_new (); + if (!iobref) { + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + iobref_add (iobref, iobuf); + + iov.iov_base = iobuf_ptr (iobuf); + iov.iov_len = op_ret; + + /* Hack to notify higher layers of EOF. */ + if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size) + op_errno = ENOENT; + +out: + STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, &iov, 1, + &postbuf, iobref, NULL); + if (iobuf) + iobuf_unref (iobuf); + if (iobref) + iobref_unref (iobref); + + GF_FREE (paiocb); + + return 0; +} + +int +bd_aio_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, + size_t size, off_t offset, uint32_t flags, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + int _fd = -1; + struct iobuf *iobuf = NULL; + bd_fd_t *bd_fd = NULL; + int ret = -1; + struct bd_aio_cb *paiocb = NULL; + bd_priv_t *priv = NULL; + struct iocb *iocb = NULL; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + priv = this->private; + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { + STACK_WIND (frame, default_readv_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->readv, fd, size, offset, + flags, xdata); + return 0; + } + _fd = bd_fd->fd; + bd_inode_ctx_get (fd->inode, this, &bdatt); + if (!size) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size); + goto err; + } + + iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); + if (!iobuf) { + op_errno = ENOMEM; + goto err; + } + + paiocb = CALLOC (1, sizeof (*paiocb)); + if (!paiocb) { + op_errno = ENOMEM; + goto err; + } + + paiocb->frame = frame; + paiocb->iobuf = iobuf; + paiocb->offset = offset; + paiocb->op = GF_FOP_READ; + paiocb->fd = fd; + + paiocb->iocb.data = paiocb; + paiocb->iocb.aio_fildes = _fd; + paiocb->iocb.aio_lio_opcode = IO_CMD_PREAD; + paiocb->iocb.aio_reqprio = 0; + paiocb->iocb.u.c.buf = iobuf_ptr (iobuf); + paiocb->iocb.u.c.nbytes = size; + paiocb->iocb.u.c.offset = offset; + + iocb = &paiocb->iocb; + + LOCK (&fd->lock); + { + __bd_fd_set_odirect (fd, bd_fd, flags, offset, size); + + ret = io_submit (priv->ctxp, 1, &iocb); + } + UNLOCK (&fd->lock); + + if (ret != 1) { + gf_log (this->name, GF_LOG_ERROR, + "io_submit() returned %d", ret); + op_errno = -ret; + goto err; + } + + return 0; +err: + STACK_UNWIND_STRICT (readv, frame, -1, op_errno, 0, 0, 0, 0, 0); + if (iobuf) + iobuf_unref (iobuf); + + if (paiocb) + GF_FREE (paiocb); + + return 0; +} + +int +bd_aio_writev_complete (struct bd_aio_cb *paiocb, int res, int res2) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + struct iatt prebuf = {0,}; + struct iatt postbuf = {0,}; + int op_ret = -1; + int op_errno = 0; + bd_attr_t *bdatt = NULL; + + frame = paiocb->frame; + prebuf = paiocb->prebuf; + this = frame->this; + + if (res < 0) { + op_ret = -1; + op_errno = -res; + gf_log (this->name, GF_LOG_ERROR, + "writev(async) failed fd=%p,offset=%llu (%d/%s)", + paiocb->fd, (unsigned long long) paiocb->offset, res, + strerror (op_errno)); + + goto out; + } + + bd_inode_ctx_get (paiocb->fd->inode, this, &bdatt); + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME); + memcpy (&postbuf, &bdatt->iatt, sizeof (struct iatt)); + + op_ret = res; + op_errno = 0; + +out: + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &prebuf, &postbuf, + NULL); + + if (paiocb) { + if (paiocb->iobref) + iobref_unref (paiocb->iobref); + GF_FREE (paiocb); + } + + return 0; +} + +int +bd_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *iov, int count, off_t offset, uint32_t flags, + struct iobref *iobref, dict_t *xdata) +{ + int32_t op_errno = EINVAL; + int _fd = -1; + bd_fd_t *bd_fd = NULL; + int ret = -1; + struct bd_aio_cb *paiocb = NULL; + bd_priv_t *priv = NULL; + struct iocb *iocb = NULL; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + priv = this->private; + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { + STACK_WIND (frame, default_writev_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, + fd, iov, count, offset, flags, iobref, xdata); + return 0; + } + + bd_inode_ctx_get (fd->inode, this, &bdatt); + + _fd = bd_fd->fd; + + paiocb = CALLOC (1, sizeof (*paiocb)); + if (!paiocb) { + op_errno = ENOMEM; + goto err; + } + + + paiocb->frame = frame; + paiocb->offset = offset; + paiocb->op = GF_FOP_WRITE; + paiocb->fd = fd; + + paiocb->iocb.data = paiocb; + paiocb->iocb.aio_fildes = _fd; + paiocb->iobref = iobref_ref (iobref); + paiocb->iocb.aio_lio_opcode = IO_CMD_PWRITEV; + paiocb->iocb.aio_reqprio = 0; + paiocb->iocb.u.v.vec = iov; + paiocb->iocb.u.v.nr = count; + paiocb->iocb.u.v.offset = offset; + + iocb = &paiocb->iocb; + + memcpy (&paiocb->prebuf, &bdatt->iatt, sizeof (struct iatt)); + LOCK (&fd->lock); + { + __bd_fd_set_odirect (fd, bd_fd, flags, offset, + iov_length (iov, count)); + + ret = io_submit (priv->ctxp, 1, &iocb); + } + UNLOCK (&fd->lock); + + if (ret != 1) { + gf_log (this->name, GF_LOG_ERROR, + "io_submit() returned %d", ret); + op_errno = -ret; + goto err; + } + + return 0; +err: + STACK_UNWIND_STRICT (writev, frame, -1, op_errno, 0, 0, 0); + + if (paiocb) { + if (paiocb->iobref) + iobref_unref (paiocb->iobref); + GF_FREE (paiocb); + } + + return 0; +} + +void * +bd_aio_thread (void *data) +{ + xlator_t *this = NULL; + bd_priv_t *priv = NULL; + int ret = 0; + int i = 0; + struct io_event *event = NULL; + struct bd_aio_cb *paiocb = NULL; + struct io_event events[BD_AIO_MAX_NR_GETEVENTS]; + struct timespec ts = {0, }; + + this = data; + THIS = this; + priv = this->private; + + ts.tv_sec = 5; + for (;;) { + memset (&events[0], 0, sizeof (events)); + ret = io_getevents (priv->ctxp, 1, BD_AIO_MAX_NR_GETEVENTS, + &events[0], &ts); + if (ret < 0) { + if (ret == -EINTR) + continue; + gf_log (this->name, GF_LOG_ERROR, + "io_getevents() returned %d, exiting", ret); + break; + } + + for (i = 0; i < ret; i++) { + event = &events[i]; + + paiocb = event->data; + + switch (paiocb->op) { + case GF_FOP_READ: + bd_aio_readv_complete (paiocb, event->res, + event->res2); + break; + case GF_FOP_WRITE: + bd_aio_writev_complete (paiocb, event->res, + event->res2); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "unknown op %d found in piocb", + paiocb->op); + break; + } + } + } + + return NULL; +} + +int +bd_aio_init (xlator_t *this) +{ + bd_priv_t *priv = NULL; + int ret = 0; + + priv = this->private; + + ret = io_setup (BD_AIO_MAX_NR_EVENTS, &priv->ctxp); + if ((ret == -1 && errno == ENOSYS) || ret == -ENOSYS) { + gf_log (this->name, GF_LOG_WARNING, + "Linux AIO not available at run-time." + " Continuing with synchronous IO"); + ret = 0; + goto out; + } + + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "io_setup() failed. ret=%d, errno=%d", + ret, errno); + goto out; + } + + ret = pthread_create (&priv->aiothread, NULL, + bd_aio_thread, this); + if (ret != 0) { + io_destroy (priv->ctxp); + goto out; + } + + this->fops->readv = bd_aio_readv; + this->fops->writev = bd_aio_writev; +out: + return ret; +} + + +int +bd_aio_on (xlator_t *this) +{ + bd_priv_t *priv = NULL; + int ret = 0; + + priv = this->private; + + if (!priv->aio_init_done) { + ret = bd_aio_init (this); + if (ret == 0) + priv->aio_capable = _gf_true; + else + priv->aio_capable = _gf_false; + priv->aio_init_done = _gf_true; + } + + if (priv->aio_capable) { + this->fops->readv = bd_aio_readv; + this->fops->writev = bd_aio_writev; + } + + return ret; +} + +int +bd_aio_off (xlator_t *this) +{ + this->fops->readv = bd_readv; + this->fops->writev = bd_writev; + + return 0; +} + +#else + +int +bd_aio_on (xlator_t *this) +{ + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return 0; +} + +int +bd_aio_off (xlator_t *this) +{ + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return 0; +} + +void +__bd_fd_set_odirect (fd_t *fd, struct bd_fd *pfd, int opflags, + off_t offset, size_t size) +{ + xlator_t *this = THIS; + gf_log (this->name, GF_LOG_INFO, + "Linux AIO not available at build-time." + " Continuing with synchronous IO"); + return; +} +#endif diff --git a/xlators/storage/bd/src/bd-aio.h b/xlators/storage/bd/src/bd-aio.h new file mode 100644 index 000000000..16f686a4c --- /dev/null +++ b/xlators/storage/bd/src/bd-aio.h @@ -0,0 +1,41 @@ +/* + Copyright IBM, Corp. 2013 + + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _BD_AIO_H +#define _BD_AIO_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "glusterfs.h" + +/* + * Maximum number of concurrently submitted IO events. The heaviest load + * GlusterFS has been able to handle had 60-80 concurrent calls + */ +#define BD_AIO_MAX_NR_EVENTS 256 + +/* Maximum number of completed IO operations to reap per getevents syscall */ +#define BD_AIO_MAX_NR_GETEVENTS 16 + +int bd_aio_on (xlator_t *this); +int bd_aio_off (xlator_t *this); + +int bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata); + +int bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata); + +#endif /* !_BD_AIO_H */ diff --git a/xlators/storage/bd/src/bd-helper.c b/xlators/storage/bd/src/bd-helper.c new file mode 100644 index 000000000..5525e346b --- /dev/null +++ b/xlators/storage/bd/src/bd-helper.c @@ -0,0 +1,783 @@ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif +#include <lvm2app.h> +#ifdef HAVE_LIBAIO +#include <libaio.h> +#endif + +#include "bd.h" +#include "run.h" + +int +bd_inode_ctx_set (inode_t *inode, xlator_t *this, bd_attr_t *ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; + + GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO (this->name, ctx, out); + + ctx_int = (long)ctx; + ret = inode_ctx_set (inode, this, &ctx_int); +out: + return ret; +} + +int +bd_inode_ctx_get (inode_t *inode, xlator_t *this, bd_attr_t **ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; + + GF_VALIDATE_OR_GOTO (this->name, inode, out); + ret = inode_ctx_get (inode, this, &ctx_int); + if (ret) + return ret; + if (ctx) + *ctx = (bd_attr_t *) ctx_int; +out: + return ret; +} + +void +bd_local_free (xlator_t *this, bd_local_t *local) +{ + if (!local) + return; + if (local->fd) + fd_unref (local->fd); + else if (local->loc.path) + loc_wipe (&local->loc); + if (local->dict) + dict_unref (local->dict); + if (local->inode) + inode_unref (local->inode); + if (local->bdatt) { + GF_FREE (local->bdatt->type); + GF_FREE (local->bdatt); + } + mem_put (local); + local = NULL; +} + +bd_local_t * +bd_local_init (call_frame_t *frame, xlator_t *this) +{ + frame->local = mem_get0 (this->local_pool); + if (!frame->local) + return NULL; + + return frame->local; +} + +/* + * VG are set with the tag in GF_XATTR_VOL_ID_KEY:<uuid> format. + * This function validates this tag agains volume-uuid. Also goes + * through LV list to find out if a thin-pool is configured or not. + */ +int bd_scan_vg (xlator_t *this, bd_priv_t *priv) +{ + vg_t brick = NULL; + data_t *tmp_data = NULL; + struct dm_list *tags = NULL; + int op_ret = -1; + uuid_t dict_uuid = {0, }; + uuid_t vg_uuid = {0, }; + gf_boolean_t uuid = _gf_false; + lvm_str_list_t *strl = NULL; + struct dm_list *lv_dm_list = NULL; + lv_list_t *lv_list = NULL; + struct dm_list *dm_seglist = NULL; + lvseg_list_t *seglist = NULL; + lvm_property_value_t prop = {0, }; + gf_boolean_t thin = _gf_false; + const char *lv_name = NULL; + + brick = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!brick) { + gf_log (this->name, GF_LOG_CRITICAL, "VG %s is not found", + priv->vg); + return ENOENT; + } + + lv_dm_list = lvm_vg_list_lvs (brick); + if (!lv_dm_list) + goto check; + + dm_list_iterate_items (lv_list, lv_dm_list) { + dm_seglist = lvm_lv_list_lvsegs (lv_list->lv); + if (!dm_seglist) + continue; + dm_list_iterate_items (seglist, dm_seglist) { + prop = lvm_lvseg_get_property (seglist->lvseg, + "segtype"); + if (!prop.is_valid || !prop.value.string) + continue; + if (!strcmp (prop.value.string, "thin-pool")) { + thin = _gf_true; + lv_name = lvm_lv_get_name (lv_list->lv); + priv->pool = gf_strdup (lv_name); + gf_log (THIS->name, GF_LOG_INFO, "Thin Pool " + "\"%s\" will be used for thin LVs", + lv_name); + break; + } + } + } + +check: + /* If there is no volume-id set in dict, we cant validate */ + tmp_data = dict_get (this->options, "volume-id"); + if (!tmp_data) { + op_ret = 0; + goto out; + } + + op_ret = uuid_parse (tmp_data->data, dict_uuid); + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "wrong volume-id (%s) set in volume file", + tmp_data->data); + op_ret = -1; + goto out; + } + + tags = lvm_vg_get_tags (brick); + if (!tags) { /* no tags in the VG */ + gf_log (this->name, GF_LOG_ERROR, + "Extended attribute trusted.glusterfs." + "volume-id is absent"); + op_ret = -1; + goto out; + } + dm_list_iterate_items (strl, tags) { + if (!strncmp (strl->str, GF_XATTR_VOL_ID_KEY, + strlen (GF_XATTR_VOL_ID_KEY))) { + uuid = _gf_true; + break; + } + } + /* UUID tag is not set in VG */ + if (!uuid) { + gf_log (this->name, GF_LOG_ERROR, + "Extended attribute trusted.glusterfs." + "volume-id is absent"); + op_ret = -1; + goto out; + } + + op_ret = uuid_parse (strl->str + strlen (GF_XATTR_VOL_ID_KEY) + 1, + vg_uuid); + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "wrong volume-id (%s) set in VG", strl->str); + op_ret = -1; + goto out; + } + if (uuid_compare (dict_uuid, vg_uuid)) { + gf_log (this->name, GF_LOG_ERROR, + "mismatching volume-id (%s) received. " + "already is a part of volume %s ", + tmp_data->data, vg_uuid); + op_ret = -1; + goto out; + } + + op_ret = 0; + +out: + lvm_vg_close (brick); + + if (!thin) + gf_log (THIS->name, GF_LOG_WARNING, "No thin pool found in " + "VG %s\n", priv->vg); + else + priv->caps |= BD_CAPS_THIN; + + return op_ret; +} + +/* FIXME: Move this code to common place, so posix and bd xlator can use */ +char * +page_aligned_alloc (size_t size, char **aligned_buf) +{ + char *alloc_buf = NULL; + char *buf = NULL; + + alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_common_mt_char); + if (!alloc_buf) + return NULL; + /* page aligned buffer */ + buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE); + *aligned_buf = buf; + + return alloc_buf; +} + +static int +__bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd_p) +{ + int ret = -1; + int _fd = -1; + char *devpath = NULL; + bd_fd_t *bdfd = NULL; + uint64_t tmp_bdfd = 0; + bd_priv_t *priv = this->private; + bd_gfid_t gfid = {0, }; + bd_attr_t *bdatt = NULL; + + /* not bd file */ + if (fd->inode->ia_type != IA_IFREG || + bd_inode_ctx_get (fd->inode, this, &bdatt)) + return 0; + + ret = __fd_ctx_get (fd, this, &tmp_bdfd); + if (ret == 0) { + bdfd = (void *)(long) tmp_bdfd; + *bdfd_p = bdfd; + return 0; + } + + uuid_utoa_r (fd->inode->gfid, gfid); + asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid); + if (!devpath) + goto out; + + _fd = open (devpath, O_RDWR | O_LARGEFILE, 0); + if (_fd < 0) { + ret = errno; + gf_log (this->name, GF_LOG_ERROR, "open on %s: %s", devpath, + strerror (ret)); + goto out; + } + bdfd = GF_CALLOC (1, sizeof(bd_fd_t), gf_bd_fd); + BD_VALIDATE_MEM_ALLOC (bdfd, ret, out); + + bdfd->fd = _fd; + bdfd->flag = O_RDWR | O_LARGEFILE; + if (__fd_ctx_set (fd, this, (uint64_t)(long)bdfd) < 0) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set the fd context fd=%p", fd); + goto out; + } + + *bdfd_p = bdfd; + + ret = 0; +out: + FREE (devpath); + if (ret) { + close (_fd); + GF_FREE (bdfd); + } + return ret; +} + +int +bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd) +{ + int ret; + + /* FIXME: Is it ok to fd->lock here ? */ + LOCK (&fd->lock); + { + ret = __bd_fd_ctx_get (this, fd, bdfd); + } + UNLOCK (&fd->lock); + + return ret; +} + +/* + * Validates if LV exists for given inode or not. + * Returns 0 if LV exists and size also matches. + * If LV does not exist -1 returned + * If LV size mismatches, returnes 1 also lv_size is updated with actual + * size + */ +int +bd_validate_bd_xattr (xlator_t *this, char *bd, char **type, + uint64_t *lv_size, uuid_t uuid) +{ + char *path = NULL; + int ret = -1; + bd_gfid_t gfid = {0, }; + bd_priv_t *priv = this->private; + struct stat stbuf = {0, }; + uint64_t size = 0; + vg_t vg = NULL; + lv_t lv = NULL; + char *bytes = NULL; + + bytes = strrchr (bd, ':'); + if (bytes) { + *bytes = '\0'; + bytes++; + gf_string2bytesize (bytes, &size); + } + + if (strcmp (bd, BD_LV) && strcmp (bd, BD_THIN)) { + gf_log (this->name, GF_LOG_WARNING, + "invalid xattr %s", bd); + return -1; + } + *type = gf_strdup (bd); + + /* + * Check if LV really exist, there could be a failure + * after setxattr and successful LV creation + */ + uuid_utoa_r (uuid, gfid); + gf_asprintf (&path, "/dev/%s/%s", priv->vg, gfid); + if (!path) { + gf_log (this->name, GF_LOG_WARNING, + "insufficient memory"); + return 0; + } + + /* Destination file does not exist */ + if (stat (path, &stbuf)) { + gf_log (this->name, GF_LOG_WARNING, + "lstat failed for path %s", path); + return -1; + } + + vg = lvm_vg_open (priv->handle, priv->vg, "r", 0); + if (!vg) { + gf_log (this->name, GF_LOG_WARNING, + "VG %s does not exist?", priv->vg); + ret = -1; + goto out; + } + + lv = lvm_lv_from_name (vg, gfid); + if (!lv) { + gf_log (this->name, GF_LOG_WARNING, + "LV %s does not exist", gfid); + ret = -1; + goto out; + } + + *lv_size = lvm_lv_get_size (lv); + if (size == *lv_size) { + ret = 0; + goto out; + } + + ret = 1; + +out: + if (vg) + lvm_vg_close (vg); + + GF_FREE (path); + return ret; +} + +static int +create_thin_lv (char *vg, char *pool, char *lv, uint64_t extent) +{ + int ret = -1; + runner_t runner = {0, }; + char *path = NULL; + struct stat stat = {0, }; + + runinit (&runner); + runner_add_args (&runner, LVM_CREATE, NULL); + runner_add_args (&runner, "--thin", NULL); + runner_argprintf (&runner, "%s/%s", vg, pool); + runner_add_args (&runner, "--name", NULL); + runner_argprintf (&runner, "%s", lv); + runner_add_args (&runner, "--virtualsize", NULL); + runner_argprintf (&runner, "%ldB", extent); + runner_start (&runner); + runner_end (&runner); + + gf_asprintf (&path, "/dev/%s/%s", vg, lv); + if (!path) { + ret = ENOMEM; + goto out; + } + if (lstat (path, &stat) < 0) + ret = EAGAIN; + else + ret = 0; +out: + GF_FREE (path); + return ret; +} + +int +bd_create (uuid_t uuid, uint64_t size, char *type, bd_priv_t *priv) +{ + int ret = 0; + vg_t vg = NULL; + bd_gfid_t gfid = {0, }; + + uuid_utoa_r (uuid, gfid); + + if (!strcmp (type, BD_THIN)) + return create_thin_lv (priv->vg, priv->pool, gfid, + size); + + vg = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!vg) { + gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + return ENOENT; + } + + if (!lvm_vg_create_lv_linear (vg, gfid, size)) { + gf_log (THIS->name, GF_LOG_WARNING, "lvm_vg_create_lv_linear " + "failed"); + ret = errno; + } + + lvm_vg_close (vg); + + return ret; +} + +int32_t +bd_resize (bd_priv_t *priv, uuid_t uuid, off_t size) +{ + uint64_t new_size = 0; + runner_t runner = {0, }; + bd_gfid_t gfid = {0, }; + int ret = 0; + vg_t vg = NULL; + lv_t lv = NULL; + + uuid_utoa_r (uuid, gfid); + + runinit (&runner); + + runner_add_args (&runner, LVM_RESIZE, NULL); + runner_argprintf (&runner, "%s/%s", priv->vg, gfid); + runner_argprintf (&runner, "-L%ldb", size); + runner_add_args (&runner, "-f", NULL); + + runner_start (&runner); + runner_end (&runner); + + vg = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!vg) { + gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + return EAGAIN; + } + + lv = lvm_lv_from_name (vg, gfid); + if (!lv) { + gf_log (THIS->name, GF_LOG_WARNING, "LV %s not found", gfid); + ret = EIO; + goto out; + } + new_size = lvm_lv_get_size (lv); + + if (new_size != size) { + gf_log (THIS->name, GF_LOG_WARNING, "resized LV size %ld does " + "not match requested size %ld", new_size, size); + ret = EIO; + } + +out: + lvm_vg_close (vg); + return ret; +} + +uint64_t +bd_get_default_extent (bd_priv_t *priv) +{ + vg_t vg = NULL; + uint64_t size = 0; + + vg = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!vg) { + gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + return 0; + } + + size = lvm_vg_get_extent_size (vg); + + lvm_vg_close (vg); + + return size; +} + +/* + * Adjusts the user specified size to VG specific extent size + */ +uint64_t +bd_adjust_size (bd_priv_t *priv, uint64_t size) +{ + uint64_t extent = 0; + uint64_t nr_ex = 0; + + extent = bd_get_default_extent (priv); + if (!extent) + return 0; + + nr_ex = size / extent; + if (size % extent) + nr_ex++; + + size = extent * nr_ex; + + return size; +} + +int +bd_delete_lv (bd_priv_t *priv, const char *lv_name, int *op_errno) +{ + vg_t vg = NULL; + lv_t lv = NULL; + int ret = -1; + + *op_errno = 0; + vg = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!vg) { + gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + *op_errno = ENOENT; + return -1; + } + lv = lvm_lv_from_name (vg, lv_name); + if (!lv) { + gf_log (THIS->name, GF_LOG_WARNING, "No such LV %s", lv_name); + *op_errno = ENOENT; + goto out; + } + ret = lvm_vg_remove_lv (lv); + if (ret < 0) { + gf_log (THIS->name, GF_LOG_WARNING, "removing LV %s failed", + lv_name); + *op_errno = errno; + goto out; + } +out: + lvm_vg_close (vg); + + return ret; +} + +inline void +bd_update_amtime(struct iatt *iatt, int flag) +{ + struct timespec ts = {0, }; + + clock_gettime (CLOCK_REALTIME, &ts); + if (flag & GF_SET_ATTR_ATIME) { + iatt->ia_atime = ts.tv_sec; + iatt->ia_atime_nsec = ts.tv_nsec; + } + if (flag & GF_SET_ATTR_MTIME) { + iatt->ia_mtime = ts.tv_sec; + iatt->ia_mtime_nsec = ts.tv_nsec; + } +} + +int +bd_snapshot_create (bd_local_t *local, bd_priv_t *priv) +{ + char *path = NULL; + bd_gfid_t dest = {0, }; + bd_gfid_t origin = {0, }; + int ret = 0; + runner_t runner = {0, }; + struct stat stat = {0, }; + + uuid_utoa_r (local->dloc->gfid, dest); + uuid_utoa_r (local->loc.gfid, origin); + + gf_asprintf (&path, "/dev/%s/%s", priv->vg, dest); + if (!path) { + gf_log (THIS->name, GF_LOG_WARNING, + "Insufficient memory"); + return ENOMEM; + } + + runinit (&runner); + runner_add_args (&runner, LVM_CREATE, NULL); + runner_add_args (&runner, "--snapshot", NULL); + runner_argprintf (&runner, "/dev/%s/%s", priv->vg, origin); + runner_add_args (&runner, "--name", NULL); + runner_argprintf (&runner, "%s", dest); + if (strcmp (local->bdatt->type, BD_THIN)) + runner_argprintf (&runner, "-L%ldB", local->size); + runner_start (&runner); + runner_end (&runner); + + if (lstat (path, &stat) < 0) + ret = EIO; + + GF_FREE (path); + return ret; +} + +int +bd_clone (bd_local_t *local, bd_priv_t *priv) +{ + int ret = ENOMEM; + int fd1 = -1; + int fd2 = -1; + int i = 0; + char *buff = NULL; + ssize_t bytes = 0; + char *spath = NULL; + char *dpath = NULL; + struct iovec *vec = NULL; + bd_gfid_t source = {0, }; + bd_gfid_t dest = {0, }; + void *bufp[IOV_NR] = {0, }; + + vec = GF_CALLOC (IOV_NR, sizeof (struct iovec), gf_common_mt_iovec); + if (!vec) + return ENOMEM; + + for (i = 0; i < IOV_NR; i++) { + bufp[i] = page_aligned_alloc (IOV_SIZE, &buff); + if (!buff) + goto out; + vec[i].iov_base = buff; + vec[i].iov_len = IOV_SIZE; + } + + uuid_utoa_r (local->loc.gfid, source); + uuid_utoa_r (local->dloc->gfid, dest); + + gf_asprintf (&spath, "/dev/%s/%s", priv->vg, source); + gf_asprintf (&dpath, "/dev/%s/%s", priv->vg, dest); + if (!spath || !dpath) + goto out; + + ret = bd_create (local->dloc->gfid, local->size, + local->bdatt->type, priv); + if (ret) + goto out; + + fd1 = open (spath, O_RDONLY | O_DIRECT); + if (fd1 < 0) { + ret = errno; + goto out; + } + fd2 = open (dpath, O_WRONLY | O_DIRECT); + if (fd2 < 0) { + ret = errno; + goto out; + } + + while (1) { + bytes = readv (fd1, vec, IOV_NR); + if (bytes < 0) { + ret = errno; + gf_log (THIS->name, GF_LOG_WARNING, "read failed: %s", + strerror (ret)); + goto out; + } + if (!bytes) + break; + bytes = writev (fd2, vec, IOV_NR); + if (bytes < 0) { + ret = errno; + gf_log (THIS->name, GF_LOG_WARNING, + "write failed: %s", strerror (ret)); + goto out; + } + } + ret = 0; + +out: + for (i = 0; i < IOV_NR; i++) + GF_FREE (bufp[i]); + GF_FREE (vec); + + if (fd1 != -1) + close (fd1); + if (fd2 != -1) + close (fd2); + + FREE (spath); + FREE (dpath); + + return ret; +} + +/* + * Merges snapshot LV to origin LV and returns status + */ +int +bd_merge (bd_priv_t *priv, uuid_t gfid) +{ + bd_gfid_t dest = {0, }; + char *path = NULL; + struct stat stat = {0, }; + runner_t runner = {0, }; + int ret = 0; + + uuid_utoa_r (gfid, dest); + gf_asprintf (&path, "/dev/%s/%s", priv->vg, dest); + + runinit (&runner); + runner_add_args (&runner, LVM_CONVERT, NULL); + runner_add_args (&runner, "--merge", NULL); + runner_argprintf (&runner, "%s", path); + runner_start (&runner); + runner_end (&runner); + + if (!lstat (path, &stat)) + ret = EIO; + + GF_FREE (path); + + return ret; +} + +int +bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict) +{ + vg_t brick = NULL; + lvm_property_value_t prop = {0, }; + lv_t lv = NULL; + int ret = -1; + bd_gfid_t gfid = {0, }; + inode_t *inode = NULL; + char *origin = NULL; + + brick = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!brick) { + gf_log (THIS->name, GF_LOG_CRITICAL, "VG %s is not found", + priv->vg); + return ENOENT; + } + + if (fd) + inode = fd->inode; + else + inode = loc->inode; + + uuid_utoa_r (inode->gfid, gfid); + lv = lvm_lv_from_name (brick, gfid); + if (!lv) { + gf_log (THIS->name, GF_LOG_CRITICAL, "LV %s not found", gfid); + ret = ENOENT; + goto out; + } + + prop = lvm_lv_get_property (lv, "origin"); + if (!prop.is_valid || !prop.value.string) { + ret = ENODATA; + goto out; + } + + origin = gf_strdup (prop.value.string); + ret = dict_set_dynstr (dict, BD_ORIGIN, origin); + +out: + lvm_vg_close (brick); + return ret; +} + diff --git a/xlators/storage/bd/src/bd.c b/xlators/storage/bd/src/bd.c new file mode 100644 index 000000000..405474c58 --- /dev/null +++ b/xlators/storage/bd/src/bd.c @@ -0,0 +1,2404 @@ +/* + BD translator V2 - Exports Block devices on server side as regular + files to client + + Now only exporting Logical volumes supported. + + Copyright IBM, Corp. 2013 + + This file is part of GlusterFS. + + Author: + M. Mohan Kumar <mohan@in.ibm.com> + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif +#include <lvm2app.h> +#include <openssl/md5.h> +#include <time.h> +#include <linux/fs.h> +#include <sys/ioctl.h> +#ifdef HAVE_LIBAIO +#include <libaio.h> +#endif + +#include "bd.h" +#include "bd-aio.h" +#include "defaults.h" +#include "glusterfs3-xdr.h" +#include "run.h" +#include "protocol-common.h" +#include "checksum.h" + +/* + * Call back function for setxattr and removexattr. + * does not do anything. FIXME: How to handle remove/setxattr failure + */ +int +bd_null_rmsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + STACK_DESTROY (frame->root); + return 0; +} + +/* + * returns 0 if a file is mapped to BD or not. + */ +int +bd_get_bd_info (call_frame_t *frame, xlator_t *this, dict_t *xattr, uuid_t gfid, + char **type, uint64_t *size) +{ + char *bd_xattr = NULL; + char *bd = NULL; + int ret = -1; + loc_t loc = {0, }; + dict_t *dict = NULL; + char *p = NULL; + call_frame_t *bd_frame = NULL; + + if (!xattr) + return 1; + + if (dict_get_str (xattr, BD_XATTR, &p)) + return 1; + + bd_xattr = gf_strdup (p); + + memcpy (loc.gfid, gfid, sizeof (uuid_t)); + + bd_frame = copy_frame (frame); + BD_VALIDATE_MEM_ALLOC (bd_frame, ret, out); + + ret = bd_validate_bd_xattr (this, bd_xattr, type, size, gfid); + if (ret < 0) {/* LV does not exist */ + STACK_WIND (bd_frame, bd_null_rmsetxattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->removexattr, &loc, + BD_XATTR, NULL); + + gf_log (this->name, GF_LOG_WARNING, + "Mapped LV not available for posix file <gfid:%s>, " + "deleting mapping", uuid_utoa (gfid)); + } else if (ret == 1) { + /* BD_XATTR size and LV size mismatch. Update BD_XATTR */ + gf_asprintf (&bd, "%s:%ld", *type, *size); + + dict = dict_new (); + BD_VALIDATE_MEM_ALLOC (dict, ret, out); + + ret = dict_set_dynstr (dict, BD_XATTR, bd); + if (ret) + goto out; + + STACK_WIND (bd_frame, bd_null_rmsetxattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setxattr, &loc, dict, 0, + NULL); + } + +out: + dict_del (xattr, BD_XATTR); + GF_FREE (bd_xattr); + GF_FREE (bd); + return ret; +} + +/* + * bd_lookup_cbk: Call back from posix_lookup. + */ +int32_t +bd_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *buf, dict_t *xattr, + struct iatt *postparent) +{ + int ret = -1; + bd_attr_t *bdatt = NULL; + uint64_t size = 0; + char *type = BD_TYPE_NONE; + + /* only regular files are part of BD object */ + if (op_ret < 0 || buf->ia_type != IA_IFREG) + goto out; + + /* iatt already cached */ + if (!bd_inode_ctx_get (inode, this, &bdatt)) + goto next; + + if (bd_get_bd_info (frame, this, xattr, buf->ia_gfid, &type, &size)) + goto out; + + /* BD file, update buf */ + bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + if (!bdatt) { + op_errno = ENOMEM; + goto out; + } + memcpy (&bdatt->iatt, buf, sizeof (struct iatt)); + bdatt->type = type; + + /* Cache LV size in inode_ctx */ + ret = bd_inode_ctx_set (inode, this, bdatt); + if (ret < 0) { + GF_FREE (bdatt); + op_errno = EINVAL; + goto out; + } + + bdatt->iatt.ia_size = size; + bdatt->iatt.ia_blocks = size / 512; + +next: + dict_del (xattr, GF_CONTENT_KEY); + memcpy (buf, &bdatt->iatt, sizeof (struct iatt)); + +out: + BD_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf, + xattr, postparent); + return 0; +} + +/* + * bd_lookup: Issues posix_lookup to find out if file is mapped to BD + * bd_lookup -> posix_lookup -> bd_lookup_cbk +*/ +int32_t +bd_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + dict_t *bd_xattr = NULL; + bd_attr_t *bdatt = NULL; + int op_errno = EINVAL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (loc->path, out); + VALIDATE_OR_GOTO (this->private, out); + + if (bd_inode_ctx_get (loc->inode, this, &bdatt) < 0) { + if (!xattr_req) { + bd_xattr = dict_new (); + BD_VALIDATE_MEM_ALLOC (bd_xattr, op_errno, out); + xattr_req = bd_xattr; + } + if (dict_set_int8 (xattr_req, BD_XATTR, 1) < 0) + goto out; + } + + STACK_WIND (frame, bd_lookup_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lookup, loc, xattr_req); + + if (bd_xattr) + dict_unref (bd_xattr); + return 0; +out: + BD_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + + return 0; +} + +int +bd_forget (xlator_t *this, inode_t *inode) +{ + int ret = -1; + uint64_t ctx = 0; + bd_attr_t *bdatt = NULL; + + ret = bd_inode_ctx_get (inode, this, &bdatt); + if (!ret) { + inode_ctx_del (inode, this, &ctx); + FREE (bdatt); + } + return 0; +} + +int +bd_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *entries, dict_t *xdata) +{ + gf_dirent_t *entry = NULL; + uint64_t size = 0; + char *type = NULL; + + if (op_ret < 0) + goto out; + + list_for_each_entry (entry, &entries->list, list) { + if (entry->d_type != DT_REG) + continue; + if (!bd_get_bd_info (frame, this, entry->dict, + entry->d_stat.ia_gfid, &type, &size)) { + entry->d_stat.ia_size = size; + entry->d_stat.ia_blocks = size / 512; + FREE (type); + } + } + +out: + BD_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, xdata); + return 0; +} + +/* + * bd_readdirp: In bd_readdirp_cbk if the file and BD_XATTR_SIZE is set + * ia_size is updated with the LV(BD_XATTR_SIZE) size + */ +int32_t +bd_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict) +{ + int op_errno = EINVAL; + bd_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + if (!dict) { + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + local->dict = dict_new (); + BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out); + dict = local->dict; + } + + if (dict_set_int8 (dict, BD_XATTR, 0)) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set key %s", BD_XATTR); + goto out; + } + + STACK_WIND (frame, bd_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, off, dict); + + return 0; +out: + BD_STACK_UNWIND (readdirp, frame, -1, op_errno, NULL, dict); + return 0; +} + +int +bd_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *buf, dict_t *xdata) +{ + bd_local_t *local = frame->local; + bd_attr_t *bdatt = NULL; + + /* only regular files are part of BD object */ + if (op_ret < 0 || buf->ia_type != IA_IFREG) + goto out; + + BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out); + + /* update buf with LV size */ + if (!bd_inode_ctx_get (local->inode, this, &bdatt)) + memcpy (buf, bdatt, sizeof (struct iatt)); + +out: + BD_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int +bd_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int op_errno = EINVAL; + bd_local_t *local = NULL; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (loc->path, out); + VALIDATE_OR_GOTO (this->private, out); + + if (!bd_inode_ctx_get (loc->inode, this, &bdatt)) { + BD_STACK_UNWIND (stat, frame, 0, 0, &bdatt->iatt, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + local->inode = inode_ref (loc->inode); + + STACK_WIND(frame, bd_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; +out: + BD_STACK_UNWIND (stat, frame, -1, op_errno, NULL, xdata); + return 0; +} + +int +bd_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct statvfs *buff, dict_t *xdata) +{ + uint64_t size = 0; + uint64_t fr_size = 0; + bd_priv_t *priv = NULL; + vg_t vg = NULL; + + if (op_ret < 0) + goto out; + + priv = this->private; + + vg = lvm_vg_open (priv->handle, priv->vg, "r", 0); + if (!vg) { + gf_log (this->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + op_ret = -1; + op_errno = EAGAIN; + goto out; + } + size = lvm_vg_get_size (vg); + fr_size = lvm_vg_get_free_size (vg); + lvm_vg_close (vg); + + buff->f_blocks += size / buff->f_frsize; + buff->f_bfree += fr_size / buff->f_frsize; + buff->f_bavail += fr_size / buff->f_frsize; + +out: + BD_STACK_UNWIND (statfs, frame, op_ret, op_errno, buff, xdata); + return 0; +} + +/* + * bd_statfs: Mimics statfs by returning used/free extents in the VG + */ +int +bd_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + STACK_WIND (frame, bd_statfs_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->statfs, loc, xdata); + return 0; +out: + BD_STACK_UNWIND (statfs, frame, -1, EINVAL, NULL, NULL); + return 0; +} + +int +bd_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *buf, dict_t *xdata) +{ + bd_attr_t *bdatt = NULL; + bd_local_t *local = frame->local; + + /* only regular files are part of BD object */ + if (op_ret < 0 || buf->ia_type != IA_IFREG) + goto out; + + BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out); + + /* update buf with LV size */ + if (!bd_inode_ctx_get (local->inode, this, &bdatt)) + memcpy (buf, &bdatt->iatt, sizeof (struct iatt)); + +out: + BD_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int +bd_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int op_errno = EINVAL; + bd_local_t *local = NULL; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + /* if its already cached return it */ + if (!bd_inode_ctx_get (fd->inode, this, &bdatt)) { + BD_STACK_UNWIND (fstat, frame, 0, 0, &bdatt->iatt, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->inode = inode_ref (fd->inode); + + STACK_WIND (frame, bd_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + + return 0; +out: + BD_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, xdata); + return 0; +} + +/* + * bd_readv: If posix file, invokes posix_readv otherwise reads from the BD + * file + */ +int +bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + int ret = -1; + int _fd = -1; + int32_t op_ret = -1; + int32_t op_errno = 0; + bd_fd_t *bd_fd = NULL; + struct iovec vec = {0, }; + struct iobuf *iobuf = NULL; + struct iobref *iobref = NULL; + uint64_t bd_size = 0; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { + STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, + fd, size, offset, flags, xdata); + return 0; + } + if (!size) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size); + goto out; + } + iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); + if (!iobuf) { + op_errno = ENOMEM; + goto out; + } + _fd = bd_fd->fd; + op_ret = pread (_fd, iobuf->ptr, size, offset); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "read failed on fd=%p: %s", fd, + strerror (op_errno)); + goto out; + } + + vec.iov_base = iobuf->ptr; + vec.iov_len = op_ret; + + iobref = iobref_new (); + iobref_add (iobref, iobuf); + + if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { + op_errno = EINVAL; + op_ret = -1; + goto out; + } + bd_size = bdatt->iatt.ia_size; + if (!bd_size || (offset + vec.iov_len) >= bd_size) + op_errno = ENOENT; + + op_ret = vec.iov_len; + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_ATIME); + +out: + BD_STACK_UNWIND (readv, frame, op_ret, op_errno, + &vec, 1, &bdatt->iatt, iobref, NULL); + + if (iobref) + iobref_unref (iobref); + if (iobuf) + iobuf_unref (iobuf); + + return 0; +} + +#ifdef BLKDISCARD +/* + * bd_discard: Sends BLKDISCARD ioctl to the block device + */ +int +bd_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int ret = -1; + int op_errno = EINVAL; + bd_fd_t *bd_fd = NULL; + uint64_t param[2] = {0, }; + bd_attr_t *bdatt = NULL; + struct iatt prebuf = {0, }; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (fd, out); + + /* posix */ + if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { + STACK_WIND (frame, default_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, + fd, offset, len, xdata); + return 0; + } + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { + op_errno = EINVAL; + goto out; + } + + param[0] = offset; + param[1] = len; + ret = ioctl (bd_fd->fd, BLKDISCARD, param); + if (ret < 0) { + if (errno == ENOTTY) + op_errno = ENOSYS; + else + op_errno = errno; + goto out; + } + memcpy (&prebuf, &bdatt->iatt, sizeof (prebuf)); + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME); + + BD_STACK_UNWIND (discard, frame, ret, op_errno, &prebuf, + &bdatt->iatt, xdata); + return 0; + +out: + BD_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} +#else + +int +bd_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + BD_STACK_UNWIND (discard, frame, -1, ENOSYS, NULL, NULL, NULL); + return 0; +} +#endif + +/* + * Call back from posix_open for opening the backing posix file + * If it failed, close BD fd + */ +int +bd_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +{ + bd_fd_t *bd_fd = NULL; + bd_attr_t *bdatt = NULL; + + if (!op_ret) + goto out; + + bd_inode_ctx_get (fd->inode, this, &bdatt); + if (!bdatt) /* posix file */ + goto out; + + /* posix open failed */ + if (bd_fd_ctx_get (this, fd, &bd_fd) < 0) { + gf_log (this->name, GF_LOG_WARNING, + "bd_fd is NULL from fd=%p", fd); + goto out; + } + close (bd_fd->fd); + GF_FREE (bd_fd); + +out: + BD_STACK_UNWIND (open, frame, op_ret, op_errno, fd, NULL); + + return 0; +} + +/* + * bd_open: Opens BD file if given posix file is mapped to BD. Also opens + * posix file. + * fd contains both posix and BD fd + */ +int32_t +bd_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + int32_t ret = EINVAL; + bd_fd_t *bd_fd = NULL; + bd_attr_t *bdatt = NULL; + bd_gfid_t gfid = {0, }; + char *devpath = NULL; + bd_priv_t *priv = this->private; + int _fd = -1; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (fd, out); + + /* not bd file */ + if (fd->inode->ia_type != IA_IFREG || + bd_inode_ctx_get (fd->inode, this, &bdatt)) + goto posix; + + uuid_utoa_r (fd->inode->gfid, gfid); + asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid); + BD_VALIDATE_MEM_ALLOC (devpath, ret, out); + + _fd = open (devpath, flags | O_LARGEFILE, 0); + if (_fd < 0) { + ret = errno; + gf_log (this->name, GF_LOG_ERROR, "open on %s: %s", devpath, + strerror (ret)); + goto out; + } + bd_fd = GF_CALLOC (1, sizeof(bd_fd_t), gf_bd_fd); + BD_VALIDATE_MEM_ALLOC (bd_fd, ret, out); + + bd_fd->fd = _fd; + bd_fd->flag = flags | O_LARGEFILE; + + if (fd_ctx_set (fd, this, (uint64_t)(long)bd_fd) < 0) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set the fd context fd=%p", fd); + goto out; + } + + ret = 0; + +posix: + + /* open posix equivalant of this file, fd needed for fd related + operations like fsetxattr, ftruncate etc */ + STACK_WIND (frame, bd_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + + return 0; +out: + BD_STACK_UNWIND (open, frame, -1, ret, fd, NULL); + + FREE (devpath); + if (ret) { + close (_fd); + GF_FREE (bd_fd); + } + + return 0; +} + +/* + * call back from posix_setattr after updating iatt to posix file. + */ +int +bd_fsync_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + bd_local_t *local = frame->local; + bd_attr_t *bdatt = local->bdatt; + + BD_STACK_UNWIND (fsync, frame, op_ret, op_errno, &bdatt->iatt, + &bdatt->iatt, NULL); + return 0; +} + +int +bd_do_fsync (int fd, int datasync) +{ + int op_errno = 0; + +#ifdef HAVE_FDATASYNC + if (datasync) { + if (fdatasync (fd)) { + op_errno = errno; + gf_log (THIS->name, GF_LOG_ERROR, + "fdatasync on fd=%d failed: %s", + fd, strerror (errno)); + } + + } else +#endif + { + if (fsync (fd)) { + op_errno = errno; + gf_log (THIS->name, GF_LOG_ERROR, + "fsync on fd=%d failed: %s", + fd, strerror (op_errno)); + } + } + + return op_errno; +} + +/* + * bd_fsync: Syncs if BD fd, forwards the request to posix + * fsync -> posix_setattr -> posix_fsync +*/ +int32_t +bd_fsync (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t datasync, dict_t *xdata) +{ + int ret = -1; + int32_t op_ret = -1; + int32_t op_errno = 0; + bd_fd_t *bd_fd = NULL; + bd_priv_t *priv = NULL; + bd_attr_t *bdatt = NULL; + bd_local_t *local = NULL; + int valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; + struct iatt prebuf = {0, }; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ret = bd_inode_ctx_get (fd->inode, this, &bdatt); + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd || !bdatt) { + STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsync, fd, datasync, + xdata); + return 0; + } + + memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); + + op_errno = bd_do_fsync (bd_fd->fd, datasync); + if (op_errno) + goto out; + + /* For BD, Update the a|mtime during full fsync only */ + if (!datasync) { + local = bd_local_init (frame, this); + /* In case of mem failure, should posix flush called ? */ + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out); + + local->bdatt->type = gf_strdup (bdatt->type); + memcpy (&local->bdatt->iatt, &bdatt->iatt, sizeof (struct iatt)); + bd_update_amtime (&local->bdatt->iatt, valid); + uuid_copy (local->loc.gfid, fd->inode->gfid); + STACK_WIND (frame, bd_fsync_setattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setattr, &local->loc, + &local->bdatt->iatt, + valid, NULL); + return 0; + } + +out: + BD_STACK_UNWIND (fsync, frame, op_ret, op_errno, &prebuf, + &bdatt->iatt, NULL); + return 0; +} + +int +bd_flush_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + BD_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata); + return 0; +} + +int +bd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int ret = -1; + bd_fd_t *bd_fd = NULL; + bd_priv_t *priv = NULL; + bd_attr_t *bdatt = NULL; + int valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; + bd_local_t *local = NULL; + int op_errno = EINVAL; + loc_t loc = {0, }; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ret = bd_inode_ctx_get (fd->inode, this, &bdatt); + if (!bdatt) + goto out; + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd || !bdatt) { + gf_log (this->name, GF_LOG_WARNING, + "bdfd/bdatt is NULL from fd=%p", fd); + goto out; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->fd = fd_ref (fd); + uuid_copy (loc.gfid, bdatt->iatt.ia_gfid); + + /* Update the a|mtime during flush */ + STACK_WIND (frame, bd_flush_setattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setattr, &loc, &bdatt->iatt, + valid, NULL); + + return 0; + +out: + STACK_WIND (frame, default_flush_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->flush, fd, xdata); + + return 0; +} + +int32_t +bd_release (xlator_t *this, fd_t *fd) +{ + int ret = -1; + bd_fd_t *bd_fd = NULL; + uint64_t tmp_bfd = 0; + bd_attr_t *bdatt = NULL; + bd_priv_t *priv = this->private; + + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (priv, out); + + ret = bd_inode_ctx_get (fd->inode, this, &bdatt); + if (ret || !bdatt) /* posix file */ + goto out; + + /* FIXME: Update amtime during release */ + + ret = fd_ctx_del (fd, this, &tmp_bfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "bfd is NULL from fd=%p", fd); + goto out; + } + bd_fd = (bd_fd_t *)(long)tmp_bfd; + + close (bd_fd->fd); + GF_FREE (bd_fd); +out: + return 0; +} + +/* + * Call back for removexattr after removing BD_XATTR incase of + * bd create failure + */ +int +bd_setx_rm_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + + if (local->fd) + BD_STACK_UNWIND (setxattr, frame, -1, EIO, xdata); + else + BD_STACK_UNWIND (setxattr, frame, -1, EIO, xdata); + return 0; + +} + +/* + * Call back after setting BD_XATTR. Creates BD. If BD creation is a failure + * invokes posix_removexattr to remove created BD_XATTR + */ +int +bd_setx_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + bd_attr_t *bdatt = NULL; + + if (op_ret < 0) + goto next; + + /* Create LV */ + op_errno = bd_create (local->inode->gfid, local->bdatt->iatt.ia_size, + local->bdatt->type, this->private); + if (!op_errno) + goto out; + + /* LV creation failed, remove BD_XATTR */ + if (local->fd) + STACK_WIND (frame, bd_setx_rm_xattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, + local->fd, BD_XATTR, NULL); + else + STACK_WIND (frame, bd_setx_rm_xattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, + &local->loc, BD_XATTR, NULL); + + return 0; +out: + + bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + if (!bdatt) { + op_ret = -1; + op_errno = ENOMEM; + goto next; + } + + memcpy (&bdatt->iatt, &local->bdatt->iatt, sizeof (struct iatt)); + bdatt->type = gf_strdup (local->bdatt->type); + + bd_inode_ctx_set (local->inode, THIS, bdatt); + +next: + if (local->fd) + BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); + else + BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); + return 0; + +} + +/* + * Call back from posix_stat + */ +int +bd_setx_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *iatt, + dict_t *xdata) +{ + char *param = NULL; + char *type = NULL; + char *s_size = NULL; + char *p = NULL; + char *copy = NULL; + bd_local_t *local = frame->local; + bd_priv_t *priv = this->private; + char *bd = NULL; + uint64_t size = 0; + + if (op_ret < 0) + goto out; + + if (!IA_ISREG (iatt->ia_type)) { + op_errno = EOPNOTSUPP; + goto out; + } + + param = copy = GF_CALLOC (1, local->data->len + 1, gf_common_mt_char); + BD_VALIDATE_MEM_ALLOC (param, op_errno, out); + + strncpy (param, local->data->data, local->data->len); + + type = strtok_r (param, ":", &p); + if (!type) { + op_errno = EINVAL; + goto out; + } + + if (strcmp (type, BD_LV) && strcmp (type, BD_THIN)) { + gf_log (this->name, GF_LOG_WARNING, "Invalid bd type %s given", + type); + op_errno = EINVAL; + goto out; + } + + s_size = strtok_r (NULL, ":", &p); + + /* If size not specified get default size */ + if (!s_size) + size = bd_get_default_extent (priv); + else + gf_string2bytesize (s_size, &size); + + gf_asprintf (&bd, "%s:%ld", type, size); + BD_VALIDATE_MEM_ALLOC (bd, op_errno, out); + + local->dict = dict_new (); + BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out); + + local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out); + + if (dict_set_dynstr (local->dict, BD_XATTR, bd) < 0) { + op_errno = EINVAL; + goto out; + } + + local->bdatt->type = gf_strdup (type); + memcpy (&local->bdatt->iatt, iatt, sizeof (struct iatt)); + local->bdatt->iatt.ia_size = size; + + if (local->fd) + STACK_WIND (frame, bd_setx_setx_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + local->fd, local->dict, 0, NULL); + else + STACK_WIND (frame, bd_setx_setx_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + &local->loc, local->dict, 0, NULL); + + return 0; + +out: + if (local->fd) + BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, xdata); + else + BD_STACK_UNWIND (setxattr, frame, -1, op_errno, xdata); + + GF_FREE (bd); + GF_FREE (copy); + return 0; +} + +int +bd_offload_rm_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + + if (local->fd) + BD_STACK_UNWIND (fsetxattr, frame, -1, EIO, NULL); + else + BD_STACK_UNWIND (setxattr, frame, -1, EIO, NULL); + + return 0; +} + +int +bd_offload_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + + if (op_ret < 0) + goto out; + + if (local->offload == BD_OF_SNAPSHOT) + op_ret = bd_snapshot_create (frame->local, this->private); + else + op_ret = bd_clone (frame->local, this->private); + + if (op_ret) { + STACK_WIND (frame, bd_offload_rm_xattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, + local->dloc, BD_XATTR, NULL); + return 0; + } + +out: + if (local->fd) + BD_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, NULL); + else + BD_STACK_UNWIND (setxattr, frame, op_errno, op_errno, NULL); + + return 0; +} + +int +bd_offload_getx_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + char *bd = NULL; + bd_local_t *local = frame->local; + char *type = NULL; + char *p = NULL; + + if (op_ret < 0) + goto out; + + if (dict_get_str (xattr, BD_XATTR, &p)) { + op_errno = EINVAL; + goto out; + } + + type = gf_strdup (p); + BD_VALIDATE_MEM_ALLOC (type, op_errno, out); + + p = strrchr (type, ':'); + if (!p) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, + "source file xattr %s corrupted?", type); + goto out; + } + + *p='\0'; + + /* For clone size is taken from source LV */ + if (!local->size) { + p++; + gf_string2bytesize (p, &local->size); + } + gf_asprintf (&bd, "%s:%ld", type, local->size); + local->bdatt->type = gf_strdup (type); + dict_del (local->dict, BD_XATTR); + dict_del (local->dict, LINKTO); + if (dict_set_dynstr (local->dict, BD_XATTR, bd)) { + op_errno = EINVAL; + goto out; + } + + STACK_WIND (frame, bd_offload_setx_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + local->dloc, local->dict, 0, NULL); + + return 0; + +out: + if (local->fd) + BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); + else + BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + + GF_FREE (type); + GF_FREE (bd); + + return 0; +} + +int +bd_offload_dest_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct iatt *iatt, + dict_t *xattr, struct iatt *postparent) +{ + bd_local_t *local = frame->local; + char *bd = NULL; + int ret = -1; + char *linkto = NULL; + + if (op_ret < 0 && op_errno != ENODATA) { + op_errno = EINVAL; + goto out; + } + + if (!IA_ISREG (iatt->ia_type)) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, "destination gfid is not a " + "regular file"); + goto out; + } + + ret = dict_get_str (xattr, LINKTO, &linkto); + if (linkto) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, "destination file not " + "present in same brick"); + goto out; + } + + ret = dict_get_str (xattr, BD_XATTR, &bd); + if (bd) { + op_errno = EEXIST; + goto out; + } + + local->bdatt = CALLOC (1, sizeof (bd_attr_t)); + BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out); + + STACK_WIND (frame, bd_offload_getx_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + &local->loc, BD_XATTR, NULL); + + return 0; +out: + if (local->fd) + BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); + else + BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + + return 0; +} + +int +bd_merge_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + /* FIXME: if delete failed, remove xattr */ + + BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, NULL); + return 0; +} + +int +bd_do_merge(call_frame_t *frame, xlator_t *this) +{ + bd_local_t *local = frame->local; + inode_t *parent = NULL; + char *p = NULL; + int op_errno = 0; + + op_errno = bd_merge (this->private, local->inode->gfid); + if (op_errno) + goto out; + + /* + * posix_unlink needs loc->pargfid to be valid, but setxattr FOP does + * not have loc->pargfid set. Get parent's gfid by getting parents inode + */ + parent = inode_parent (local->inode, NULL, NULL); + if (!parent) { + /* + * FIXME: Snapshot LV already deleted. + * remove xattr, instead of returning failure + */ + op_errno = EINVAL; + goto out; + } + uuid_copy (local->loc.pargfid, parent->gfid); + + p = strrchr (local->loc.path, '/'); + if (p) + p++; + local->loc.name = p; + + STACK_WIND (frame, bd_merge_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + &local->loc, 0, NULL); + + return 0; +out: + BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); + + return op_errno; +} + +int +bd_offload (call_frame_t *frame, xlator_t *this, loc_t *loc, + fd_t *fd, bd_offload_t offload) +{ + char *param = NULL; + char *param_copy = NULL; + char *p = NULL; + char *size = NULL; + char *gfid = NULL; + int op_errno = 0; + bd_local_t *local = frame->local; + + param = GF_CALLOC (1, local->data->len + 1, gf_common_mt_char); + BD_VALIDATE_MEM_ALLOC (param, op_errno, out); + param_copy = param; + + local->dict = dict_new (); + BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out); + + local->dloc = CALLOC (1, sizeof (loc_t)); + BD_VALIDATE_MEM_ALLOC (local->dloc, op_errno, out); + + strncpy (param, local->data->data, local->data->len); + + gfid = strtok_r (param, ":", &p); + size = strtok_r (NULL, ":", &p); + if (size) + gf_string2bytesize (size, &local->size); + else if (offload != BD_OF_CLONE) + local->size = bd_get_default_extent (this->private); + + if (dict_set_int8 (local->dict, BD_XATTR, 1) < 0) { + op_errno = EINVAL; + goto out; + } + if (dict_set_int8 (local->dict, LINKTO, 1) < 0) { + op_errno = EINVAL; + goto out; + } + + uuid_parse (gfid, local->dloc->gfid); + local->offload = offload; + + STACK_WIND (frame, bd_offload_dest_lookup_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lookup, local->dloc, + local->dict); + + return 0; + +out: + if (fd) + BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); + else + BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + + GF_FREE (param_copy); + return 0; +} + +/* + * bd_setxattr: Used to create & map an LV to a posix file using + * BD_XATTR xattr + * bd_setxattr -> posix_stat -> bd_setx_stat_cbk -> posix_setxattr -> + * bd_setx_setx_cbk -> create_lv + * if create_lv failed, posix_removexattr -> bd_setx_rm_xattr_cbk + */ +int +bd_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int flags, dict_t *xdata) +{ + int op_errno = 0; + data_t *data = NULL; + bd_local_t *local = NULL; + bd_attr_t *bdatt = NULL; + bd_offload_t cl_type = BD_OF_NONE; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + + if ((data = dict_get (dict, BD_XATTR))) + cl_type = BD_OF_NONE; + else if ((data = dict_get (dict, BD_CLONE))) + cl_type = BD_OF_CLONE; + else if ((data = dict_get (dict, BD_SNAPSHOT))) + cl_type = BD_OF_SNAPSHOT; + else if ((data = dict_get (dict, BD_MERGE))) + cl_type = BD_OF_MERGE; + + bd_inode_ctx_get (loc->inode, this, &bdatt); + if (!cl_type && !data) { + STACK_WIND (frame, default_setxattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setxattr, loc, dict, + flags, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->data = data; + loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + + if (cl_type) { + /* For cloning/snapshot, source file must be mapped to LV */ + if (!bdatt) { + gf_log (this->name, GF_LOG_WARNING, + "%s not mapped to BD", loc->path); + op_errno = EINVAL; + goto out; + } + if (cl_type == BD_OF_MERGE) + bd_do_merge (frame, this); + else + bd_offload (frame, this, loc, NULL, cl_type); + } else if (data) { + if (bdatt) { + gf_log (this->name, GF_LOG_WARNING, + "%s already mapped to BD", loc->path); + op_errno = EEXIST; + goto out; + } + STACK_WIND (frame, bd_setx_stat_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->stat, loc, xdata); + } + + return 0; +out: + if (op_errno) + STACK_UNWIND_STRICT (setxattr, frame, -1, op_errno, xdata); + + return 0; +} + +/* + * bd_fsetxattr: Used to create/map an LV to a posix file using + * BD_XATTR xattr + * bd_fsetxattr -> posix_fstat -> bd_setx_stat_cbk -> posix_fsetxattr -> + * bd_setx_setx_cbk -> create_lv + * if create_lv failed, posix_removexattr -> bd_setx_rm_xattr_cbk + * -> bd_fsetxattr_cbk + */ +int32_t +bd_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int flags, dict_t *xdata) +{ + int op_errno = 0; + data_t *data = NULL; + bd_attr_t *bdatt = NULL; + bd_local_t *local = NULL; + bd_offload_t cl_type = BD_OF_NONE; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (fd, out); + + bd_inode_ctx_get (fd->inode, this, &bdatt); + + data = dict_get (dict, BD_XATTR); + if ((data = dict_get (dict, BD_XATTR))) + cl_type = BD_OF_NONE; + else if ((data = dict_get (dict, BD_CLONE))) + cl_type = BD_OF_CLONE; + else if ((data = dict_get (dict, BD_SNAPSHOT))) + cl_type = BD_OF_SNAPSHOT; + else if ((data = dict_get (dict, BD_MERGE))) { + /* + * bd_merge is not supported for fsetxattr, because snapshot LV + * is opened and it causes problem in snapshot merge + */ + op_errno = EOPNOTSUPP; + goto out; + } + + bd_inode_ctx_get (fd->inode, this, &bdatt); + + if (!cl_type && !data) { + /* non bd file object */ + STACK_WIND (frame, default_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + fd, dict, flags, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->inode = inode_ref (fd->inode); + local->fd = fd_ref (fd); + local->data = data; + + if (cl_type) { + /* For cloning/snapshot, source file must be mapped to LV */ + if (!bdatt) { + gf_log (this->name, GF_LOG_WARNING, + "fd %p not mapped to BD", fd); + op_errno = EINVAL; + goto out; + + } + bd_offload (frame, this, NULL, fd, cl_type); + } else if (data) { + if (bdatt) { + gf_log (this->name, GF_LOG_WARNING, + "fd %p already mapped to BD", fd); + op_errno = EEXIST; + goto out; + } + STACK_WIND(frame, bd_setx_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + } + + return 0; +out: + + BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + + return 0; +} + +int32_t +bd_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + if (!strcmp (name, BD_XATTR)) + goto out; + + STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + return 0; +out: + BD_STACK_UNWIND (removexattr, frame, -1, ENODATA, NULL); + return 0; +} + +int32_t +bd_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + if (!strcmp (name, BD_XATTR)) + goto out; + + STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + + return 0; +out: + BD_STACK_UNWIND (fremovexattr, frame, -1, ENODATA, NULL); + return 0; +} + +int +bd_trunc_setxattr_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + + if (local->fd) + BD_STACK_UNWIND (ftruncate, frame, -1, EIO, NULL, NULL, NULL); + else + BD_STACK_UNWIND (truncate, frame, -1, EIO, NULL, NULL, NULL); + + return 0; +} + +/* + * Call back for setxattr after setting BD_XATTR_SIZE. + */ +int +bd_trunc_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + bd_attr_t *bdatt = NULL; + struct iatt prebuf = {0, }; + char *bd = NULL; + + if (op_ret < 0) + goto out; + + bd_inode_ctx_get (local->inode, this, &bdatt); + if (!bdatt) + goto revert_xattr; + + op_errno = bd_resize (this->private, local->inode->gfid, + local->bdatt->iatt.ia_size); + if (op_errno) + goto revert_xattr; + + memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); + /* LV resized, update new size in the cache */ + bdatt->iatt.ia_size = local->bdatt->iatt.ia_size; + + if (local->fd) + BD_STACK_UNWIND (ftruncate, frame, 0, 0, &prebuf, &bdatt->iatt, + NULL); + else + BD_STACK_UNWIND (truncate, frame, 0, 0, &prebuf, &bdatt->iatt, + NULL); + + return 0; + +revert_xattr: + /* revert setxattr */ + op_ret = dict_get_str (local->dict, BD_XATTR, &bd); + GF_FREE (bd); + gf_asprintf (&bd, "%s:%ld", bdatt->type, bdatt->iatt.ia_size); + + if (local->fd) + STACK_WIND (frame, bd_trunc_setxattr_setx_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + local->fd, local->dict, 0, NULL); + else + STACK_WIND (frame, bd_trunc_setxattr_setx_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + &local->loc, local->dict, 0, NULL); + + return 0; +out: + if (local->fd) + BD_STACK_UNWIND (ftruncate, frame, -1, EIO, NULL, NULL, NULL); + else + BD_STACK_UNWIND (truncate, frame, -1, EIO, NULL, NULL, NULL); + + return 0; +} + +/* + * call back from posix_[f]truncate_stat + * If offset > LV size, it resizes the LV and calls posix_setxattr + * to update new LV size in xattr else calls posix_setattr for updating + * the posix file so that truncate fop behaves properly + */ +int +bd_trunc_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *buf, dict_t *xdata) +{ + char *bd = NULL; + bd_local_t *local = frame->local; + bd_attr_t *bdatt = NULL; + + if (op_ret < 0) + goto out; + + local->dict = dict_new (); + BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out); + + bd_inode_ctx_get (local->inode, this, &bdatt); + if (!bdatt) { + op_errno = EINVAL; + goto out; + } + + gf_asprintf (&bd, "%s:%ld", bdatt->type, local->bdatt->iatt.ia_size); + if (dict_set_dynstr (local->dict, BD_XATTR, bd)) { + op_errno = EINVAL; + goto out; + } + + if (local->fd) + STACK_WIND (frame, bd_trunc_setxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + local->fd, local->dict, 0, NULL); + else + STACK_WIND (frame, bd_trunc_setxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + &local->loc, local->dict, 0, NULL); + + return 0; +out: + if (local->fd) + BD_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, + NULL); + else + BD_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, + NULL); + GF_FREE (bd); + return 0; +} + +void +bd_do_trunc (call_frame_t *frame, xlator_t *this, fd_t *fd, loc_t *loc, + off_t offset, bd_attr_t *bdatt) +{ + bd_local_t *local = NULL; + struct iatt prebuf = {0, }; + int op_errno = 0; + int op_ret = -1; + + /* If requested size is less than LV size, return success */ + if (offset <= bdatt->iatt.ia_size) { + memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME); + op_ret = 0; + goto out; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out); + + if (fd) { + local->inode = inode_ref (fd->inode); + local->fd = fd_ref (fd); + } else { + local->inode = inode_ref (loc->inode); + loc_copy (&local->loc, loc); + } + + local->bdatt->iatt.ia_size = + bd_adjust_size (this->private, offset); + + STACK_WIND (frame, bd_trunc_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, NULL); + + return; + +out: + if (fd) + BD_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, + &prebuf, &bdatt->iatt, NULL); + else + BD_STACK_UNWIND (truncate, frame, op_ret, op_errno, + &prebuf, &bdatt->iatt, NULL); + return; +} + +/* + * bd_ftruncate: Resizes a LV if fd belongs to BD. + */ +int32_t +bd_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + int op_errno = 0; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { + STACK_WIND (frame, default_ftruncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, + offset, xdata); + return 0; + } + + bd_do_trunc (frame, this, fd, NULL, offset, bdatt); + return 0; +out: + BD_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +/* + * bd_truncate: Resizes a LV if file maps to LV. + */ +int32_t +bd_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + int op_errno = 0; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + + if (bd_inode_ctx_get (loc->inode, this, &bdatt)) { + STACK_WIND (frame, default_truncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, + offset, xdata); + return 0; + } + + bd_do_trunc (frame, this, NULL, loc, offset, bdatt); + return 0; + +out: + BD_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +__bd_pwritev (int fd, struct iovec *vector, int count, off_t offset, + uint64_t bd_size) +{ + int index = 0; + int retval = 0; + off_t internal_offset = 0; + + if (!vector) + return -EFAULT; + + retval = pwritev (fd, vector, count, offset); + if (retval == -1) { + gf_log (THIS->name, GF_LOG_WARNING, + "base %p, length %ld, offset %ld, message %s", + vector[index].iov_base, vector[index].iov_len, + internal_offset, strerror (errno)); + retval = -errno; + goto err; + } +/* + + + internal_offset = offset; + for (index = 0; index < count; index++) { + if (internal_offset > bd_size) { + op_ret = -ENOSPC; + goto err; + } + if (internal_offset + vector[index].iov_len > bd_size) { + vector[index].iov_len = bd_size - internal_offset; + no_space = 1; + } + retval = pwritev (fd, vector[index].iov_base, + vector[index].iov_len, internal_offset); + if (retval == -1) { + gf_log (THIS->name, GF_LOG_WARNING, + "base %p, length %ld, offset %ld, message %s", + vector[index].iov_base, vector[index].iov_len, + internal_offset, strerror (errno)); + op_ret = -errno; + goto err; + } + op_ret += retval; + internal_offset += retval; + if (no_space) + break; + } +*/ +err: + return retval; +} + +/* + * bd_writev: Writes to LV if its BD file or forwards the request to posix_write + * bd_writev -> posix_writev -> bd_writev_cbk + */ +int +bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdict) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int _fd = -1; + bd_fd_t *bd_fd = NULL; + int ret = -1; + uint64_t size = 0; + struct iatt prebuf = {0, }; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (vector, out); + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { /* posix fd */ + STACK_WIND (frame, default_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, + offset, flags, iobref, xdict); + return 0; + } + + _fd = bd_fd->fd; + + if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + size = bdatt->iatt.ia_size; + + op_ret = __bd_pwritev (_fd, vector, count, offset, size); + if (op_ret < 0) { + op_errno = -op_ret; + op_ret = -1; + gf_log (this->name, GF_LOG_ERROR, "write failed: offset %"PRIu64 + ", %s", offset, strerror (op_errno)); + goto out; + } + + memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME); +out: + + BD_STACK_UNWIND (writev, frame, op_ret, op_errno, &prebuf, + &bdatt->iatt, NULL); + return 0; +} + +int +bd_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + bd_attr_t *bdatt = NULL; + int *valid = cookie; + bd_local_t *local = frame->local; + + if (op_ret < 0 || !valid || !local) + goto out; + + if (bd_inode_ctx_get (local->inode, this, &bdatt)) + goto out; + + if (*valid & GF_SET_ATTR_UID) + bdatt->iatt.ia_uid = postbuf->ia_uid; + else if (*valid & GF_SET_ATTR_GID) + bdatt->iatt.ia_gid = postbuf->ia_gid; + else if (*valid & GF_SET_ATTR_MODE) { + bdatt->iatt.ia_type = postbuf->ia_type; + bdatt->iatt.ia_prot = postbuf->ia_prot; + } else if (*valid & GF_SET_ATTR_ATIME) { + bdatt->iatt.ia_atime = postbuf->ia_atime; + bdatt->iatt.ia_atime_nsec = postbuf->ia_atime_nsec; + } else if (*valid & GF_SET_ATTR_MTIME) { + bdatt->iatt.ia_mtime = postbuf->ia_mtime; + bdatt->iatt.ia_mtime_nsec = postbuf->ia_mtime_nsec; + } + + bdatt->iatt.ia_ctime = postbuf->ia_ctime; + bdatt->iatt.ia_ctime_nsec = postbuf->ia_ctime_nsec; + + memcpy (postbuf, &bdatt->iatt, sizeof (struct iatt)); +out: + FREE (valid); + BD_STACK_UNWIND (setattr, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + return 0; +} + +int +bd_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + bd_local_t *local = NULL; + bd_attr_t *bdatt = NULL; + int *ck_valid = NULL; + int op_errno = 0; + + if (bd_inode_ctx_get (loc->inode, this, &bdatt)) { + STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, + loc, stbuf, valid, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + ck_valid = CALLOC (1, sizeof (valid)); + BD_VALIDATE_MEM_ALLOC (ck_valid, op_errno, out); + + local->inode = inode_ref (loc->inode); + *ck_valid = valid; + + STACK_WIND_COOKIE (frame, bd_setattr_cbk, ck_valid, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, + loc, stbuf, valid, xdata); + + return 0; +out: + BD_STACK_UNWIND (setattr, frame, -1, ENOMEM, NULL, NULL, xdata); + return 0; +} + +int +bd_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + bd_attr_t *bdatt = NULL; + + if (op_ret < 0) + goto out; + + if (bd_inode_ctx_get (inode, this, &bdatt)) + goto out; + + bdatt->iatt.ia_ctime = buf->ia_ctime; + bdatt->iatt.ia_ctime_nsec = buf->ia_ctime_nsec; + bdatt->iatt.ia_nlink = buf->ia_nlink; + memcpy (buf, &bdatt->iatt, sizeof (struct iatt)); + +out: + BD_STACK_UNWIND (link, frame, op_ret, op_errno, inode, buf, + preparent, postparent, NULL); + return 0; +} + +int +bd_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + STACK_WIND (frame, bd_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + return 0; +} + +int +bd_handle_special_xattrs (call_frame_t *frame, xlator_t *this, loc_t *loc, + fd_t *fd, const char *name, dict_t *xdata) +{ + dict_t *xattr = NULL; + int op_ret = -1; + int op_errno = ENOMEM;; + bd_priv_t *priv = this->private; + + xattr = dict_new (); + if (!xattr) + goto out; + + if (!strcmp (name, VOL_TYPE)) + op_ret = dict_set_int64 (xattr, (char *)name, 1); + else if (!strcmp (name, VOL_CAPS)) + op_ret = dict_set_int64 (xattr, (char *)name, priv->caps); + else + op_ret = bd_get_origin (this->private, loc, fd, xattr); + +out: + if (loc) + BD_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, + xdata); + else + BD_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr, + xdata); + + op_ret = dict_reset (xattr); + dict_unref (xattr); + + return 0; +} + +int +bd_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + if (name && (!strcmp (name, VOL_TYPE) || !strcmp (name, VOL_CAPS) + || !strcmp (name, BD_ORIGIN))) + bd_handle_special_xattrs (frame, this, NULL, fd, name, xdata); + else + STACK_WIND (frame, default_fgetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, + fd, name, xdata); + return 0; +} + +int +bd_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + if (name && (!strcmp (name, VOL_TYPE) || !strcmp (name, VOL_CAPS) + || !strcmp (name, BD_ORIGIN))) + bd_handle_special_xattrs (frame, this, loc, NULL, name, xdata); + else + STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + loc, name, xdata); + + return 0; +} + +int +bd_unlink_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xattr, + struct iatt *postparent) +{ + bd_gfid_t gfid = {0, }; + bd_local_t *local = frame->local; + + if (buf->ia_nlink > 1) + goto posix; + + BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out); + + uuid_utoa_r (inode->gfid, gfid); + if (bd_delete_lv (this->private, gfid, &op_errno) < 0) { + if (op_errno != ENOENT) + goto out; + } + +posix: + /* remove posix */ + STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + &local->loc, 0, NULL); + + return 0; +out: + BD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int +bd_unlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, int xflag, dict_t *xdata) +{ + int op_errno = 0; + bd_attr_t *bdatt = NULL; + bd_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + + if (bd_inode_ctx_get (loc->inode, this, &bdatt)) { + STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + loc, xflag, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + loc_copy (&local->loc, loc); + + STACK_WIND (frame, bd_unlink_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, NULL); + return 0; +out: + BD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +bd_priv (xlator_t *this) +{ + return 0; +} + +int32_t +bd_inode (xlator_t *this) +{ + return 0; +} + +int32_t +bd_rchecksum (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + int32_t len, dict_t *xdata) +{ + int op_ret = -1; + int op_errno = 0; + int ret = 0; + int _fd = -1; + char *alloc_buf = NULL; + char *buf = NULL; + int32_t weak_checksum = 0; + bd_fd_t *bd_fd = NULL; + unsigned char strong_checksum[MD5_DIGEST_LENGTH] = {0}; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { + STACK_WIND (frame, default_rchecksum_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->rchecksum, fd, offset, + len, xdata); + return 0; + } + + memset (strong_checksum, 0, MD5_DIGEST_LENGTH); + + alloc_buf = page_aligned_alloc (len, &buf); + if (!alloc_buf) { + op_errno = ENOMEM; + goto out; + } + + _fd = bd_fd->fd; + + LOCK (&fd->lock); + { + ret = pread (_fd, buf, len, offset); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "pread of %d bytes returned %d (%s)", + len, ret, strerror (errno)); + op_errno = errno; + } + } + UNLOCK (&fd->lock); + + if (ret < 0) + goto out; + + weak_checksum = gf_rsync_weak_checksum ((unsigned char *) buf, + (size_t) len); + gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) len, + (unsigned char *) strong_checksum); + + op_ret = 0; +out: + BD_STACK_UNWIND (rchecksum, frame, op_ret, op_errno, + weak_checksum, strong_checksum, NULL); + + GF_FREE (alloc_buf); + + return 0; +} + +/** + * notify - when parent sends PARENT_UP, send CHILD_UP event from here + */ +int32_t +notify (xlator_t *this, + int32_t event, + void *data, + ...) +{ + switch (event) + { + case GF_EVENT_PARENT_UP: + { + /* Tell the parent that bd xlator is up */ + default_notify (this, GF_EVENT_CHILD_UP, data); + } + break; + default: + break; + } + return 0; +} + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_bd_mt_end + 1); + + if (ret != 0) + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; +} + +int +reconfigure (xlator_t *this, dict_t *options) +{ + int ret = -1; + bd_priv_t *priv = this->private; + + GF_OPTION_RECONF ("bd-aio", priv->aio_configured, options, + bool, out); + + if (priv->aio_configured) + bd_aio_on (this); + else + bd_aio_off (this); + + ret = 0; +out: + return ret; +} + +/** + * bd xlator init - Validate configured VG + */ +int +init (xlator_t *this) +{ + int ret = 0; + char *vg_data = NULL; + char *device = NULL; + bd_priv_t *_private = NULL; + + if (!this->children) { + gf_log (this->name, GF_LOG_CRITICAL, + "FATAL: storage/bd needs posix as subvolume"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "Volume is dangling. Please check the volume file."); + } + + GF_OPTION_INIT ("export", vg_data, str, error); + GF_OPTION_INIT ("device", device, str, error); + + /* Now we support only LV device */ + if (strcasecmp (device, BACKEND_VG)) { + gf_log (this->name, GF_LOG_CRITICAL, + "FATAL: unknown %s backend %s", BD_XLATOR, device); + return -1; + } + + this->local_pool = mem_pool_new (bd_local_t, 64); + if (!this->local_pool) { + gf_log (this->name, GF_LOG_CRITICAL, + "FATAL: Failed to create bd memory pool"); + return -1; + } + + ret = 0; + _private = GF_CALLOC (1, sizeof (*_private), gf_bd_private); + if (!_private) + goto error; + + this->private = _private; + _private->vg = gf_strdup (vg_data); + if (!_private->vg) + goto error; + + _private->handle = lvm_init (NULL); + if (!_private->handle) { + gf_log (this->name, GF_LOG_CRITICAL, "lvm_init failed"); + goto error; + } + _private->caps = BD_CAPS_BD; + if (bd_scan_vg (this, _private)) + goto error; + + _private->aio_init_done = _gf_false; + _private->aio_capable = _gf_false; + + GF_OPTION_INIT ("bd-aio", _private->aio_configured, bool, error); + if (_private->aio_configured) { + if (bd_aio_on (this)) { + gf_log (this->name, GF_LOG_ERROR, + "BD AIO init failed"); + ret = -1; + goto error; + } + } + + _private->caps |= BD_CAPS_OFFLOAD_COPY | BD_CAPS_OFFLOAD_SNAPSHOT; + + return 0; +error: + GF_FREE (_private->vg); + if (_private->handle) + lvm_quit (_private->handle); + mem_pool_destroy (this->local_pool); + GF_FREE (_private); + return -1; +} + +void +fini (xlator_t *this) +{ + bd_priv_t *priv = this->private; + mem_pool_destroy (this->local_pool); + this->local_pool = NULL; + if (!priv) + return; + lvm_quit (priv->handle); + GF_FREE (priv->vg); + this->private = NULL; + GF_FREE (priv); + return; +} + +struct xlator_dumpops dumpops = { + .priv = bd_priv, + .inode = bd_inode, +}; + +struct xlator_fops fops = { + .readdirp = bd_readdirp, + .lookup = bd_lookup, + .stat = bd_stat, + .statfs = bd_statfs, + .open = bd_open, + .fstat = bd_fstat, + .rchecksum = bd_rchecksum, + .readv = bd_readv, + .fsync = bd_fsync, + .setxattr = bd_setxattr, + .fsetxattr = bd_fsetxattr, + .removexattr = bd_removexattr, + .fremovexattr=bd_fremovexattr, + .truncate = bd_truncate, + .ftruncate = bd_ftruncate, + .writev = bd_writev, + .getxattr = bd_getxattr, + .fgetxattr = bd_fgetxattr, + .unlink = bd_unlink, + .link = bd_link, + .flush = bd_flush, + .setattr = bd_setattr, + .discard = bd_discard, +}; + +struct xlator_cbks cbks = { + .release = bd_release, + .forget = bd_forget, +}; + +struct volume_options options[] = { + { .key = {"export"}, + .type = GF_OPTION_TYPE_STR}, + { .key = {"device"}, + .type = GF_OPTION_TYPE_STR, + .default_value = BACKEND_VG}, + { + .key = {"bd-aio"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "Support for native Linux AIO" + }, + + { .key = {NULL} } +}; diff --git a/xlators/storage/bd/src/bd.h b/xlators/storage/bd/src/bd.h new file mode 100644 index 000000000..34b4c9e22 --- /dev/null +++ b/xlators/storage/bd/src/bd.h @@ -0,0 +1,178 @@ +/* + BD translator - Exports Block devices on server side as regular + files to client + + Copyright IBM, Corp. 2012 + + This file is part of GlusterFS. + + Author: + M. Mohan Kumar <mohan@in.ibm.com> + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _BD_H +#define _BD_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#ifdef HAVE_LIBAIO +#include <libaio.h> +#endif + +#include "xlator.h" +#include "mem-types.h" + +#define BD_XLATOR "block device mapper xlator" +#define BACKEND_VG "vg" +#define GF_XATTR "user.glusterfs" +#define BD_XATTR GF_XATTR ".bd" + +#define BD_LV "lv" +#define BD_THIN "thin" + +#define LVM_RESIZE "/sbin/lvresize" +#define LVM_CREATE "/sbin/lvcreate" +#define LVM_CONVERT "/sbin/lvconvert" + +#define VOL_TYPE "volume.type" +#define VOL_CAPS "volume.caps" + +#define ALIGN_SIZE 4096 + +#define BD_CAPS_BD 0x01 +#define BD_CAPS_THIN 0x02 +#define BD_CAPS_OFFLOAD_COPY 0x04 +#define BD_CAPS_OFFLOAD_SNAPSHOT 0x08 + +#define BD_CLONE "clone" +#define BD_SNAPSHOT "snapshot" +#define BD_MERGE "merge" +#define BD_ORIGIN "list-origin" + +#define IOV_NR 4 +#define IOV_SIZE (64 * 1024) + +#define ALIGN_SIZE 4096 + +#define LINKTO "trusted.glusterfs.dht.linkto" + +#define BD_VALIDATE_MEM_ALLOC(buff, op_errno, label) \ + if (!buff) { \ + op_errno = ENOMEM; \ + gf_log (this->name, GF_LOG_ERROR, "out of memory"); \ + goto label; \ + } + +#define BD_VALIDATE_LOCAL_OR_GOTO(local, op_errno, label) \ + if (!local) { \ + op_errno = EINVAL; \ + goto label; \ + } + +#define BD_STACK_UNWIND(typ, frame, args ...) do { \ + bd_local_t *__local = frame->local; \ + xlator_t *__this = frame->this; \ + \ + frame->local = NULL; \ + STACK_UNWIND_STRICT (typ, frame, args); \ + if (__local) \ + bd_local_free (__this, __local); \ + } while (0) + +typedef char bd_gfid_t[GF_UUID_BUF_SIZE]; + +enum gf_bd_mem_types_ { + gf_bd_private = gf_common_mt_end + 1, + gf_bd_attr, + gf_bd_fd, + gf_bd_mt_end +}; + +/** + * bd_fd - internal structure + */ +typedef struct bd_fd { + int fd; + int32_t flag; + int odirect; +} bd_fd_t; + +typedef struct bd_priv { + lvm_t handle; + char *vg; + char *pool; + int caps; + gf_boolean_t aio_init_done; + gf_boolean_t aio_capable; + gf_boolean_t aio_configured; +#ifdef HAVE_LIBAIO + io_context_t ctxp; + pthread_t aiothread; +#endif +} bd_priv_t; + + +typedef enum bd_type { + BD_TYPE_NONE, + BD_TYPE_LV, +} bd_type_t; + +typedef struct { + struct iatt iatt; + char *type; +} bd_attr_t; + +typedef enum { + BD_OF_NONE, + BD_OF_CLONE, + BD_OF_SNAPSHOT, + BD_OF_MERGE, +} bd_offload_t; + +typedef struct { + dict_t *dict; + bd_attr_t *bdatt; + inode_t *inode; + loc_t loc; + fd_t *fd; + data_t *data; /* for setxattr */ + bd_offload_t offload; + uint64_t size; + loc_t *dloc; +} bd_local_t; + +/* Prototypes */ +int bd_inode_ctx_set (inode_t *inode, xlator_t *this, bd_attr_t *ctx); +int bd_inode_ctx_get (inode_t *inode, xlator_t *this, bd_attr_t **ctx); +int bd_scan_vg (xlator_t *this, bd_priv_t *priv); +bd_local_t *bd_local_init (call_frame_t *frame, xlator_t *this); +void bd_local_free (xlator_t *this, bd_local_t *local); +int bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd); +char *page_aligned_alloc (size_t size, char **aligned_buf); +int bd_validate_bd_xattr (xlator_t *this, char *bd, char **type, + uint64_t *lv_size, uuid_t uuid); +uint64_t bd_get_default_extent (bd_priv_t *priv); +uint64_t bd_adjust_size (bd_priv_t *priv, uint64_t size); +int bd_create (uuid_t uuid, uint64_t size, char *type, bd_priv_t *priv); +int bd_resize (bd_priv_t *priv, uuid_t uuid, off_t size); +int bd_delete_lv (bd_priv_t *priv, const char *lv_name, int *op_errno); +int bd_snapshot_create (bd_local_t *local, bd_priv_t *priv); +int bd_clone (bd_local_t *local, bd_priv_t *priv); + +int bd_merge (bd_priv_t *priv, uuid_t gfid); +int bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict); +inline void bd_update_amtime(struct iatt *iatt, int flag); +int bd_snapshot_create (bd_local_t *local, bd_priv_t *priv); +int bd_clone (bd_local_t *local, bd_priv_t *priv); +int bd_merge (bd_priv_t *priv, uuid_t gfid); +int bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict); + +#endif diff --git a/xlators/storage/bd_map/src/bd_map.c b/xlators/storage/bd_map/src/bd_map.c deleted file mode 100644 index 9c8f69c64..000000000 --- a/xlators/storage/bd_map/src/bd_map.c +++ /dev/null @@ -1,2580 +0,0 @@ -/* - BD translator - Exports Block devices on server side as regular - files to client - - Now only exporting Logical volumes supported. - - Copyright IBM, Corp. 2012 - - This file is part of GlusterFS. - - Author: - M. Mohan Kumar <mohan@in.ibm.com> - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include <time.h> -#include <lvm2app.h> -#include <openssl/md5.h> - -#include "bd_map.h" -#include "bd_map_help.h" -#include "defaults.h" -#include "glusterfs3-xdr.h" -#include "run.h" -#include "protocol-common.h" - -/* Regular fops */ - -int -bd_access (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t mask, dict_t *xdict) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - char path[PATH_MAX] = {0, }; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - - sprintf (path, "/dev/mapper/%s", loc->path); - op_ret = access (path, mask & 07); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, "access failed on %s: %s", - loc->path, strerror (op_errno)); - goto out; - } - op_ret = 0; -out: - STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, NULL); - - return 0; -} - -#define LV_RENAME "/sbin/lvrename" - -int bd_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdict) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - char *new_path = NULL; - char *np = NULL; - struct iatt stbuf = {0, }; - struct iatt preoldparent = {0, }; - struct iatt postoldparent = {0, }; - struct iatt prenewparent = {0, }; - struct iatt postnewparent = {0, }; - bd_priv_t *priv = NULL; - bd_entry_t *lventry = NULL; - bd_entry_t *newp_entry = NULL; - char *path = NULL; - struct stat v_stat = {0, }; - runner_t runner = {0, }; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (oldloc, out); - VALIDATE_OR_GOTO (newloc, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - BD_ENTRY (priv, lventry, oldloc->path); - if (lventry->refcnt > 1) { - op_errno = EBUSY; - goto out; - } - - memcpy (&preoldparent, lventry->parent->attr, sizeof(preoldparent)); - - new_path = np = gf_strdup (newloc->path); - if (!new_path) - goto out; - new_path = strrchr (np, '/'); - if (!new_path) { - op_errno = EINVAL; - goto out; - } - - *new_path = '\0'; - BD_ENTRY (priv, newp_entry, np); - - memcpy (&prenewparent, newp_entry->parent->attr, sizeof(preoldparent)); - - runinit (&runner); - - runner_add_args (&runner, LV_RENAME, NULL); - runner_add_args (&runner, lventry->parent->name, NULL); - runner_add_args (&runner, oldloc->name, NULL); - runner_add_args (&runner, newloc->name, NULL); - - runner_start (&runner); - runner_end (&runner); - - /* verify */ - gf_asprintf (&path, "/dev/%s", newloc->path); - if (stat (path, &v_stat) < 0) { - op_errno = EIO; - goto out; - } - BD_ENTRY_UPDATE_MTIME (lventry); - BD_ENTRY_UPDATE_MTIME (newp_entry); - memcpy (&postoldparent, lventry->parent->attr, sizeof(postoldparent)); - memcpy (&postnewparent, newp_entry->parent->attr, - sizeof(postoldparent)); - BD_WR_LOCK (&priv->lock); - strncpy (lventry->name, newloc->name, sizeof(lventry->name)); - memcpy (&stbuf, lventry->attr, sizeof(stbuf)); - BD_UNLOCK (&priv->lock); - op_ret = 0; -out: - if (lventry) - BD_PUT_ENTRY (priv, lventry); - if (newp_entry) - BD_PUT_ENTRY (priv, newp_entry); - if (np) - GF_FREE (np); - if (path) - GF_FREE (path); - - STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, &stbuf, - &preoldparent, &postoldparent, &prenewparent, - &postnewparent, NULL); - return 0; -} - -int32_t -bd_delete_lv (bd_priv_t *priv, bd_entry_t *p_entry, bd_entry_t *lventry, - const char *path, int *op_errno) -{ - vg_t vg = NULL; - lv_t lv = NULL; - int op_ret = -1; - - *op_errno = 0; - BD_WR_LOCK (&priv->lock); - vg = lvm_vg_open (priv->handle, p_entry->name, "w", 0); - if (!vg) { - *op_errno = ENOENT; - BD_UNLOCK (&priv->lock); - goto out; - } - - lv = lvm_lv_from_name (vg, lventry->name); - if (!lv) { - lvm_vg_close (vg); - *op_errno = ENOENT; - BD_UNLOCK (&priv->lock); - goto out; - } - op_ret = lvm_vg_remove_lv (lv); - if (op_ret < 0) { - *op_errno = errno; - lvm_vg_close (vg); - BD_UNLOCK (&priv->lock); - goto out; - } - lvm_vg_close (vg); - - op_ret = bd_entry_rm (path); - if (op_ret < 0) { - *op_errno = EIO; - BD_UNLOCK (&priv->lock); - goto out; - } - BD_ENTRY_UPDATE_MTIME (p_entry); - - op_ret = 0; - op_errno = 0; - - BD_UNLOCK (&priv->lock); - op_ret = 0; -out: - return op_ret; -} - -int32_t -bd_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, int xflag, dict_t *xdata) -{ - int32_t op_ret = -1; - int32_t op_errno = ENOENT; - struct iatt preparent = {0, }; - struct iatt postparent = {0, }; - bd_priv_t *priv = NULL; - bd_entry_t *lventry = NULL; - bd_entry_t *p_entry = NULL; - char *vg_name = NULL; - char *volume = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - volume = vg_name = gf_strdup (loc->path); - if (!volume) - goto out; - volume = strrchr (volume, '/'); - if (!volume) { - op_errno = EINVAL; - goto out; - } - /* creating under non VG directory not permited */ - if (vg_name == volume) { - op_errno = EOPNOTSUPP; - goto out; - } - *volume = '\0'; - - BD_ENTRY (priv, p_entry, vg_name); - BD_ENTRY (priv, lventry, loc->path); - if (!p_entry || !lventry) - goto out; - - memcpy (&preparent, p_entry->attr, sizeof(preparent)); - op_ret = bd_delete_lv (priv, p_entry, lventry, loc->path, &op_errno); - memcpy (&postparent, p_entry->attr, sizeof(postparent)); -out: - if (p_entry) - BD_PUT_ENTRY (priv, p_entry); - if (lventry) - BD_PUT_ENTRY (priv, lventry); - if (vg_name) - GF_FREE (vg_name); - STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno, - &preparent, &postparent, NULL); - - return 0; -} - -#define LVM_CREATE "/sbin/lvcreate" - -#define IOV_NR 4 -#define IOV_SIZE (4 * 1024) - -int bd_clone_lv (bd_priv_t *priv, bd_entry_t *p_entry, dict_t *output, - const char *vg_name, const char *lv_name, - const char *dest_lv_name, struct iatt *stbuf) -{ - int32_t ret = -1; - vg_t vg = NULL; - lv_t lv = NULL; - ssize_t size = 0; - uint64_t extent = 0; - int fd1 = -1; - int fd2 = -1; - struct iatt iattr = {0, }; - bd_entry_t *lventry = NULL; - char path[512] = {0, }; - struct iovec *vec = NULL; - int i = 0; - ssize_t bytes = 0; - int nr_iov = 0; - - vec = GF_CALLOC (IOV_NR, sizeof(struct iovec), gf_common_mt_iovec); - if (!vec) - goto out; - - for (i = 0; i < IOV_NR; i++) { - vec[i].iov_base = GF_MALLOC (IOV_SIZE, gf_common_mt_char); - if (!vec[i].iov_base) - goto out; - vec[i].iov_len = IOV_SIZE; - } - - vg = lvm_vg_open (priv->handle, vg_name, "w", 0); - if (!vg) { - gf_log (THIS->name, GF_LOG_ERROR, - "lvm_vg_open %s failed", vg_name); - ret = -1; - goto out; - } - lv = lvm_lv_from_name (vg, lv_name); - if (!lv) { - gf_log (THIS->name, GF_LOG_ERROR, "lvm_lv_from_name failed"); - ret = -1; - goto out; - } - - size = lvm_lv_get_size (lv); - extent = size / lvm_vg_get_extent_size (vg); - - if (lvm_vg_create_lv_linear (vg, dest_lv_name, size) == NULL) { - gf_log (THIS->name, GF_LOG_ERROR, "lv_create:%s", - lvm_errmsg(priv->handle)); - ret = -1; - goto out; - } - sprintf (path, "/dev/%s/%s", vg_name, lv_name); - fd1 = open (path, O_RDONLY); - if (fd1 < 0) { - gf_log (THIS->name, GF_LOG_ERROR, "opening %s failed", path); - goto out; - } - sprintf (path, "/dev/%s/%s", vg_name, dest_lv_name); - fd2 = open (path, O_WRONLY); - if (fd2 < 0) { - gf_log (THIS->name, GF_LOG_ERROR, "opening %s failed", path); - goto out; - } - - bd_entry_istat (path, &iattr, IA_IFREG); - iattr.ia_size = size; - - bytes = size; - while (bytes) { - size = readv(fd1, vec, IOV_NR); - if (size < 0) { - gf_log (THIS->name, GF_LOG_DEBUG, - "read failed:%s", strerror(errno)); - goto out; - } - if (size < IOV_NR * IOV_SIZE) { - vec[size / IOV_SIZE].iov_len = size % IOV_SIZE; - nr_iov = (size / IOV_SIZE) + 1; - } else - nr_iov = IOV_NR; - bytes -= size; - size = writev (fd2, vec, nr_iov); - if (size < 0) { - gf_log (THIS->name, GF_LOG_DEBUG, - "write failed:%s", strerror(errno)); - goto out; - } - } - - lventry = bd_entry_add (p_entry, dest_lv_name, &iattr, IA_IFREG); - if (!lventry) { - ret = EAGAIN; - goto out; - } - - if (stbuf) - memcpy (stbuf, &iattr, sizeof(iattr)); - - ret = 0; - gf_log (THIS->name, GF_LOG_INFO, "Clone completed"); -out: - if (vg) - lvm_vg_close (vg); - if (fd1 != -1) - close (fd1); - if (fd2 != -1) - close (fd2); - if (vec) - iov_free (vec, IOV_NR); - return ret; -} - -int bd_snapshot_lv (bd_priv_t *priv, bd_entry_t *p_entry, dict_t *output, - const char *lv_name, const char *dest_lv, char *size, - struct iatt *stbuf) -{ - int32_t ret = -1; - struct iatt iattr = {0, }; - struct stat stat = {0, }; - bd_entry_t *lventry = NULL; - char *error = NULL; - int retval = -1; - runner_t runner = {0, }; - char *path = NULL; - vg_t vg = NULL; - lv_t lv = NULL; - - runinit (&runner); - - runner_add_args (&runner, LVM_CREATE, NULL); - runner_add_args (&runner, "--snapshot", NULL); - runner_argprintf (&runner, "/dev/%s/%s", p_entry->name, lv_name); - runner_add_args (&runner, "--name", NULL); - runner_argprintf (&runner, "%s", dest_lv); - runner_argprintf (&runner, "-L%s", size); - - runner_start (&runner); - runner_end (&runner); - - gf_asprintf (&path, "/dev/%s/%s", p_entry->name, dest_lv); - if (!path) { - ret = -ENOMEM; - goto out; - } - if (lstat (path, &stat) < 0) { - ret = -EAGAIN; - if (output) - gf_asprintf (&error, "try again"); - goto out; - } - - vg = lvm_vg_open (priv->handle, p_entry->name, "r", 0); - if (!vg) { - ret = -EIO; - if (output) - gf_asprintf (&error, "can't open vg %s", p_entry->name); - goto out; - } - lv = lvm_lv_from_name (vg, lv_name); - if (!lv) { - ret = -EIO; - if (output) - gf_asprintf (&error, "can't open lv %s", lv_name); - goto out; - } - bd_entry_istat (path, &iattr, IA_IFREG); - iattr.ia_size = lvm_lv_get_size (lv); - lventry = bd_entry_add (p_entry, dest_lv, &iattr, IA_IFREG); - if (!lventry) { - if (output) - gf_asprintf (&error, "try again"); - ret = -EAGAIN; - goto out; - } - if (stbuf) - memcpy (stbuf, &iattr, sizeof(iattr)); - ret = 0; -out: - if (vg) - lvm_vg_close (vg); - if (error && output) - retval = dict_set_str (output, "error", error); - GF_FREE (path); - return ret; -} - -/* - * Creates a snapshot of given LV - */ -int -bd_symlink (call_frame_t *frame, xlator_t *this, - const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - bd_priv_t *priv = NULL; - struct iatt stbuf = {0, }; - struct iatt preparent = {0, }; - struct iatt postparent = {0, }; - bd_entry_t *lventry = NULL; - char *name = NULL; - char *np = NULL; - char *volume = NULL; - char *vg_name = NULL; - char *path = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - if (strchr (loc->path, '/')) { - vg_name = gf_strdup (loc->path); - volume = strrchr (vg_name, '/'); - if (!volume) { - op_errno = EINVAL; - goto out; - } - /* creating under non VG directory not permited */ - if (vg_name == volume) { - op_errno = EOPNOTSUPP; - goto out; - } - GF_FREE (vg_name); - vg_name = NULL; - } - - /* - * symlink creation for BD xlator is different - * source (LV) has to exist for creation of symbolic link (snapshot) - */ - if (strchr (linkname, '/')) { - op_errno = EOPNOTSUPP; - goto out; - } - gf_asprintf (&path, "%s/%s", priv->vg, linkname); - if (!path) { - op_errno = -ENOMEM; - goto out; - } - BD_ENTRY (priv, lventry, path); - if (!lventry) { - op_errno = ENOENT; - goto out; - } - - name = np = gf_strdup (loc->path); - if (!name) - goto out; - - /* Get LV name from loc->path */ - name = strrchr (loc->path, '/'); - if (name != loc->path) - name++; - - memcpy (&preparent, lventry->parent->attr, sizeof(preparent)); - if (bd_snapshot_lv (priv, lventry->parent, NULL, lventry->name, - name, "1", &stbuf) < 0) { - op_errno = EAGAIN; - goto out; - } - BD_ENTRY_UPDATE_MTIME (lventry->parent); - memcpy (&postparent, lventry->parent->attr, sizeof (postparent)); - op_ret = 0; -out: - if (lventry) - BD_PUT_ENTRY (priv, lventry); - if (np) - GF_FREE (np); - if (vg_name) - GF_FREE (vg_name); - if (path) - GF_FREE (path); - - STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, - (loc)?loc->inode:NULL, &stbuf, &preparent, - &postparent, NULL); - return 0; -} - -/* - * bd_link: Does full clone of given logical volume - * A new logical volume with source logical volume's size created - * and entire content copied - */ -int -bd_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - bd_priv_t *priv = NULL; - struct iatt stbuf = {0, }; - struct iatt preparent = {0, }; - struct iatt postparent = {0, }; - bd_entry_t *lventry = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (oldloc, out); - VALIDATE_OR_GOTO (newloc, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - BD_ENTRY (priv, lventry, oldloc->path); - if (!lventry) { - op_errno = ENOENT; - goto out; - } - memcpy (&postparent, lventry->parent->attr, sizeof (postparent)); - if (bd_clone_lv (priv, lventry->parent, NULL, lventry->parent->name, - lventry->name, newloc->name, &stbuf) < 0) { - op_errno = EAGAIN; - goto out; - } - BD_ENTRY_UPDATE_MTIME (lventry->parent); - memcpy (&preparent, lventry->parent->attr, sizeof (preparent)); - op_ret = 0; -out: - if (lventry) - BD_PUT_ENTRY (priv, lventry); - - - STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, - (oldloc)?oldloc->inode:NULL, &stbuf, &preparent, - &postparent, NULL); - return 0; -} - -int32_t -bd_open (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t _fd = -1; - bd_fd_t *bd_fd = NULL; - bd_entry_t *lventry = NULL; - bd_priv_t *priv = NULL; - char *devpath = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - BD_ENTRY (priv, lventry, loc->path); - if (!lventry) { - op_errno = ENOENT; - goto out; - } - - gf_asprintf (&devpath, "/dev/%s/%s", lventry->parent->name, - lventry->name); - _fd = open (devpath, flags, 0); - if (_fd == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "open on %s: %s", devpath, strerror (op_errno)); - goto out; - } - - bd_fd = GF_CALLOC (1, sizeof(*bd_fd), gf_bd_fd); - if (!bd_fd) { - op_errno = errno; - goto out; - } - bd_fd->entry = lventry; - bd_fd->fd = _fd; - - op_ret = fd_ctx_set (fd, this, (uint64_t)(long)bd_fd); - if (op_ret) { - gf_log (this->name, GF_LOG_WARNING, - "failed to set the fd context path=%s fd=%p", - loc->name, fd); - goto out; - } - - op_ret = 0; -out: - if (op_ret == -1) { - if (_fd != -1) - close (_fd); - /* FIXME: Should we call fd_ctx_set with NULL? */ - if (bd_fd) - GF_FREE (bd_fd); - if (lventry) - BD_PUT_ENTRY (priv, lventry); - } - if (devpath) - GF_FREE (devpath); - - STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, NULL); - - return 0; -} - -int -bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, uint32_t flags, dict_t *xdata) -{ - uint64_t tmp_bd_fd = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - int _fd = -1; - bd_priv_t *priv = NULL; - struct iobuf *iobuf = NULL; - struct iobref *iobref = NULL; - struct iovec vec = {0, }; - bd_fd_t *bd_fd = NULL; - int ret = -1; - struct iatt stbuf = {0, }; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - ret = fd_ctx_get (fd, this, &tmp_bd_fd); - if (ret < 0) { - op_errno = -EINVAL; - gf_log (this->name, GF_LOG_WARNING, - "bd_fd is NULL from fd=%p", fd); - goto out; - } - bd_fd = (bd_fd_t *)(long)tmp_bd_fd; - if (!size) { - op_errno = EINVAL; - gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size); - goto out; - } - iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); - if (!iobuf) { - op_errno = ENOMEM; - goto out; - } - _fd = bd_fd->fd; - op_ret = pread (_fd, iobuf->ptr, size, offset); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "read failed on fd=%p: %s", fd, - strerror (op_errno)); - goto out; - } - - vec.iov_base = iobuf->ptr; - vec.iov_len = op_ret; - - iobref = iobref_new (); - iobref_add (iobref, iobuf); - BD_ENTRY_UPDATE_ATIME (bd_fd->entry); - - memcpy (&stbuf, bd_fd->entry->attr, sizeof(stbuf)); - - /* Hack to notify higher layers of EOF. */ - if (bd_fd->entry->size == 0) - op_errno = ENOENT; - else if ((offset + vec.iov_len) >= bd_fd->entry->size) - op_errno = ENOENT; - op_ret = vec.iov_len; -out: - STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, - &vec, 1, &stbuf, iobref, NULL); - - if (iobref) - iobref_unref (iobref); - if (iobuf) - iobuf_unref (iobuf); - return 0; -} - -#define LVM_RESIZE "/sbin/lvresize" - -int32_t -bd_resize (bd_priv_t *priv, bd_entry_t *lventry, off_t *size) -{ - bd_entry_t *vgentry = NULL; - uint64_t extent = 0; - int32_t op_ret = -1; - vg_t vg = NULL; - uint32_t nr_ex = 0; - lv_t lv = NULL; - uint64_t new_size = 0; - runner_t runner = {0, }; - - BD_ENTRY (priv, vgentry, lventry->parent->name); - if (!vgentry) { - op_ret = ENOENT; - goto out; - } - - BD_WR_LOCK (&priv->lock); - vg = lvm_vg_open (priv->handle, vgentry->name, "w", 0); - if (!vg) { - op_ret = lvm_errno (priv->handle); - BD_UNLOCK (&priv->lock); - goto out; - } - - extent = lvm_vg_get_extent_size (vg); - lvm_vg_close (vg); - BD_UNLOCK (&priv->lock); - - nr_ex = *size / extent; - if (*size % extent) - nr_ex++; - *size = extent * nr_ex; - - runinit (&runner); - - runner_add_args (&runner, LVM_RESIZE, NULL); - runner_argprintf (&runner, "/dev/%s/%s", lventry->parent->name, - lventry->name); - runner_argprintf (&runner, "-l%ld", nr_ex); - runner_add_args (&runner, "-f", NULL); - - runner_start (&runner); - runner_end (&runner); - - BD_WR_LOCK (&priv->lock); - vg = lvm_vg_open (priv->handle, vgentry->name, "w", 0); - if (!vg) { - op_ret = lvm_errno (priv->handle); - BD_UNLOCK (&priv->lock); - goto out; - } - - lv = lvm_lv_from_name (vg, lventry->name); - if (!lv) { - op_ret = lvm_errno (priv->handle); - lvm_vg_close (vg); - BD_UNLOCK (&priv->lock); - goto out; - } - new_size = lvm_lv_get_size (lv); - lvm_vg_close (vg); - if (new_size != *size) { - op_ret = EIO; - BD_UNLOCK (&priv->lock); - goto out; - } - - BD_UNLOCK (&priv->lock); - op_ret = 0; - -out: - if (vgentry) - BD_PUT_ENTRY (priv, vgentry); - - return op_ret; -} - - int32_t -bd_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset, dict_t *xdict) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - struct iatt preop = {0, }; - struct iatt postop = {0, }; - bd_fd_t *bd_fd = NULL; - int ret = -1; - uint64_t tmp_bd_fd = 0; - bd_priv_t *priv = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - ret = fd_ctx_get (fd, this, &tmp_bd_fd); - - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "bd_fd is NULL, fd=%p", fd); - op_errno = -ret; - goto out; - } - bd_fd = (bd_fd_t *)(long)tmp_bd_fd; - - memcpy (&preop, bd_fd->entry->attr, sizeof(preop)); - if (offset > bd_fd->entry->size) { - op_errno = bd_resize (priv, bd_fd->entry, &offset); - if (op_errno) - goto out; - if (offset > bd_fd->entry->size) { - bd_fd->entry->attr->ia_size = offset; - bd_fd->entry->size = offset; - } - } - /* If the requested size is less then current size - * we will not update that in bd_fd->entry->attr - * because it will result in showing size of this file less - * instead we will return 0 for less size truncation - */ - BD_ENTRY_UPDATE_MTIME (bd_fd->entry); - memcpy (&postop, bd_fd->entry->attr, sizeof(postop)); - - op_ret = 0; -out: - STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, &preop, - &postop, NULL); - return 0; -} - -int32_t -bd_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, - off_t offset, dict_t *xdict) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - struct iatt prebuf = {0, }; - struct iatt postbuf = {0, }; - bd_entry_t *lventry = NULL; - bd_priv_t *priv = NULL; - off_t size = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - BD_ENTRY (priv, lventry, loc->path); - if (!lventry) { - op_errno = ENOENT; - gf_log (this->name, GF_LOG_ERROR, - "pre-operation lstat on %s failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - memcpy (&prebuf, lventry->attr, sizeof(prebuf)); - if (offset > lventry->size) { - op_errno = bd_resize (priv, lventry, &size); - if (op_errno) - goto out; - if (lventry->size < offset) { - lventry->attr->ia_size = offset; - lventry->size = size; - } - } - BD_ENTRY_UPDATE_MTIME (lventry); - memcpy (&postbuf, lventry->attr, sizeof(postbuf)); - BD_PUT_ENTRY (priv, lventry); - op_ret = 0; -out: - if (lventry) - BD_PUT_ENTRY (priv, lventry); - STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, - &prebuf, &postbuf, NULL); - return 0; -} - -int32_t -__bd_pwritev (int fd, struct iovec *vector, int count, off_t offset, - uint64_t bd_size) -{ - int32_t op_ret = 0; - int index = 0; - int retval = 0; - off_t internal_offset = 0; - int no_space = 0; - - if (!vector) - return -EFAULT; - - internal_offset = offset; - for (index = 0; index < count; index++) { - if (internal_offset >= bd_size) { - op_ret = -ENOSPC; - goto err; - } - if (internal_offset + vector[index].iov_len >= bd_size) { - vector[index].iov_len = bd_size - internal_offset; - no_space = 1; - } - - retval = pwrite (fd, vector[index].iov_base, - vector[index].iov_len, internal_offset); - if (retval == -1) { - gf_log (THIS->name, GF_LOG_WARNING, - "base %p, length %ld, offset %ld, message %s", - vector[index].iov_base, vector[index].iov_len, - internal_offset, strerror (errno)); - op_ret = -errno; - goto err; - } - op_ret += retval; - internal_offset += retval; - if (no_space) - break; - } -err: - return op_ret; -} - -int bd_create_lv (bd_priv_t *priv, bd_entry_t *p_entry, const char *vg_name, - const char *lv_name, char *size, mode_t mode) -{ - vg_t vg = NULL; - int ret = -1; - char *path = NULL; - struct iatt iattr = {0, }; - bd_entry_t *lventry = NULL; - uint64_t extent = 0; - - BD_WR_LOCK (&priv->lock); - vg = lvm_vg_open (priv->handle, vg_name, "w", 0); - if (!vg) { - ret = -1; - goto out; - } - extent = lvm_vg_get_extent_size (vg); - if (size) - gf_string2bytesize (size, &extent); - - if (lvm_vg_create_lv_linear (vg, lv_name, extent) == NULL) { - ret = -EAGAIN; - lvm_vg_close (vg); - goto out; - } - lvm_vg_close (vg); - - gf_asprintf (&path, "/dev/%s/%s", vg_name, lv_name); - if (!path) { - ret = -ENOMEM; - lvm_vg_close (vg); - goto out; - } - bd_entry_istat (path, &iattr, IA_IFREG); - iattr.ia_size = extent; - if (!mode) - mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; - - iattr.ia_type = ia_type_from_st_mode (mode); - iattr.ia_prot = ia_prot_from_st_mode (mode); - lventry = bd_entry_add (p_entry, lv_name, &iattr, IA_IFREG); - if (!lventry) { - ret = -EAGAIN; - goto out; - } - ret = 0; -out: - BD_UNLOCK (&priv->lock); - if (path) - GF_FREE (path); - return ret; -} - -int bd_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - mode_t umask, fd_t *fd, dict_t *params) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - int32_t _fd = -1; - bd_priv_t *priv = NULL; - struct iatt stbuf = {0, }; - struct iatt preparent = {0, }; - struct iatt postparent = {0, }; - bd_entry_t *p_entry = NULL; - bd_entry_t *lventry = NULL; - bd_fd_t *pfd = NULL; - char *vg_name = NULL; - char *volume = NULL; - char *path = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - volume = vg_name = gf_strdup (loc->path); - if (!volume) - goto out; - volume = strrchr (volume, '/'); - if (!volume) { - op_errno = EINVAL; - goto out; - } - /* creating under non VG directory not permited */ - if (vg_name == volume) { - op_errno = EOPNOTSUPP; - goto out; - } - *volume = '\0'; - - BD_ENTRY (priv, p_entry, vg_name); - if (!p_entry) { - op_errno = ENOENT; - goto out; - } - - memcpy (&preparent, p_entry->attr, sizeof(preparent)); - - op_errno = bd_create_lv (priv, p_entry, p_entry->name, loc->name, 0, - mode); - if (op_errno) - goto out; - - BD_ENTRY (priv, lventry, loc->path); - if (!lventry) { - gf_log (this->name, GF_LOG_WARNING, - "newly created LV not available %s", loc->path); - op_errno = EAGAIN; - goto out; - } - - /* Mask O_CREATE since we created LV */ - flags &= ~(O_CREAT | O_EXCL); - - gf_asprintf (&path, "/dev/%s/%s", p_entry->name, loc->name); - if (!path) { - op_errno = ENOMEM; - goto out; - } - _fd = open (path, flags, 0); - if (_fd == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "open on %s: %s", path, strerror (op_errno)); - goto out; - } - - memcpy (&stbuf, lventry->attr, sizeof(stbuf)); - - pfd = GF_CALLOC (1, sizeof(*pfd), gf_bd_fd); - if (!pfd) { - op_errno = errno; - goto out; - } - pfd->flag = flags; - pfd->fd = _fd; - pfd->entry = lventry; - - if (fd_ctx_set (fd, this, (uint64_t)(long)pfd)) { - gf_log (this->name, GF_LOG_WARNING, - "failed to set the fd context path=%s fd=%p", - loc->name, fd); - goto out; - } - - op_ret = 0; - - memcpy (&postparent, p_entry->attr, sizeof(postparent)); -out: - if (p_entry) - BD_PUT_ENTRY (priv, p_entry); - if (path) - GF_FREE (path); - if (op_ret < 0 && lventry) - BD_PUT_ENTRY (priv, lventry); - if (vg_name) - GF_FREE (vg_name); - - STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, - (loc)?loc->inode:NULL, &stbuf, &preparent, - &postparent, NULL); - return 0; -} - -/* - * We don't do actual setattr on devices on the host side, we just update - * the entries in server process & they are not persistent - */ -int bd_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iatt *stbuf, int32_t valid, dict_t *xdata) -{ - struct iatt statpre = {0, }; - struct iatt statpost = {0, }; - int32_t op_ret = -1; - int32_t op_errno = 0; - bd_priv_t *priv = NULL; - bd_fd_t *pfd = NULL; - int ret = 0; - uint64_t tmp_pfd = 0; - int _fd = -1; - - priv = this->private; - - ret = fd_ctx_get (fd, this, &tmp_pfd); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "pfd is NULL, fd=%p", fd); - op_errno = -ret; - goto out; - } - pfd = (bd_fd_t *)(long)tmp_pfd; - - _fd = pfd->fd; - memcpy (&statpre, pfd->entry->attr, sizeof(statpre)); - op_ret = 0; - - if (valid & GF_SET_ATTR_MODE) - pfd->entry->attr->ia_prot = stbuf->ia_prot; - if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) { - if (valid & GF_SET_ATTR_UID) - pfd->entry->attr->ia_uid = stbuf->ia_uid; - if (valid & GF_SET_ATTR_GID) - pfd->entry->attr->ia_gid = stbuf->ia_gid; - } - if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) { - pfd->entry->attr->ia_atime = stbuf->ia_atime; - pfd->entry->attr->ia_atime_nsec = stbuf->ia_atime_nsec; - pfd->entry->attr->ia_mtime = stbuf->ia_mtime; - pfd->entry->attr->ia_mtime_nsec = stbuf->ia_mtime_nsec; - } - memcpy (&statpost, pfd->entry->attr, sizeof(statpost)); - op_errno = 0; -out: - STACK_UNWIND_STRICT (setattr, frame, 0, 0, &statpre, &statpost, NULL); - return 0; -} - -int bd_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, - struct iatt *stbuf, int32_t valid, dict_t *xdata) -{ - struct iatt statpre = {0, }; - struct iatt statpost = {0, }; - bd_entry_t *lventry = NULL; - int32_t op_ret = -1; - int32_t op_errno = 0; - bd_priv_t *priv = NULL; - char path[PATH_MAX] = {0, }; - - priv = this->private; - - /* - * We don't allow to do setattr on / on host side - * ie /dev - */ - if (!strcmp (loc->path, "/")) { - op_ret = 0; - goto out; - } - - BD_ENTRY (priv, lventry, loc->path); - if (!lventry) { - op_errno = ENOENT; - goto out; - } - sprintf (path, "/dev/%s/%s", lventry->parent->name, lventry->name); - - memcpy (&statpre, lventry->attr, sizeof(statpre)); - if (valid & GF_SET_ATTR_MODE) - lventry->attr->ia_prot = stbuf->ia_prot; - if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) { - if (valid & GF_SET_ATTR_UID) - lventry->attr->ia_uid = stbuf->ia_uid; - if (valid & GF_SET_ATTR_GID) - lventry->attr->ia_gid = stbuf->ia_gid; - } - if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) { - lventry->attr->ia_atime = stbuf->ia_atime; - lventry->attr->ia_atime_nsec = stbuf->ia_atime_nsec; - lventry->attr->ia_mtime = stbuf->ia_mtime; - lventry->attr->ia_mtime_nsec = stbuf->ia_mtime_nsec; - } - memcpy (&statpost, lventry->attr, sizeof(statpost)); - op_errno = 0; -out: - if (lventry) - BD_PUT_ENTRY (priv, lventry); - STACK_UNWIND_STRICT (setattr, frame, 0, 0, &statpre, &statpost, NULL); - return 0; -} - -int -bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t offset, - uint32_t flags, struct iobref *iobref, dict_t *xdict) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - int _fd = -1; - bd_priv_t *priv = NULL; - bd_fd_t *bd_fd = NULL; - int ret = -1; - struct iatt preop = {0, }; - struct iatt postop = {0, }; - uint64_t tmp_bd_fd = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - VALIDATE_OR_GOTO (vector, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - ret = fd_ctx_get (fd, this, &tmp_bd_fd); - if (ret < 0) { - op_errno = -ret; - gf_log (this->name, GF_LOG_WARNING, - "bd_fd is NULL from fd=%p", fd); - goto out; - } - bd_fd = (bd_fd_t *)(long)tmp_bd_fd; - _fd = bd_fd->fd; - - memcpy (&preop, bd_fd->entry->attr, sizeof(preop)); - op_ret = __bd_pwritev (_fd, vector, count, offset, bd_fd->entry->size); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; - gf_log (this->name, GF_LOG_ERROR, "write failed: offset %"PRIu64 - ", %s", offset, strerror (op_errno)); - goto out; - } - BD_ENTRY_UPDATE_MTIME (bd_fd->entry); - memcpy (&postop, bd_fd->entry->attr, sizeof(postop)); - -out: - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &preop, - &postop, NULL); - - return 0; -} - -int32_t -bd_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) -{ - struct iatt buf = {0, }; - int32_t op_ret = -1; - int32_t entry_ret = 0; - int32_t op_errno = 0; - char *pathdup = NULL; - bd_entry_t *bdentry = NULL; - struct iatt postparent = {0, }; - bd_priv_t *priv = NULL; - char *p = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (loc->path, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - BD_ENTRY (priv, bdentry, loc->path); - if (!bdentry) { - op_errno = ENOENT; - entry_ret = -1; - goto parent; - } - memcpy (&buf, bdentry->attr, sizeof(buf)); - BD_PUT_ENTRY (priv, bdentry); - -parent: - if (loc->parent) { - pathdup = p = gf_strdup (loc->path); - if (!pathdup) { - op_errno = ENOMEM; - entry_ret = -1; - goto out; - } - p = strrchr (pathdup, '/'); - if (p == pathdup) - *(p+1) = '\0'; - else - *p = '\0'; - BD_ENTRY (priv, bdentry, pathdup); - if (!bdentry) { - op_errno = ENOENT; - gf_log (this->name, GF_LOG_ERROR, - "post-operation lookup on parent of %s " - "failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - memcpy (&postparent, bdentry->attr, sizeof(postparent)); - BD_PUT_ENTRY (priv, bdentry); - } - - op_ret = entry_ret; -out: - if (pathdup) - GF_FREE (pathdup); - - STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, - (loc)?loc->inode:NULL, &buf, NULL, &postparent); - - return 0; -} - -int32_t -bd_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) -{ - struct iatt buf = {0,}; - int32_t op_ret = -1; - int32_t op_errno = 0; - bd_entry_t *bdentry = NULL; - bd_priv_t *priv = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - BD_ENTRY (priv, bdentry, loc->path); - if (!bdentry) { - op_errno = ENOENT; - gf_log (this->name, GF_LOG_ERROR, "stat on %s failed: %s", - loc->path, strerror (op_errno)); - goto out; - } - memcpy (&buf, bdentry->attr, sizeof(buf)); - BD_PUT_ENTRY (priv, bdentry); - op_ret = 0; - -out: - STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, &buf, NULL); - - return 0; -} - -int32_t -bd_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) -{ - int ret = -1; - int32_t op_ret = -1; - int32_t op_errno = 0; - uint64_t tmp_bd_fd = 0; - struct iatt buf = {0, }; - bd_fd_t *bd_fd = NULL; - int _fd = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - ret = fd_ctx_get (fd, this, &tmp_bd_fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "bd_fd is NULL, fd=%p", fd); - op_errno = -EINVAL; - goto out; - } - bd_fd = (bd_fd_t *)(long)tmp_bd_fd; - _fd = bd_fd->fd; - - memcpy (&buf, bd_fd->entry->attr, sizeof(buf)); - op_ret = 0; - -out: - STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, &buf, NULL); - return 0; -} - -int32_t -bd_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd, dict_t *xdata) -{ - int32_t op_ret = -1; - int32_t op_errno = EINVAL; - bd_fd_t *bd_fd = NULL; - bd_entry_t *bdentry = NULL; - bd_priv_t *priv = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (loc, out); - VALIDATE_OR_GOTO (loc->path, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - BD_ENTRY (priv, bdentry, loc->path); - if (!bdentry) { - op_errno = ENOENT; - gf_log (this->name, GF_LOG_ERROR, "opendir failed on %s: %s", - loc->path, strerror (op_errno)); - goto out; - } - bd_fd = GF_CALLOC (1, sizeof(*bd_fd), gf_bd_fd); - if (!bd_fd) { - op_errno = errno; - BD_PUT_ENTRY (priv, bdentry); - goto out; - } - - bd_fd->p_entry = bdentry; - - bdentry = list_entry ((&bdentry->child)->next, typeof(*bdentry), child); - if (!bdentry) { - op_errno = EINVAL; - gf_log (this->name, GF_LOG_ERROR, "bd_entry NULL"); - goto out; - } - bdentry = list_entry ((&bdentry->sibling), typeof(*bdentry), sibling); - if (!bdentry) { - op_errno = EINVAL; - gf_log (this->name, GF_LOG_ERROR, "bd_entry NULL"); - goto out; - } - - bd_fd->entry = bdentry; - - op_ret = fd_ctx_set (fd, this, (uint64_t) (long)bd_fd); - if (op_ret) { - gf_log (this->name, GF_LOG_ERROR, - "failed to set the fd context path=%s fd=%p", - loc->path, fd); - goto out; - } - - op_ret = 0; -out: - if (op_ret == -1) { - BD_PUT_ENTRY (priv, bd_fd->p_entry); - if (bd_fd) - GF_FREE (bd_fd); - } - - STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, NULL); - return 0; -} - -int32_t -bd_releasedir (xlator_t *this, fd_t *fd) -{ - bd_fd_t *bd_fd = NULL; - uint64_t tmp_bd_fd = 0; - int ret = 0; - bd_priv_t *priv = NULL; - - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - ret = fd_ctx_del (fd, this, &tmp_bd_fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, "bd_fd from fd=%p is NULL", - fd); - goto out; - } - bd_fd = (bd_fd_t *) (long)tmp_bd_fd; - BD_PUT_ENTRY (priv, bd_fd->p_entry); - - bd_fd = (bd_fd_t *) (long)tmp_bd_fd; - GF_FREE (bd_fd); -out: - return 0; -} - -/* - * bd_statfs: Mimics statfs by returning used/free extents in the VG - * TODO: IF more than one VG allowed per volume, this functions needs some - * change - */ -int32_t -bd_statfs (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xdata) -{ - int32_t op_ret = -1; - int32_t ret = -1; - int32_t op_errno = 0; - bd_priv_t *priv = NULL; - struct statvfs buf = {0, }; - vg_t vg = NULL; - char *vg_name = NULL; - uint64_t size = 0; - uint64_t fr_size = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - ret = dict_get_str (this->options, "export", &vg_name); - if (ret) { - gf_log (this->name, GF_LOG_CRITICAL, - "FATAL: storage/bd does not specify volume groups"); - op_errno = EINVAL; - goto out; - } - - BD_RD_LOCK (&priv->lock); - - vg = lvm_vg_open (priv->handle, vg_name, "r", 0); - size += lvm_vg_get_size (vg); - fr_size += lvm_vg_get_free_size (vg); - lvm_vg_close (vg); - - BD_UNLOCK (&priv->lock); - - if (statvfs ("/", &buf) < 0) { - op_errno = errno; - goto out; - } - op_ret = 0; - buf.f_blocks = size / buf.f_frsize; - buf.f_bfree = fr_size / buf.f_frsize; - buf.f_bavail = fr_size / buf.f_frsize; -out: - STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, &buf, NULL); - return 0; -} - -int32_t -bd_release (xlator_t *this, fd_t *fd) -{ - bd_fd_t *bd_fd = NULL; - int ret = -1; - uint64_t tmp_bd_fd = 0; - bd_priv_t *priv = NULL; - - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - ret = fd_ctx_get (fd, this, &tmp_bd_fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, "bd_fd is NULL from fd=%p", - fd); - goto out; - } - bd_fd = (bd_fd_t *) (long)tmp_bd_fd; - close (bd_fd->fd); - BD_PUT_ENTRY (priv, bd_fd->entry); - - GF_FREE (bd_fd); -out: - return 0; -} - -int32_t -bd_fsync (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t datasync, dict_t *xdata) -{ - int _fd = -1; - int ret = -1; - int32_t op_ret = -1; - int32_t op_errno = 0; - uint64_t tmp_bd_fd = 0; - bd_fd_t *bd_fd = NULL; - struct iatt preop = {0, }; - struct iatt postop = {0, }; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - ret = fd_ctx_get (fd, this, &tmp_bd_fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "bd_fd is NULL, fd=%p", fd); - op_errno = -ret; - goto out; - } - bd_fd = (bd_fd_t *)(long)tmp_bd_fd; - - _fd = bd_fd->fd; - memcpy (&preop, &bd_fd->entry->attr, sizeof(preop)); - if (datasync) { - ; -#ifdef HAVE_FDATASYNC - op_ret = fdatasync (_fd); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "fdatasync on fd=%p failed: %s", - fd, strerror (errno)); - } -#endif - } else { - op_ret = fsync (_fd); - if (op_ret == -1) { - op_errno = errno; - gf_log (this->name, GF_LOG_ERROR, - "fsync on fd=%p failed: %s", - fd, strerror (op_errno)); - goto out; - } - } - - memcpy (&postop, bd_fd->entry->attr, sizeof(postop)); - op_ret = 0; - -out: - STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, &preop, - &postop, NULL); - - return 0; -} - -int32_t -bd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdict) -{ - int32_t op_ret = -1; - int32_t op_errno = 0; - int ret = -1; - uint64_t tmp_bd_fd = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - ret = fd_ctx_get (fd, this, &tmp_bd_fd); - if (ret < 0) { - op_errno = -EINVAL; - gf_log (this->name, GF_LOG_WARNING, - "bd_fd is NULL on fd=%p", fd); - goto out; - } - op_ret = 0; -out: - STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, NULL); - - return 0; -} - -int -__bd_fill_readdir (pthread_rwlock_t *bd_lock, bd_fd_t *bd_fd, off_t off, - size_t size, gf_dirent_t *entries) -{ - size_t filled = 0; - int count = 0; - struct dirent entry = {0, }; - int32_t this_size = -1; - gf_dirent_t *this_entry = NULL; - bd_entry_t *bdentry = NULL; - bd_entry_t *cur_entry = NULL; - bd_entry_t *n_entry = NULL; - - BD_RD_LOCK (bd_lock); - - bdentry = list_entry ((&bd_fd->p_entry->child)->next, typeof(*n_entry), - child); - - if (off) { - int i = 0; - list_for_each_entry (n_entry, &bd_fd->entry->sibling, sibling) { - if (i == off && strcmp (n_entry->name, "")) { - bd_fd->entry = n_entry; - break; - } - } - } else - bd_fd->entry = list_entry ((&bdentry->sibling), - typeof(*n_entry), sibling); - - while (filled <= size) { - cur_entry = bd_fd->entry; - - n_entry = list_entry ((&bd_fd->entry->sibling)->next, - typeof (*cur_entry), sibling); - if (&n_entry->sibling == (&bdentry->sibling)) - break; - - strcpy (entry.d_name, n_entry->name); - entry.d_ino = n_entry->attr->ia_ino; - entry.d_off = off; - if (n_entry->attr->ia_type == IA_IFDIR) - entry.d_type = DT_DIR; - else - entry.d_type = DT_REG; - - this_size = max (sizeof(gf_dirent_t), - sizeof (gfs3_dirplist)) - + strlen (entry.d_name) + 1; - - if (this_size + filled > size) - break; - - bd_fd->entry = n_entry; - - this_entry = gf_dirent_for_name (entry.d_name); - if (!this_entry) { - gf_log (THIS->name, GF_LOG_ERROR, - "could not create gf_dirent for entry %s", - entry.d_name); - goto out; - } - this_entry->d_off = off; - this_entry->d_ino = entry.d_ino; - this_entry->d_type = entry.d_type; - off++; - - list_add_tail (&this_entry->list, &entries->list); - - filled += this_size; - count++; - } -out: - BD_UNLOCK (bd_lock); - return count; -} - -int32_t -bd_do_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off, int whichop) -{ - uint64_t tmp_bd_fd = 0; - bd_fd_t *bd_fd = NULL; - int ret = -1; - int count = 0; - int32_t op_ret = -1; - int32_t op_errno = 0; - gf_dirent_t entries; - gf_dirent_t *tmp_entry = NULL; - bd_entry_t *bdentry = NULL; - bd_priv_t *priv = NULL; - char *devpath = NULL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - INIT_LIST_HEAD (&entries.list); - - ret = fd_ctx_get (fd, this, &tmp_bd_fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, "bd_fd is NULL, fd=%p", fd); - op_errno = -EINVAL; - goto out; - } - bd_fd = (bd_fd_t *) (long)tmp_bd_fd; - LOCK (&fd->lock); - { - count = __bd_fill_readdir (&priv->lock, bd_fd, off, - size, &entries); - } - UNLOCK (&fd->lock); - - /* pick ENOENT to indicate EOF */ - op_errno = errno; - op_ret = count; - - if (whichop != GF_FOP_READDIRP) - goto out; - - BD_RD_LOCK (&priv->lock); - list_for_each_entry (tmp_entry, &entries.list, list) { - char path[PATH_MAX]; - sprintf (path, "%s/%s", bd_fd->p_entry->name, - tmp_entry->d_name); - bdentry = bd_entry_get (path); - if (!bdentry) { - gf_log (this->name, GF_LOG_WARNING, - "entry failed %s\n", tmp_entry->d_name); - continue; - } - if (bdentry->attr->ia_ino) - tmp_entry->d_ino = bdentry->attr->ia_ino; - memcpy (&tmp_entry->d_stat, - bdentry->attr, sizeof (tmp_entry->d_stat)); - bd_entry_put (bdentry); - GF_FREE (devpath); - } - BD_UNLOCK (&priv->lock); - -out: - STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, &entries, NULL); - - gf_dirent_free (&entries); - - return 0; -} - -int32_t -bd_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off, dict_t *dict) -{ - bd_do_readdir (frame, this, fd, size, off, GF_FOP_READDIR); - return 0; -} - - -int32_t -bd_readdirp (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off, dict_t *dict) -{ - bd_do_readdir (frame, this, fd, size, off, GF_FOP_READDIRP); - return 0; -} - -int32_t -bd_priv (xlator_t *this) -{ - return 0; -} - -int32_t -bd_inode (xlator_t *this) -{ - return 0; -} - -/* unsupported interfaces */ -int32_t -bd_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size, dict_t *xdata) -{ - struct iatt stbuf = {0, }; - char *dest = NULL; - - dest = alloca (size + 1); - STACK_UNWIND_STRICT (readlink, frame, -1, ENOSYS, dest, &stbuf, NULL); - return 0; -} - -int -bd_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t dev, mode_t umask, dict_t *xdata) -{ - struct iatt stbuf = {0, }; - struct iatt preparent = {0, }; - struct iatt postparent = {0, }; - - STACK_UNWIND_STRICT (mknod, frame, -1, ENOSYS, - (loc)?loc->inode:NULL, &stbuf, &preparent, - &postparent, NULL); - return 0; -} - -int -bd_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - mode_t umask, dict_t *xdata) -{ - struct iatt stbuf = {0, }; - struct iatt preparent = {0, }; - struct iatt postparent = {0, }; - - STACK_UNWIND_STRICT (mkdir, frame, -1, ENOSYS, - (loc)?loc->inode:NULL, &stbuf, &preparent, - &postparent, NULL); - return 0; -} - -int -bd_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, - dict_t *xdata) -{ - struct iatt preparent = {0, }; - struct iatt postparent = {0, }; - - STACK_UNWIND_STRICT (rmdir, frame, -1, ENOSYS, - &preparent, &postparent, NULL); - return 0; -} - -int32_t -bd_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int flags, dict_t *xdata) -{ - STACK_UNWIND_STRICT (setxattr, frame, -1, ENOSYS, NULL); - return 0; -} - -int32_t -bd_fsetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *dict, int flags, dict_t *xdata) -{ - STACK_UNWIND_STRICT (setxattr, frame, -1, ENOSYS, NULL); - return 0; -} - -int32_t -bd_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name, dict_t *xdata) -{ - STACK_UNWIND_STRICT (getxattr, frame, -1, ENOSYS, NULL, NULL); - return 0; -} - -int32_t -bd_fgetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *name, dict_t *xdata) -{ - STACK_UNWIND_STRICT (fgetxattr, frame, -1, ENOSYS, NULL, NULL); - - return 0; -} - -int32_t -bd_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name, dict_t *xdata) -{ - STACK_UNWIND_STRICT (removexattr, frame, -1, ENOSYS, NULL); - return 0; -} - -int32_t -bd_fremovexattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *name, dict_t *xdata) -{ - STACK_UNWIND_STRICT (fremovexattr, frame, -1, ENOSYS, NULL); - return 0; -} - -int32_t -bd_fsyncdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, int datasync, dict_t *xdata) -{ - STACK_UNWIND_STRICT (fsyncdir, frame, -1, ENOSYS, NULL); - return 0; -} - -static int gf_bd_lk_log; -int32_t -bd_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata) -{ - struct gf_flock nullock = {0, }; - - GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL, - "\"features/locks\" translator is " - "not loaded. You need to use it for proper " - "functioning of your application."); - - STACK_UNWIND_STRICT (lk, frame, -1, ENOSYS, &nullock, NULL); - return 0; -} - -int32_t -bd_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, - struct gf_flock *lock, dict_t *xdata) -{ - GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL, - "\"features/locks\" translator is " - "not loaded. You need to use it for proper " - "functioning of your application."); - - STACK_UNWIND_STRICT (inodelk, frame, -1, ENOSYS, NULL); - return 0; -} - -int32_t -bd_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, - struct gf_flock *lock, dict_t *xdata) -{ - GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL, - "\"features/locks\" translator is " - "not loaded. You need to use it for proper " - "functioning of your application."); - - STACK_UNWIND_STRICT (finodelk, frame, -1, ENOSYS, NULL); - return 0; -} - - -int32_t -bd_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata) -{ - GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL, - "\"features/locks\" translator is " - "not loaded. You need to use it for proper " - "functioning of your application."); - - STACK_UNWIND_STRICT (entrylk, frame, -1, ENOSYS, NULL); - return 0; -} - -int32_t -bd_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, const char *basename, - entrylk_cmd cmd, entrylk_type type, dict_t *xdata) -{ - GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL, - "\"features/locks\" translator is " - "not loaded. You need to use it for proper " - "functioning of your application."); - - STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOSYS, NULL); - return 0; -} - -int32_t -bd_rchecksum (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset, int32_t len, dict_t *xdata) -{ - int32_t weak_checksum = 0; - unsigned char strong_checksum[MD5_DIGEST_LENGTH]; - - STACK_UNWIND_STRICT (rchecksum, frame, -1, ENOSYS, - weak_checksum, strong_checksum, NULL); - return 0; -} - -int -bd_xattrop (call_frame_t *frame, xlator_t *this, - loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr, - dict_t *xdata) -{ - STACK_UNWIND_STRICT (xattrop, frame, -1, ENOSYS, xattr, NULL); - return 0; -} - - -int -bd_fxattrop (call_frame_t *frame, xlator_t *this, - fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr, - dict_t *xdata) -{ - STACK_UNWIND_STRICT (xattrop, frame, -1, ENOSYS, xattr, NULL); - return 0; -} - -int bd_xl_op_create (bd_priv_t *priv, dict_t *input, dict_t *output) -{ - char *vg = NULL; - char *lv = NULL; - char *path = NULL; - bd_entry_t *p_entry = NULL; - bd_entry_t *lventry = NULL; - char *size = 0; - int ret = -1; - char *error = NULL; - int retval = -1; - char *buff = NULL; - char *buffp = NULL; - char *save = NULL; - - ret = dict_get_str (input, "size", &size); - if (ret) { - gf_asprintf (&error, "no size specified"); - goto out; - } - ret = dict_get_str (input, "path", &path); - if (ret) { - gf_asprintf (&error, "no path specified"); - goto out; - } - - buff = buffp = gf_strdup (path); - - vg = strtok_r (buff, "/", &save); - lv = strtok_r (NULL, "/", &save); - - if (!vg || !lv) { - gf_asprintf (&error, "invalid path %s", path); - ret = -1; - goto out; - } - - BD_ENTRY (priv, p_entry, vg); - if (!p_entry) { - ret = -ENOENT; - goto out; - } - BD_ENTRY (priv, lventry, path); - if (lventry) { - ret = -EEXIST; - gf_asprintf (&error, "%s already exists", lv); - BD_PUT_ENTRY (priv, lventry); - goto out; - } - - ret = bd_create_lv (priv, p_entry, vg, lv, size, 0); - if (ret < 0) { - gf_asprintf (&error, "bd_create_lv error %d", -ret); - goto out; - } - ret = 0; -out: - if (p_entry) - BD_PUT_ENTRY (priv, p_entry); - - if (buffp) - GF_FREE (buffp); - - if (error) - retval = dict_set_dynstr (output, "error", error); - return ret; -} - -int bd_xl_op_delete (bd_priv_t *priv, dict_t *input, dict_t *output) -{ - char *vg = NULL; - char *path = NULL; - bd_entry_t *p_entry = NULL; - bd_entry_t *lventry = NULL; - int ret = -1; - char *error = NULL; - int retval = -1; - char *buff = NULL; - char *buffp = NULL; - char *save = NULL; - int op_errno = 0; - - ret = dict_get_str (input, "path", &path); - if (ret) { - gf_asprintf (&error, "no path specified"); - goto out; - } - - buff = buffp = gf_strdup (path); - - vg = strtok_r (buff, "/", &save); - if (!vg) { - gf_asprintf (&error, "invalid path %s", path); - op_errno = EINVAL; - ret = -1; - goto out; - } - - BD_ENTRY (priv, p_entry, vg); - BD_ENTRY (priv, lventry, path); - if (!p_entry || !lventry) { - op_errno = -ENOENT; - gf_asprintf (&error, "%s not found", path); - ret = -1; - goto out; - } - ret = bd_delete_lv (priv, p_entry, lventry, path, &op_errno); - if (ret < 0) { - gf_asprintf (&error, "bd_delete_lv error, error:%d", op_errno); - goto out; - } - ret = 0; -out: - if (p_entry) - BD_PUT_ENTRY (priv, p_entry); - if (lventry) - BD_PUT_ENTRY (priv, lventry); - if (buffp) - GF_FREE (buffp); - if (error) - retval = dict_set_dynstr (output, "error", error); - return ret; -} - -int bd_xl_op_clone(bd_priv_t *priv, int subop, dict_t *input, dict_t *output) -{ - bd_entry_t *p_entry = NULL; - bd_entry_t *lventry = NULL; - int ret = -1; - char *error = NULL; - int retval = -1; - char *vg = NULL; - char *lv = NULL; - char *dest_lv = NULL; - char *size = NULL; - char *buff = NULL; - char *buffp = NULL; - char *path = NULL; - char *save = NULL; - char *npath = NULL; - - ret = dict_get_str (input, "path", &path); - ret = dict_get_str (input, "dest_lv", &dest_lv); - ret = dict_get_str (input, "size", &size); - - if (!path || !dest_lv) { - gf_asprintf (&error, "invalid arguments"); - ret = -1; - goto out; - } - - buff = buffp = gf_strdup (path); - - vg = strtok_r (buff, "/", &save); - lv = strtok_r (NULL, "/", &save); - if (!lv) { - gf_asprintf (&error, "lv not given %s", path); - ret = -1; - goto out; - } - - BD_ENTRY (priv, p_entry, vg); - if (!p_entry) { - gf_asprintf (&error, "%s does not exist", vg); - retval = dict_set_str (output, "error", error); - goto out; - } - - BD_ENTRY (priv, lventry, path); - if (!lventry) { - gf_asprintf (&error, "%s does not exist", path); - ret = -1; - goto out; - } - BD_PUT_ENTRY (priv, lventry); - lventry = NULL; - gf_asprintf (&npath, "/%s/%s", vg, dest_lv); - BD_ENTRY (priv, lventry, npath); - if (lventry) { - gf_asprintf (&error, "%s already exists", dest_lv); - BD_PUT_ENTRY (priv, lventry); - ret = -1; - goto out; - } - - if (subop == GF_BD_OP_SNAPSHOT_BD) { - if (!size) { - gf_asprintf (&error, "size not given"); - ret = -1; - goto out; - } - ret = bd_snapshot_lv (priv, p_entry, output, lv, dest_lv, - size, NULL); - } else - ret = bd_clone_lv (priv, p_entry, output, vg, lv, dest_lv, - NULL); - - if (ret) - goto out; - ret = 0; -out: - if (error) - retval = dict_set_dynstr (output, "error", error); - if (p_entry) - BD_PUT_ENTRY (priv, p_entry); - if (npath) - GF_FREE (npath); - if (buffp) - GF_FREE (buffp); - return ret; -} - -int32_t -bd_notify (xlator_t *this, dict_t *input, dict_t *output) -{ - int ret = -1; - int retval = -1; - int32_t bdop = -1; - bd_priv_t *priv = NULL; - char *error = NULL; - - priv = this->private; - VALIDATE_OR_GOTO (priv, out); - - ret = dict_get_int32 (input, "bd-op", (int32_t *)&bdop); - if (ret) { - gf_asprintf (&error, "no sub-op specified"); - goto out; - } - - switch (bdop) - { - case GF_BD_OP_NEW_BD: - ret = bd_xl_op_create (priv, input, output); - break; - case GF_BD_OP_DELETE_BD: - ret = bd_xl_op_delete (priv, input, output); - break; - case GF_BD_OP_CLONE_BD: - case GF_BD_OP_SNAPSHOT_BD: - ret = bd_xl_op_clone (priv, bdop, input, output); - break; - default: - gf_asprintf (&error, "invalid bd-op %d specified", bdop); - retval = dict_set_dynstr (output, "error", error); - goto out; - } - -out: - return ret; -} - -/** - * notify - when parent sends PARENT_UP, send CHILD_UP event from here - */ -int32_t -notify (xlator_t *this, - int32_t event, - void *data, - ...) -{ - va_list ap; - int ret = 0; - void *data2 = NULL; - dict_t *input = NULL; - dict_t *output = NULL; - - va_start (ap, data); - data2 = va_arg (ap, dict_t *); - va_end (ap); - - switch (event) - { - case GF_EVENT_PARENT_UP: - { - /* Tell the parent that bd xlator is up */ - default_notify (this, GF_EVENT_CHILD_UP, data); - } - break; - case GF_EVENT_TRANSLATOR_OP: - input = data; - output = data2; - if (!output) - output = dict_new (); - ret = bd_notify (this, input, output); - break; - - default: - break; - } - return ret; -} - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_bd_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } - - return ret; -} - - -/** - * init - Constructs lists of LVs in the given VG - */ -int -init (xlator_t *this) -{ - bd_priv_t *_private = NULL; - int ret = 0; - char *vg = NULL; - char *device = NULL; - - LOCK_INIT (&inode_lk); - - bd_rootp = bd_entry_add_root (); - if (!bd_rootp) { - gf_log (this->name, GF_LOG_CRITICAL, - "FATAL: adding root entry failed"); - return -1; - } - - if (this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "FATAL: storage/bd cannot have subvolumes"); - ret = -1; - goto out; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "Volume is dangling. Please check the volume file."); - } - - ret = dict_get_str (this->options, "device", &device); - if (ret) { - gf_log (this->name, GF_LOG_CRITICAL, - "FATAL: storage/bd does not specify backend"); - return -1; - } - - /* Now we support only LV device */ - if (strcasecmp (device, BACKEND_VG)) { - gf_log (this->name, GF_LOG_CRITICAL, - "FATAL: unknown %s backend %s", BD_XLATOR, device); - return -1; - } - - ret = dict_get_str (this->options, "export", &vg); - if (ret) { - gf_log (this->name, GF_LOG_CRITICAL, - "FATAL: storage/bd does not specify volume groups"); - return -1; - } - - ret = 0; - _private = GF_CALLOC (1, sizeof(*_private), gf_bd_private); - if (!_private) - goto error; - - pthread_rwlock_init (&_private->lock, NULL); - this->private = (void *)_private; - _private->handle = NULL; - _private->vg = gf_strdup (vg); - if (!_private->vg) { - goto error; - } - - if (bd_build_lv_list (this->private, vg) < 0) - goto error; - -out: - return 0; -error: - BD_WR_LOCK (&_private->lock); - bd_entry_cleanup (); - lvm_quit (_private->handle); - if (_private->vg) - GF_FREE (_private->vg); - GF_FREE (_private); - return -1; -} - -void -fini (xlator_t *this) -{ - bd_priv_t *priv = this->private; - if (!priv) - return; - lvm_quit (priv->handle); - BD_WR_LOCK (&priv->lock); - bd_entry_cleanup (); - BD_UNLOCK (&priv->lock); - GF_FREE (priv->vg); - this->private = NULL; - GF_FREE (priv); - return; -} - -struct xlator_dumpops dumpops = { - .priv = bd_priv, - .inode = bd_inode, -}; - -struct xlator_fops fops = { - /* Not supported */ - .readlink = bd_readlink, - .mknod = bd_mknod, - .mkdir = bd_mkdir, - .rmdir = bd_rmdir, - .setxattr = bd_setxattr, - .fsetxattr = bd_fsetxattr, - .getxattr = bd_getxattr, - .fgetxattr = bd_fgetxattr, - .removexattr = bd_removexattr, - .fremovexattr= bd_fremovexattr, - .fsyncdir = bd_fsyncdir, - .lk = bd_lk, - .inodelk = bd_inodelk, - .finodelk = bd_finodelk, - .entrylk = bd_entrylk, - .fentrylk = bd_fentrylk, - .rchecksum = bd_rchecksum, - .xattrop = bd_xattrop, - - /* Supported */ - .lookup = bd_lookup, - .opendir = bd_opendir, - .readdir = bd_readdir, - .readdirp = bd_readdirp, - .stat = bd_stat, - .statfs = bd_statfs, - .open = bd_open, - .access = bd_access, - .flush = bd_flush, - .readv = bd_readv, - .fstat = bd_fstat, - .truncate = bd_truncate, - .ftruncate = bd_ftruncate, - .fsync = bd_fsync, - .writev = bd_writev, - .fstat = bd_fstat, - .create = bd_create, - .setattr = bd_setattr, - .fsetattr = bd_fsetattr, - .unlink = bd_unlink, - .link = bd_link, - .symlink = bd_symlink, - .rename = bd_rename, -}; - -struct xlator_cbks cbks = { - .releasedir = bd_releasedir, - .release = bd_release, -}; - -struct volume_options options[] = { - { .key = {"export"}, - .type = GF_OPTION_TYPE_STR}, - { .key = {"device"}, - .type = GF_OPTION_TYPE_STR}, - { .key = {NULL} } -}; diff --git a/xlators/storage/bd_map/src/bd_map.h b/xlators/storage/bd_map/src/bd_map.h deleted file mode 100644 index 1a0f4248e..000000000 --- a/xlators/storage/bd_map/src/bd_map.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - BD translator - Exports Block devices on server side as regular - files to client - - Copyright IBM, Corp. 2012 - - This file is part of GlusterFS. - - Author: - M. Mohan Kumar <mohan@in.ibm.com> - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef _BD_MAP_H -#define _BD_MAP_H - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "xlator.h" -#include "mem-types.h" - -#define BD_XLATOR "block device mapper xlator" - -#define BACKEND_VG "vg" - -enum gf_bd_mem_types_ { - gf_bd_fd = gf_common_mt_end + 1, - gf_bd_private, - gf_bd_entry, - gf_bd_attr, - gf_bd_mt_end -}; - -/* - * Each BD/LV is represented by this data structure - * Usually root entry will have only children and there is no sibling for that - * All other entries may have children and/or sibling entries - * If an entry is a Volume Group it will have child (. & .. and Logical - * Volumes) and also other Volume groups will be a sibling for this - */ -typedef struct bd_entry { - struct list_head child; /* List to child */ - struct list_head sibling; /* List of siblings */ - struct bd_entry *parent;/* Parent of this node */ - struct bd_entry *link; /* Link to actual entry, if its . or .. */ - char name[NAME_MAX]; - struct iatt *attr; - int refcnt; - uint64_t size; - pthread_rwlock_t lock; -} bd_entry_t; - -/** - * bd_fd - internal structure common to file and directory fd's - */ -typedef struct bd_fd { - bd_entry_t *entry; - bd_entry_t *p_entry; /* Parent entry */ - int fd; - int32_t flag; -} bd_fd_t; - -typedef struct bd_priv { - lvm_t handle; - pthread_rwlock_t lock; - char *vg; -} bd_priv_t; - -#endif diff --git a/xlators/storage/bd_map/src/bd_map_help.c b/xlators/storage/bd_map/src/bd_map_help.c deleted file mode 100644 index 0613aa383..000000000 --- a/xlators/storage/bd_map/src/bd_map_help.c +++ /dev/null @@ -1,501 +0,0 @@ -/* - BD translator - Exports Block devices on server side as regular - files to client - - Copyright IBM, Corp. 2012 - - This file is part of GlusterFS. - - Author: - M. Mohan Kumar <mohan@in.ibm.com> - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#define __XOPEN_SOURCE 500 - -#include <libgen.h> -#include <time.h> -#include <lvm2app.h> - -#include "bd_map.h" -#include "bd_map_help.h" -#include "defaults.h" -#include "glusterfs3-xdr.h" - -#define CHILD_ENTRY(node) list_entry ((&node->child)->next, typeof(*node), \ - child) - -bd_entry_t *bd_rootp; -gf_lock_t inode_lk; -static uint64_t bd_entry_ino = 5000; /* Starting inode */ - -static void bd_entry_get_ino (uint64_t *inode) -{ - LOCK (&inode_lk); - { - *inode = bd_entry_ino++; - } - UNLOCK (&inode_lk); -} - -void bd_update_time (bd_entry_t *entry, int type) -{ - struct timespec ts; - - clock_gettime (CLOCK_REALTIME, &ts); - if (type == 0) { - entry->attr->ia_mtime = ts.tv_sec; - entry->attr->ia_mtime_nsec = ts.tv_nsec; - entry->attr->ia_atime = ts.tv_sec; - entry->attr->ia_atime_nsec = ts.tv_nsec; - } else if (type == 1) { - entry->attr->ia_mtime = ts.tv_sec; - entry->attr->ia_mtime_nsec = ts.tv_nsec; - } else { - entry->attr->ia_atime = ts.tv_sec; - entry->attr->ia_atime_nsec = ts.tv_nsec; - } -} - -static bd_entry_t *bd_entry_init (const char *name) -{ - bd_entry_t *bdentry; - - bdentry = GF_MALLOC (sizeof(bd_entry_t), gf_bd_entry); - if (!bdentry) - return NULL; - - bdentry->attr = GF_MALLOC (sizeof(struct iatt), gf_bd_attr); - if (!bdentry->attr) { - GF_FREE (bdentry); - return NULL; - } - - strcpy (bdentry->name, name); - INIT_LIST_HEAD (&bdentry->sibling); - INIT_LIST_HEAD (&bdentry->child); - bdentry->link = NULL; - bdentry->refcnt = 0; - return bdentry; -} - -static bd_entry_t *bd_entry_clone (bd_entry_t *orig, char *name) -{ - bd_entry_t *bdentry; - - bdentry = GF_MALLOC (sizeof(bd_entry_t), gf_bd_entry); - if (!bdentry) - return NULL; - - bdentry->attr = orig->attr; - - strcpy (bdentry->name, name); - INIT_LIST_HEAD (&bdentry->sibling); - INIT_LIST_HEAD (&bdentry->child); - bdentry->link = orig; - bdentry->refcnt = 0; - return bdentry; -} - -static void bd_entry_init_iattr (struct iatt *attr, int type) -{ - struct timespec ts = {0, }; - - clock_gettime (CLOCK_REALTIME, &ts); - attr->ia_dev = ia_makedev (0, 0); /* FIXME: */ - attr->ia_type = type; - attr->ia_prot = ia_prot_from_st_mode (0750); - attr->ia_nlink = 2; - attr->ia_uid = 0; - attr->ia_gid = 0; - attr->ia_rdev = ia_makedev (0, 0); - - attr->ia_size = 4096; /* FIXME */ - attr->ia_blksize = 4096; - attr->ia_blocks = 0; - - attr->ia_atime = ts.tv_sec; - attr->ia_atime_nsec = ts.tv_nsec; - attr->ia_mtime = ts.tv_sec; - attr->ia_mtime_nsec = ts.tv_nsec; - attr->ia_ctime = ts.tv_sec; - attr->ia_ctime_nsec = ts.tv_nsec; -} - -/* - * bd_entry_istat: Initialize iatt strucutre for a given path on success - */ -void bd_entry_istat (const char *path, struct iatt *attr, int type) -{ - struct stat stbuf = {0, }; - - if (stat (path, &stbuf) < 0) - bd_entry_init_iattr (attr, type); - else - iatt_from_stat (attr, &stbuf); - sprintf ((char *)attr->ia_gfid, "%lx", stbuf.st_ino); -} - -/* - * Adds the root entry and required entries - * ie header entry followed by . and .. entries - */ -bd_entry_t *bd_entry_add_root (void) -{ - bd_entry_t *bdentry = NULL; - bd_entry_t *h_entry = NULL; - bd_entry_t *d_entry = NULL; - bd_entry_t *dd_entry = NULL; - - bdentry = bd_entry_init ("/"); - if (!bdentry) - return NULL; - - bdentry->parent = bdentry; - - bd_entry_get_ino (&bdentry->attr->ia_ino); - sprintf ((char *)bdentry->attr->ia_gfid, "%ld", - bdentry->attr->ia_ino << 2); - bd_entry_init_iattr (bdentry->attr, IA_IFDIR); - - h_entry = bd_entry_clone (bdentry, ""); - bdentry->child.next = &h_entry->child; - bdentry->child.prev = &h_entry->child; - - d_entry = bd_entry_clone (bdentry, "."); - dd_entry = bd_entry_clone (bdentry, ".."); - - list_add_tail (&d_entry->sibling, &h_entry->sibling); - list_add_tail (&dd_entry->sibling, &h_entry->sibling); - return bdentry; -} - -bd_entry_t *bd_entry_add (bd_entry_t *parent, const char *name, - struct iatt *iattr, ia_type_t type) -{ - bd_entry_t *bdentry = NULL; - bd_entry_t *h_entry = NULL; - bd_entry_t *d_entry = NULL; - bd_entry_t *dd_entry = NULL; - bd_entry_t *sentry = NULL; - struct timespec ts = { 0, }; - - if (!parent) - parent = bd_rootp; - - if (type != IA_IFREG && type != IA_IFDIR) - return NULL; - - bdentry = bd_entry_init (name); - if (!bdentry) - return NULL; - - bdentry->parent = parent; - - iattr->ia_type = type; - - bd_entry_get_ino (&iattr->ia_ino); - if (IA_ISDIR(type)) { - h_entry = bd_entry_clone (bdentry, ""); - parent->attr->ia_nlink++; - bdentry->child.next = &h_entry->child; - bdentry->child.prev = &h_entry->child; - - d_entry = bd_entry_clone (bdentry, "."); - dd_entry = bd_entry_clone (bdentry, ".."); - - list_add_tail (&d_entry->sibling, &h_entry->sibling); - list_add_tail (&dd_entry->sibling, &h_entry->sibling); - } - memcpy (bdentry->attr, iattr, sizeof(*iattr)); - - clock_gettime (CLOCK_REALTIME, &ts); - parent->attr->ia_mtime = ts.tv_sec; - parent->attr->ia_mtime_nsec = ts.tv_nsec; - bdentry->size = iattr->ia_size; - - sentry = CHILD_ENTRY (parent); - list_add_tail (&bdentry->sibling, &sentry->sibling); - return bdentry; -} - -bd_entry_t *bd_entry_get_list (const char *name, bd_entry_t *parent) -{ - bd_entry_t *centry = NULL; - bd_entry_t *bdentry = NULL; - - if (!parent) - parent = bd_rootp; - - if (parent->child.next == &parent->child) - return NULL; - - centry = CHILD_ENTRY (parent); - if (!strcmp (centry->name, name)) - return centry; - - list_for_each_entry (bdentry, ¢ry->sibling, sibling) { - if (!strcmp (bdentry->name, name)) - return bdentry; - } - return NULL; -} - -/* FIXME: Do we need hashing here? */ -bd_entry_t *bd_entry_find_by_gfid (const char *path) -{ - bd_entry_t *h = NULL; - bd_entry_t *tmp = NULL; - bd_entry_t *tmp2 = NULL; - bd_entry_t *node = NULL; - bd_entry_t *cnode = NULL; - bd_entry_t *leaf = NULL; - char *gfid = NULL; - char *cp = NULL; - char *bgfid = NULL; - bd_entry_t *entry = NULL; - - gfid = GF_MALLOC (strlen(path) + 1, gf_common_mt_char); - sscanf (path, "<gfid:%s", gfid); - if (!gfid) - return NULL; - - cp = strchr(gfid, '>'); - *cp = '\0'; - - node = CHILD_ENTRY (bd_rootp); - - bgfid = GF_MALLOC (GF_UUID_BUF_SIZE, gf_common_mt_char); - if (!bgfid) - return NULL; - - list_for_each_entry_safe (h, tmp, &node->sibling, sibling) { - uuid_utoa_r (h->attr->ia_gfid, bgfid); - if (!h->link && !strcmp (gfid, bgfid)) { - entry = h; - goto out; - } - - /* if we have children for this node */ - if (h->child.next != &h->child) { - cnode = CHILD_ENTRY (h); - uuid_utoa_r (cnode->attr->ia_gfid, bgfid); - if (!cnode->link && !strcmp (gfid, bgfid)) { - entry = cnode; - goto out; - } - - list_for_each_entry_safe (leaf, tmp2, (&cnode->sibling), - sibling) { - uuid_utoa_r (leaf->attr->ia_gfid, bgfid); - if (!leaf->link && !strcmp (gfid, bgfid)) { - entry = leaf; - goto out; - } - - } - } - } -out: - if (bgfid) - GF_FREE (bgfid); - - return entry; -} - -/* Called with priv->bd_lock held */ -bd_entry_t *bd_entry_get (const char *name) -{ - bd_entry_t *pentry = NULL; - char *path = NULL; - char *comp = NULL; - char *save = NULL; - - if (!strncmp (name, "<gfid:", 5)) { - pentry = bd_entry_find_by_gfid (name); - if (pentry) - pentry->refcnt++; - return pentry; - } - - if (!strcmp (name, "/")) { - bd_rootp->refcnt++; - return bd_rootp; - } - - path = gf_strdup (name); - comp = strtok_r (path, "/", &save); - pentry = bd_entry_get_list (comp, NULL); - if (!pentry) - goto out; - while (comp) { - comp = strtok_r (NULL, "/", &save); - if (!comp) - break; - pentry = bd_entry_get_list (comp, pentry); - if (!pentry) - goto out; - } - - pentry->refcnt++; -out: - GF_FREE (path); - return pentry; -} - -int bd_entry_rm (const char *path) -{ - bd_entry_t *bdentry = NULL; - int ret = -1; - - bdentry = bd_entry_get (path); - if (!bdentry) - goto out; - - list_del_init (&bdentry->sibling); - list_del_init (&bdentry->child); - GF_FREE (bdentry); - - ret = 0; -out: - return ret; -} - - - -/* Called with priv->bd_lock held */ -void bd_entry_put (bd_entry_t *entry) -{ - entry->refcnt--; -} - -int bd_build_lv_list (bd_priv_t *priv, char *vg_name) -{ - struct dm_list *lv_dm_list = NULL; - struct lvm_lv_list *lv_list = NULL; - struct iatt iattr = {0, }; - char path[PATH_MAX] = {0, }; - vg_t vg = NULL; - bd_entry_t *vg_map = NULL; - bd_entry_t *bd = NULL; - int ret = -1; - const char *lv_name = NULL; - - priv->handle = lvm_init (NULL); - if (!priv->handle) { - gf_log (THIS->name, GF_LOG_CRITICAL, "FATAL: bd_init failed"); - return -1; - } - - BD_WR_LOCK (&priv->lock); - - vg = lvm_vg_open (priv->handle, vg_name, "r", 0); - if (!vg) { - gf_log (THIS->name, GF_LOG_CRITICAL, - "opening vg %s failed", vg_name); - goto out; - } - /* get list of LVs associated with this VG */ - lv_dm_list = lvm_vg_list_lvs (vg); - sprintf (path, "/dev/%s", vg_name); - bd_entry_istat (path, &iattr, IA_IFDIR); - vg_map = bd_entry_add (bd_rootp, vg_name, &iattr, - IA_IFDIR); - if (!vg_map) { - gf_log (THIS->name, GF_LOG_CRITICAL, - "bd_add_entry failed"); - goto out; - } - ret = 0; - if (!lv_dm_list) /* no lvs for this VG */ - goto out; - - dm_list_iterate_items (lv_list, lv_dm_list) { - if (!lv_list) - continue; - lv_name = lvm_lv_get_name (lv_list->lv); - /* snapshot%d is reserved name */ - if (!strncmp (lv_name, "snapshot", 8)) - continue; - /* get symbolic path for this LV */ - sprintf (path, "/dev/%s/%s", vg_name, lv_name); - bd_entry_istat (path, &iattr, IA_IFREG); - /* Make the file size equivalant to BD size */ - iattr.ia_size = lvm_lv_get_size (lv_list->lv); - /* got LV, add it to our tree */ - bd = bd_entry_add (vg_map, - lvm_lv_get_name (lv_list->lv), - &iattr, IA_IFREG); - if (bd == NULL) { - gf_log (THIS->name, GF_LOG_ERROR, - "bd_add_entry failed"); - goto out; - } - } -out: - if (vg) - lvm_vg_close (vg); - - BD_UNLOCK (&priv->lock); - return ret; -} - -/* - * Called with bd_lock held to cleanup entire list. If there was a - * reference to any one of the entry, nothing cleared. - * Return 0 on success -1 in case if there is a reference to the entry - */ -int bd_entry_cleanup (void) -{ - bd_entry_t *node = NULL; - bd_entry_t *tmp = NULL; - bd_entry_t *tmp2 = NULL; - bd_entry_t *cnode = NULL; - bd_entry_t *h = NULL; - bd_entry_t *leaf = NULL; - - if (!bd_rootp) - return 0; - - node = CHILD_ENTRY (bd_rootp); - if (node->refcnt) { - gf_log (THIS->name, GF_LOG_WARNING, - "entry %s is inuse\n", node->name); - return -1; - } - list_for_each_entry_safe (h, tmp, &node->sibling, sibling) { - /* if we have children for this node */ - if (h->child.next != &h->child) { - cnode = CHILD_ENTRY (h); - list_for_each_entry_safe (leaf, tmp2, (&cnode->sibling), - sibling) { - list_del_init (&leaf->sibling); - list_del_init (&leaf->child); - if (!leaf->link) - GF_FREE (leaf->attr); - GF_FREE (leaf); - } - list_del_init (&cnode->sibling); - list_del_init (&cnode->child); - if (!cnode->link) - GF_FREE (cnode->attr); - GF_FREE (cnode); - } - if (!h->link) - GF_FREE (h->attr); - GF_FREE (h); - } - GF_FREE (h); - GF_FREE (bd_rootp->attr); - GF_FREE (bd_rootp); - return 0; -} diff --git a/xlators/storage/bd_map/src/bd_map_help.h b/xlators/storage/bd_map/src/bd_map_help.h deleted file mode 100644 index 9fafa2d13..000000000 --- a/xlators/storage/bd_map/src/bd_map_help.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - BD translator - Exports Block devices on server side as regular - files to client. - - Copyright IBM, Corp. 2012 - - This file is part of GlusterFS. - - Author: - M. Mohan Kumar <mohan@in.ibm.com> - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ -#ifndef _BD_MAP_HELP_H -#define _BD_MAP_HELP_H - -#define BD_RD_LOCK(lock) \ - pthread_rwlock_rdlock (lock); - -#define BD_WR_LOCK(lock) \ - pthread_rwlock_wrlock (lock); - -#define BD_UNLOCK(lock) \ - pthread_rwlock_unlock (lock); - -#define BD_WR_ENTRY(priv, bdentry, path) \ - do { \ - BD_WR_LOCK (&priv->lock); \ - bdentry = bd_entry_get (path); \ - BD_UNLOCK (&priv->lock); \ - } while (0) - -#define BD_ENTRY(priv, bdentry, path) \ - do { \ - BD_RD_LOCK (&priv->lock); \ - bdentry = bd_entry_get (path); \ - BD_UNLOCK (&priv->lock); \ - } while (0) - -#define BD_PUT_ENTRY(priv, bdentry) \ - do { \ - BD_RD_LOCK (&priv->lock); \ - bd_entry_put (bdentry); \ - BD_UNLOCK (&priv->lock); \ - } while (0) - -#define BD_ENTRY_UPDATE_TIME(bdentry) bd_update_time (bdentry, 0) -#define BD_ENTRY_UPDATE_ATIME(bdentry) bd_update_time (bdentry, 2) -#define BD_ENTRY_UPDATE_MTIME(bdentry) bd_update_time (bdentry, 1) - -extern bd_entry_t *bd_rootp; -extern gf_lock_t inode_lk; - -void bd_entry_istat (const char *path, struct iatt *attr, int type); -bd_entry_t *bd_entry_add_root (void); -bd_entry_t *bd_entry_add (bd_entry_t *parent, const char *name, - struct iatt *iattr, ia_type_t type); -bd_entry_t *bd_entry_get_list (const char *name, bd_entry_t *parent); -bd_entry_t *bd_entry_get (const char *name); -void bd_entry_put (bd_entry_t *entry); -int bd_build_lv_list (bd_priv_t *priv, char *vg); -int bd_entry_cleanup (void); -void bd_update_time (bd_entry_t *entry, int type); -int bd_entry_rm (const char *path); - -#endif diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c index fad4a7df3..c3bbddd67 100644 --- a/xlators/storage/posix/src/posix-aio.c +++ b/xlators/storage/posix/src/posix-aio.c @@ -136,11 +136,7 @@ posix_aio_readv_complete (struct posix_aio_cb *paiocb, int res, int res2) /* Hack to notify higher layers of EOF. */ - if (postbuf.ia_size == 0) - op_errno = ENOENT; - else if ((offset + iov.iov_len) == postbuf.ia_size) - op_errno = ENOENT; - else if (offset > postbuf.ia_size) + if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size) op_errno = ENOENT; LOCK (&priv->lock); @@ -490,8 +486,8 @@ posix_aio_init (xlator_t *this) goto out; } - ret = pthread_create (&priv->aiothread, NULL, - posix_aio_thread, this); + ret = gf_thread_create (&priv->aiothread, NULL, + posix_aio_thread, this); if (ret != 0) { io_destroy (priv->ctxp); goto out; diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c index f8284d335..e295f8850 100644 --- a/xlators/storage/posix/src/posix-helpers.c +++ b/xlators/storage/posix/src/posix-helpers.c @@ -22,6 +22,7 @@ #include <pthread.h> #include <ftw.h> #include <sys/stat.h> +#include <signal.h> #ifndef GF_BSD_HOST_OS #include <alloca.h> @@ -44,6 +45,7 @@ #include "timer.h" #include "glusterfs3-xdr.h" #include "hashfn.h" +#include "glusterfs-acl.h" #include <fnmatch.h> char *marker_xattrs[] = {"trusted.glusterfs.quota.*", @@ -883,8 +885,8 @@ posix_spawn_janitor_thread (xlator_t *this) LOCK (&priv->lock); { if (!priv->janitor_present) { - ret = pthread_create (&priv->janitor, NULL, - posix_janitor_thread_proc, this); + ret = gf_thread_create (&priv->janitor, NULL, + posix_janitor_thread_proc, this); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, @@ -900,6 +902,74 @@ unlock: UNLOCK (&priv->lock); } +static int +is_fresh_file (struct stat *stat) +{ + struct timeval tv; + + gettimeofday (&tv, NULL); + + if ((stat->st_ctime >= (tv.tv_sec - 1)) + && (stat->st_ctime <= tv.tv_sec)) + return 1; + + return 0; +} + + +int +posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req) +{ + /* The purpose of this function is to prevent a race + where an inode creation FOP (like mkdir/mknod/create etc) + races with lookup in the following way: + + {create thread} | {lookup thread} + | + t0 + mkdir ("name") | + t1 + | posix_gfid_set ("name", 2); + t2 + posix_gfid_set ("name", 1); | + t3 + lstat ("name"); | lstat ("name"); + + In the above case mkdir FOP would have resulted with GFID 2 while + it should have been GFID 1. It matters in the case where GFID would + have gotten set to 1 on other subvolumes of replciate/distribute + + The "solution" here is that, if we detect lookup is attempting to + set a GFID on a file which is created very recently, but does not + yet have a GFID (i.e, between t1 and t2), then "fake" it as though + posix_gfid_heal was called at t0 instead. + */ + + uuid_t uuid_curr; + int ret = 0; + struct stat stat = {0, }; + + if (!xattr_req) + goto out; + + if (sys_lstat (path, &stat) != 0) + goto out; + + ret = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16); + if (ret != 16) { + if (is_fresh_file (&stat)) { + ret = -1; + errno = ENOENT; + goto out; + } + } + + ret = posix_gfid_set (this, path, loc, xattr_req); +out: + return ret; +} + + int posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req) { @@ -913,17 +983,17 @@ posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req) if (sys_lstat (path, &stat) != 0) goto out; - data = dict_get (xattr_req, "system.posix_acl_access"); + data = dict_get (xattr_req, POSIX_ACL_ACCESS_XATTR); if (data) { - ret = sys_lsetxattr (path, "system.posix_acl_access", + ret = sys_lsetxattr (path, POSIX_ACL_ACCESS_XATTR, data->data, data->len, 0); if (ret != 0) goto out; } - data = dict_get (xattr_req, "system.posix_acl_default"); + data = dict_get (xattr_req, POSIX_ACL_DEFAULT_XATTR); if (data) { - ret = sys_lsetxattr (path, "system.posix_acl_default", + ret = sys_lsetxattr (path, POSIX_ACL_DEFAULT_XATTR, data->data, data->len, 0); if (ret != 0) goto out; @@ -944,8 +1014,8 @@ _handle_entry_create_keyvalue_pair (dict_t *d, char *k, data_t *v, if (!strcmp (GFID_XATTR_KEY, k) || !strcmp ("gfid-req", k) || - !strcmp ("system.posix_acl_default", k) || - !strcmp ("system.posix_acl_access", k) || + !strcmp (POSIX_ACL_DEFAULT_XATTR, k) || + !strcmp (POSIX_ACL_ACCESS_XATTR, k) || ZR_FILE_CONTENT_REQUEST(k)) { return 0; } @@ -1063,3 +1133,259 @@ posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd) return ret; } + +static void * +posix_health_check_thread_proc (void *data) +{ + xlator_t *this = NULL; + struct posix_private *priv = NULL; + uint32_t interval = 0; + int ret = -1; + struct stat sb = {0, }; + + this = data; + priv = this->private; + + /* prevent races when the interval is updated */ + interval = priv->health_check_interval; + if (interval == 0) + goto out; + + gf_log (this->name, GF_LOG_DEBUG, "health-check thread started, " + "interval = %d seconds", interval); + + while (1) { + /* aborting sleep() is a request to exit this thread, sleep() + * will normally not return when cancelled */ + ret = sleep (interval); + if (ret > 0) + break; + + /* prevent thread errors while doing the health-check(s) */ + pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL); + + /* Do the health-check, it should be moved to its own function + * in case it gets more complex. */ + ret = stat (priv->base_path, &sb); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "stat() on %s returned: %s", priv->base_path, + strerror (errno)); + goto abort; + } + + pthread_setcancelstate (PTHREAD_CANCEL_ENABLE, NULL); + } + +out: + gf_log (this->name, GF_LOG_DEBUG, "health-check thread exiting"); + + LOCK (&priv->lock); + { + priv->health_check_active = _gf_false; + } + UNLOCK (&priv->lock); + + return NULL; + +abort: + /* health-check failed */ + gf_log (this->name, GF_LOG_EMERG, "health-check failed, going down"); + xlator_notify (this->parents->xlator, GF_EVENT_CHILD_DOWN, this); + + ret = sleep (30); + if (ret == 0) { + gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGTERM"); + kill (getpid(), SIGTERM); + } + + ret = sleep (30); + if (ret == 0) { + gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGKILL"); + kill (getpid(), SIGKILL); + } + + return NULL; +} + +void +posix_spawn_health_check_thread (xlator_t *xl) +{ + struct posix_private *priv = NULL; + int ret = -1; + + priv = xl->private; + + LOCK (&priv->lock); + { + /* cancel the running thread */ + if (priv->health_check_active == _gf_true) { + pthread_cancel (priv->health_check); + priv->health_check_active = _gf_false; + } + + /* prevent scheduling a check in a tight loop */ + if (priv->health_check_interval == 0) + goto unlock; + + ret = gf_thread_create (&priv->health_check, NULL, + posix_health_check_thread_proc, xl); + if (ret < 0) { + priv->health_check_interval = 0; + priv->health_check_active = _gf_false; + gf_log (xl->name, GF_LOG_ERROR, + "unable to setup health-check thread: %s", + strerror (errno)); + goto unlock; + } + + /* run the thread detached, resources will be freed on exit */ + pthread_detach (priv->health_check); + priv->health_check_active = _gf_true; + } +unlock: + UNLOCK (&priv->lock); +} + +int +posix_fsyncer_pick (xlator_t *this, struct list_head *head) +{ + struct posix_private *priv = NULL; + int count = 0; + + priv = this->private; + pthread_mutex_lock (&priv->fsync_mutex); + { + while (list_empty (&priv->fsyncs)) + pthread_cond_wait (&priv->fsync_cond, + &priv->fsync_mutex); + + count = priv->fsync_queue_count; + priv->fsync_queue_count = 0; + list_splice_init (&priv->fsyncs, head); + } + pthread_mutex_unlock (&priv->fsync_mutex); + + return count; +} + + +void +posix_fsyncer_process (xlator_t *this, call_stub_t *stub, gf_boolean_t do_fsync) +{ + struct posix_fd *pfd = NULL; + int ret = -1; + struct posix_private *priv = NULL; + + priv = this->private; + + ret = posix_fd_ctx_get (stub->args.fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not get fdctx for fd(%s)", + uuid_utoa (stub->args.fd->inode->gfid)); + call_unwind_error (stub, -1, EINVAL); + return; + } + + if (do_fsync) { +#ifdef HAVE_FDATASYNC + if (stub->args.datasync) + ret = fdatasync (pfd->fd); + else +#endif + ret = fsync (pfd->fd); + } else { + ret = 0; + } + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "could not fstat fd(%s)", + uuid_utoa (stub->args.fd->inode->gfid)); + call_unwind_error (stub, -1, errno); + return; + } + + call_unwind_error (stub, 0, 0); +} + + +static void +posix_fsyncer_syncfs (xlator_t *this, struct list_head *head) +{ + call_stub_t *stub = NULL; + struct posix_fd *pfd = NULL; + int ret = -1; + + stub = list_entry (head->prev, call_stub_t, list); + ret = posix_fd_ctx_get (stub->args.fd, this, &pfd); + if (ret) + return; + +#ifdef GF_LINUX_HOST_OS + /* syncfs() is not "declared" in RHEL's glibc even though + the kernel has support. + */ +#include <sys/syscall.h> +#include <unistd.h> +#ifdef SYS_syncfs + syscall (SYS_syncfs, pfd->fd); +#else + sync(); +#endif +#else + sync(); +#endif +} + + +void * +posix_fsyncer (void *d) +{ + xlator_t *this = d; + struct posix_private *priv = NULL; + call_stub_t *stub = NULL; + call_stub_t *tmp = NULL; + struct list_head list; + int count = 0; + gf_boolean_t do_fsync = _gf_true; + + priv = this->private; + + for (;;) { + INIT_LIST_HEAD (&list); + + count = posix_fsyncer_pick (this, &list); + + usleep (priv->batch_fsync_delay_usec); + + gf_log (this->name, GF_LOG_DEBUG, + "picked %d fsyncs", count); + + switch (priv->batch_fsync_mode) { + case BATCH_NONE: + case BATCH_REVERSE_FSYNC: + break; + case BATCH_SYNCFS: + case BATCH_SYNCFS_SINGLE_FSYNC: + case BATCH_SYNCFS_REVERSE_FSYNC: + posix_fsyncer_syncfs (this, &list); + break; + } + + if (priv->batch_fsync_mode == BATCH_SYNCFS) + do_fsync = _gf_false; + else + do_fsync = _gf_true; + + list_for_each_entry_safe_reverse (stub, tmp, &list, list) { + list_del_init (&stub->list); + + posix_fsyncer_process (this, stub, do_fsync); + + if (priv->batch_fsync_mode == BATCH_SYNCFS_SINGLE_FSYNC) + do_fsync = _gf_false; + } + } +} diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index 93577cf54..fb45c7a67 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -23,6 +23,8 @@ #include <pthread.h> #include <ftw.h> #include <sys/stat.h> +#include <signal.h> +#include <sys/uio.h> #ifndef GF_BSD_HOST_OS #include <alloca.h> @@ -50,6 +52,7 @@ #include "glusterfs3-xdr.h" #include "hashfn.h" #include "posix-aio.h" +#include "glusterfs-acl.h" extern char *marker_xattrs[]; #define ALIGN_SIZE 4096 @@ -128,7 +131,7 @@ posix_lookup (call_frame_t *frame, xlator_t *this, MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &buf); if (uuid_is_null (loc->inode->gfid)) { - posix_gfid_set (this, real_path, loc, xdata); + posix_gfid_heal (this, real_path, loc, xdata); MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &buf); } @@ -561,6 +564,289 @@ out: return 0; } +static int32_t +posix_do_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + off_t offset, size_t len, struct iatt *statpre, + struct iatt *statpost) +{ + struct posix_fd *pfd = NULL; + int32_t ret = -1; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "pfd is NULL from fd=%p", fd); + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpre); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "fallocate (fstat) failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } + + ret = sys_fallocate(pfd->fd, flags, offset, len); + if (ret == -1) { + ret = -errno; + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpost); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "fallocate (fstat) failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } + +out: + SET_TO_OLD_FS_ID (); + + return ret; +} + +char* +_page_aligned_alloc (size_t size, char **aligned_buf) +{ + char *alloc_buf = NULL; + char *buf = NULL; + + alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_posix_mt_char); + if (!alloc_buf) + goto out; + /* page aligned buffer */ + buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE); + *aligned_buf = buf; +out: + return alloc_buf; +} + +static int32_t +_posix_do_zerofill(int fd, off_t offset, size_t len, int o_direct) +{ + size_t num_vect = 0; + int32_t num_loop = 1; + int32_t idx = 0; + int32_t op_ret = -1; + int32_t vect_size = VECTOR_SIZE; + size_t remain = 0; + size_t extra = 0; + struct iovec *vector = NULL; + char *iov_base = NULL; + char *alloc_buf = NULL; + + if (len == 0) + return 0; + if (len < VECTOR_SIZE) + vect_size = len; + + num_vect = len / (vect_size); + remain = len % vect_size ; + if (num_vect > MAX_NO_VECT) { + extra = num_vect % MAX_NO_VECT; + num_loop = num_vect / MAX_NO_VECT; + num_vect = MAX_NO_VECT; + } + + vector = GF_CALLOC (num_vect, sizeof(struct iovec), + gf_common_mt_iovec); + if (!vector) + return -1; + if (o_direct) { + alloc_buf = _page_aligned_alloc(vect_size, &iov_base); + if (!alloc_buf) { + gf_log ("_posix_do_zerofill", GF_LOG_DEBUG, + "memory alloc failed, vect_size %d: %s", + vect_size, strerror(errno)); + GF_FREE(vector); + return -1; + } + } else { + iov_base = GF_CALLOC (vect_size, sizeof(char), + gf_common_mt_char); + if (!iov_base) { + GF_FREE(vector); + return -1; + } + } + + for (idx = 0; idx < num_vect; idx++) { + vector[idx].iov_base = iov_base; + vector[idx].iov_len = vect_size; + } + lseek(fd, offset, SEEK_SET); + for (idx = 0; idx < num_loop; idx++) { + op_ret = writev(fd, vector, num_vect); + if (op_ret < 0) + goto err; + } + if (extra) { + op_ret = writev(fd, vector, extra); + if (op_ret < 0) + goto err; + } + if (remain) { + vector[0].iov_len = remain; + op_ret = writev(fd, vector , 1); + if (op_ret < 0) + goto err; + } +err: + if (o_direct) + GF_FREE(alloc_buf); + else + GF_FREE(iov_base); + GF_FREE(vector); + return op_ret; +} + +static int32_t +posix_do_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, struct iatt *statpre, + struct iatt *statpost) +{ + struct posix_fd *pfd = NULL; + int32_t ret = -1; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = posix_fd_ctx_get (fd, this, &pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "pfd is NULL from fd=%p", fd); + goto out; + } + + ret = posix_fdstat (this, pfd->fd, statpre); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "pre-operation fstat failed on fd = %p: %s", fd, + strerror (errno)); + goto out; + } + ret = _posix_do_zerofill(pfd->fd, offset, len, pfd->flags & O_DIRECT); + if (ret < 0) { + ret = -errno; + gf_log(this->name, GF_LOG_ERROR, + "zerofill failed on fd %d length %ld %s", + pfd->fd, len, strerror(errno)); + goto out; + } + if (pfd->flags & (O_SYNC|O_DSYNC)) { + ret = fsync (pfd->fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "fsync() in writev on fd %d failed: %s", + pfd->fd, strerror (errno)); + ret = -errno; + goto out; + } + } + + ret = posix_fdstat (this, pfd->fd, statpost); + if (ret == -1) { + ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "post operation fstat failed on fd=%p: %s", fd, + strerror (errno)); + goto out; + } + +out: + SET_TO_OLD_FS_ID (); + + return ret; +} + +static int32_t +_posix_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size, + off_t offset, size_t len, dict_t *xdata) +{ + int32_t ret; + int32_t flags = 0; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + + if (keep_size) + flags = FALLOC_FL_KEEP_SIZE; + + ret = posix_do_fallocate(frame, this, fd, flags, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(fallocate, frame, 0, 0, &statpre, &statpost, NULL); + return 0; + +err: + STACK_UNWIND_STRICT(fallocate, frame, -1, -ret, NULL, NULL, NULL); + return 0; +} + +static int32_t +posix_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int32_t ret; + int32_t flags = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + + ret = posix_do_fallocate(frame, this, fd, flags, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(discard, frame, 0, 0, &statpre, &statpost, NULL); + return 0; + +err: + STACK_UNWIND_STRICT(discard, frame, -1, -ret, NULL, NULL, NULL); + return 0; + +} + +static int32_t +posix_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int32_t ret = 0; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + + ret = posix_do_zerofill(frame, this, fd, offset, len, + &statpre, &statpost); + if (ret < 0) + goto err; + + STACK_UNWIND_STRICT(zerofill, frame, 0, 0, &statpre, &statpost, NULL); + return 0; + +err: + STACK_UNWIND_STRICT(zerofill, frame, -1, -ret, NULL, NULL, NULL); + return 0; + +} + int32_t posix_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, dict_t *xdata) @@ -1969,11 +2255,7 @@ posix_readv (call_frame_t *frame, xlator_t *this, } /* Hack to notify higher layers of EOF. */ - if (stbuf.ia_size == 0) - op_errno = ENOENT; - else if ((offset + vec.iov_len) == stbuf.ia_size) - op_errno = ENOENT; - else if (offset > stbuf.ia_size) + if (!stbuf.ia_size || (offset + vec.iov_len) >= stbuf.ia_size) op_errno = ENOENT; op_ret = vec.iov_len; @@ -2018,22 +2300,6 @@ err: return op_ret; } -char* -_page_aligned_alloc (size_t size, char **aligned_buf) -{ - char *alloc_buf = NULL; - char *buf = NULL; - - alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_posix_mt_char); - if (!alloc_buf) - goto out; - /* page aligned buffer */ - buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE); - *aligned_buf = buf; -out: - return alloc_buf; -} - int32_t __posix_writev (int fd, struct iovec *vector, int count, off_t startoff, int odirect) @@ -2082,6 +2348,48 @@ err: return op_ret; } +dict_t* +_fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this, int is_append) +{ + dict_t *rsp_xdata = NULL; + int32_t ret = 0; + inode_t *inode = NULL; + + if (fd) + inode = fd->inode; + + if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid Args: " + "fd: %p inode: %p gfid:%s", fd, inode?inode:0, + inode?uuid_utoa(inode->gfid):"N/A"); + goto out; + } + + if (!xdata || !dict_get (xdata, GLUSTERFS_OPEN_FD_COUNT)) + goto out; + + rsp_xdata = dict_new(); + if (!rsp_xdata) + goto out; + + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_OPEN_FD_COUNT, + fd->inode->fd_count); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set " + "dictionary value for %s", uuid_utoa (fd->inode->gfid), + GLUSTERFS_OPEN_FD_COUNT); + } + + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, + is_append); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set " + "dictionary value for %s", uuid_utoa (fd->inode->gfid), + GLUSTERFS_WRITE_IS_APPEND); + } +out: + return rsp_xdata; +} int32_t posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, @@ -2096,6 +2404,9 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt preop = {0,}; struct iatt postop = {0,}; int ret = -1; + dict_t *rsp_xdata = NULL; + int is_append = 0; + gf_boolean_t locked = _gf_false; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -2117,6 +2428,17 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, _fd = pfd->fd; + if (xdata && dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) { + /* The write_is_append check and write must happen + atomically. Else another write can overtake this + write after the check and get written earlier. + + So lock before preop-stat and unlock after write. + */ + locked = _gf_true; + LOCK(&fd->inode->lock); + } + op_ret = posix_fdstat (this, _fd, &preop); if (op_ret == -1) { op_errno = errno; @@ -2126,8 +2448,19 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, goto out; } + if (locked) { + if (preop.ia_size == offset || (fd->flags & O_APPEND)) + is_append = 1; + } + op_ret = __posix_writev (_fd, vector, count, offset, (pfd->flags & O_DIRECT)); + + if (locked) { + UNLOCK (&fd->inode->lock); + locked = _gf_false; + } + if (op_ret < 0) { op_errno = -op_ret; op_ret = -1; @@ -2143,6 +2476,7 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, UNLOCK (&priv->lock); if (op_ret >= 0) { + rsp_xdata = _fill_writev_xdata (fd, xdata, this, is_append); /* wiretv successful, we also need to get the stat of * the file we wrote to */ @@ -2172,9 +2506,16 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, out: + if (locked) { + UNLOCK (&fd->inode->lock); + locked = _gf_false; + } + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &preop, &postop, - NULL); + rsp_xdata); + if (rsp_xdata) + dict_unref (rsp_xdata); return 0; } @@ -2301,6 +2642,33 @@ out: } +int +posix_batch_fsync (call_frame_t *frame, xlator_t *this, + fd_t *fd, int datasync, dict_t *xdata) +{ + call_stub_t *stub = NULL; + struct posix_private *priv = NULL; + + priv = this->private; + + stub = fop_fsync_stub (frame, default_fsync, fd, datasync, xdata); + if (!stub) { + STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, 0, 0, 0); + return 0; + } + + pthread_mutex_lock (&priv->fsync_mutex); + { + list_add_tail (&stub->list, &priv->fsyncs); + priv->fsync_queue_count++; + pthread_cond_signal (&priv->fsync_cond); + } + pthread_mutex_unlock (&priv->fsync_mutex); + + return 0; +} + + int32_t posix_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, dict_t *xdata) @@ -2312,6 +2680,7 @@ posix_fsync (call_frame_t *frame, xlator_t *this, int ret = -1; struct iatt preop = {0,}; struct iatt postop = {0,}; + struct posix_private *priv = NULL; DECLARE_OLD_FS_ID_VAR; @@ -2327,6 +2696,12 @@ posix_fsync (call_frame_t *frame, xlator_t *this, goto out; #endif + priv = this->private; + if (priv->batch_fsync_mode && xdata && dict_get (xdata, "batch-fsync")) { + posix_batch_fsync (frame, this, fd, datasync, xdata); + return 0; + } + ret = posix_fd_ctx_get (fd, this, &pfd); if (ret < 0) { op_errno = -ret; @@ -2552,8 +2927,9 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, if (ret < 0) { op_ret = -1; op_errno = -ret; - gf_log (this->name, GF_LOG_WARNING, - "Failed to get rea filename (%s, %s): %s", + gf_log (this->name, (op_errno == ENOENT) ? + GF_LOG_DEBUG : GF_LOG_WARNING, + "Failed to get real filename (%s, %s): %s", loc->path, name, strerror (op_errno)); goto out; } @@ -2667,7 +3043,8 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, "supported (try remounting" " brick with 'user_xattr' " "flag)"); - } else if (op_errno == ENOATTR) { + } else if (op_errno == ENOATTR || + op_errno == ENODATA) { gf_log (this->name, GF_LOG_DEBUG, "No such attribute:%s for file %s", key, real_path); @@ -4187,6 +4564,27 @@ posix_set_owner (xlator_t *this, uid_t uid, gid_t gid) return ret; } + +static int +set_batch_fsync_mode (struct posix_private *priv, const char *str) +{ + if (strcmp (str, "none") == 0) + priv->batch_fsync_mode = BATCH_NONE; + else if (strcmp (str, "syncfs") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS; + else if (strcmp (str, "syncfs-single-fsync") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS_SINGLE_FSYNC; + else if (strcmp (str, "syncfs-reverse-fsync") == 0) + priv->batch_fsync_mode = BATCH_SYNCFS_REVERSE_FSYNC; + else if (strcmp (str, "reverse-fsync") == 0) + priv->batch_fsync_mode = BATCH_REVERSE_FSYNC; + else + return -1; + + return 0; +} + + int reconfigure (xlator_t *this, dict_t *options) { @@ -4194,6 +4592,7 @@ reconfigure (xlator_t *this, dict_t *options) struct posix_private *priv = NULL; uid_t uid = -1; gid_t gid = -1; + char *batch_fsync_mode_str = NULL; priv = this->private; @@ -4201,6 +4600,18 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("brick-gid", gid, options, uint32, out); posix_set_owner (this, uid, gid); + GF_OPTION_RECONF ("batch-fsync-delay-usec", priv->batch_fsync_delay_usec, + options, uint32, out); + + GF_OPTION_RECONF ("batch-fsync-mode", batch_fsync_mode_str, + options, str, out); + + if (set_batch_fsync_mode (priv, batch_fsync_mode_str) != 0) { + gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s", + batch_fsync_mode_str); + goto out; + } + GF_OPTION_RECONF ("linux-aio", priv->aio_configured, options, bool, out); @@ -4219,6 +4630,10 @@ reconfigure (xlator_t *this, dict_t *options) " fallback to <hostname>:<export>"); } + GF_OPTION_RECONF ("health-check-interval", priv->health_check_interval, + options, uint32, out); + posix_spawn_health_check_thread (this); + ret = 0; out: return ret; @@ -4248,6 +4663,7 @@ init (xlator_t *this) char *guuid = NULL; uid_t uid = -1; gid_t gid = -1; + char *batch_fsync_mode_str; dir_data = dict_get (this->options, "directory"); @@ -4400,7 +4816,7 @@ init (xlator_t *this) } } - size = sys_lgetxattr (dir_data->data, "system.posix_acl_access", + size = sys_lgetxattr (dir_data->data, POSIX_ACL_ACCESS_XATTR, NULL, 0); if ((size < 0) && (errno == ENOTSUP)) gf_log (this->name, GF_LOG_WARNING, @@ -4589,11 +5005,39 @@ init (xlator_t *this) " fallback to <hostname>:<export>"); } + _private->health_check_active = _gf_false; + GF_OPTION_INIT ("health-check-interval", + _private->health_check_interval, uint32, out); + if (_private->health_check_interval) + posix_spawn_health_check_thread (this); + pthread_mutex_init (&_private->janitor_lock, NULL); pthread_cond_init (&_private->janitor_cond, NULL); INIT_LIST_HEAD (&_private->janitor_fds); posix_spawn_janitor_thread (this); + + pthread_mutex_init (&_private->fsync_mutex, NULL); + pthread_cond_init (&_private->fsync_cond, NULL); + INIT_LIST_HEAD (&_private->fsyncs); + + ret = gf_thread_create (&_private->fsyncer, NULL, posix_fsyncer, this); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "fsyncer thread" + " creation failed (%s)", strerror (errno)); + goto out; + } + + GF_OPTION_INIT ("batch-fsync-mode", batch_fsync_mode_str, str, out); + + if (set_batch_fsync_mode (_private, batch_fsync_mode_str) != 0) { + gf_log (this->name, GF_LOG_ERROR, "Unknown mode string: %s", + batch_fsync_mode_str); + goto out; + } + + GF_OPTION_INIT ("batch-fsync-delay-usec", _private->batch_fsync_delay_usec, + uint32, out); out: return ret; } @@ -4659,6 +5103,9 @@ struct xlator_fops fops = { .fxattrop = posix_fxattrop, .setattr = posix_setattr, .fsetattr = posix_fsetattr, + .fallocate = _posix_fallocate, + .discard = posix_discard, + .zerofill = posix_zerofill, }; struct xlator_cbks cbks = { @@ -4712,5 +5159,34 @@ struct volume_options options[] = { .description = "return glusterd's node-uuid in pathinfo xattr" " string instead of hostname" }, + { + .key = {"health-check-interval"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = "30", + .validate = GF_OPT_VALIDATE_MIN, + .description = "Interval in seconds for a filesystem health check, " + "set to 0 to disable" + }, + { .key = {"batch-fsync-mode"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "reverse-fsync", + .description = "Possible values:\n" + "\t- syncfs: Perform one syncfs() on behalf oa batch" + "of fsyncs.\n" + "\t- syncfs-single-fsync: Perform one syncfs() on behalf of a batch" + " of fsyncs and one fsync() per batch.\n" + "\t- syncfs-reverse-fsync: Preform one syncfs() on behalf of a batch" + " of fsyncs and fsync() each file in the batch in reverse order.\n" + " in reverse order.\n" + "\t- reverse-fsync: Perform fsync() of each file in the batch in" + " reverse order." + }, + { .key = {"batch-fsync-delay-usec"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "0", + .description = "Num of usecs to wait for aggregating fsync" + " requests", + }, { .key = {NULL} } }; diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index 2cee1905f..3121db271 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -43,12 +43,15 @@ #include "timer.h" #include "posix-mem-types.h" #include "posix-handle.h" +#include "call-stub.h" #ifdef HAVE_LIBAIO #include <libaio.h> #include "posix-aio.h" #endif +#define VECTOR_SIZE 64 * 1024 /* vector size 64KB*/ +#define MAX_NO_VECT 1024 /** * posix_fd - internal structure common to file and directory fd's */ @@ -127,6 +130,27 @@ struct posix_private { /* node-uuid in pathinfo xattr */ gf_boolean_t node_uuid_pathinfo; + + pthread_t fsyncer; + struct list_head fsyncs; + pthread_mutex_t fsync_mutex; + pthread_cond_t fsync_cond; + int fsync_queue_count; + + enum { + BATCH_NONE = 0, + BATCH_SYNCFS, + BATCH_SYNCFS_SINGLE_FSYNC, + BATCH_REVERSE_FSYNC, + BATCH_SYNCFS_REVERSE_FSYNC + } batch_fsync_mode; + + uint32_t batch_fsync_delay_usec; + + /* seconds to sleep between health checks */ + uint32_t health_check_interval; + pthread_t health_check; + gf_boolean_t health_check_active; }; typedef struct { @@ -166,7 +190,7 @@ int posix_get_file_contents (xlator_t *this, uuid_t pargfid, int posix_set_file_contents (xlator_t *this, const char *path, char *key, data_t *value, int flags); int posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req); -int posix_gfid_heal (xlator_t *this, const char *path, dict_t *xattr_req); +int posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req); int posix_entry_create_xattr_set (xlator_t *this, const char *path, dict_t *dict); @@ -178,4 +202,7 @@ gf_boolean_t posix_special_xattr (char **pattern, char *key); void __posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags, off_t offset, size_t size); +void posix_spawn_health_check_thread (xlator_t *this); + +void *posix_fsyncer (void *); #endif /* _POSIX_H */ diff --git a/xlators/system/posix-acl/src/posix-acl-xattr.c b/xlators/system/posix-acl/src/posix-acl-xattr.c index 460daf985..cc0937c5e 100644 --- a/xlators/system/posix-acl/src/posix-acl-xattr.c +++ b/xlators/system/posix-acl/src/posix-acl-xattr.c @@ -65,7 +65,7 @@ posix_acl_from_xattr (xlator_t *this, const char *xattr_buf, int xattr_size) count = size / sizeof (*entry); header = (struct posix_acl_xattr_header *) (xattr_buf); - entry = (struct posix_acl_xattr_entry *) (header + 1); + entry = (struct posix_acl_xattr_entry *) (header + 1); if (header->version != htole32 (POSIX_ACL_VERSION)) return NULL; @@ -126,7 +126,7 @@ posix_acl_to_xattr (xlator_t *this, struct posix_acl *acl, char *xattr_buf, return size; header = (struct posix_acl_xattr_header *) (xattr_buf); - entry = (struct posix_acl_xattr_entry *) (header + 1); + entry = (struct posix_acl_xattr_entry *) (header + 1); ace = acl->entries; header->version = htole32 (POSIX_ACL_VERSION); diff --git a/xlators/system/posix-acl/src/posix-acl-xattr.h b/xlators/system/posix-acl/src/posix-acl-xattr.h index c4e90f5f9..2933c2057 100644 --- a/xlators/system/posix-acl/src/posix-acl-xattr.h +++ b/xlators/system/posix-acl/src/posix-acl-xattr.h @@ -11,24 +11,10 @@ #ifndef _POSIX_ACL_XATTR_H #define _POSIX_ACL_XATTR_H -#include <stdint.h> - #include "common-utils.h" #include "posix-acl.h" #include "glusterfs.h" - -#define POSIX_ACL_VERSION 2 - -struct posix_acl_xattr_entry { - uint16_t tag; - uint16_t perm; - uint32_t id; -}; - -struct posix_acl_xattr_header { - uint32_t version; - struct posix_acl_xattr_entry entries[]; -}; +#include "glusterfs-acl.h" struct posix_acl *posix_acl_from_xattr (xlator_t *this, const char *buf, int size); diff --git a/xlators/system/posix-acl/src/posix-acl.c b/xlators/system/posix-acl/src/posix-acl.c index 3e2f7f212..4658cad49 100644 --- a/xlators/system/posix-acl/src/posix-acl.c +++ b/xlators/system/posix-acl/src/posix-acl.c @@ -186,7 +186,7 @@ acl_permits (call_frame_t *frame, inode_t *inode, int want) ace = acl->entries; - if (acl->count > 3) + if (acl->count > POSIX_ACL_MINIMAL_ACE_COUNT) acl_present = 1; for (i = 0; i < acl->count; i++) { @@ -663,7 +663,12 @@ int posix_acl_ctx_update (inode_t *inode, xlator_t *this, struct iatt *buf) { struct posix_acl_ctx *ctx = NULL; + struct posix_acl *acl = NULL; + struct posix_ace *ace = NULL; + struct posix_ace *mask_ce = NULL; + struct posix_ace *group_ce = NULL; int ret = 0; + int i = 0; ctx = posix_acl_ctx_get (inode, this); if (!ctx) { @@ -676,7 +681,46 @@ posix_acl_ctx_update (inode_t *inode, xlator_t *this, struct iatt *buf) ctx->uid = buf->ia_uid; ctx->gid = buf->ia_gid; ctx->perm = st_mode_from_ia (buf->ia_prot, buf->ia_type); + + acl = ctx->acl_access; + if (!acl || !(acl->count > POSIX_ACL_MINIMAL_ACE_COUNT)) + goto unlock; + + /* This is an extended ACL (not minimal acl). In case we + are only refreshing from iatt and not ACL xattrs (for + e.g. from postattributes of setattr() call, we need to + update the corresponding ACEs as well. + */ + ace = acl->entries; + for (i = 0; i < acl->count; i++) { + switch (ace->tag) { + case POSIX_ACL_USER_OBJ: + ace->perm = (ctx->perm & S_IRWXU) >> 6; + break; + case POSIX_ACL_USER: + case POSIX_ACL_GROUP: + break; + case POSIX_ACL_GROUP_OBJ: + group_ce = ace; + break; + case POSIX_ACL_MASK: + mask_ce = ace; + break; + case POSIX_ACL_OTHER: + ace->perm = (ctx->perm & S_IRWXO); + break; + } + ace++; + } + + if (mask_ce) + mask_ce->perm = (ctx->perm & S_IRWXG) >> 3; + else if (group_ce) + group_ce->perm = (ctx->perm & S_IRWXG) >> 3; + else + ret = -1; } +unlock: UNLOCK(&inode->lock); out: return ret; diff --git a/xlators/system/posix-acl/src/posix-acl.h b/xlators/system/posix-acl/src/posix-acl.h index 6ac2c6a84..c5e01967a 100644 --- a/xlators/system/posix-acl/src/posix-acl.h +++ b/xlators/system/posix-acl/src/posix-acl.h @@ -11,57 +11,10 @@ #ifndef _POSIX_ACL_H #define _POSIX_ACL_H -#include <stdint.h> - #include "xlator.h" #include "common-utils.h" #include "byte-order.h" - - -#define POSIX_ACL_READ (0x04) -#define POSIX_ACL_WRITE (0x02) -#define POSIX_ACL_EXECUTE (0x01) - -#define POSIX_ACL_UNDEFINED_TAG (0x00) -#define POSIX_ACL_USER_OBJ (0x01) -#define POSIX_ACL_USER (0x02) -#define POSIX_ACL_GROUP_OBJ (0x04) -#define POSIX_ACL_GROUP (0x08) -#define POSIX_ACL_MASK (0x10) -#define POSIX_ACL_OTHER (0x20) - -#define POSIX_ACL_UNDEFINED_ID ((id_t)-1) - - -struct posix_ace { - uint16_t tag; - uint16_t perm; - uint32_t id; -}; - - -struct posix_acl { - int refcnt; - int count; - struct posix_ace entries[]; -}; - - -struct posix_acl_ctx { - uid_t uid; - gid_t gid; - mode_t perm; - struct posix_acl *acl_access; - struct posix_acl *acl_default; -}; - - -struct posix_acl_conf { - gf_lock_t acl_lock; - uid_t super_uid; - struct posix_acl *minimal_acl; -}; - +#include "glusterfs-acl.h" struct posix_acl *posix_acl_new (xlator_t *this, int entry_count); struct posix_acl *posix_acl_ref (xlator_t *this, struct posix_acl *acl); |
