diff options
author | Jeff Darcy <jdarcy@redhat.com> | 2013-12-11 16:26:25 -0500 |
---|---|---|
committer | Jeff Darcy <jdarcy@redhat.com> | 2013-12-11 16:26:25 -0500 |
commit | ef171ff2bfd114e46442441fbdeb692a416cc951 (patch) | |
tree | 27ac663045954c8efb145fbbae3df87d7bbfe5b3 /xlators/cluster/nsr-server/src/nsr.c | |
parent | 4bbbda2017be3cfae57c122d70d11c9470364f63 (diff) |
Roll-up patch for NSR so far.
Previous history:
https://forge.gluster.org/~jdarcy/glusterfs-core/glusterfs-nsr
Change-Id: I2b56328788753c6a74d9589815f2dd705ac9ce6a
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
Diffstat (limited to 'xlators/cluster/nsr-server/src/nsr.c')
-rw-r--r-- | xlators/cluster/nsr-server/src/nsr.c | 682 |
1 files changed, 682 insertions, 0 deletions
diff --git a/xlators/cluster/nsr-server/src/nsr.c b/xlators/cluster/nsr-server/src/nsr.c new file mode 100644 index 000000000..3707b3003 --- /dev/null +++ b/xlators/cluster/nsr-server/src/nsr.c @@ -0,0 +1,682 @@ +/* + Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "call-stub.h" +#include "defaults.h" +#include "xlator.h" +#include "api/src/glfs.h" +#include "api/src/glfs-internal.h" +#include "run.h" +#include "common-utils.h" + + +#include "etcd-api.h" +#include "nsr-internal.h" +#include "../../nsr-recon/src/recon_driver.h" +#include "../../nsr-recon/src/recon_xlator.h" + + +#define GLUSTERD_DEFAULT_WORKDIR "/var/lib/glusterd" +#define GLUSTERD_VOLUME_DIR_PREFIX "vols" +#define GLUSTERD_BRICK_INFO_DIR "bricks" + +#define NSR_FLUSH_INTERVAL 5 + +nsr_inode_ctx_t * +nsr_get_inode_ctx (xlator_t *this, inode_t *inode) +{ + uint64_t ctx_int = 0LL; + nsr_inode_ctx_t *ctx_ptr; + + if (__inode_ctx_get(inode,this,&ctx_int) == 0) { + ctx_ptr = (nsr_inode_ctx_t *)(long)ctx_int; + } + else { + ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr), + gf_mt_nsr_inode_ctx_t); + if (ctx_ptr) { + ctx_int = (uint64_t)(long)ctx_ptr; + if (__inode_ctx_set(inode,this,&ctx_int) == 0) { + LOCK_INIT(&ctx_ptr->lock); + INIT_LIST_HEAD(&ctx_ptr->aqueue); + INIT_LIST_HEAD(&ctx_ptr->pqueue); + } + else { + GF_FREE(ctx_ptr); + ctx_ptr = NULL; + } + } + + } + + return ctx_ptr; +} + +nsr_fd_ctx_t * +nsr_get_fd_ctx (xlator_t *this, fd_t *fd) +{ + uint64_t ctx_int = 0LL; + nsr_fd_ctx_t *ctx_ptr; + + if (__fd_ctx_get(fd,this,&ctx_int) == 0) { + ctx_ptr = (nsr_fd_ctx_t *)(long)ctx_int; + } + else { + ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr), gf_mt_nsr_fd_ctx_t); + if (ctx_ptr) { + if (__fd_ctx_set(fd,this,(uint64_t)ctx_ptr) == 0) { + INIT_LIST_HEAD(&ctx_ptr->dirty_list); + INIT_LIST_HEAD(&ctx_ptr->fd_list); + } + else { + GF_FREE(ctx_ptr); + ctx_ptr = NULL; + } + } + + } + + return ctx_ptr; +} + +void +nsr_mark_fd_dirty (xlator_t *this, nsr_local_t *local) +{ + fd_t *fd = local->fd; + nsr_fd_ctx_t *ctx_ptr; + nsr_dirty_list_t *dirty; + nsr_private_t *priv = this->private; + + /* + * TBD: don't do any of this for O_SYNC/O_DIRECT writes. + * Unfortunately, that optimization requires that we distinguish + * between writev and other "write" calls, saving the original flags + * and checking them in the callback. Too much work for too little + * gain right now. + */ + + LOCK(&fd->lock); + ctx_ptr = nsr_get_fd_ctx(this,fd); + dirty = GF_CALLOC(1,sizeof(*dirty),gf_mt_nsr_dirty_t); + if (ctx_ptr && dirty) { + gf_log (this->name, GF_LOG_TRACE, + "marking fd %p as dirty (%p)", fd, dirty); + /* TBD: fill dirty->id from what changelog gave us */ + list_add_tail(&dirty->links,&ctx_ptr->dirty_list); + if (list_empty(&ctx_ptr->fd_list)) { + /* Add a ref so _release doesn't get called. */ + ctx_ptr->fd = fd_ref(fd); + LOCK(&priv->dirty_lock); + list_add_tail (&ctx_ptr->fd_list, + &priv->dirty_fds); + UNLOCK(&priv->dirty_lock); + } + } + else { + gf_log (this->name, GF_LOG_ERROR, + "could not mark %p dirty", fd); + if (ctx_ptr) { + GF_FREE(ctx_ptr); + } + if (dirty) { + GF_FREE(dirty); + } + } + UNLOCK(&fd->lock); +} + +#define NSR_TERM_XATTR "trusted.nsr.term" +#define RECON_TERM_XATTR "trusted.nsr.recon-term" +#define RECON_INDEX_XATTR "trusted.nsr.recon-index" +#define NSR_REP_COUNT_XATTR "trusted.nsr.rep-count" +#include "nsr-cg.c" + +uint8_t +nsr_count_up_kids (nsr_private_t *priv) +{ + uint8_t retval = 0; + uint8_t i; + + for (i = 0; i < priv->n_children; ++i) { + if (priv->kid_state & (1 << i)) { + ++retval; + } + } + + return retval; +} + +/* + * The fsync machinery looks a lot like that for any write call, but there are + * some important differences that are easy to miss. First, we don't care + * about the xdata that shows whether the call came from a leader or + * reconciliation process. If we're the leader we fan out; if we're not we + * don't. Second, we don't wait for followers before we issue the local call. + * The code generation system could be updated to handle this, and still might + * if we need to implement other "almost identical" paths (e.g. for open), but + * a copy is more readable as long as it's just one. + */ + +int32_t +nsr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + nsr_local_t *local = frame->local; + gf_boolean_t unwind; + + LOCK(&frame->lock); + unwind = !--(local->call_count); + UNLOCK(&frame->lock); + + if (unwind) { + STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + } + return 0; +} + +int32_t +nsr_fsync_local_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + nsr_dirty_list_t *dirty; + nsr_dirty_list_t *dtmp; + nsr_local_t *local = frame->local; + + list_for_each_entry_safe (dirty, dtmp, &local->qlinks, links) { + gf_log (this->name, GF_LOG_TRACE, + "sending post-op on %p (%p)", local->fd, dirty); + GF_FREE(dirty); + } + + return nsr_fsync_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); +} + +int32_t +nsr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, + dict_t *xdata) +{ + nsr_private_t *priv = this->private; + nsr_local_t *local; + uint64_t ctx_int = 0LL; + nsr_fd_ctx_t *ctx_ptr; + xlator_list_t *trav; + + local = mem_get0(this->local_pool); + if (!local) { + STACK_UNWIND_STRICT(fsync,frame,-1,ENOMEM,NULL,NULL,xdata); + return 0; + } + INIT_LIST_HEAD(&local->qlinks); + frame->local = local; + + /* Move the dirty list from the fd to the fsync request. */ + LOCK(&fd->lock); + if (__fd_ctx_get(fd,this,&ctx_int) == 0) { + ctx_ptr = (nsr_fd_ctx_t *)(long)ctx_int; + list_splice_init (&ctx_ptr->dirty_list, + &local->qlinks); + } + UNLOCK(&fd->lock); + + /* Issue the local call. */ + local->call_count = priv->leader ? priv->n_children : 1; + STACK_WIND (frame, nsr_fsync_local_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, + fd, flags, xdata); + + /* Issue remote calls if we're the leader. */ + if (priv->leader) { + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, nsr_fsync_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, + fd, flags, xdata); + } + } + + return 0; +} + +int32_t +nsr_getxattr_special (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) +{ + dict_t *result; + uint8_t up; + nsr_private_t *priv = this->private; + + if (!priv->leader) { + STACK_UNWIND_STRICT (getxattr, frame, -1, EREMOTE, NULL, NULL); + return 0; + } + + if (!name || (strcmp(name,NSR_REP_COUNT_XATTR) != 0)) { + STACK_WIND_TAIL (frame, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + loc, name, xdata); + return 0; + } + + result = dict_new(); + if (!result) { + goto dn_failed; + } + + up = nsr_count_up_kids(this->private); + if (dict_set_uint32(result,NSR_REP_COUNT_XATTR,up) != 0) { + goto dsu_failed; + } + + STACK_UNWIND_STRICT (getxattr, frame, 0, 0, result, NULL); + dict_destroy(result); + return 0; + +dsu_failed: + dict_destroy(result); +dn_failed: + STACK_UNWIND_STRICT (getxattr, frame, -1, ENOMEM, NULL, NULL); + return 0; +} + +void +nsr_flush_fd (xlator_t *this, nsr_fd_ctx_t *fd_ctx) +{ + nsr_dirty_list_t *dirty; + nsr_dirty_list_t *dtmp; + + list_for_each_entry_safe (dirty, dtmp, &fd_ctx->dirty_list, links) { + gf_log (this->name, GF_LOG_TRACE, + "sending post-op on %p (%p)", fd_ctx->fd, dirty); + GF_FREE(dirty); + } + + INIT_LIST_HEAD(&fd_ctx->dirty_list); +} + +void * +nsr_flush_thread (void *ctx) +{ + xlator_t *this = ctx; + nsr_private_t *priv = this->private; + struct list_head dirty_fds; + nsr_fd_ctx_t *fd_ctx; + nsr_fd_ctx_t *fd_tmp; + + for (;;) { + /* + * We have to be very careful to avoid lock inversions here, so + * we can't just hold priv->dirty_lock while we take and + * release locks for each fd. Instead, we only hold dirty_lock + * at the beginning of each iteration, as we (effectively) make + * a copy of the current list head and then clear the original. + * This leads to four scenarios for adding the first entry to + * an fd and potentially putting it on the global list. + * + * (1) While we're asleep. No lock contention, it just gets + * added and will be processed on the next iteration. + * + * (2) After we've made a local copy, but before we've started + * processing that fd. The new entry will be added to the + * fd (under its lock), and we'll process it on the current + * iteration. + * + * (3) While we're processing the fd. They'll block on the fd + * lock, then see that the list is empty and put it on the + * global list. We'll process it here on the next + * iteration. + * + * (4) While we're working, but after we've processed that fd. + * Same as (1) as far as that fd is concerned. + */ + INIT_LIST_HEAD(&dirty_fds); + LOCK(&priv->dirty_lock); + list_splice_init(&priv->dirty_fds,&dirty_fds); + UNLOCK(&priv->dirty_lock); + + list_for_each_entry_safe (fd_ctx, fd_tmp, &dirty_fds, fd_list) { + LOCK(&fd_ctx->fd->lock); + nsr_flush_fd(this,fd_ctx); + list_del_init(&fd_ctx->fd_list); + UNLOCK(&fd_ctx->fd->lock); + fd_unref(fd_ctx->fd); + } + + sleep(NSR_FLUSH_INTERVAL); + } + + return NULL; +} + +int32_t +nsr_forget (xlator_t *this, inode_t *inode) +{ + uint64_t ctx = 0LL; + + if ((inode_ctx_del(inode,this,&ctx) == 0) && ctx) { + GF_FREE((void *)(long)ctx); + } + + return 0; +} + +int32_t +nsr_release (xlator_t *this, fd_t *fd) +{ + uint64_t ctx = 0LL; + + if ((fd_ctx_del(fd,this,&ctx) == 0) && ctx) { + GF_FREE((void *)(long)ctx); + } + + return 0; +} + +struct xlator_cbks cbks = { + .forget = nsr_forget, + .release = nsr_release, +}; + +int +nsr_reconfigure (xlator_t *this, dict_t *options) +{ + nsr_private_t *priv = this->private; + + GF_OPTION_RECONF ("leader", priv->leader, options, bool, err); + return 0; + +err: + return -1; +} + +int +nsr_get_child_index (xlator_t *this, xlator_t *kid) +{ + xlator_list_t *trav; + int retval = -1; + + for (trav = this->children; trav; trav = trav->next) { + ++retval; + if (trav->xlator == kid) { + return retval; + } + } + + return -1; +} + +/* + * Child notify handling is unreasonably FUBAR. Sometimes we'll get a + * CHILD_DOWN for a protocol/client child before we ever got a CHILD_UP for it. + * Other times we won't. Because it's effectively random (probably racy), we + * can't just maintain a count. We actually have to keep track of the state + * for each child separately, to filter out the bogus CHILD_DOWN events, and + * then generate counts on demand. + */ +int +nsr_notify (xlator_t *this, int event, void *data, ...) +{ + nsr_private_t *priv = this->private; + int index; + + switch (event) { + case GF_EVENT_CHILD_UP: + index = nsr_get_child_index(this,data); + if (index >= 0) { + priv->kid_state |= (1 << index); + gf_log (this->name, GF_LOG_INFO, + "got CHILD_UP for %s, now %u kids", + ((xlator_t *)data)->name, + nsr_count_up_kids(priv)); + } + break; + case GF_EVENT_CHILD_DOWN: + index = nsr_get_child_index(this,data); + if (index >= 0) { + priv->kid_state &= ~(1 << index); + gf_log (this->name, GF_LOG_INFO, + "got CHILD_DOWN for %s, now %u kids", + ((xlator_t *)data)->name, + nsr_count_up_kids(priv)); + } + break; + default: + ; + } + + return default_notify(this,event,data); +} + + +extern void *nsr_leader_thread (void *); + +int32_t +nsr_init (xlator_t *this) +{ + xlator_list_t *remote; + xlator_list_t *local; + nsr_private_t *priv = NULL; + xlator_list_t *trav; + pthread_t kid; + uuid_t tmp_uuid; + char *my_name = NULL, *recon_file = NULL, *recon_pid_file = NULL, *ptr = NULL; + char *volname; + extern xlator_t global_xlator; + glusterfs_ctx_t *oldctx = global_xlator.ctx; + runner_t runner = {0,}; + int32_t ret = -1; + struct stat buf; + + /* + * Any fop that gets special treatment has to be patched in here, + * because the compiled-in table is produced by the code generator and + * only contains generated functions. Note that we have to go through + * this->fops because of some dynamic-linking strangeness; modifying + * the static table doesn't work. + */ + this->fops->getxattr = nsr_getxattr_special; + this->fops->fsync = nsr_fsync; + + local = this->children; + if (!local) { + gf_log (this->name, GF_LOG_ERROR, "no local subvolume"); + goto err; + } + + remote = local->next; + if (!remote) { + gf_log (this->name, GF_LOG_ERROR, "no remote subvolumes"); + goto err; + } + + this->local_pool = mem_pool_new (nsr_local_t, 128); + if (!this->local_pool) { + gf_log (this->name, GF_LOG_ERROR, + "failed to create nsr_local_t pool"); + goto err; + } + + priv = GF_CALLOC (1, sizeof(*priv), gf_mt_nsr_private_t); + if (!priv) { + gf_log (this->name, GF_LOG_ERROR, "could not allocate priv"); + goto err; + } + + // set this so that unless leader election is done, IO is fenced + priv->fence_io = 1; + + for (trav = this->children; trav; trav = trav->next) { + ++(priv->n_children); + } + + LOCK_INIT(&priv->dirty_lock); + INIT_LIST_HEAD(&priv->dirty_fds); + + this->private = priv; + + GF_OPTION_INIT ("etcd-servers", priv->etcd_servers, str, err); + if (!priv->etcd_servers) { + gf_log (this->name, GF_LOG_ERROR, "etcd servers not generated. ???"); + goto err; + } + priv->vol_uuid = "temporary"; + uuid_generate(tmp_uuid); + priv->brick_uuid = strdup(uuid_utoa(tmp_uuid)); + priv->term_uuid = "nsr-term"; + gf_log (this->name, GF_LOG_INFO, + "brick_uuid = %s\n", priv->brick_uuid); + + GF_OPTION_INIT ("my-name", my_name, str, err); + if (!my_name) { + gf_log (this->name, GF_LOG_ERROR, "brick name not generated. ???"); + goto err; + } + GF_OPTION_INIT ("vol-name", volname, str, err); + if (!volname) { + gf_log (this->name, GF_LOG_ERROR, "vol name not generated. ???"); + goto err; + } + + recon_file = GF_CALLOC (1,PATH_MAX + strlen(my_name) + strlen("con") +1, gf_mt_nsr_private_t); + recon_pid_file = GF_CALLOC (1,PATH_MAX + strlen(my_name) + strlen("recon") +1, gf_mt_nsr_private_t); + if ((!recon_file) || (!recon_pid_file)) { + gf_log (this->name, GF_LOG_ERROR, "could not allocate reconciliation file name"); + goto err; + } + ptr = strchr (my_name, '/'); + while (ptr) { + *ptr = '-'; + ptr = strchr (my_name, '/'); + } + + sprintf(recon_file,"/%s/%s/%s/%s/",GLUSTERD_DEFAULT_WORKDIR, + GLUSTERD_VOLUME_DIR_PREFIX, + volname, + GLUSTERD_BRICK_INFO_DIR); + strcat(recon_file, my_name); + strcat(recon_file, "-nsr-recon.vol"); + + sprintf(recon_pid_file,"/%s/%s/%s/%s/",GLUSTERD_DEFAULT_WORKDIR, + GLUSTERD_VOLUME_DIR_PREFIX, + volname, + "run"); + strcat(recon_pid_file, my_name); + strcat(recon_pid_file, "-recon.pid"); + + priv->vol_file = GF_CALLOC (1,PATH_MAX + strlen(my_name) + strlen("con") +1, gf_mt_nsr_private_t); + if (!priv->vol_file) { + gf_log (this->name, GF_LOG_ERROR, "could not allocate reconciliation file name"); + goto err; + } + sprintf(priv->vol_file,"%s/%s/%s/%s/", + GLUSTERD_DEFAULT_WORKDIR, + GLUSTERD_VOLUME_DIR_PREFIX, + volname, + GLUSTERD_BRICK_INFO_DIR); + strcat(priv->vol_file, "con:"); + strcat(priv->vol_file, my_name); + + if (pthread_create(&kid,NULL,nsr_flush_thread,this) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not start flush thread"); + /* TBD: treat this as a fatal error? */ + } + + // Start the recon process. Then start the leader thread. + /* + * REVIEW + * Logs belong in /var/log not /tmp. + */ + if (!stat(priv->vol_file, &buf)) { + runinit (&runner); + runner_add_args(&runner, "/usr/local/sbin/glusterfs", + "-f", recon_file, + "-p", recon_pid_file, + "-l", "/tmp/reconciliation.log", + NULL); + ret = runner_run (&runner); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "could not exec reconciliation process " ); + goto err; + } + + // TBD - convert this to make sure recon process runs + sleep(2); + priv->nsr_recon_start = _gf_true; + } + + + (void)pthread_create(&kid,NULL,nsr_leader_thread,this); + while (priv->leader_inited == 0) { + sleep(1); + } + /* + * Calling glfs_new changes old->ctx, even if THIS still points + * to global_xlator. That causes problems later in the main + * thread, when gf_log_dump_graph tries to use the FILE after + * we've mucked with it and gets a segfault in __fprintf_chk. + * We can avoid all that by undoing the damage before we + * continue. + */ + global_xlator.ctx = oldctx; + + return 0; + +err: + if (priv) { + GF_FREE(priv); + } + return -1; +} + + +void +nsr_fini (xlator_t *this) +{ +} + +class_methods_t class_methods = { + .init = nsr_init, + .fini = nsr_fini, + .reconfigure = nsr_reconfigure, + .notify = nsr_notify, +}; + +struct volume_options options[] = { + { .key = {"leader"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "false", + .description = "Start in the leader role. This is only for " + "bootstrapping the code, and should go away when we " + "have real leader election." + }, + { .key ={"vol-name"}, + .type = GF_OPTION_TYPE_STR, + .description = "volume name" + }, + { .key = {"my-name"}, + .type = GF_OPTION_TYPE_STR, + .description = "brick name in form of host:/path" + }, + { .key = {"etcd-servers"}, + .type = GF_OPTION_TYPE_STR, + .description = "list of comma seperated etc servers" + }, + { .key = {NULL} }, +}; |