summaryrefslogtreecommitdiffstats
path: root/xlators/mgmt/glusterd
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/mgmt/glusterd')
-rw-r--r--xlators/mgmt/glusterd/src/Makefile.am4
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-etcd.c87
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-etcd.h23
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handler.c12
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-sm.c11
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.c2
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.c351
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.h4
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c19
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.c16
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.h21
11 files changed, 537 insertions, 13 deletions
diff --git a/xlators/mgmt/glusterd/src/Makefile.am b/xlators/mgmt/glusterd/src/Makefile.am
index b89ec6ddc..9b33edf4d 100644
--- a/xlators/mgmt/glusterd/src/Makefile.am
+++ b/xlators/mgmt/glusterd/src/Makefile.am
@@ -13,7 +13,7 @@ glusterd_la_SOURCES = glusterd.c glusterd-handler.c glusterd-sm.c \
glusterd-volume-ops.c glusterd-brick-ops.c glusterd-mountbroker.c \
glusterd-syncop.c glusterd-hooks.c glusterd-volume-set.c \
glusterd-locks.c glusterd-snapshot.c glusterd-mgmt-handler.c \
- glusterd-mgmt.c
+ glusterd-mgmt.c glusterd-etcd.c
glusterd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
$(top_builddir)/rpc/xdr/src/libgfxdr.la \
@@ -24,7 +24,7 @@ noinst_HEADERS = glusterd.h glusterd-utils.h glusterd-op-sm.h \
glusterd-sm.h glusterd-store.h glusterd-mem-types.h \
glusterd-pmap.h glusterd-volgen.h glusterd-mountbroker.h \
glusterd-syncop.h glusterd-hooks.h glusterd-locks.h \
- glusterd-mgmt.h
+ glusterd-mgmt.h glusterd-etcd.h
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
-I$(rpclibdir) -I$(CONTRIBDIR)/rbtree \
diff --git a/xlators/mgmt/glusterd/src/glusterd-etcd.c b/xlators/mgmt/glusterd/src/glusterd-etcd.c
new file mode 100644
index 000000000..656ea3b9b
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-etcd.c
@@ -0,0 +1,87 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <errno.h>
+#include <signal.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include "glusterfs.h"
+#include "run.h"
+#include "glusterd-etcd.h"
+
+#define GLUSTERD_ETCD_DIR "/var/lib/glusterd/etcd"
+#define GLUSTERD_ETCD_CMD "/root/etcd/bin/etcd"
+
+pid_t
+start_etcd (char *this_host, char *other_host)
+{
+ runner_t runner;
+ char me[256];
+
+ if (gethostname(me,sizeof(me)-1) != 0) {
+ gf_log (__func__, GF_LOG_ERROR, "gethostname failed?!?");
+ return -1;
+ }
+ me[sizeof(me)-1] = '\0';
+
+ if ((mkdir(GLUSTERD_ETCD_DIR,0700) < 0) && (errno != EEXIST)) {
+ gf_log (__func__, GF_LOG_ERROR,
+ "failed to create %s", GLUSTERD_ETCD_DIR);
+ return -1;
+ }
+
+ runinit (&runner);
+ runner_add_args (&runner, GLUSTERD_ETCD_CMD,
+ "-name", this_host,
+ "-data-dir", GLUSTERD_ETCD_DIR,
+ "-bind-addr", NULL);
+ runner_argprintf( &runner, "%s:4001", me);
+ runner_add_arg (&runner, "-peer-addr");
+ runner_argprintf (&runner, "%s:7001", me);
+ if (other_host) {
+ runner_add_arg (&runner, "-peers");
+ runner_argprintf (&runner, "%s:7001", other_host);
+ gf_log (__func__, GF_LOG_INFO, "starting etcd via %s", other_host);
+ } else {
+ gf_log (__func__, GF_LOG_INFO, "starting etcd standalone");
+ }
+
+ /*
+ * Runner_run would wait for it. Runner_run_nowait would not wait,
+ * but would detach it so thoroughly that it won't die when we do.
+ * Also, runner->chpid would be the PID of the transient middle
+ * process, not the one we might actually need to kill later. This
+ * seems to do exactly what we need.
+ */
+ if (runner_start(&runner) != 0) {
+ gf_log (__func__, GF_LOG_ERROR,
+ "failed to start %s", GLUSTERD_ETCD_CMD);
+ return -1;
+ }
+
+ return runner.chpid;
+}
+
+void
+stop_etcd (pid_t pid)
+{
+ if (pid > 0) {
+ gf_log (__func__, GF_LOG_INFO, "killing etcd %d", pid);
+ (void)kill(pid,SIGKILL);
+ (void)waitpid(pid,NULL,0);
+ }
+}
+
+void
+nuke_etcd_dir (void)
+{
+ (void)runcmd("rm","-rf",GLUSTERD_ETCD_DIR,NULL);
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-etcd.h b/xlators/mgmt/glusterd/src/glusterd-etcd.h
new file mode 100644
index 000000000..9459f6bbd
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-etcd.h
@@ -0,0 +1,23 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_ETCD_H_
+#define _GLUSTERD_ETCD_H_
+
+#include <sys/types.h>
+#include "glusterfs.h"
+
+pid_t start_etcd (char *this_host, char *other_host);
+
+void stop_etcd (pid_t pid);
+
+void nuke_etcd_dir (void);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
index b8202b233..e0373c774 100644
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
@@ -50,6 +50,7 @@
#include "globals.h"
#include "glusterd-syncop.h"
+#include "glusterd-etcd.h"
#ifdef HAVE_BD_XLATOR
#include <lvm2app.h>
@@ -2631,7 +2632,18 @@ __glusterd_handle_probe_query (rpcsvc_request_t *req)
gf_log ("", GF_LOG_ERROR, "Failed to add peer %s",
remote_hostname);
rsp.op_errno = GF_PROBE_ADD_FAILED;
+ goto respond;
}
+ gf_log (THIS->name, GF_LOG_INFO,
+ "joining, should point etcd at %s", remote_hostname);
+ /*
+ * We should have started a standalone etcd before. Now we
+ * need a new one, with a new config.
+ */
+ stop_etcd(conf->etcd_pid);
+ nuke_etcd_dir();
+ conf->etcd_pid = start_etcd (uuid_utoa(MY_UUID),
+ remote_hostname);
}
respond:
diff --git a/xlators/mgmt/glusterd/src/glusterd-sm.c b/xlators/mgmt/glusterd/src/glusterd-sm.c
index e6ad34335..7a8b2c94f 100644
--- a/xlators/mgmt/glusterd/src/glusterd-sm.c
+++ b/xlators/mgmt/glusterd/src/glusterd-sm.c
@@ -34,6 +34,7 @@
#include "glusterd-op-sm.h"
#include "glusterd-utils.h"
#include "glusterd-store.h"
+#include "glusterd-etcd.h"
static struct list_head gd_friend_sm_queue;
@@ -605,6 +606,9 @@ glusterd_ac_handle_friend_remove_req (glusterd_friend_sm_event_t *event,
"Peer detach cleanup was not successful");
ret = 0;
}
+ gf_log (THIS->name, GF_LOG_INFO, "detached, stopping etcd");
+ stop_etcd(priv->etcd_pid);
+ nuke_etcd_dir();
out:
gf_log (THIS->name, GF_LOG_DEBUG, "Returning with %d", ret);
@@ -733,6 +737,13 @@ glusterd_ac_handle_friend_add_req (glusterd_friend_sm_event_t *event, void *ctx)
peerinfo->hostname, ev_ctx->port,
op_ret, op_errno);
+ // apply a deterministic function to decide via whom we should join the cluster
+ if (strcmp(peerinfo->hostname, ev_ctx->hostname) > 0) {
+ stop_etcd(conf->etcd_pid);
+ nuke_etcd_dir();
+ conf->etcd_pid = start_etcd (uuid_utoa(MY_UUID), peerinfo->hostname);
+ }
+
out:
gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
index 896827244..7883a98bf 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
@@ -1905,6 +1905,8 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
if (ret == 0) {
glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
(void) glusterd_brick_unlink_socket_file (volinfo, brickinfo);
+ GLUSTERD_GET_BRICK_RECON_PIDFILE (pidfile, volinfo, brickinfo, priv);
+ ret = glusterd_service_stop ("recon", pidfile, SIGTERM, _gf_false);
}
}
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c
index ae095bf7c..f42d596ba 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c
@@ -103,7 +103,6 @@ xlator_instantiate_va (const char *type, const char *format, va_list arg)
return NULL;
}
-#ifdef __not_used_as_of_now_
static xlator_t *
xlator_instantiate (const char *type, const char *format, ...)
{
@@ -116,7 +115,6 @@ xlator_instantiate (const char *type, const char *format, ...)
return xl;
}
-#endif
static int
volgen_xlator_link (xlator_t *pxl, xlator_t *cxl)
@@ -1445,6 +1443,312 @@ server_spec_extended_option_handler (volgen_graph_t *graph,
static void get_vol_tstamp_file (char *filename, glusterd_volinfo_t *volinfo);
+xlator_t *
+add_one_peer (volgen_graph_t *graph, glusterd_brickinfo_t *peer,
+ char *volname, uint16_t index)
+{
+ xlator_t *kid;
+
+ kid = volgen_graph_add_nolink (graph, "protocol/client",
+ "%s-client-%u", volname,
+ index++);
+ if (!kid) {
+ return NULL;
+ }
+
+ /* TBD: figure out where to get the proper transport list */
+ if (xlator_set_option(kid,"transport-type","socket")) {
+ return NULL;
+ }
+ if (xlator_set_option(kid,"remote-host",peer->hostname)) {
+ return NULL;
+ }
+ if (xlator_set_option(kid,"remote-subvolume",peer->path)) {
+ return NULL;
+ }
+ /* TBD: deal with RDMA, SSL */
+
+ return kid;
+}
+
+void
+assign_groups (glusterd_volinfo_t *volinfo)
+{
+ glusterd_brickinfo_t *brickinfo = NULL;
+ uint16_t group_num = 0;
+ int in_group = 0;
+ uuid_t tmp_uuid;
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ if (in_group == 0) {
+ uuid_generate(tmp_uuid);
+ }
+ brickinfo->group = group_num;
+ uuid_copy(brickinfo->nsr_uuid,tmp_uuid);
+ if (++in_group >= volinfo->replica_count) {
+ in_group = 0;
+ ++group_num;
+ }
+ }
+}
+
+int
+add_nsr_stuff (volgen_graph_t *graph, char *volname,
+ glusterd_brickinfo_t *brickinfo, glusterd_volinfo_t *volinfo,
+ char *changelog_basepath)
+{
+ xlator_t *me;
+ xlator_t *kid;
+ glusterd_brickinfo_t *peer;
+ uint16_t index = 0;
+ //uint32_t i=0;
+ char *leader_opt;
+ uint32_t replica_group_size = 1;
+ char dst[NSR_MAX_PATH_SIZE];
+ char local_path[NSR_MAX_PATH_SIZE];
+ char local_name[NSR_MAX_PATH_SIZE];
+ char hosts[NSR_MAX_PATH_SIZE * NSR_MAX_REPLICA_GROUP_SIZE];
+ char remote_names[NSR_MAX_REPLICA_GROUP_SIZE * NSR_MAX_PATH_SIZE];
+ char filepath[PATH_MAX] = {0,};
+ char lp[PATH_MAX] = {0,};
+ xlator_t *xl = NULL;
+ char s[256];
+ char transt[16] = {0,};
+ char auth[256];
+ char c_d[NSR_MAX_PATH_SIZE];
+ char *username = NULL, *password = NULL;
+ gf_boolean_t enable_recon = _gf_false;
+ static uint32_t nsr_port = 27000;
+
+ if (glusterd_volinfo_get_boolean(volinfo,"cluster.nsr.recon") > 0) {
+ enable_recon = _gf_true;
+ }
+
+ volgen_graph_t ng = {0,};
+ char path[PATH_MAX] = {0,};
+ char *ptr = NULL, *this = NULL, *that = NULL;
+ glusterd_conf_t *priv = NULL;
+
+
+ priv = THIS->private;
+ remote_names[0] = '\0';
+ that = gf_strdup (brickinfo->hostname);
+ this = gf_strdup (brickinfo->path);
+ ptr = strchr (this, '/');
+ while (ptr) {
+ *ptr = '-';
+ ptr = strchr (this, '/');
+ }
+ GLUSTERD_GET_VOLUME_DIR (path, volinfo, priv);
+ snprintf (dst, PATH_MAX,
+ "%s/%s/%s:%s",
+ path,
+ GLUSTERD_BRICK_INFO_DIR,
+ that,
+ this);
+
+ /* Create the NSR xlator, but defer linkage for now. */
+ me = xlator_instantiate ("cluster/nsr", "%s-nsr", volname);
+ if (!me || volgen_xlator_link(me,first_of(graph))) {
+ return -1;
+ }
+
+ strcpy(local_name, brickinfo->hostname);
+ strcpy(local_path, brickinfo->hostname);
+ strcat(local_name, ":");
+ strcat(local_name, brickinfo->path);
+ strcpy(hosts, brickinfo->hostname);
+
+ peer = list_prev (brickinfo, &volinfo->bricks,
+ glusterd_brickinfo_t, brick_list);
+ /* Check leader status while we have this pointer in hand. */
+ leader_opt = (!peer || (peer->group != brickinfo->group)) ? "yes"
+ : "no";
+ if (xlator_set_option(me,"vol-name",volname))
+ return -1;
+ if (xlator_set_option(me,"my-name",local_name))
+ return -1;
+ if (xlator_set_option(me,"leader",leader_opt))
+ return -1;
+ if (xlator_set_option(me,"subvol-uuid",
+ uuid_utoa(brickinfo->nsr_uuid))) {
+ return -1;
+ }
+
+#define FILL_REMOTE_NAMES { \
+ strcat(remote_names, \
+ peer->hostname); \
+ strcat(remote_names, \
+ ":"); \
+ strcat(remote_names, \
+ peer->path); \
+ strcat(remote_names, \
+ ","); \
+ strcat(hosts, ","); \
+ strcat(hosts, \
+ peer->hostname); \
+ replica_group_size++; \
+}
+
+ /* Now get on with the show. */
+ while (peer) {
+ if (peer->group != brickinfo->group) {
+ break;
+ }
+ gf_log ("glusterd", GF_LOG_INFO,
+ "%s:%s needs client for %s:%s",
+ brickinfo->hostname, brickinfo->path,
+ peer->hostname, peer->path);
+ kid = add_one_peer (graph, peer, volname, index++);
+ if (!kid || volgen_xlator_link(me,kid)) {
+ return -1;
+ }
+ FILL_REMOTE_NAMES;
+ peer = list_prev (peer, &volinfo->bricks,
+ glusterd_brickinfo_t, brick_list);
+ }
+
+ peer = list_next (brickinfo, &volinfo->bricks,
+ glusterd_brickinfo_t, brick_list);
+ while (peer) {
+ if (peer->group != brickinfo->group) {
+ break;
+ }
+ gf_log ("glusterd", GF_LOG_INFO,
+ "%s:%s needs client for %s:%s",
+ brickinfo->hostname, brickinfo->path,
+ peer->hostname, peer->path);
+ kid = add_one_peer (graph, peer, volname, index++);
+ if (!kid || volgen_xlator_link(me,kid)) {
+ return -1;
+ }
+ FILL_REMOTE_NAMES;
+ peer = list_next (peer, &volinfo->bricks,
+ glusterd_brickinfo_t, brick_list);
+ }
+
+ // to remove the final ","
+ if (strlen(remote_names)) {
+ remote_names[strlen(remote_names) - 1] = '\0';
+ }
+ if (xlator_set_option(me,"etcd-servers",hosts))
+ return -1;
+
+ // Finish linkage to client file
+ glusterfs_graph_set_first(&graph->graph,me);
+
+ if (enable_recon == _gf_false)
+ return 0;
+
+ /* Now fill in the various files required for reconciliation */
+ snprintf (filepath, PATH_MAX,
+ "%s-nsr-recon.vol",
+ dst);
+ gf_log ("glusterd", GF_LOG_INFO,
+ "writing nsr recon volfile in %s\n",
+ filepath);
+#if 0
+ strcpy(lp, local_name);
+#else
+ strcpy(lp, brickinfo->path);
+#endif
+ strcat(lp,"/recon");
+ bzero(&ng, sizeof(ng));
+ xl = volgen_graph_add_as (&ng, "cluster/nsr_recon",lp);
+ if (!xl)
+ return -1;
+ sprintf(s,"%d",replica_group_size);
+ if (xlator_set_option(xl, "replica-group-size", s) == -1)
+ return -1;
+ if (xlator_set_option(xl, "local-member", local_name) == -1)
+ return -1;
+ if (xlator_set_option(xl, "replica-group-members", remote_names) == -1)
+ return -1;
+ if (xlator_set_option(xl,"vol-name",volname))
+ return -1;
+ if (xlator_set_option(xl,"changelog-dir",changelog_basepath))
+ return -1;
+ if (xlator_set_option(xl,"base-dir",brickinfo->path))
+ return -1;
+
+ xl = volgen_graph_add (&ng, "protocol/server", lp);
+ if (!xl)
+ return -1;
+ get_vol_transport_type (volinfo, transt);
+ if(xlator_set_option (xl, "transport-type", transt) == -1)
+ return -1;
+ sprintf(s,"%d",nsr_port);
+ if(xlator_set_option (xl, "transport.socket.listen-port", s) == -1)
+ return -1;
+ strcpy(auth, "auth.addr.");
+ strcat(auth, lp);
+ strcat(auth, ".allow");
+ if(xlator_set_option (xl, auth, "*") == -1)
+ return -1;
+ if(xlator_set_option (xl, "rpc-auth.auth-null", "off") == -1)
+ return -1;
+ if(xlator_set_option (xl, "rpc-auth.auth-unix", "off") == -1)
+ return -1;
+ if(xlator_set_option (xl, "rpc-auth.auth-glusterfs", "off") == -1)
+ return -1;
+ if(volgen_write_volfile(&ng, filepath) == -1)
+ return -1;
+
+ bzero(&ng, sizeof(ng));
+ kid = volgen_graph_add_nolink (&ng, "protocol/client",
+ "%s-client-%u", lp, 0);
+ if (!kid)
+ return -1;
+ if (xlator_set_option(kid,"remote-host",brickinfo->hostname))
+ return -1;
+#if 0
+ strcpy(lp, brickinfo->path);
+ strcat(lp,"/recon");
+#endif
+ if (xlator_set_option(kid,"remote-subvolume",lp))
+ return -1;
+ if(xlator_set_option (kid, "transport-type", transt) == -1)
+ return -1;
+ sprintf(s,"%d",nsr_port++);
+ if(xlator_set_option (kid, "remote-port", s) == -1)
+ return -1;
+ snprintf (c_d, PATH_MAX,
+ "%s/%s/con:%s:%s",
+ path,
+ GLUSTERD_BRICK_INFO_DIR,
+ that, this);
+ if (volgen_write_volfile(&ng, c_d))
+ return -1;
+
+ bzero(&ng, sizeof(ng));
+ kid = volgen_graph_add_nolink (&ng, "protocol/client",
+ "%s-client-%u", lp, 0);
+ if (!kid)
+ return -1;
+ if (xlator_set_option(kid,"remote-host",brickinfo->hostname))
+ return -1;
+ if (xlator_set_option(kid,"remote-subvolume",brickinfo->path))
+ return -1;
+ if(xlator_set_option (kid, "transport-type", transt) == -1)
+ return -1;
+ username = glusterd_auth_get_username (volinfo);
+ password = glusterd_auth_get_password (volinfo);
+ if(xlator_set_option (kid, "username", username) == -1)
+ return -1;
+ if(xlator_set_option (kid, "password", password) == -1)
+ return -1;
+ snprintf (c_d, PATH_MAX,
+ "%s/%s/data:%s:%s",
+ path,
+ GLUSTERD_BRICK_INFO_DIR, that,
+ this);
+ if (volgen_write_volfile(&ng, c_d))
+ return -1;
+
+ return 0;
+
+}
+
static int
server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
dict_t *set_dict, void *param)
@@ -1561,10 +1865,17 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
if (ret)
return -1;
+ if (glusterd_volinfo_get_boolean(volinfo,"cluster.nsr") > 0) {
+ ret = xlator_set_option (xl, "encoding", "ascii");
+ if (ret)
+ return -1;
+ }
+
ret = check_and_add_debug_xl (graph, set_dict, volname, "changelog");
if (ret)
return -1;
+
xl = volgen_graph_add (graph, "features/access-control", volname);
if (!xl)
return -1;
@@ -1643,9 +1954,19 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
return -1;
}
- xl = volgen_graph_add (graph, "features/index", volname);
- if (!xl)
- return -1;
+ /* TBD: conditionalize on NSR being enabled */
+ if (glusterd_volinfo_get_boolean(volinfo,"cluster.nsr") > 0) {
+ ret = add_nsr_stuff (graph, volname, brickinfo, volinfo,
+ changelog_basepath);
+ if (ret) {
+ return -1;
+ }
+ }
+ else {
+ xl = volgen_graph_add (graph, "features/index", volname);
+ if (!xl)
+ return -1;
+ }
snprintf (index_basepath, sizeof (index_basepath), "%s/%s",
path, ".glusterfs/indices");
@@ -2470,8 +2791,8 @@ volume_volgen_graph_build_clusters (volgen_graph_t *graph,
glusterd_volinfo_t *volinfo,
gf_boolean_t is_quotad)
{
- char *replicate_args[] = {"cluster/replicate",
- "%s-replicate-%d"};
+ char *replicate_type = "cluster/replicate";
+ char *replicate_fmt = "%s-replicate-%d";
char *stripe_args[] = {"cluster/stripe",
"%s-stripe-%d"};
int rclusters = 0;
@@ -2485,12 +2806,16 @@ volume_volgen_graph_build_clusters (volgen_graph_t *graph,
if (volinfo->dist_leaf_count == 1)
goto build_distribute;
+ if (glusterd_volinfo_get_boolean(volinfo,"cluster.nsr") > 0) {
+ replicate_type = "cluster/nsrc";
+ }
+
/* All other cases, it will have one or the other cluster type */
switch (volinfo->type) {
case GF_CLUSTER_TYPE_REPLICATE:
clusters = volgen_graph_build_clusters (graph, volinfo,
- replicate_args[0],
- replicate_args[1],
+ replicate_type,
+ replicate_fmt,
volinfo->brick_count,
volinfo->replica_count);
if (clusters < 0)
@@ -2510,8 +2835,8 @@ volume_volgen_graph_build_clusters (volgen_graph_t *graph,
if (volinfo->replica_count == 0)
goto out;
clusters = volgen_graph_build_clusters (graph, volinfo,
- replicate_args[0],
- replicate_args[1],
+ replicate_type,
+ replicate_fmt,
volinfo->brick_count,
volinfo->replica_count);
if (clusters < 0)
@@ -3539,6 +3864,10 @@ generate_brick_volfiles (glusterd_volinfo_t *volinfo)
}
}
+ if (glusterd_volinfo_get_boolean(volinfo,"cluster.nsr") > 0) {
+ assign_groups(volinfo);
+ }
+
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
gf_log ("", GF_LOG_DEBUG,
"Found a brick - %s:%s", brickinfo->hostname,
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.h b/xlators/mgmt/glusterd/src/glusterd-volgen.h
index ef92087fc..f4703c288 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volgen.h
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.h
@@ -35,6 +35,10 @@
#define AUTH_REJECT_OPT_KEY "auth.addr.*.reject"
#define NFS_DISABLE_OPT_KEY "nfs.*.disable"
+// TBD - bring this from a common conf file
+#define NSR_MAX_REPLICA_GROUP_SIZE 8
+#define NSR_MAX_PATH_SIZE (1024 + PATH_MAX)
+#define NSR_CONF_PATH "/var/lib/glusterd/nsr/"
typedef enum {
GF_CLIENT_TRUSTED,
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index e3af4e7f5..1374e82cd 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -898,6 +898,25 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.type = NO_DOC,
.op_version = 2
},
+ { .key = "cluster.nsr",
+ .voltype = "cluster/nsr",
+ .option = "!nsr",
+ .op_version = 3,
+ .description = "enable NSR instead of AFR for replication",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "cluster.nsr.recon",
+ .voltype = "cluster/nsr",
+ .op_version = 3,
+ .description = "enable NSR reconciliation",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "cluster.nsr.quorum-percent",
+ .voltype = "cluster/nsr",
+ .option = "quorum-percent",
+ .op_version = 3,
+ .description = "percent of rep_count-1 bricks that must be up"
+ },
/* Performance xlators enable/disbable options */
{ .key = "performance.write-behind",
diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c
index 1a6aa81d3..4d09d7fd9 100644
--- a/xlators/mgmt/glusterd/src/glusterd.c
+++ b/xlators/mgmt/glusterd/src/glusterd.c
@@ -1475,7 +1475,21 @@ init (xlator_t *this)
if (list_empty (&conf->peers)) {
glusterd_launch_synctask (glusterd_spawn_daemons, NULL);
+ gf_log (this->name, GF_LOG_INFO,
+ "no peers, should start FRESH etcd");
+ /*
+ * We might not have any peers now, but if we did once before
+ * then we don't want to start up with a config that still has
+ * references to them.
+ */
+ nuke_etcd_dir();
}
+ else {
+ gf_log (this->name, GF_LOG_INFO,
+ "have peers, should start etcd with old config");
+ }
+ conf->etcd_pid = start_etcd(uuid_utoa(MY_UUID),NULL);
+
ret = glusterd_options_init (this);
if (ret < 0)
goto out;
@@ -1521,6 +1535,8 @@ fini (xlator_t *this)
conf = this->private;
glusterd_stop_uds_listener (this);
+ stop_etcd(conf->etcd_pid);
+ nuke_etcd_dir();
FREE (conf->pmap);
if (conf->handle)
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
index b53d8e412..7157bee64 100644
--- a/xlators/mgmt/glusterd/src/glusterd.h
+++ b/xlators/mgmt/glusterd/src/glusterd.h
@@ -38,6 +38,7 @@
#include "cli1-xdr.h"
#include "syncop.h"
#include "store.h"
+#include "glusterd-etcd.h"
#define GLUSTERD_MAX_VOLUME_NAME 1000
#define GLUSTERD_TR_LOG_SIZE 50
@@ -178,6 +179,7 @@ typedef struct {
char *snap_bricks_directory;
gf_store_handle_t *missed_snaps_list_shandle;
struct list_head missed_snaps_list;
+ pid_t etcd_pid;
} glusterd_conf_t;
@@ -204,6 +206,16 @@ struct glusterd_brickinfo {
char vg[PATH_MAX]; /* FIXME: Use max size for length of vg */
int caps; /* Capability */
int32_t snap_status;
+ /*
+ * The group is used to identify which bricks are part of the same
+ * replica set during brick-volfile generation, so that NSR volfiles
+ * can "cross-connect" the bricks to one another. This same approach
+ * could be used to make client-volfile generation much simpler and
+ * more efficient too, though it would require some further adaptation
+ * to support more than one layer of hierarchy.
+ */
+ uint16_t group;
+ uuid_t nsr_uuid;
};
typedef struct glusterd_brickinfo glusterd_brickinfo_t;
@@ -532,6 +544,15 @@ typedef ssize_t (*gd_serialize_t) (struct iovec outmsg, void *args);
volpath, brickinfo->hostname, exp_path); \
} while (0)
+#define GLUSTERD_GET_BRICK_RECON_PIDFILE(pidfile,volinfo,brickinfo, priv) do { \
+ char exp_path[PATH_MAX] = {0,}; \
+ char volpath[PATH_MAX] = {0,}; \
+ GLUSTERD_GET_VOLUME_DIR (volpath, volinfo, priv); \
+ GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, exp_path); \
+ snprintf (pidfile, PATH_MAX, "%s/run/%s:-%s-recon.pid", \
+ volpath, brickinfo->hostname, exp_path); \
+ } while (0)
+
#define GLUSTERD_GET_NFS_PIDFILE(pidfile,nfspath) { \
snprintf (pidfile, PATH_MAX, "%s/run/nfs.pid", \
nfspath); \