diff options
-rwxr-xr-x | tests/basic/nsr/nsr.t | 33 | ||||
-rw-r--r-- | tests/volume.rc | 18 | ||||
-rw-r--r-- | xlators/experimental/nsr-client/src/nsrc.c | 110 | ||||
-rw-r--r-- | xlators/experimental/nsr-client/src/nsrc.h | 3 | ||||
-rw-r--r-- | xlators/experimental/nsr-server/src/all-templates.c | 12 | ||||
-rw-r--r-- | xlators/experimental/nsr-server/src/nsr-internal.h | 2 | ||||
-rw-r--r-- | xlators/experimental/nsr-server/src/nsr.c | 90 |
7 files changed, 248 insertions, 20 deletions
diff --git a/tests/basic/nsr/nsr.t b/tests/basic/nsr/nsr.t new file mode 100755 index 00000000000..b5a4aaf1058 --- /dev/null +++ b/tests/basic/nsr/nsr.t @@ -0,0 +1,33 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../cluster.rc +. $(dirname $0)/../../snapshot.rc + +cleanup; + +TEST verify_lvm_version; +#Create cluster with 3 nodes +TEST launch_cluster 3; +TEST setup_lvm 3 + +TEST $CLI_1 peer probe $H2; +TEST $CLI_1 peer probe $H3; +EXPECT_WITHIN $PROBE_TIMEOUT 2 peer_count; + +TEST $CLI_1 volume create $V0 replica 3 $H1:$L1 $H2:$L2 $H3:$L3 +TEST $CLI_1 volume set $V0 cluster.nsr on +#TEST $CLI_1 volume set $V0 diagnostics.brick-log-level DEBUG +TEST $CLI_1 volume start $V0 + +TEST glusterfs --volfile-id=$V0 --volfile-server=$H1 --entry-timeout=0 $M0; + +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" nsrc_child_up_status $V0 0 + +echo "file" > $M0/file1 +TEST stat $L1/file1 +TEST stat $L2/file1 +TEST stat $L3/file1 + +cleanup; diff --git a/tests/volume.rc b/tests/volume.rc index e488aa73b1c..71b40b72d66 100644 --- a/tests/volume.rc +++ b/tests/volume.rc @@ -110,6 +110,24 @@ function snap_client_connected_status { echo "$up" } +function _nsrc_child_up_status { + local vol=$1 + #brick_id is (brick-num in volume info - 1) + local brick_id=$2 + local gen_state_dump=$3 + local fpath=$($gen_state_dump $vol) + up=$(grep -a -B1 child_$brick_id=$vol-client-$brick_id $fpath | head -1 | cut -f2 -d'=') + rm -f $fpath + echo "$up" +} + +function nsrc_child_up_status { + local vol=$1 + #brick_id is (brick-num in volume info - 1) + local brick_id=$2 + _nsrc_child_up_status $vol $brick_id generate_mount_statedump +} + function _afr_child_up_status { local vol=$1 #brick_id is (brick-num in volume info - 1) diff --git a/xlators/experimental/nsr-client/src/nsrc.c b/xlators/experimental/nsr-client/src/nsrc.c index dd3ad20544e..13f1a2d38c5 100644 --- a/xlators/experimental/nsr-client/src/nsrc.c +++ b/xlators/experimental/nsr-client/src/nsrc.c @@ -18,6 +18,7 @@ #include "xlator.h" #include "nsr-messages.h" #include "nsrc.h" +#include "statedump.h" #define SCAR_LIMIT 20 #define HILITE(x) ("[1;33m"x"[0m") @@ -168,6 +169,7 @@ int32_t nsrc_init (xlator_t *this) { nsrc_private_t *priv = NULL; + xlator_list_t *trav = NULL; this->local_pool = mem_pool_new (nsrc_local_t, 128); if (!this->local_pool) { @@ -181,6 +183,10 @@ nsrc_init (xlator_t *this) goto err; } + for (trav = this->children; trav; trav = trav->next) { + ++(priv->n_children); + } + priv->active = FIRST_CHILD(this); this->private = priv; return 0; @@ -198,33 +204,111 @@ nsrc_fini (xlator_t *this) GF_FREE(this->private); } +int +nsrc_get_child_index (xlator_t *this, xlator_t *kid) +{ + xlator_list_t *trav; + int retval = -1; + + for (trav = this->children; trav; trav = trav->next) { + ++retval; + if (trav->xlator == kid) { + return retval; + } + } + + return -1; +} + +uint8_t +nsrc_count_up_kids (nsrc_private_t *priv) +{ + uint8_t retval = 0; + uint8_t i; + + for (i = 0; i < priv->n_children; ++i) { + if (priv->kid_state & (1 << i)) { + ++retval; + } + } + + return retval; +} + int32_t nsrc_notify (xlator_t *this, int32_t event, void *data, ...) { - int32_t ret = 0; + int32_t ret = 0; + int32_t index = 0; + nsrc_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO (THIS->name, this, out); + priv = this->private; + GF_VALIDATE_OR_GOTO (this->name, priv, out); switch (event) { + case GF_EVENT_CHILD_UP: + index = nsrc_get_child_index(this, data); + if (index >= 0) { + priv->kid_state |= (1 << index); + priv->up_children = nsrc_count_up_kids(priv); + gf_msg (this->name, GF_LOG_INFO, 0, N_MSG_GENERIC, + "got CHILD_UP for %s, now %u kids", + ((xlator_t *)data)->name, + priv->up_children); + } + ret = default_notify (this, event, data); + break; case GF_EVENT_CHILD_DOWN: - /* - * TBD: handle this properly - * - * What we really should do is propagate this only if it caused - * us to lose quorum, and likewise for GF_EVENT_CHILD_UP only - * if it caused us to gain quorum. However, that requires - * tracking child states and for now it's easier to swallow - * these unconditionally. The consequence of failing to do - * this is that DHT sees the first GF_EVENT_CHILD_DOWN and gets - * confused, so it doesn't call us and doesn't get up-to-date - * directory listings etc. - */ + index = nsrc_get_child_index(this, data); + if (index >= 0) { + priv->kid_state &= ~(1 << index); + priv->up_children = nsrc_count_up_kids(priv); + gf_msg (this->name, GF_LOG_INFO, 0, N_MSG_GENERIC, + "got CHILD_DOWN for %s, now %u kids", + ((xlator_t *)data)->name, + priv->up_children); + } break; default: ret = default_notify (this, event, data); } +out: return ret; } +int +nsrc_priv_dump (xlator_t *this) +{ + nsrc_private_t *priv = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + xlator_list_t *trav = NULL; + int32_t i = -1; + + GF_VALIDATE_OR_GOTO (THIS->name, this, out); + priv = this->private; + GF_VALIDATE_OR_GOTO (this->name, priv, out); + + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", + this->type, this->name); + gf_proc_dump_add_section(key_prefix); + + gf_proc_dump_write("up_children", "%u", priv->up_children); + + for (trav = this->children, i = 0; trav; trav = trav->next, i++) { + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "child_%d", i); + gf_proc_dump_write(key_prefix, "%s", trav->xlator->name); + } + +out: + return 0; +} + +struct xlator_dumpops dumpops = { + .priv = nsrc_priv_dump, +}; + class_methods_t class_methods = { .init = nsrc_init, .fini = nsrc_fini, diff --git a/xlators/experimental/nsr-client/src/nsrc.h b/xlators/experimental/nsr-client/src/nsrc.h index 0c61d7a9fa8..15f0d7c85a0 100644 --- a/xlators/experimental/nsr-client/src/nsrc.h +++ b/xlators/experimental/nsr-client/src/nsrc.h @@ -13,6 +13,9 @@ typedef struct { xlator_t *active; + uint8_t up_children; + uint8_t n_children; + uint32_t kid_state; } nsrc_private_t; typedef struct { diff --git a/xlators/experimental/nsr-server/src/all-templates.c b/xlators/experimental/nsr-server/src/all-templates.c index 300abea959d..c3819d2af54 100644 --- a/xlators/experimental/nsr-server/src/all-templates.c +++ b/xlators/experimental/nsr-server/src/all-templates.c @@ -83,6 +83,9 @@ nsr_@NAME@ (call_frame_t *frame, xlator_t *this, if (result == _gf_false) { /* Emulate the AFR client-side-quorum behavior. */ + gf_msg (this->name, GF_LOG_ERROR, EROFS, + N_MSG_QUORUM_NOT_MET, "Sufficient number of " + "subvolumes are not up to meet quorum."); op_errno = EROFS; goto err; } @@ -309,6 +312,10 @@ nsr_@NAME@_continue (call_frame_t *frame, xlator_t *this, result = fop_quorum_check (this, (double)priv->n_children, (double)local->successful_acks + 1); if (result == _gf_false) { + gf_msg (this->name, GF_LOG_ERROR, EROFS, + N_MSG_QUORUM_NOT_MET, "Didn't receive enough acks " + "to meet quorum. Failing the operation without trying " + "it on the leader."); STACK_UNWIND_STRICT (@NAME@, frame, -1, EROFS, @ERROR_ARGS@); } else { @@ -406,8 +413,9 @@ nsr_@NAME@_complete (call_frame_t *frame, void *cookie, xlator_t *this, if (result == _gf_false) { op_ret = -1; op_errno = EROFS; - gf_msg_debug (this->name, 0, - "Quorum is not met. The operation has failed."); + gf_msg (this->name, GF_LOG_ERROR, EROFS, + N_MSG_QUORUM_NOT_MET, "Quorum is not met. " + "The operation has failed."); } else { #if defined(NSR_CG_NEED_FD) op_ret = local->successful_op_ret; diff --git a/xlators/experimental/nsr-server/src/nsr-internal.h b/xlators/experimental/nsr-server/src/nsr-internal.h index b8c7fc314b7..d43fbac9a53 100644 --- a/xlators/experimental/nsr-server/src/nsr-internal.h +++ b/xlators/experimental/nsr-server/src/nsr-internal.h @@ -74,6 +74,8 @@ typedef struct { * TBD: re-evaluate how to manage this */ char term_buf[CHANGELOG_ENTRY_SIZE]; + gf_boolean_t child_up; /* To maintain the state of * + * the translator */ } nsr_private_t; typedef struct { diff --git a/xlators/experimental/nsr-server/src/nsr.c b/xlators/experimental/nsr-server/src/nsr.c index 48966ab15a1..0fb618f236e 100644 --- a/xlators/experimental/nsr-server/src/nsr.c +++ b/xlators/experimental/nsr-server/src/nsr.c @@ -860,13 +860,23 @@ nsr_get_child_index (xlator_t *this, xlator_t *kid) int nsr_notify (xlator_t *this, int event, void *data, ...) { - nsr_private_t *priv = this->private; - int index; + nsr_private_t *priv = this->private; + int index = -1; + int ret = -1; + gf_boolean_t result = _gf_false; + gf_boolean_t relevant = _gf_false; switch (event) { case GF_EVENT_CHILD_UP: index = nsr_get_child_index(this, data); if (index >= 0) { + /* Check if the child was previously down + * and it's not a false CHILD_UP + */ + if (!(priv->kid_state & (1 << index))) { + relevant = _gf_true; + } + priv->kid_state |= (1 << index); priv->up_children = nsr_count_up_kids(priv); gf_msg (this->name, GF_LOG_INFO, 0, N_MSG_GENERIC, @@ -876,27 +886,96 @@ nsr_notify (xlator_t *this, int event, void *data, ...) if (!priv->config_leader && (priv->up_children > 1)) { priv->leader = _gf_false; } + + /* If it's not relevant, or we have already * + * sent CHILD_UP just break */ + if (!relevant || priv->child_up) + break; + + /* If it's not a leader, just send the notify up */ + if (!priv->leader) { + ret = default_notify(this, event, data); + if (!ret) + priv->child_up = _gf_true; + break; + } + + result = fop_quorum_check (this, + (double)(priv->n_children - 1), + (double)(priv->up_children - 1)); + if (result == _gf_false) { + gf_msg (this->name, GF_LOG_INFO, 0, + N_MSG_GENERIC, "Not enough children " + "are up to meet quorum. Waiting to " + "send CHILD_UP from leader"); + } else { + gf_msg (this->name, GF_LOG_INFO, 0, + N_MSG_GENERIC, "Enough children are up " + "to meet quorum. Sending CHILD_UP " + "from leader"); + ret = default_notify(this, event, data); + if (!ret) + priv->child_up = _gf_true; + } } break; case GF_EVENT_CHILD_DOWN: index = nsr_get_child_index(this, data); if (index >= 0) { + /* Check if the child was previously up + * and it's not a false CHILD_DOWN + */ + if (priv->kid_state & (1 << index)) { + relevant = _gf_true; + } priv->kid_state &= ~(1 << index); priv->up_children = nsr_count_up_kids(priv); gf_msg (this->name, GF_LOG_INFO, 0, N_MSG_GENERIC, "got CHILD_DOWN for %s, now %u kids", ((xlator_t *)data)->name, priv->up_children); - if (!priv->config_leader && (priv->up_children < 2)) { + if (!priv->config_leader && (priv->up_children < 2) + && relevant) { priv->leader = _gf_true; } + + /* If it's not relevant, or we have already * + * sent CHILD_DOWN just break */ + if (!relevant || !priv->child_up) + break; + + /* If it's not a leader, just break coz we shouldn't * + * propagate the failure from the failure till it * + * itself goes down * + */ + if (!priv->leader) { + break; + } + + result = fop_quorum_check (this, + (double)(priv->n_children - 1), + (double)(priv->up_children - 1)); + if (result == _gf_false) { + gf_msg (this->name, GF_LOG_INFO, 0, + N_MSG_GENERIC, "Enough children are " + "to down to fail quorum. " + "Sending CHILD_DOWN from leader"); + ret = default_notify(this, event, data); + if (!ret) + priv->child_up = _gf_false; + } else { + gf_msg (this->name, GF_LOG_INFO, 0, + N_MSG_GENERIC, "Not enough children " + "are down to fail quorum. Waiting to " + "send CHILD_DOWN from leader"); + } } break; default: - ; + ret = default_notify(this, event, data); } - return default_notify(this, event, data); + return ret; } @@ -995,6 +1074,7 @@ nsr_init (xlator_t *this) GF_OPTION_INIT ("quorum-percent", priv->quorum_pct, percent, err); priv->leader = priv->config_leader; + priv->child_up = _gf_false; if (pthread_create(&kid, NULL, nsr_flush_thread, this) != 0) { |