diff options
author | Kaushal M <kaushal@redhat.com> | 2015-03-26 15:18:54 +0530 |
---|---|---|
committer | Krishnan Parthasarathi <kparthas@redhat.com> | 2015-04-13 06:30:02 +0000 |
commit | 1efa50861b2cee68de9c9b751d9fc5eed08f5e5b (patch) | |
tree | 218ca5aba1eb404ff0d7ac25f20716915768f4ec /xlators/mgmt/glusterd/src/glusterd-op-sm.c | |
parent | 7c7bbc027feb4c5b233e3078951e5bb1d9fc4618 (diff) |
glusterd: Replace transaction peers lists
Transaction peer lists were used in GlusterD to peers belonging to a
transaction. This was needed to prevent newly added peers performing
partial transactions, which could be incorrect.
This was accomplished by creating a seperate transaction peers list at
the beginning of every transaction. A transaction peers list referenced
the peerinfo data structures of the peers which were present at the
beginning of the transaction. RCU protection of peerinfos referenced by
the transaction peers list is a hard problem and difficult to do
correctly.
To have proper RCU protection of peerinfos, the transaction peers lists
have been replaced by an alternative method to identify peers that
belong to a transaction. The alternative method is to the global peers
list along with generation numbers to identify peers that should belong
to a transaction.
This change introduces a global peer list generation number, and a
generation number for each peerinfo object. Whenever a peerinfo object
is created, the global generation number is bumped, and the peerinfos
generation number is set to the bumped global generation.
With the above changes, the algorithm to identify peers belonging to a
transaction with RCU protection is as follows,
- At the beginning of a transaction, the current global generation
number is saved
- To identify if a peers belonging to the transaction,
- Start a RCU read critical section
- For each peer in the global peers list,
- If the peers generation number is not greater than the saved
generation number, continue with the action on the peer
- End the RCU read critical section
The above algorithm guarantees that,
- The peer list is not modified when a transaction is iterating through
it
- The transaction actions are only done on peers that were present when
the transaction started
But, as a transaction could iterate over the peers list multiple times,
the algorithm cannot guarantee that same set of peers will be selected
every time. A peer could get deleted between two iterations of the list
within a transaction. This problem existed with transaction peers list
as well, but unlike before now it will not lead to invalid memory access
and potential crashes. This problem will be addressed seprately.
This change was developed on the git branch at [1]. This commit is a
combination of the following commits on the development branch.
52ded5b Add timespec_cmp
44aedd8 Add create timestamp to peerinfo
7bcbea5 Fix some silly mistakes
13e3241 Add start time to opinfo
17a6727 Use timestamp comparisions to identify xaction peers instead
of a xaction peer list
3be05b6 Correct check for peerinfo age
70d5b58 Use read-critical sections for peer list iteration
ba4dbca Use peerinfo timestamp checks in op-sm instead of xaction peer
list
d63f811 Add more peer status checks when iterating peers list in
glusterd-syncop
1998a2a Timestamp based peer list traversal of mgmtv3 xactions
f3c1a42 Remove transaction peer lists
b8b08ee Remove unused labels
32e5f5b Remove 'npeers' usage
a075fb7 Remove 'npeers' from mgmt-v3 framework
12c9df2 Use generation number instead of timestamps.
9723021 Remove timespec_cmp
80ae2c6 Remove timespec.h include
a9479b0 Address review comments on 10147/4
[1]: https://github.com/kshlm/glusterfs/tree/urcu
Change-Id: I9be1033525c0a89276f5b5d83dc2eb061918b97f
BUG: 1205186
Signed-off-by: Kaushal M <kaushal@redhat.com>
Reviewed-on: http://review.gluster.org/10147
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
Reviewed-by: Anand Nekkunti <anekkunt@redhat.com>
Reviewed-by: Krishnan Parthasarathi <kparthas@redhat.com>
Tested-by: Krishnan Parthasarathi <kparthas@redhat.com>
Diffstat (limited to 'xlators/mgmt/glusterd/src/glusterd-op-sm.c')
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-op-sm.c | 67 |
1 files changed, 47 insertions, 20 deletions
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c index d7694258301..5bfdb0bb43e 100644 --- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c +++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c @@ -119,13 +119,16 @@ glusterd_txn_opinfo_dict_fini () void glusterd_txn_opinfo_init (glusterd_op_info_t *opinfo, - glusterd_op_sm_state_info_t *state, - glusterd_op_t *op, - dict_t *op_ctx, - rpcsvc_request_t *req) + glusterd_op_sm_state_info_t *state, glusterd_op_t *op, + dict_t *op_ctx, rpcsvc_request_t *req) { + glusterd_conf_t *conf = NULL; + GF_ASSERT (opinfo); + conf = THIS->private; + GF_ASSERT (conf); + if (state) opinfo->state = *state; @@ -140,6 +143,9 @@ glusterd_txn_opinfo_init (glusterd_op_info_t *opinfo, if (req) opinfo->req = req; + opinfo->txn_generation = conf->generation; + cmm_smp_rmb (); + return; } @@ -314,9 +320,6 @@ glusterd_clear_txn_opinfo (uuid_t *txn_id) dict_del(priv->glusterd_txn_opinfo, uuid_utoa (*txn_id)); - if (txn_op_info.local_xaction_peers) - GF_FREE (txn_op_info.local_xaction_peers); - gf_log ("", GF_LOG_DEBUG, "Successfully cleared opinfo for transaction ID : %s", uuid_utoa (*txn_id)); @@ -2919,9 +2922,13 @@ glusterd_op_ac_send_lock (glusterd_op_sm_event_t *event, void *ctx) priv = this->private; GF_ASSERT (priv); - list_for_each_local_xaction_peers (peerinfo, - opinfo.local_xaction_peers) { - GF_ASSERT (peerinfo); + rcu_read_lock (); + cds_list_for_each_entry_rcu (peerinfo, &priv->peers, uuid_list) { + /* Only send requests to peers who were available before the + * transaction started + */ + if (peerinfo->generation > opinfo.txn_generation) + continue; if (!peerinfo->connected || !peerinfo->mgmt) continue; @@ -2936,6 +2943,7 @@ glusterd_op_ac_send_lock (glusterd_op_sm_event_t *event, void *ctx) if (proc->fn) { ret = proc->fn (NULL, this, peerinfo); if (ret) { + rcu_read_unlock (); gf_log (this->name, GF_LOG_WARNING, "Failed to send lock request " "for operation 'Volume %s' to " @@ -2958,6 +2966,7 @@ glusterd_op_ac_send_lock (glusterd_op_sm_event_t *event, void *ctx) ret = dict_set_static_ptr (dict, "peerinfo", peerinfo); if (ret) { + rcu_read_unlock (); gf_log (this->name, GF_LOG_ERROR, "failed to set peerinfo"); dict_unref (dict); @@ -2966,6 +2975,7 @@ glusterd_op_ac_send_lock (glusterd_op_sm_event_t *event, void *ctx) ret = proc->fn (NULL, this, dict); if (ret) { + rcu_read_unlock (); gf_log (this->name, GF_LOG_WARNING, "Failed to send mgmt_v3 lock " "request for operation " @@ -2981,6 +2991,7 @@ glusterd_op_ac_send_lock (glusterd_op_sm_event_t *event, void *ctx) } } } + rcu_read_unlock (); opinfo.pending_count = pending_count; if (!opinfo.pending_count) @@ -3009,9 +3020,13 @@ glusterd_op_ac_send_unlock (glusterd_op_sm_event_t *event, void *ctx) priv = this->private; GF_ASSERT (priv); - list_for_each_local_xaction_peers (peerinfo, - opinfo.local_xaction_peers) { - GF_ASSERT (peerinfo); + rcu_read_lock (); + cds_list_for_each_entry_rcu (peerinfo, &priv->peers, uuid_list) { + /* Only send requests to peers who were available before the + * transaction started + */ + if (peerinfo->generation > opinfo.txn_generation) + continue; if (!peerinfo->connected || !peerinfo->mgmt || !peerinfo->locked) @@ -3083,6 +3098,7 @@ glusterd_op_ac_send_unlock (glusterd_op_sm_event_t *event, void *ctx) } } } + rcu_read_unlock (); opinfo.pending_count = pending_count; if (!opinfo.pending_count) @@ -3562,9 +3578,13 @@ glusterd_op_ac_send_stage_op (glusterd_op_sm_event_t *event, void *ctx) if (op == GD_OP_REPLACE_BRICK) glusterd_rb_use_rsp_dict (NULL, rsp_dict); - list_for_each_local_xaction_peers (peerinfo, - opinfo.local_xaction_peers) { - GF_ASSERT (peerinfo); + rcu_read_lock (); + cds_list_for_each_entry_rcu (peerinfo, &priv->peers, uuid_list) { + /* Only send requests to peers who were available before the + * transaction started + */ + if (peerinfo->generation > opinfo.txn_generation) + continue; if (!peerinfo->connected || !peerinfo->mgmt) continue; @@ -3577,6 +3597,7 @@ glusterd_op_ac_send_stage_op (glusterd_op_sm_event_t *event, void *ctx) if (proc->fn) { ret = dict_set_static_ptr (dict, "peerinfo", peerinfo); if (ret) { + rcu_read_unlock (); gf_log (this->name, GF_LOG_ERROR, "failed to " "set peerinfo"); goto out; @@ -3593,6 +3614,7 @@ glusterd_op_ac_send_stage_op (glusterd_op_sm_event_t *event, void *ctx) pending_count++; } } + rcu_read_unlock (); opinfo.pending_count = pending_count; out: @@ -4212,9 +4234,13 @@ glusterd_op_ac_send_commit_op (glusterd_op_sm_event_t *event, void *ctx) goto out; } - list_for_each_local_xaction_peers (peerinfo, - opinfo.local_xaction_peers) { - GF_ASSERT (peerinfo); + rcu_read_lock (); + cds_list_for_each_entry_rcu (peerinfo, &priv->peers, uuid_list) { + /* Only send requests to peers who were available before the + * transaction started + */ + if (peerinfo->generation > opinfo.txn_generation) + continue; if (!peerinfo->connected || !peerinfo->mgmt) continue; @@ -4227,6 +4253,7 @@ glusterd_op_ac_send_commit_op (glusterd_op_sm_event_t *event, void *ctx) if (proc->fn) { ret = dict_set_static_ptr (dict, "peerinfo", peerinfo); if (ret) { + rcu_read_unlock (); gf_log (this->name, GF_LOG_ERROR, "failed to set peerinfo"); goto out; @@ -4242,6 +4269,7 @@ glusterd_op_ac_send_commit_op (glusterd_op_sm_event_t *event, void *ctx) pending_count++; } } + rcu_read_unlock (); opinfo.pending_count = pending_count; gf_log (this->name, GF_LOG_DEBUG, "Sent commit op req for 'Volume %s' " @@ -4528,7 +4556,6 @@ glusterd_op_txn_complete (uuid_t *txn_id) glusterd_op_clear_op (); glusterd_op_reset_ctx (); glusterd_op_clear_errstr (); - gd_cleanup_local_xaction_peers_list (opinfo.local_xaction_peers); /* Based on the op-version, we release the cluster or mgmt_v3 lock */ if (priv->op_version < GD_OP_VERSION_3_6_0) { |