diff options
author | karthik-us <ksubrahm@redhat.com> | 2018-05-30 15:27:52 +0530 |
---|---|---|
committer | Ravishankar N <ravishankar@redhat.com> | 2018-09-20 09:18:20 +0000 |
commit | 5784a00f997212d34bd52b2303e20c097240d91c (patch) | |
tree | 2dc25588df397d0e029066a7510977dfa48c3481 | |
parent | 4f6ae853ffa9d06446407f389aaef61ac0b3b424 (diff) |
cluster/afr: Use 2 domain locking in SHD for thin-arbiter
With this change when SHD starts the index crawl it requests
all the clients to release the AFR_TA_DOM_NOTIFY lock so that
clients will know the in memory state is no more valid and
any new operations needs to query the thin-arbiter if required.
When SHD completes healing all the files without any failure, it
will again take the AFR_TA_DOM_NOTIFY lock and gets the xattrs on
TA to see whether there are any new failures happened by that time.
If there are new failures marked on TA, SHD will start the crawl
immediately to heal those failures as well. If there are no new
failures, then SHD will take the AFR_TA_DOM_MODIFY lock and unsets
the xattrs on TA, so that both the data bricks will be considered
as good there after.
Change-Id: I037b89a0823648f314580ba0716d877bd5ddb1f1
fixes: bz#1579788
Signed-off-by: karthik-us <ksubrahm@redhat.com>
-rw-r--r-- | tests/basic/afr/ta-shd.t | 49 | ||||
-rw-r--r-- | tests/thin-arbiter.rc | 181 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 7 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heald.c | 245 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.c | 1 |
5 files changed, 392 insertions, 91 deletions
diff --git a/tests/basic/afr/ta-shd.t b/tests/basic/afr/ta-shd.t new file mode 100644 index 00000000000..bb2e58b3f77 --- /dev/null +++ b/tests/basic/afr/ta-shd.t @@ -0,0 +1,49 @@ +#!/bin/bash +#Self-heal tests + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../thin-arbiter.rc +cleanup; +TEST ta_create_brick_and_volfile brick0 +TEST ta_create_brick_and_volfile brick1 +TEST ta_create_ta_and_volfile ta +TEST ta_start_brick_process brick0 +TEST ta_start_brick_process brick1 +TEST ta_start_ta_process ta + +TEST ta_create_mount_volfile brick0 brick1 ta +TEST ta_start_mount_process $M0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" ta_up_status $V0 $M0 0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "trusted.afr.patchy-ta-2" ls $B0/ta + +TEST ta_create_shd_volfile brick0 brick1 ta +TEST ta_start_shd_process glustershd + +TEST touch $M0/a.txt +TEST ta_kill_brick brick0 +EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0 +echo "Hello" >> $M0/a.txt +EXPECT "000000010000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/brick1/a.txt +EXPECT "000000010000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/ta/trusted.afr.$V0-ta-2 + +#TODO: After the write txn changes are merged, take statedump of TA process and +#check whether AFR_TA_DOM_NOTIFY lock is held by the client here. Take the +#statedump again after line #38 to check AFR_TA_DOM_NOTIFY lock is released by +#the SHD process. + +TEST ta_start_brick_process brick0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 +EXPECT_WITHIN $HEAL_TIMEOUT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/brick1/a.txt +EXPECT_WITHIN $HEAL_TIMEOUT "000000000000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/ta/trusted.afr.$V0-ta-2 + +#Kill the previously up brick and try reading from other brick. Since the heal +#has happened file content should be same. +TEST ta_kill_brick brick1 +EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 1 +#Umount and mount to remove cached data. +TEST umount $M0 +TEST ta_start_mount_process $M0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" ta_up_status $V0 $M0 0 +EXPECT "Hello" cat $M0/a.txt +cleanup; diff --git a/tests/thin-arbiter.rc b/tests/thin-arbiter.rc index 36d11cea61d..c5ac00baaaf 100644 --- a/tests/thin-arbiter.rc +++ b/tests/thin-arbiter.rc @@ -431,3 +431,184 @@ function ta_up_status() local replica_id=$3 grep -E "^up = " $m/.meta/graphs/active/${v}-replicate-${replica_id}/private | cut -f2 -d'=' } + +function ta_create_shd_volfile() +{ + local b0=$B0/$1 + local b1=$B0/$2 + local ta=$B0/$3 + local b0_port=${PORTMAP[$1]} + local b1_port=${PORTMAP[$2]} + local ta_port=${PORTMAP[$3]} +cat > $B0/glustershd.vol <<EOF +volume ${V0}-replicate-0-client-0 + type protocol/client + option send-gids on + option transport.socket.lowlat off + option transport.socket.keepalive-interval 2 + option remote-host $H0 + option remote-subvolume $b0 + option ping-timeout 42 + option client-bind-insecure off + option transport.socket.own-thread off + option frame-timeout 1800 + option non-blocking-io off + option transport.socket.keepalive 1 + option transport.socket.keepalive-count 9 + option transport.tcp-user-timeout 0 + option transport.socket.nodelay 1 + option transport.socket.keepalive-time 20 + option transport.socket.read-fail-log off + option transport-type tcp + option filter-O_DIRECT disable + option event-threads 2 + option transport.listen-backlog 1024 + option transport.socket.ssl-enabled off + option password a0ad63dd-8314-4f97-9160-1b93e3cb1f0b + option username 459d48e8-2a92-4f11-89f2-077b29f6f86d + option remote-port $b0_port +end-volume + +volume ${V0}-replicate-0-client-1 + type protocol/client + option remote-host $H0 + option transport.socket.keepalive-time 20 + option transport.socket.keepalive-count 9 + option transport.socket.own-thread off + option transport.socket.ssl-enabled off + option transport-type tcp + option remote-subvolume $b1 + option event-threads 2 + option transport.tcp-user-timeout 0 + option transport.socket.keepalive 1 + option transport.socket.nodelay 1 + option transport.socket.read-fail-log off + option frame-timeout 1800 + option ping-timeout 42 + option client-bind-insecure off + option filter-O_DIRECT disable + option send-gids on + option non-blocking-io off + option transport.listen-backlog 1024 + option transport.socket.lowlat off + option transport.socket.keepalive-interval 2 + option password a0ad63dd-8314-4f97-9160-1b93e3cb1f0b + option username 459d48e8-2a92-4f11-89f2-077b29f6f86d + option remote-port $b1_port +end-volume + +volume ${V0}-replicate-0-thin-arbiter-client + type protocol/client + option frame-timeout 1800 + option event-threads 2 + option transport.listen-backlog 1024 + option transport.socket.nodelay 1 + option transport.socket.keepalive-count 9 + option transport.socket.ssl-enabled off + option transport-type tcp + option remote-subvolume $ta + option filter-O_DIRECT disable + option non-blocking-io off + option transport.socket.lowlat off + option transport.socket.keepalive-interval 2 + option transport.socket.read-fail-log off + option remote-host $H0 + option send-gids on + option transport.tcp-user-timeout 0 + option transport.socket.keepalive-time 20 + option ping-timeout 42 + option client-bind-insecure off + option transport.socket.keepalive 1 + option transport.socket.own-thread off + option remote-port $ta_port +end-volume + +volume ${V0}-replicate-0 + type cluster/replicate + option background-self-heal-count 8 + option metadata-self-heal on + option data-change-log on + option entrylk-trace off + option iam-self-heal-daemon yes + option afr-dirty-xattr trusted.afr.dirty + option heal-timeout 10 + option read-hash-mode 1 + option metadata-splitbrain-forced-heal off + option thin-arbiter $H0:$ta + option shd-max-threads 1 + option afr-pending-xattr ${V0}-client-0,${V0}-client-1,${V0}-ta-2 + option halo-max-latency 5 + option halo-max-replicas 99999 + option entry-change-log on + option halo-nfsd-max-latency 5 + option inodelk-trace off + option pre-op-compat on + option eager-lock on + option self-heal-readdir-size 1KB + option ensure-durability on + option locking-scheme full + option halo-enabled False + option heal-wait-queue-length 128 + option entry-self-heal on + option self-heal-daemon on + option quorum-reads no + option shd-wait-qlength 1024 + option choose-local true + option halo-min-replicas 2 + option data-self-heal on + option metadata-change-log on + option consistent-metadata no + option full-lock yes + option use-compound-fops no + option halo-shd-max-latency 99999 + option quorum-type none + option favorite-child-policy none + option read-subvolume-index -1 + option optimistic-change-log on + option iam-nfs-daemon off + option post-op-delay-secs 1 + option granular-entry-heal no + option consistent-io no + option data-self-heal-window-size 1 + subvolumes ${V0}-replicate-0-client-0 ${V0}-replicate-0-client-1 ${V0}-replicate-0-thin-arbiter-client +end-volume + +volume glustershd + type debug/io-stats + option log-buf-size 5 + option ios-dump-format json + option latency-measurement off + option sys-log-level CRITICAL + option brick-log-level INFO + option client-logger gluster-log + option client-log-format with-msg-id + option brick-log-format with-msg-id + option client-log-buf-size 5 + option log-flush-timeout 120 + option ios-dump-interval 0 + option ios-sample-interval 0 + option ios-dnscache-ttl-sec 86400 + option count-fop-hits off + option client-log-level INFO + option brick-logger gluster-log + option brick-log-buf-size 5 + option ios-sample-buf-size 65535 + option client-log-flush-timeout 120 + option brick-log-flush-timeout 120 + option unique-id /no/such/path + option dump-fd-stats off + subvolumes ${V0}-replicate-0 +end-volume +EOF +} + +function ta_start_shd_process() +{ + if glusterfs -p $B0/${1}.pid --volfile=$B0/${1}.vol -l $(gluster --print-logdir)/${1}.log --process-name=glustershd + then + cat $B0/${1}.pid + else + echo "" + return 1 + fi +} diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index eb0e7330a91..73f1d728809 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -6717,7 +6717,8 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc) }; int32_t cmd = 0; - GF_ASSERT(afr_ta_is_fop_called_from_synctask(this)); + if (!priv->shd.iamshd) + GF_ASSERT(afr_ta_is_fop_called_from_synctask(this)); flock1.l_type = F_WRLCK; while (!locked) { @@ -6725,7 +6726,6 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc) cmd = F_SETLKW; flock1.l_start = 0; flock1.l_len = 0; - } else { cmd = F_SETLK; if (priv->ta_notify_dom_lock_offset) { @@ -6780,7 +6780,8 @@ afr_ta_post_op_unlock(xlator_t *this, loc_t *loc) }; int ret = 0; - GF_ASSERT(afr_ta_is_fop_called_from_synctask(this)); + if (!priv->shd.iamshd) + GF_ASSERT(afr_ta_is_fop_called_from_synctask(this)); flock.l_type = F_UNLCK; flock.l_start = 0; flock.l_len = 0; diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index 0cf01a041b4..53d7ef8bb8e 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -546,14 +546,128 @@ afr_shd_full_sweep(struct subvol_healer *healer, inode_t *inode) GF_CLIENT_PID_SELF_HEALD, healer, afr_shd_full_heal); } -void -afr_shd_ta_set_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer) +int +afr_shd_fill_ta_loc(xlator_t *this, loc_t *loc) { afr_private_t *priv = NULL; - dict_t *xattr = NULL; - struct gf_flock flock = { + struct iatt stbuf = { 0, }; + int ret = -1; + + priv = this->private; + loc->parent = inode_ref(this->itable->root); + gf_uuid_copy(loc->pargfid, loc->parent->gfid); + loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX]; + loc->inode = inode_new(loc->parent->table); + GF_CHECK_ALLOC(loc->inode, ret, out); + + if (!gf_uuid_is_null(priv->ta_gfid)) + goto assign_gfid; + + ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], loc, &stbuf, + 0, 0, 0); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed lookup on file %s.", loc->name); + goto out; + } + + gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid); + +assign_gfid: + gf_uuid_copy(loc->gfid, priv->ta_gfid); + ret = 0; + +out: + if (ret) + loc_wipe(loc); + + return ret; +} + +int +_afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata) +{ + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int *raw = NULL; + int ret = -1; + int i = 0; + + priv = this->private; + + xattr = dict_new(); + if (!xattr) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_GET_FAILED, + "Failed to create dict."); + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + raw = GF_CALLOC(AFR_NUM_CHANGE_LOGS, sizeof(int), gf_afr_mt_int32_t); + if (!raw) + goto out; + + ret = dict_set_bin(xattr, priv->pending_key[i], raw, + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret) { + GF_FREE(raw); + goto out; + } + } + + ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL, xdata, NULL); + if (ret || !(*xdata)) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Xattrop failed on %s.", loc->name); + } + +out: + if (xattr) + dict_unref(xattr); + + return ret; +} + +void +afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, struct subvol_healer *healer, + dict_t **xdata) +{ + int ret = 0; + + loc_wipe(loc); + if (afr_shd_fill_ta_loc(this, loc)) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate thin-arbiter loc for: %s.", loc->name); + goto out; + } + + ret = afr_ta_post_op_lock(this, loc); + if (ret) + goto out; + + ret = _afr_shd_ta_get_xattrs(this, loc, xdata); + if (ret) { + if (*xdata) { + dict_unref(*xdata); + *xdata = NULL; + } + } + + afr_ta_post_op_unlock(this, loc); + +out: + if (ret) + healer->rerun = 1; +} + +int +afr_shd_ta_unset_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer) +{ + afr_private_t *priv = NULL; + dict_t *xattr = NULL; gf_boolean_t need_xattrop = _gf_false; void *pending_raw = NULL; int *raw = NULL; @@ -563,7 +677,7 @@ afr_shd_ta_set_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer) int i = 0; int j = 0; int val = 0; - int ret = 0; + int ret = -1; priv = this->private; @@ -598,6 +712,7 @@ afr_shd_ta_set_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer) "not the good shd. Skipping. " "SHD = %d.", healer); + ret = 0; GF_FREE(raw); goto out; } @@ -607,113 +722,69 @@ afr_shd_ta_set_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer) } ret = dict_set_bin(xattr, priv->pending_key[i], raw, - AFR_NUM_CHANGE_LOGS * sizeof(int)); + AFR_NUM_CHANGE_LOGS * sizeof (int)); if (ret) { - GF_FREE(raw); + GF_FREE (raw); goto out; } - memset(pending, 0, sizeof(pending)); + if (need_xattrop) + break; } if (!need_xattrop) { + ret = 0; goto out; } - flock.l_type = F_WRLCK; - flock.l_start = 0; - flock.l_len = 0; - - ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], - AFR_TA_DOM_NOTIFY, loc, F_SETLKW, &flock, NULL, NULL); - if (ret) - goto out; - ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc, GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL); if (ret) gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, "Xattrop failed."); - flock.l_type = F_UNLCK; - syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], AFR_TA_DOM_NOTIFY, - loc, F_SETLKW, &flock, NULL, NULL); - out: if (xattr) dict_unref(xattr); - return; + + return ret; } void -afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata) +afr_shd_ta_check_and_unset_xattrs(xlator_t *this, loc_t *loc, + struct subvol_healer *healer, + dict_t *pre_crawl_xdata) { - afr_private_t *priv = NULL; - dict_t *xattr = NULL; - struct iatt stbuf = { - 0, - }; - int *raw = NULL; + int ret_lock = 0; int ret = 0; - int i = 0; + dict_t *post_crawl_xdata = NULL; - priv = this->private; - - loc->parent = inode_ref(this->itable->root); - gf_uuid_copy(loc->pargfid, loc->parent->gfid); - loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX]; - loc->inode = inode_new(loc->parent->table); - if (!loc->inode) { - goto out; - } + ret_lock = afr_ta_post_op_lock(this, loc); + if (ret_lock) + goto unref; - ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], loc, &stbuf, - 0, 0, 0); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, - "Failed lookup on file %s.", loc->name); - goto out; - } - - gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid); - gf_uuid_copy(loc->gfid, priv->ta_gfid); + ret = _afr_shd_ta_get_xattrs(this, loc, &post_crawl_xdata); + if (ret) + goto unref; - xattr = dict_new(); - if (!xattr) { - gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_GET_FAILED, - "Failed to create dict."); - goto out; + if (!are_dicts_equal(pre_crawl_xdata, post_crawl_xdata, NULL, NULL)) { + ret = -1; + goto unref; } - for (i = 0; i < priv->child_count; i++) { - raw = GF_CALLOC(AFR_NUM_CHANGE_LOGS, sizeof(int), gf_afr_mt_int32_t); - if (!raw) { - goto out; - } + ret = afr_shd_ta_unset_xattrs(this, loc, &post_crawl_xdata, healer->subvol); - ret = dict_set_bin(xattr, priv->pending_key[i], raw, - AFR_NUM_CHANGE_LOGS * sizeof(int)); - if (ret) { - GF_FREE(raw); - goto out; - } +unref: + if (post_crawl_xdata) { + dict_unref(post_crawl_xdata); + post_crawl_xdata = NULL; } - ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc, - GF_XATTROP_ADD_ARRAY, xattr, NULL, xdata, NULL); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, - "Xattrop failed."); - goto out; - } - if (!(*xdata)) - gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_GET_FAILED, - "Xdata response is empty."); + if (ret || ret_lock) + healer->rerun = 1; -out: - if (xattr) - dict_unref(xattr); - return; + if (!ret_lock) + afr_ta_post_op_unlock(this, loc); } void * @@ -723,7 +794,7 @@ afr_shd_index_healer(void *data) xlator_t *this = NULL; int ret = 0; afr_private_t *priv = NULL; - dict_t *xdata = NULL; + dict_t *pre_crawl_xdata = NULL; loc_t loc = { 0, }; @@ -739,8 +810,7 @@ afr_shd_index_healer(void *data) priv->local[healer->subvol] = healer->local; if (priv->thin_arbiter_count) { - loc_wipe(&loc); - afr_shd_ta_get_xattrs(this, &loc, &xdata); + afr_shd_ta_get_xattrs(this, &loc, healer, &pre_crawl_xdata); } do { @@ -770,15 +840,14 @@ afr_shd_index_healer(void *data) sleep(1); } while (ret > 0); - if (xdata && !healer->crawl_event.heal_failed_count) { - afr_shd_ta_set_xattrs(this, &loc, &xdata, healer->subvol); - dict_unref(xdata); - xdata = NULL; + if (pre_crawl_xdata && !healer->crawl_event.heal_failed_count) { + afr_shd_ta_check_and_unset_xattrs(this, &loc, healer, + pre_crawl_xdata); + dict_unref(pre_crawl_xdata); + pre_crawl_xdata = NULL; } } - loc_wipe(&loc); - return NULL; } diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 568293cdf2c..26950fd7927 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -384,6 +384,7 @@ init(xlator_t *this) priv->child_count--; priv->ta_bad_child_index = AFR_CHILD_UNKNOWN; priv->ta_notify_dom_lock_offset = 0; + *priv->ta_gfid = 0; } INIT_LIST_HEAD(&priv->healing); INIT_LIST_HEAD(&priv->heal_waiting); |