summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRavishankar N <ravishankar@redhat.com>2016-01-10 09:19:34 +0530
committerPranith Kumar Karampuri <pkarampu@redhat.com>2016-03-01 03:23:20 -0800
commit8210ca1a5c0e78e91c6fab7df7e002e39660b706 (patch)
tree432a6836cc685760ee441f4b8e46221947247211
parentea00992d3d52a51b7c8311ad9565bbbb6e395f9d (diff)
afr: Add throttled background client-side heals
If a heal is needed after inode refresh (lookup, read_txn), launch it in the background instead of blocking the fop (that triggered refresh) until the heal happens. afr_replies_interpret() is modified such that the heal is launched only if atleast one sink brick is up. Max. no of heals that can happen in parallel is configurable via the 'background-self-heal-count' volume option. Any number greater than that is put in a wait queue whose length is configurable via 'heal-wait-queue-leng' volume option. If the wait queue is also full, further heals will be ignored. Default values: background-self-heal-count=8, heal-wait-queue-leng=128 Change-Id: I1d4a52814cdfd43d90591b6d2ad7b6219937ce70 BUG: 1297172 Signed-off-by: Ravishankar N <ravishankar@redhat.com> Reviewed-on: http://review.gluster.org/13207 Smoke: Gluster Build System <jenkins@build.gluster.com> CentOS-regression: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com> Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
-rw-r--r--libglusterfs/src/globals.h4
-rw-r--r--tests/basic/afr/client-side-heal.t1
-rwxr-xr-xtests/bugs/glusterd/859927/repl.t4
-rw-r--r--tests/bugs/quota/bug-1035576.t1
-rwxr-xr-xtests/bugs/replicate/bug-802417.t5
-rwxr-xr-xtests/bugs/replicate/bug-977797.t52
-rw-r--r--xlators/cluster/afr/src/afr-common.c89
-rw-r--r--xlators/cluster/afr/src/afr-dir-write.c2
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c110
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h3
-rw-r--r--xlators/cluster/afr/src/afr.c26
-rw-r--r--xlators/cluster/afr/src/afr.h28
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c8
13 files changed, 241 insertions, 92 deletions
diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h
index ae05a3abd15..cd4582a12f8 100644
--- a/libglusterfs/src/globals.h
+++ b/libglusterfs/src/globals.h
@@ -62,6 +62,10 @@
#define GD_OP_VERSION_3_7_7 30707 /* Op-version for GlusterFS 3.7.7 */
+/* Op-version was not bumped up for 3.7.8 */
+
+#define GD_OP_VERSION_3_7_9 30709 /* Op-version for GlusterFS 3.7.9 */
+
#define GD_OP_VERSION_4_0_0 40000 /* Op-version for GlusterFS 4.0.0 */
#define GD_OP_VER_PERSISTENT_AFR_XATTRS GD_OP_VERSION_3_6_0
diff --git a/tests/basic/afr/client-side-heal.t b/tests/basic/afr/client-side-heal.t
index 18f76265b29..d87f4b14063 100644
--- a/tests/basic/afr/client-side-heal.t
+++ b/tests/basic/afr/client-side-heal.t
@@ -70,6 +70,7 @@ EXPECT 7 get_pending_heal_count $V0
EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
TEST cat $M0/datafile
+EXPECT_WITHIN $HEAL_TIMEOUT 6 get_pending_heal_count $V0
EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
diff --git a/tests/bugs/glusterd/859927/repl.t b/tests/bugs/glusterd/859927/repl.t
index a500961165c..40e86029685 100755
--- a/tests/bugs/glusterd/859927/repl.t
+++ b/tests/bugs/glusterd/859927/repl.t
@@ -23,7 +23,6 @@ TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2};
TEST $CLI volume set $V0 cluster.self-heal-daemon off
TEST $CLI volume set $V0 performance.stat-prefetch off
TEST $CLI volume set $V0 client-log-level DEBUG
-TEST $CLI volume set $V0 cluster.background-self-heal-count 0
TEST $CLI volume start $V0
TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $M0;
@@ -34,6 +33,7 @@ EXPECT full volume_option $V0 cluster.data-self-heal-algorithm
create_setup_for_self_heal $M0/a
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
cat $file 2>&1 > /dev/null
+EXPECT_WITHIN $HEAL_TIMEOUT 0 get_pending_heal_count $V0
TEST cmp $B0/${V0}1/a $B0/${V0}2/a
TEST $CLI volume set $V0 cluster.data-self-heal-algorithm diff
@@ -41,12 +41,14 @@ EXPECT diff volume_option $V0 cluster.data-self-heal-algorithm
create_setup_for_self_heal $M0/a
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
cat $file 2>&1 > /dev/null
+EXPECT_WITHIN $HEAL_TIMEOUT 0 get_pending_heal_count $V0
TEST cmp $B0/${V0}1/a $B0/${V0}2/a
TEST $CLI volume reset $V0 cluster.data-self-heal-algorithm
create_setup_for_self_heal $M0/a
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
cat $file 2>&1 > /dev/null
+EXPECT_WITHIN $HEAL_TIMEOUT 0 get_pending_heal_count $V0
TEST cmp $B0/${V0}1/a $B0/${V0}2/a
TEST ! $CLI volume set $V0 cluster.data-self-heal-algorithm ""
diff --git a/tests/bugs/quota/bug-1035576.t b/tests/bugs/quota/bug-1035576.t
index e3d32d107d0..12f960c46c6 100644
--- a/tests/bugs/quota/bug-1035576.t
+++ b/tests/bugs/quota/bug-1035576.t
@@ -18,7 +18,6 @@ TEST $CLI volume set $V0 performance.io-cache off
TEST $CLI volume set $V0 performance.write-behind off
TEST $CLI volume set $V0 performance.stat-prefetch off
TEST $CLI volume set $V0 performance.read-ahead off
-TEST $CLI volume set $V0 background-self-heal-count 0
TEST $CLI volume set $V0 self-heal-daemon off
TEST $CLI volume quota $V0 enable
diff --git a/tests/bugs/replicate/bug-802417.t b/tests/bugs/replicate/bug-802417.t
index df989b1470b..c5ba98b65fd 100755
--- a/tests/bugs/replicate/bug-802417.t
+++ b/tests/bugs/replicate/bug-802417.t
@@ -32,7 +32,6 @@ TEST $CLI volume set $V0 performance.stat-prefetch off
## Make sure automatic self-heal doesn't perturb our results.
TEST $CLI volume set $V0 cluster.self-heal-daemon off
TEST $CLI volume set $V0 cluster.data-self-heal on
-TEST $CLI volume set $V0 cluster.background-self-heal-count 0
## Start volume and verify
TEST $CLI volume start $V0;
@@ -70,8 +69,8 @@ tgt_xattr_2="trusted.afr.${V0}-client-2"
actual=$(afr_get_changelog_xattr $obs_path_0 $tgt_xattr_0)
EXPECT "0x000000000000000000000000|^\$" echo $actual
-actual=$(afr_get_changelog_xattr $obs_path_0 $tgt_xattr_1)
-EXPECT "0x000000000000000000000000|^\$" echo $actual
+EXPECT_WITHIN $HEAL_TIMEOUT "0x000000000000000000000000" \
+afr_get_changelog_xattr $obs_path_0 $tgt_xattr_1
actual=$(afr_get_changelog_xattr $obs_path_0 $tgt_xattr_2)
EXPECT "0x000000030000000000000000" echo $actual
diff --git a/tests/bugs/replicate/bug-977797.t b/tests/bugs/replicate/bug-977797.t
index 3ff14ecf3d5..72c616ba68e 100755
--- a/tests/bugs/replicate/bug-977797.t
+++ b/tests/bugs/replicate/bug-977797.t
@@ -26,7 +26,6 @@ TEST $CLI volume set $V0 quick-read off
TEST $CLI volume set $V0 read-ahead off
TEST $CLI volume set $V0 write-behind off
TEST $CLI volume set $V0 io-cache off
-TEST $CLI volume set $V0 background-self-heal-count 0
TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
@@ -56,34 +55,29 @@ EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1;
TEST dd if=$M0/a/file of=/dev/null bs=1024k
-b1c0dir=$(afr_get_specific_changelog_xattr $B0/$V0"1"/a \
- trusted.afr.$V0-client-0 "entry")
-b1c1dir=$(afr_get_specific_changelog_xattr $B0/$V0"1"/a \
- trusted.afr.$V0-client-1 "entry")
-b2c0dir=$(afr_get_specific_changelog_xattr \
- $B0/$V0"2"/a trusted.afr.$V0-client-0 "entry")
-b2c1dir=$(afr_get_specific_changelog_xattr \
- $B0/$V0"2"/a trusted.afr.$V0-client-1 "entry")
-
-
-b1c0f=$(afr_get_specific_changelog_xattr $B0/$V0"1"/a/file \
- trusted.afr.$V0-client-0 "data")
-b1c1f=$(afr_get_specific_changelog_xattr $B0/$V0"1"/a/file \
- trusted.afr.$V0-client-1 "data")
-b2c0f=$(afr_get_specific_changelog_xattr $B0/$V0"2"/a/file \
- trusted.afr.$V0-client-0 "data")
-b2c1f=$(afr_get_specific_changelog_xattr $B0/$V0"2"/a/file \
- trusted.afr.$V0-client-1 "data")
-
-EXPECT "00000000|^$" echo $b1c0f
-EXPECT "00000000|^$" echo $b1c1f
-EXPECT "00000000|^$" echo $b2c0f
-EXPECT "00000000|^$" echo $b2c1f
-
-EXPECT "00000000|^$" echo $b1c0dir
-EXPECT "00000000|^$" echo $b1c1dir
-EXPECT "00000000|^$" echo $b2c0dir
-EXPECT "00000000|^$" echo $b2c1dir
+EXPECT_WITHIN $HEAL_TIMEOUT "00000000" \
+afr_get_specific_changelog_xattr $B0/$V0"1"/a/file trusted.afr.$V0-client-0 "data"
+
+EXPECT_WITHIN $HEAL_TIMEOUT "00000000" \
+afr_get_specific_changelog_xattr $B0/$V0"1"/a/file trusted.afr.$V0-client-1 "data"
+
+EXPECT_WITHIN $HEAL_TIMEOUT "00000000" \
+afr_get_specific_changelog_xattr $B0/$V0"2"/a/file trusted.afr.$V0-client-0 "data"
+
+EXPECT_WITHIN $HEAL_TIMEOUT "00000000" \
+afr_get_specific_changelog_xattr $B0/$V0"2"/a/file trusted.afr.$V0-client-1 "data"
+
+EXPECT_WITHIN $HEAL_TIMEOUT "00000000" \
+afr_get_specific_changelog_xattr $B0/$V0"1"/a trusted.afr.$V0-client-0 "entry"
+
+EXPECT_WITHIN HEAL_TIMEOUT "00000000" \
+afr_get_specific_changelog_xattr $B0/$V0"1"/a trusted.afr.$V0-client-1 "entry"
+
+EXPECT_WITHIN $HEAL_TIMEOUT "00000000" \
+afr_get_specific_changelog_xattr $B0/$V0"2"/a trusted.afr.$V0-client-0 "entry"
+
+EXPECT_WITHIN $HEAL_TIMEOUT "00000000" \
+afr_get_specific_changelog_xattr $B0/$V0"2"/a trusted.afr.$V0-client-1 "entry"
## Finish up
TEST $CLI volume stop $V0;
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 457f7865cec..1f68b87ef94 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -663,7 +663,8 @@ afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies,
int
-afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode)
+afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ gf_boolean_t *start_heal)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
@@ -734,6 +735,13 @@ afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode)
}
}
+ for (i = 0; i < priv->child_count; i++) {
+ if (start_heal && priv->child_up[i] &&
+ (!data_readable[i] || !metadata_readable[i])) {
+ *start_heal = _gf_true;
+ break;
+ }
+ }
afr_inode_read_subvol_set (inode, this, data_readable,
metadata_readable, event_generation);
return ret;
@@ -772,36 +780,6 @@ ret:
return -err;
}
-
-int
-afr_refresh_selfheal_wrap (void *opaque)
-{
- call_frame_t *frame = opaque;
- afr_local_t *local = NULL;
- xlator_t *this = NULL;
- int err = 0;
-
- local = frame->local;
- this = frame->this;
-
- afr_selfheal (frame->this, local->refreshinode->gfid);
-
- afr_selfheal_unlocked_discover (frame, local->refreshinode,
- local->refreshinode->gfid,
- local->replies);
-
- afr_replies_interpret (frame, this, local->refreshinode);
-
- err = afr_inode_refresh_err (frame, this);
-
- afr_local_replies_wipe (local, this->private);
-
- local->refreshfn (frame, this, err);
-
- return 0;
-}
-
-
gf_boolean_t
afr_selfheal_enabled (xlator_t *this)
{
@@ -817,35 +795,43 @@ afr_selfheal_enabled (xlator_t *this)
return data || priv->metadata_self_heal || priv->entry_self_heal;
}
-
int
afr_inode_refresh_done (call_frame_t *frame, xlator_t *this)
{
- call_frame_t *heal = NULL;
+ call_frame_t *heal_frame = NULL;
afr_local_t *local = NULL;
+ gf_boolean_t start_heal = _gf_false;
+ afr_local_t *heal_local = NULL;
+ int op_errno = ENOMEM;
int ret = 0;
int err = 0;
local = frame->local;
- ret = afr_replies_interpret (frame, this, local->refreshinode);
+ ret = afr_replies_interpret (frame, this, local->refreshinode,
+ &start_heal);
err = afr_inode_refresh_err (frame, this);
afr_local_replies_wipe (local, this->private);
- if (ret && afr_selfheal_enabled (this)) {
- heal = copy_frame (frame);
- if (heal)
- heal->root->pid = GF_CLIENT_PID_SELF_HEALD;
- ret = synctask_new (this->ctx->env, afr_refresh_selfheal_wrap,
- afr_refresh_selfheal_done, heal, frame);
- if (ret)
- goto refresh_done;
- } else {
- refresh_done:
- local->refreshfn (frame, this, err);
- }
+ if (ret && afr_selfheal_enabled (this) && start_heal) {
+ heal_frame = copy_frame (frame);
+ if (!heal_frame)
+ goto refresh_done;
+ heal_frame->root->pid = GF_CLIENT_PID_SELF_HEALD;
+ heal_local = AFR_FRAME_INIT (heal_frame, op_errno);
+ if (!heal_local) {
+ AFR_STACK_DESTROY (heal_frame);
+ goto refresh_done;
+ }
+ heal_local->refreshinode = inode_ref (local->refreshinode);
+ heal_local->heal_frame = heal_frame;
+ afr_throttled_selfheal (heal_frame, this);
+ }
+
+refresh_done:
+ local->refreshfn (frame, this, err);
return 0;
}
@@ -1758,7 +1744,7 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
*/
gf_uuid_copy (args.gfid, read_gfid);
args.ia_type = ia_type;
- if (afr_replies_interpret (frame, this, local->inode)) {
+ if (afr_replies_interpret (frame, this, local->inode, NULL)) {
read_subvol = afr_read_subvol_decide (local->inode,
this, &args);
afr_inode_read_subvol_reset (local->inode, this);
@@ -2214,7 +2200,7 @@ afr_discover_done (call_frame_t *frame, xlator_t *this)
goto unwind;
}
- afr_replies_interpret (frame, this, local->inode);
+ afr_replies_interpret (frame, this, local->inode, NULL);
read_subvol = afr_read_subvol_decide (local->inode, this, NULL);
if (read_subvol == -1) {
@@ -3863,6 +3849,12 @@ afr_priv_dump (xlator_t *this)
gf_proc_dump_write("favorite_child", "%d", priv->favorite_child);
gf_proc_dump_write("wait_count", "%u", priv->wait_count);
gf_proc_dump_write("quorum-reads", "%d", priv->quorum_reads);
+ gf_proc_dump_write("heal-wait-queue-length", "%d",
+ priv->heal_wait_qlen);
+ gf_proc_dump_write("heal-waiters", "%d", priv->heal_waiters);
+ gf_proc_dump_write("background-self-heal-count", "%d",
+ priv->background_self_heal_count);
+ gf_proc_dump_write("healers", "%d", priv->healers);
return 0;
}
@@ -4169,6 +4161,7 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
goto out;
}
+ INIT_LIST_HEAD (&local->healer);
return 0;
out:
return -1;
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
index 3f4ad246992..741a07ad9c0 100644
--- a/xlators/cluster/afr/src/afr-dir-write.c
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -88,7 +88,7 @@ __afr_dir_write_finalize (call_frame_t *frame, xlator_t *this)
priv = this->private;
if (local->inode) {
- afr_replies_interpret (frame, this, local->inode);
+ afr_replies_interpret (frame, this, local->inode, NULL);
inode_read_subvol = afr_data_subvol_get (local->inode, this,
NULL, NULL, NULL);
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 49c6bd0cc98..74e1a444069 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -15,6 +15,9 @@
#include "protocol-common.h"
#include "afr-messages.h"
+void
+afr_heal_synctask (xlator_t *this, afr_local_t *local);
+
int
afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
@@ -1422,3 +1425,110 @@ afr_selfheal (xlator_t *this, uuid_t gfid)
return ret;
}
+
+afr_local_t*
+__afr_dequeue_heals (afr_private_t *priv)
+{
+ afr_local_t *local = NULL;
+
+ if (list_empty (&priv->heal_waiting))
+ goto none;
+ if ((priv->background_self_heal_count > 0) &&
+ (priv->healers >= priv->background_self_heal_count))
+ goto none;
+
+ local = list_entry (priv->heal_waiting.next, afr_local_t, healer);
+ priv->heal_waiters--;
+ GF_ASSERT (priv->heal_waiters >= 0);
+ list_del_init(&local->healer);
+ list_add(&local->healer, &priv->healing);
+ priv->healers++;
+ return local;
+none:
+ gf_msg_debug (THIS->name, 0, "Nothing dequeued. "
+ "Num healers: %d, Num Waiters: %d",
+ priv->healers, priv->heal_waiters);
+ return NULL;
+}
+
+int
+afr_refresh_selfheal_wrap (void *opaque)
+{
+ call_frame_t *heal_frame = opaque;
+ afr_local_t *local = heal_frame->local;
+ int ret = 0;
+
+ ret = afr_selfheal (heal_frame->this, local->refreshinode->gfid);
+ return ret;
+}
+
+int
+afr_refresh_heal_done (int ret, call_frame_t *frame, void *opaque)
+{
+ call_frame_t *heal_frame = opaque;
+ xlator_t *this = heal_frame->this;
+ afr_private_t *priv = this->private;
+ afr_local_t *local = heal_frame->local;
+
+ LOCK (&priv->lock);
+ {
+ list_del_init(&local->healer);
+ priv->healers--;
+ GF_ASSERT (priv->healers >= 0);
+ local = __afr_dequeue_heals (priv);
+ }
+ UNLOCK (&priv->lock);
+
+ if (heal_frame)
+ AFR_STACK_DESTROY (heal_frame);
+
+ if (local)
+ afr_heal_synctask (this, local);
+ return 0;
+
+}
+
+void
+afr_heal_synctask (xlator_t *this, afr_local_t *local)
+{
+ int ret = 0;
+ call_frame_t *heal_frame = NULL;
+
+ heal_frame = local->heal_frame;
+ ret = synctask_new (this->ctx->env, afr_refresh_selfheal_wrap,
+ afr_refresh_heal_done, heal_frame, heal_frame);
+ if (ret < 0)
+ /* Heal not launched. Will be queued when the next inode
+ * refresh happens and shd hasn't healed it yet. */
+ afr_refresh_heal_done (ret, heal_frame, heal_frame);
+}
+
+void
+afr_throttled_selfheal (call_frame_t *frame, xlator_t *this)
+{
+ gf_boolean_t can_heal = _gf_true;
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+
+ LOCK (&priv->lock);
+ {
+ if ((priv->background_self_heal_count > 0) &&
+ (priv->heal_wait_qlen + priv->background_self_heal_count) >
+ (priv->heal_waiters + priv->healers)) {
+ list_add_tail(&local->healer, &priv->heal_waiting);
+ priv->heal_waiters++;
+ local = __afr_dequeue_heals (priv);
+ } else {
+ can_heal = _gf_false;
+ }
+ }
+ UNLOCK (&priv->lock);
+
+ if (can_heal) {
+ if (local)
+ afr_heal_synctask (this, local);
+ else
+ gf_msg_debug (this->name, 0, "Max number of heals are "
+ "pending, background self-heal rejected.");
+ }
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 74e852aa038..b298fa130c3 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -85,6 +85,9 @@
int
afr_selfheal (xlator_t *this, uuid_t gfid);
+void
+afr_throttled_selfheal (call_frame_t *frame, xlator_t *this);
+
int
afr_selfheal_name (xlator_t *this, uuid_t gfid, const char *name,
void *gfid_req);
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 98e4d3e2699..535f3e5b946 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -124,6 +124,10 @@ reconfigure (xlator_t *this, dict_t *options)
priv->background_self_heal_count, options, uint32,
out);
+ GF_OPTION_RECONF ("heal-wait-queue-length",
+ priv->heal_wait_qlen, options, uint32, out);
+
+
GF_OPTION_RECONF ("metadata-self-heal",
priv->metadata_self_heal, options, bool, out);
@@ -275,6 +279,8 @@ init (xlator_t *this)
priv->read_child = -1;
GF_OPTION_INIT ("arbiter-count", priv->arbiter_count, uint32, out);
+ INIT_LIST_HEAD (&priv->healing);
+ INIT_LIST_HEAD (&priv->heal_waiting);
priv->spb_choice_timeout = AFR_DEFAULT_SPB_CHOICE_TIMEOUT;
@@ -327,6 +333,9 @@ init (xlator_t *this)
GF_OPTION_INIT ("background-self-heal-count",
priv->background_self_heal_count, uint32, out);
+ GF_OPTION_INIT ("heal-wait-queue-length",
+ priv->heal_wait_qlen, uint32, out);
+
GF_OPTION_INIT ("data-self-heal", priv->data_self_heal, str, out);
GF_OPTION_INIT ("data-self-heal-algorithm",
@@ -599,10 +608,21 @@ struct volume_options options[] = {
{ .key = {"background-self-heal-count"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
- .default_value = "16",
+ .max = 256,
+ .default_value = "8",
+ .validate = GF_OPT_VALIDATE_MIN,
+ .description = "This specifies the number of per client self-heal "
+ "jobs that can perform parallel heals in the "
+ "background."
+ },
+ { .key = {"heal-wait-queue-length"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = 10000, /*Around 100MB with sizeof(afr_local_t)= 10496 bytes*/
+ .default_value = "128",
.validate = GF_OPT_VALIDATE_MIN,
- .description = "This specifies the number of self-heals that can be "
- " performed in background without blocking the fop"
+ .description = "This specifies the number of heals that can be queued"
+ " for the parallel background self heal jobs."
},
{ .key = {"data-self-heal"},
.type = GF_OPTION_TYPE_STR,
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 6f75e07f95c..7bb5aa8ded2 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -69,8 +69,17 @@ typedef struct _afr_private {
unsigned int data_self_heal_window_size; /* max number of pipelined
read/writes */
- unsigned int background_self_heal_count;
- unsigned int background_self_heals_started;
+ struct list_head heal_waiting; /*queue for files that need heal*/
+ uint32_t heal_wait_qlen; /*configurable queue length for heal_waiting*/
+ int32_t heal_waiters; /* No. of elements currently in wait queue.*/
+
+ struct list_head healing;/* queue for files that are undergoing
+ background heal*/
+ uint32_t background_self_heal_count;/*configurable queue length for
+ healing queue*/
+ int32_t healers;/* No. of elements currently undergoing background
+ heal*/
+
gf_boolean_t metadata_self_heal; /* on/off */
gf_boolean_t entry_self_heal; /* on/off */
@@ -122,12 +131,14 @@ typedef struct _afr_private {
afr_self_heald_t shd;
- /* pump dependencies */
- void *pump_private;
- gf_boolean_t use_afr_in_pump;
gf_boolean_t consistent_metadata;
uint64_t spb_choice_timeout;
gf_boolean_t need_heal;
+
+ /* pump dependencies */
+ void *pump_private;
+ gf_boolean_t use_afr_in_pump;
+
} afr_private_t;
@@ -740,6 +751,10 @@ typedef struct _afr_local {
int xflag;
gf_boolean_t do_discovery;
struct afr_reply *replies;
+
+ /* For client side background heals. */
+ struct list_head healer;
+ call_frame_t *heal_frame;
} afr_local_t;
@@ -891,7 +906,8 @@ int
afr_locked_nodes_count (unsigned char *locked_nodes, int child_count);
int
-afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode);
+afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ gf_boolean_t *start_heal);
void
afr_local_replies_wipe (afr_local_t *local, afr_private_t *priv);
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index dd9d159f0ca..96026a98d95 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -1119,6 +1119,14 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = GD_OP_VERSION_3_7_0,
.flags = OPT_FLAG_CLIENT_OPT
},
+ { .key = "cluster.heal-wait-queue-length",
+ .voltype = "cluster/replicate",
+ .type = DOC,
+ .op_version = GD_OP_VERSION_3_7_9,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+
+ /* stripe xlator options */
{ .key = "cluster.stripe-block-size",
.voltype = "cluster/stripe",
.option = "block-size",