cli/ afr: op_ret for index heal launch

Problem: If index heal is launched when some of the bricks are down, glustershd of that node sends a -1 op_ret to glusterd which eventually propagates it to the CLI. Also, glusterd sometimes sends an err_str and sometimes not (depending on the failure happening in the brick-op phase or commit-op phase). So the message that gets displayed varies in each case: "Launching heal operation to perform index self heal on volume testvol has been unsuccessful" (OR) "Commit failed on <host>. Please check log file for details." Fix: 1. Modify afr_xl_op() to return -1 even if index healing of atleast one brick fails. 2. Ignore glusterd's error string in gf_cli_heal_volume_cbk and print a more meaningful message. The patch also fixes a bug in glusterfs_handle_translator_op() where if we encounter an error in notify of one xlator, we break out of the loop instead of sending the notify to other xlators. Change-Id: I957f6c4b4d0a45453ffd5488e425cab5a3e0acca BUG: 1302291 Signed-off-by: Ravishankar N <ravishankar@redhat.com> Reviewed-on: http://review.gluster.org/13303 Reviewed-by: Anuradha Talur <atalur@redhat.com> Smoke: Gluster Build System <jenkins@build.gluster.com> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
author: Ravishankar N <ravishankar@redhat.com> 2016-01-18 12:16:31 +0000
committer: Pranith Kumar Karampuri <pkarampu@redhat.com> 2016-02-11 23:31:00 -0800
commit: da33097c3d6492e3b468b4347e47c70828fb4320 (patch)
tree: d33d4362d311b4a16ebbde2baca8f3e0e1bc2828
parent: e29bf0b1f102308f114e04421d80696eebfbf6e3 (diff)
4 files changed, 13 insertions, 11 deletions
diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c
index 4684679e557..6a296174b30 100644
--- a/cli/src/cli-rpc-ops.c
+++ b/cli/src/cli-rpc-ops.c
@@ -8469,13 +8469,10 @@ gf_cli_heal_volume_cbk (struct rpc_req *req, struct iovec *iov,
         }
 
         if (rsp.op_ret) {
-                if (strcmp (rsp.op_errstr, "")) {
-                        cli_err ("%s", rsp.op_errstr);
-                } else {
-                        cli_err ("%s%s on volume %s has been unsuccessful",
-                                 operation, heal_op_str, volname);
-                }
-
+                cli_err ("%s%s on volume %s has been unsuccessful on "
+                         "bricks that are down. Please check if all brick "
+                         "processes are running.",
+                         operation, heal_op_str, volname);
                 ret = rsp.op_ret;
                 goto out;
         } else {
diff --git a/glusterfsd/src/glusterfsd-mgmt.c b/glusterfsd/src/glusterfsd-mgmt.c
index 4a3e08a31b0..609567a3c4c 100644
--- a/glusterfsd/src/glusterfsd-mgmt.c
+++ b/glusterfsd/src/glusterfsd-mgmt.c
@@ -559,6 +559,7 @@ int
 glusterfs_handle_translator_op (rpcsvc_request_t *req)
 {
         int32_t                  ret     = -1;
+        int32_t                  op_ret  = 0;
         gd1_mgmt_brick_op_req    xlator_req = {0,};
         dict_t                   *input    = NULL;
         xlator_t                 *xlator = NULL;
@@ -628,9 +629,12 @@ glusterfs_handle_translator_op (rpcsvc_request_t *req)
                 ret = dict_get_str (input, key, &xname);
                 xlator = xlator_search_by_name (any, xname);
                 XLATOR_NOTIFY (xlator, GF_EVENT_TRANSLATOR_OP, input, output);
+                /* If notify fails for an xlator we need to capture it but
+                 * continue with the loop. */
                 if (ret)
-                        break;
+                        op_ret = -1;
         }
+        ret = op_ret;
 out:
         glusterfs_xlator_op_response_send (req, ret, "", output);
         if (input)
diff --git a/tests/basic/afr/arbiter.t b/tests/basic/afr/arbiter.t
index cecbc605541..df392cc6e23 100644
--- a/tests/basic/afr/arbiter.t
+++ b/tests/basic/afr/arbiter.t
@@ -57,7 +57,7 @@ TEST $CLI volume set $V0 cluster.self-heal-daemon on
 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
 EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
 EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
-TEST $CLI volume heal $V0
+$CLI volume heal $V0
 EXPECT_WITHIN $HEAL_TIMEOUT '1' echo $(count_sh_entries $B0/$V0"1")
 EXPECT_WITHIN $HEAL_TIMEOUT '1' echo $(count_sh_entries $B0/$V0"2")
 
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
index 5de512903c4..c539e117607 100644
--- a/xlators/cluster/afr/src/afr-self-heald.c
+++ b/xlators/cluster/afr/src/afr-self-heald.c
@@ -1053,7 +1053,7 @@ afr_xl_op (xlator_t *this, dict_t *input, dict_t *output)
                 goto out;
         switch (op) {
         case GF_SHD_OP_HEAL_INDEX:
-		op_ret = -1;
+		op_ret = 0;
 
 		for (i = 0; i < priv->child_count; i++) {
 			healer = &shd->index_healers[i];
@@ -1062,10 +1062,12 @@ afr_xl_op (xlator_t *this, dict_t *input, dict_t *output)
 			if (!priv->child_up[i]) {
 				ret = dict_set_str (output, key,
 						    "Brick is not connected");
+                                op_ret = -1;
 			} else if (AFR_COUNT (priv->child_up,
 					      priv->child_count) < 2) {
 				ret = dict_set_str (output, key,
 						    "< 2 bricks in replica are up");
+                                op_ret = -1;
 			} else if (!afr_shd_is_subvol_local (this, healer->subvol)) {
 				ret = dict_set_str (output, key,
 						    "Brick is remote");
@@ -1073,7 +1075,6 @@ afr_xl_op (xlator_t *this, dict_t *input, dict_t *output)
 				ret = dict_set_str (output, key,
 						    "Started self-heal");
 				afr_shd_index_healer_spawn (this, i);
-				op_ret = 0;
 			}
 		}
                 break;
author	Ravishankar N <ravishankar@redhat.com>	2016-01-18 12:16:31 +0000
committer	Pranith Kumar Karampuri <pkarampu@redhat.com>	2016-02-11 23:31:00 -0800
commit	da33097c3d6492e3b468b4347e47c70828fb4320 (patch)
tree	d33d4362d311b4a16ebbde2baca8f3e0e1bc2828
parent	e29bf0b1f102308f114e04421d80696eebfbf6e3 (diff)