diff options
author | Amar Tumballi <amar@gluster.com> | 2011-09-09 09:42:51 +0530 |
---|---|---|
committer | Vijay Bellur <vijay@gluster.com> | 2011-09-13 02:10:12 -0700 |
commit | 25daa42911d2ff697880ee29c591cac5f2abebed (patch) | |
tree | 9555284c052e1e205909e91f578a8b46b522ec56 | |
parent | 17e57f27c714c94dd5d9fa91650f83d069f2f4e4 (diff) |
support for de-commissioning a node using 'remove-brick'
to achieve this, we now create volume-file with
'decommissioned-nodes' option in distribute volume, then just
perform the rebalance set of operations (with 'force' flag set).
now onwards, the 'remove-brick' (with 'start' option) operation tries
to migrate data from removed bricks to existing bricks.
'remove-brick' also supports similar options as of replace-brick.
* (no options) -> works as 'force', will have the current behavior
of remove-brick, ie., no data-migration, volume changes.
* start (starts remove-brick with data-migration/draining process,
which takes care of migrating data and once complete, will
commit the changes to volume file)
* pause (stop data migration, but keep the volume file intact with
extra options whatever is set)
* abort (stop data-migration, and fall back to old configuration)
* commit (if volume is stopped, commits the changes to volumefile)
* force (stops the data-migration and commits the changes to
volume file)
Change-Id: I3952bcfbe604a0952e68b6accace7014d5e401d3
BUG: 1952
Reviewed-on: http://review.gluster.com/118
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Vijay Bellur <vijay@gluster.com>
22 files changed, 855 insertions, 177 deletions
diff --git a/cli/src/cli-cmd-parser.c b/cli/src/cli-cmd-parser.c index 3e68b2cce..8ea6581af 100644 --- a/cli/src/cli-cmd-parser.c +++ b/cli/src/cli-cmd-parser.c @@ -753,7 +753,7 @@ out: int32_t cli_cmd_volume_remove_brick_parse (const char **words, int wordcount, - dict_t **options) + dict_t **options, int *question) { dict_t *dict = NULL; char *volname = NULL; @@ -765,6 +765,10 @@ cli_cmd_volume_remove_brick_parse (const char **words, int wordcount, int32_t j = 0; char *tmp_brick = NULL; char *tmp_brick1 = NULL; + char *opwords[] = { "start", "commit", "pause", "abort", "status", + "force", NULL }; + char *w = NULL; + int32_t command = GF_OP_CMD_NONE; GF_ASSERT (words); GF_ASSERT (options); @@ -782,19 +786,53 @@ cli_cmd_volume_remove_brick_parse (const char **words, int wordcount, GF_ASSERT (volname); ret = dict_set_str (dict, "volname", volname); - if (ret) goto out; + w = str_getunamb (words[wordcount - 1], opwords); + if (!w) { + /* Should be default 'force' */ + command = GF_OP_CMD_COMMIT_FORCE; + if (question) + *question = 1; + } else { + /* handled this option */ + wordcount--; + if (!strcmp ("start", w)) { + command = GF_OP_CMD_START; + } else if (!strcmp ("commit", w)) { + command = GF_OP_CMD_COMMIT; + if (question) + *question = 1; + } else if (!strcmp ("pause", w)) { + command = GF_OP_CMD_PAUSE; + } else if (!strcmp ("abort", w)) { + command = GF_OP_CMD_ABORT; + } else if (!strcmp ("status", w)) { + command = GF_OP_CMD_STATUS; + } else if (!strcmp ("force", w)) { + command = GF_OP_CMD_COMMIT_FORCE; + if (question) + *question = 1; + } else { + GF_ASSERT (!"opword mismatch"); + ret = -1; + goto out; + } + } + if (wordcount < 4) { ret = -1; goto out; } - brick_index = 3; - + ret = dict_set_int32 (dict, "command", command); if (ret) - goto out; + gf_log ("cli", GF_LOG_INFO, "failed to set 'command' %d", + command); + + + brick_index = 3; tmp_index = brick_index; tmp_brick = GF_MALLOC(2048 * sizeof(*tmp_brick), gf_common_mt_char); @@ -805,7 +843,7 @@ cli_cmd_volume_remove_brick_parse (const char **words, int wordcount, ret = -1; goto out; } - + tmp_brick1 = GF_MALLOC(2048 * sizeof(*tmp_brick1), gf_common_mt_char); if (!tmp_brick1) { @@ -850,7 +888,6 @@ cli_cmd_volume_remove_brick_parse (const char **words, int wordcount, } ret = dict_set_int32 (dict, "count", brick_count); - if (ret) goto out; diff --git a/cli/src/cli-cmd-volume.c b/cli/src/cli-cmd-volume.c index 68c5ef578..16dc32328 100644 --- a/cli/src/cli-cmd-volume.c +++ b/cli/src/cli-cmd-volume.c @@ -800,6 +800,7 @@ cli_cmd_volume_remove_brick_cbk (struct cli_state *state, gf_answer_t answer = GF_ANSWER_NO; int sent = 0; int parse_error = 0; + int need_question = 0; const char *question = "Removing brick(s) can result in data loss. " "Do you want to Continue?"; @@ -808,7 +809,8 @@ cli_cmd_volume_remove_brick_cbk (struct cli_state *state, if (!frame) goto out; - ret = cli_cmd_volume_remove_brick_parse (words, wordcount, &options); + ret = cli_cmd_volume_remove_brick_parse (words, wordcount, &options, + &need_question); if (ret) { cli_usage_out (word->pattern); @@ -816,11 +818,13 @@ cli_cmd_volume_remove_brick_cbk (struct cli_state *state, goto out; } - answer = cli_cmd_get_confirmation (state, question); - - if (GF_ANSWER_NO == answer) { - ret = 0; - goto out; + if (!(state->mode & GLUSTER_MODE_SCRIPT) && need_question) { + /* we need to ask question only in case of 'commit or force' */ + answer = cli_cmd_get_confirmation (state, question); + if (GF_ANSWER_NO == answer) { + ret = 0; + goto out; + } } proc = &cli_rpc_prog->proctable[GLUSTER_CLI_REMOVE_BRICK]; @@ -1304,7 +1308,7 @@ struct cli_cmd volume_cmds[] = { cli_cmd_volume_add_brick_cbk, "add brick to volume <VOLNAME>"}, - { "volume remove-brick <VOLNAME> <BRICK> ...", + { "volume remove-brick <VOLNAME> <BRICK> ... {start|pause|abort|status|commit|force}", cli_cmd_volume_remove_brick_cbk, "remove brick from volume <VOLNAME>"}, diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c index 3d6ce25ef..d7a5988f2 100644 --- a/cli/src/cli-rpc-ops.c +++ b/cli/src/cli-rpc-ops.c @@ -856,23 +856,36 @@ gf_cli3_1_defrag_volume_cbk (struct rpc_req *req, struct iovec *iov, "rebalance process"); goto done; } - if (rsp.op_errno == 0) + + switch (rsp.op_errno) { + case GF_DEFRAG_STATUS_NOT_STARTED: status = "not started"; - if (rsp.op_errno == 1) + break; + case GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED: status = "step 1: layout fix in progress"; - if (rsp.op_errno == 2) + break; + case GF_DEFRAG_STATUS_MIGRATE_DATA_STARTED: status = "step 2: data migration in progress"; - if (rsp.op_errno == 3) + break; + case GF_DEFRAG_STATUS_STOPPED: status = "stopped"; - if (rsp.op_errno == 4) + break; + case GF_DEFRAG_STATUS_COMPLETE: status = "completed"; - if (rsp.op_errno == 5) + break; + case GF_DEFRAG_STATUS_FAILED: status = "failed"; - if (rsp.op_errno == 6) + break; + case GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE: status = "step 1: layout fix complete"; - if (rsp.op_errno == 7) + break; + case GF_DEFRAG_STATUS_MIGRATE_DATA_COMPLETE: status = "step 2: data migration complete"; - + break; + case GF_DEFRAG_STATUS_PAUSED: + status = "paused"; + break; + } if (rsp.files && (rsp.op_errno == 1)) { cli_out ("rebalance %s: fixed layout %"PRId64, status, rsp.files); @@ -1064,6 +1077,87 @@ out: return ret; } +int +gf_cli3_remove_brick_status_cbk (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + gf2_cli_defrag_vol_rsp rsp = {0,}; + char *status = "unknown"; + int ret = 0; + + if (-1 == req->rpc_status) { + goto out; + } + + ret = xdr_to_generic (*iov, &rsp, + (xdrproc_t)xdr_gf2_cli_defrag_vol_rsp); + if (ret < 0) { + gf_log ("", GF_LOG_ERROR, "error"); + goto out; + } + + ret = rsp.op_ret; + if (rsp.op_ret == -1) { + if (strcmp (rsp.op_errstr, "")) + cli_out ("%s", rsp.op_errstr); + else + cli_out ("failed to get the status of " + "remove-brick process"); + goto out; + } + + switch (rsp.op_errno) { + case GF_DEFRAG_STATUS_NOT_STARTED: + status = "not started"; + break; + case GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED: + case GF_DEFRAG_STATUS_MIGRATE_DATA_STARTED: + case GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE: + status = "in progress"; + break; + case GF_DEFRAG_STATUS_STOPPED: + status = "stopped"; + break; + case GF_DEFRAG_STATUS_COMPLETE: + case GF_DEFRAG_STATUS_MIGRATE_DATA_COMPLETE: + status = "completed"; + break; + case GF_DEFRAG_STATUS_FAILED: + status = "failed"; + break; + case GF_DEFRAG_STATUS_PAUSED: + status = "paused"; + break; + } + + if (rsp.files && (rsp.op_errno == 1)) { + cli_out ("remove-brick %s: fixed layout %"PRId64, + status, rsp.files); + goto out; + } + if (rsp.files && (rsp.op_errno == 6)) { + cli_out ("remove-brick %s: fixed layout %"PRId64, + status, rsp.files); + goto out; + } + if (rsp.files) { + cli_out ("remove-brick %s: decommissioned %"PRId64 + " files of size %"PRId64, status, + rsp.files, rsp.size); + goto out; + } + + cli_out ("remove-brick %s", status); + +out: + if (rsp.op_errstr) + free (rsp.op_errstr); //malloced by xdr + if (rsp.volname) + free (rsp.volname); //malloced by xdr + cli_cmd_broadcast_response (ret); + return ret; +} + int gf_cli3_1_remove_brick_cbk (struct rpc_req *req, struct iovec *iov, @@ -2160,8 +2254,11 @@ gf_cli3_1_remove_brick (call_frame_t *frame, xlator_t *this, void *data) { gf1_cli_remove_brick_req req = {0,}; + gf1_cli_defrag_vol_req status_req = {0,}; int ret = 0; - dict_t *dict = NULL; + dict_t *dict = NULL; + int32_t command = 0; + char *volname = NULL; if (!frame || !this || !data) { ret = -1; @@ -2170,30 +2267,45 @@ gf_cli3_1_remove_brick (call_frame_t *frame, xlator_t *this, dict = data; - ret = dict_get_str (dict, "volname", &req.volname); - + ret = dict_get_str (dict, "volname", &volname); if (ret) goto out; ret = dict_get_int32 (dict, "count", &req.count); - if (ret) goto out; - ret = dict_allocate_and_serialize (dict, - &req.bricks.bricks_val, - (size_t *)&req.bricks.bricks_len); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "failed to get serialized length of dict"); + ret = dict_get_int32 (dict, "command", &command); + if (ret) goto out; - } - ret = cli_cmd_submit (&req, frame, cli_rpc_prog, - GLUSTER_CLI_REMOVE_BRICK, NULL, - this, gf_cli3_1_remove_brick_cbk, - (xdrproc_t) xdr_gf1_cli_remove_brick_req); + if (command != GF_OP_CMD_STATUS) { + req.volname = volname; + + ret = dict_allocate_and_serialize (dict, + &req.bricks.bricks_val, + (size_t *)&req.bricks.bricks_len); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "failed to get serialized length of dict"); + goto out; + } + + ret = cli_cmd_submit (&req, frame, cli_rpc_prog, + GLUSTER_CLI_REMOVE_BRICK, NULL, + this, gf_cli3_1_remove_brick_cbk, + (xdrproc_t) xdr_gf1_cli_remove_brick_req); + } else { + /* Need rebalance status to e sent :-) */ + status_req.volname = volname; + status_req.cmd = GF_DEFRAG_CMD_STATUS; + ret = cli_cmd_submit (&status_req, frame, cli_rpc_prog, + GLUSTER_CLI_DEFRAG_VOLUME, NULL, + this, gf_cli3_remove_brick_status_cbk, + (xdrproc_t) xdr_gf1_cli_defrag_vol_req); + + } out: gf_log ("cli", GF_LOG_DEBUG, "Returning %d", ret); diff --git a/cli/src/cli.h b/cli/src/cli.h index d3e1fc21b..1e0d69cd8 100644 --- a/cli/src/cli.h +++ b/cli/src/cli.h @@ -212,7 +212,7 @@ cli_cmd_volume_add_brick_parse (const char **words, int wordcount, int32_t cli_cmd_volume_remove_brick_parse (const char **words, int wordcount, - dict_t **options); + dict_t **options, int *question); int32_t cli_cmd_volume_replace_brick_parse (const char **words, int wordcount, diff --git a/rpc/xdr/src/cli1-xdr.c b/rpc/xdr/src/cli1-xdr.c index 250efc935..9030d3067 100644 --- a/rpc/xdr/src/cli1-xdr.c +++ b/rpc/xdr/src/cli1-xdr.c @@ -38,6 +38,17 @@ xdr_gf_cli_defrag_type (XDR *xdrs, gf_cli_defrag_type *objp) } bool_t +xdr_gf_defrag_status_t (XDR *xdrs, gf_defrag_status_t *objp) +{ + register int32_t *buf; + buf = NULL; + + if (!xdr_enum (xdrs, (enum_t *) objp)) + return FALSE; + return TRUE; +} + +bool_t xdr_gf1_cluster_type (XDR *xdrs, gf1_cluster_type *objp) { register int32_t *buf; @@ -60,6 +71,17 @@ xdr_gf1_cli_replace_op (XDR *xdrs, gf1_cli_replace_op *objp) } bool_t +xdr_gf1_op_commands (XDR *xdrs, gf1_op_commands *objp) +{ + register int32_t *buf; + buf = NULL; + + if (!xdr_enum (xdrs, (enum_t *) objp)) + return FALSE; + return TRUE; +} + +bool_t xdr_gf_quota_type (XDR *xdrs, gf_quota_type *objp) { register int32_t *buf; diff --git a/rpc/xdr/src/cli1-xdr.h b/rpc/xdr/src/cli1-xdr.h index 0d606e79f..d502c30b4 100644 --- a/rpc/xdr/src/cli1-xdr.h +++ b/rpc/xdr/src/cli1-xdr.h @@ -42,9 +42,23 @@ enum gf_cli_defrag_type { GF_DEFRAG_CMD_START_LAYOUT_FIX = 1 + 3, GF_DEFRAG_CMD_START_MIGRATE_DATA = 1 + 4, GF_DEFRAG_CMD_START_MIGRATE_DATA_FORCE = 1 + 5, + GF_DEFRAG_CMD_START_FORCE = 1 + 6, }; typedef enum gf_cli_defrag_type gf_cli_defrag_type; +enum gf_defrag_status_t { + GF_DEFRAG_STATUS_NOT_STARTED = 0, + GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED = 1, + GF_DEFRAG_STATUS_MIGRATE_DATA_STARTED = 2, + GF_DEFRAG_STATUS_STOPPED = 3, + GF_DEFRAG_STATUS_COMPLETE = 4, + GF_DEFRAG_STATUS_FAILED = 5, + GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE = 6, + GF_DEFRAG_STATUS_MIGRATE_DATA_COMPLETE = 7, + GF_DEFRAG_STATUS_PAUSED = 8, +}; +typedef enum gf_defrag_status_t gf_defrag_status_t; + enum gf1_cluster_type { GF_CLUSTER_TYPE_NONE = 0, GF_CLUSTER_TYPE_STRIPE = 0 + 1, @@ -64,6 +78,17 @@ enum gf1_cli_replace_op { }; typedef enum gf1_cli_replace_op gf1_cli_replace_op; +enum gf1_op_commands { + GF_OP_CMD_NONE = 0, + GF_OP_CMD_START = 0 + 1, + GF_OP_CMD_COMMIT = 0 + 2, + GF_OP_CMD_PAUSE = 0 + 3, + GF_OP_CMD_ABORT = 0 + 4, + GF_OP_CMD_STATUS = 0 + 5, + GF_OP_CMD_COMMIT_FORCE = 0 + 6, +}; +typedef enum gf1_op_commands gf1_op_commands; + enum gf_quota_type { GF_QUOTA_OPTION_TYPE_NONE = 0, GF_QUOTA_OPTION_TYPE_ENABLE = 0 + 1, @@ -593,8 +618,10 @@ typedef struct gf1_cli_umount_rsp gf1_cli_umount_rsp; #if defined(__STDC__) || defined(__cplusplus) extern bool_t xdr_gf_cli_defrag_type (XDR *, gf_cli_defrag_type*); +extern bool_t xdr_gf_defrag_status_t (XDR *, gf_defrag_status_t*); extern bool_t xdr_gf1_cluster_type (XDR *, gf1_cluster_type*); extern bool_t xdr_gf1_cli_replace_op (XDR *, gf1_cli_replace_op*); +extern bool_t xdr_gf1_op_commands (XDR *, gf1_op_commands*); extern bool_t xdr_gf_quota_type (XDR *, gf_quota_type*); extern bool_t xdr_gf1_cli_friends_list (XDR *, gf1_cli_friends_list*); extern bool_t xdr_gf1_cli_get_volume (XDR *, gf1_cli_get_volume*); @@ -663,8 +690,10 @@ extern bool_t xdr_gf1_cli_umount_rsp (XDR *, gf1_cli_umount_rsp*); #else /* K&R C */ extern bool_t xdr_gf_cli_defrag_type (); +extern bool_t xdr_gf_defrag_status_t (); extern bool_t xdr_gf1_cluster_type (); extern bool_t xdr_gf1_cli_replace_op (); +extern bool_t xdr_gf1_op_commands (); extern bool_t xdr_gf_quota_type (); extern bool_t xdr_gf1_cli_friends_list (); extern bool_t xdr_gf1_cli_get_volume (); diff --git a/rpc/xdr/src/cli1-xdr.x b/rpc/xdr/src/cli1-xdr.x index 9fc9f02d2..ff2f09af3 100644 --- a/rpc/xdr/src/cli1-xdr.x +++ b/rpc/xdr/src/cli1-xdr.x @@ -4,7 +4,20 @@ GF_DEFRAG_CMD_STATUS, GF_DEFRAG_CMD_START_LAYOUT_FIX, GF_DEFRAG_CMD_START_MIGRATE_DATA, - GF_DEFRAG_CMD_START_MIGRATE_DATA_FORCE + GF_DEFRAG_CMD_START_MIGRATE_DATA_FORCE, + GF_DEFRAG_CMD_START_FORCE /* used by remove-brick data migration */ +} ; + + enum gf_defrag_status_t { + GF_DEFRAG_STATUS_NOT_STARTED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED, + GF_DEFRAG_STATUS_MIGRATE_DATA_STARTED, + GF_DEFRAG_STATUS_STOPPED, + GF_DEFRAG_STATUS_COMPLETE, + GF_DEFRAG_STATUS_FAILED, + GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE, + GF_DEFRAG_STATUS_MIGRATE_DATA_COMPLETE, + GF_DEFRAG_STATUS_PAUSED } ; enum gf1_cluster_type { @@ -24,6 +37,16 @@ GF_REPLACE_OP_COMMIT_FORCE } ; + enum gf1_op_commands { + GF_OP_CMD_NONE = 0, + GF_OP_CMD_START, + GF_OP_CMD_COMMIT, + GF_OP_CMD_PAUSE, + GF_OP_CMD_ABORT, + GF_OP_CMD_STATUS, + GF_OP_CMD_COMMIT_FORCE +} ; + enum gf_quota_type { GF_QUOTA_OPTION_TYPE_NONE = 0, GF_QUOTA_OPTION_TYPE_ENABLE, diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 6f8594e30..e221e10ab 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -1690,6 +1690,46 @@ dht_common_setxattr_cbk (call_frame_t *frame, void *cookie, } int +dht_checking_pathinfo_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr) +{ + int i = -1; + int ret = -1; + char *value = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + call_frame_t *prev = NULL; + int this_call_cnt = 0; + + local = frame->local; + prev = cookie; + conf = this->private; + + if (op_ret == -1) + goto out; + + + ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value); + if (ret) + goto out; + + if (!strcmp (value, local->key)) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev->this) + conf->decommissioned_bricks[i] = prev->this; + } + } + +out: + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + DHT_STACK_UNWIND (setxattr, frame, local->op_ret, ENOTSUP); + } + return 0; + +} + +int dht_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr, int flags) { @@ -1771,6 +1811,28 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, } + tmp = dict_get (xattr, "decommission-brick"); + if (tmp) { + /* This operation should happen only on '/' */ + if (__is_root_gfid (loc->inode->gfid) != 0) { + op_errno = ENOTSUP; + goto err; + } + + memcpy (value, tmp->data, ((tmp->len < 4095) ? tmp->len : 4095)); + local->key = gf_strdup (value); + local->call_cnt = conf->subvolume_cnt; + + for (i = 0 ; i < conf->subvolume_cnt; i++) { + /* Get the pathinfo, and then compare */ + STACK_WIND (frame, dht_checking_pathinfo_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->getxattr, + loc, GF_XATTR_PATHINFO_KEY); + } + return 0; + } + tmp = dict_get (xattr, GF_XATTR_FIX_LAYOUT_KEY); if (tmp) { gf_log (this->name, GF_LOG_INFO, diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index ab1b82af2..3545c0f99 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -202,6 +202,9 @@ struct dht_conf { uint32_t dir_spread_cnt; struct syncenv *env; /* The env pointer to the rebalance synctask */ + + /* to keep track of nodes which are decomissioned */ + xlator_t **decommissioned_bricks; }; typedef struct dht_conf dht_conf_t; diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index 99abe023b..d8138067e 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -579,6 +579,12 @@ dht_init_subvolumes (xlator_t *this, dht_conf_t *conf) return -1; } + conf->decommissioned_bricks = GF_CALLOC (cnt, sizeof (xlator_t *), + gf_dht_mt_xlator_t); + if (!conf->decommissioned_bricks) { + return -1; + } + return 0; } diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index 882e0209e..1c881be39 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -460,8 +460,22 @@ static inline int dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) { int i = 0; + int j = 0; int err = 0; int count = 0; + dht_conf_t *conf = NULL; + + /* Gets in use only for replace-brick, remove-brick */ + conf = this->private; + for (i = 0; i < layout->cnt; i++) { + for (j = 0; j < conf->subvolume_cnt; j++) { + if (conf->decommissioned_bricks[j] && + conf->decommissioned_bricks[j] == layout->list[i].xlator) { + layout->list[i].err = -EINVAL; + break; + } + } + } for (i = 0; i < layout->cnt; i++) { err = layout->list[i].err; diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c index 87a575654..d9499a407 100644 --- a/xlators/cluster/dht/src/dht.c +++ b/xlators/cluster/dht/src/dht.c @@ -255,6 +255,47 @@ out: int +dht_parse_decommissioned_bricks (xlator_t *this, dht_conf_t *conf, + const char *bricks) +{ + int i = 0; + int ret = -1; + char *tmpstr = NULL; + char *dup_brick = NULL; + char *node = NULL; + + if (!conf || !bricks) + goto out; + + dup_brick = gf_strdup (bricks); + node = strtok_r (dup_brick, ",", &tmpstr); + while (node) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!strcmp (conf->subvolumes[i]->name, node)) { + conf->decommissioned_bricks[i] = + conf->subvolumes[i]; + gf_log (this->name, GF_LOG_INFO, + "decommissioning subvolume %s", + conf->subvolumes[i]->name); + break; + } + } + if (i == conf->subvolume_cnt) { + /* Wrong node given. */ + goto out; + } + node = strtok_r (NULL, ",", &tmpstr); + } + + ret = 0; +out: + if (dup_brick) + GF_FREE (dup_brick); + + return ret; +} + +int reconfigure (xlator_t *this, dict_t *options) { dht_conf_t *conf = NULL; @@ -299,6 +340,12 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt, options, uint32, out); + if (dict_get_str (options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks (this, conf, temp_str); + if (ret == -1) + goto out; + } + ret = 0; out: return ret; @@ -360,14 +407,14 @@ init (xlator_t *this) goto err; } - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; + if (dict_get_str (this->options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks (this, conf, temp_str); + if (ret == -1) + goto err; } - conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t), - gf_dht_mt_dht_du_t); - if (!conf->du_stats) { + ret = dht_layouts_init (this, conf); + if (ret == -1) { goto err; } @@ -501,5 +548,8 @@ struct volume_options options[] = { { .key = {"directory-layout-spread"}, .type = GF_OPTION_TYPE_INT, }, + { .key = {"decommissioned-bricks"}, + .type = GF_OPTION_TYPE_ANY, + }, { .key = {NULL} }, }; diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c index 8b3a03b6f..8832c69ed 100644 --- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c @@ -312,7 +312,8 @@ glusterd_handle_remove_brick (rpcsvc_request_t *req) strcpy (vol_type, "distribute"); /* Do not allow remove-brick if the volume is plain stripe */ - if ((volinfo->type == GF_CLUSTER_TYPE_STRIPE) && (volinfo->brick_count == volinfo->sub_count)) { + if ((volinfo->type == GF_CLUSTER_TYPE_STRIPE) && + (volinfo->brick_count == volinfo->sub_count)) { snprintf (err_str, 2048, "Removing brick from a plain stripe is not allowed"); gf_log ("glusterd", GF_LOG_ERROR, "%s", err_str); ret = -1; @@ -321,8 +322,8 @@ glusterd_handle_remove_brick (rpcsvc_request_t *req) /* Do not allow remove-brick if the bricks given is less than the replica count or stripe count */ - if (((volinfo->type == GF_CLUSTER_TYPE_REPLICATE) || (volinfo->type == GF_CLUSTER_TYPE_STRIPE)) - && !(volinfo->brick_count <= volinfo->sub_count)) { + if ((volinfo->type != GF_CLUSTER_TYPE_NONE) && + !(volinfo->brick_count <= volinfo->sub_count)) { if (volinfo->sub_count && (count % volinfo->sub_count != 0)) { snprintf (err_str, 2048, "Remove brick incorrect" " brick count of %d for %s %d", @@ -512,16 +513,20 @@ out: int -glusterd_op_perform_remove_brick (glusterd_volinfo_t *volinfo, char *brick) +glusterd_op_perform_remove_brick (glusterd_volinfo_t *volinfo, char *brick, + int force, int *need_migrate) { - glusterd_brickinfo_t *brickinfo = NULL; char *dup_brick = NULL; - int32_t ret = -1; + int32_t ret = -1; + glusterd_conf_t *priv = NULL; GF_ASSERT (volinfo); GF_ASSERT (brick); + priv = THIS->private; + GF_ASSERT (priv); + dup_brick = gf_strdup (brick); if (!dup_brick) goto out; @@ -534,15 +539,26 @@ glusterd_op_perform_remove_brick (glusterd_volinfo_t *volinfo, char *brick) if (ret) goto out; - if (GLUSTERD_STATUS_STARTED == volinfo->status) { - ret = glusterd_brick_stop (volinfo, brickinfo); - if (ret) { - gf_log ("", GF_LOG_ERROR, "Unable to stop " - "glusterfs, ret: %d", ret); - goto out; + if (!uuid_compare (brickinfo->uuid, priv->uuid)) { + /* Only if the brick is in this glusterd, do the rebalance */ + if (need_migrate) + *need_migrate = 1; + } + + if (force) { + if (GLUSTERD_STATUS_STARTED == volinfo->status) { + ret = glusterd_brick_stop (volinfo, brickinfo); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "Unable to stop " + "glusterfs, ret: %d", ret); + goto out; + } } + glusterd_delete_brick (volinfo, brickinfo); + goto out; } - glusterd_delete_brick (volinfo, brickinfo); + + brickinfo->decommissioned = 1; out: if (dup_brick) GF_FREE (dup_brick); @@ -700,17 +716,18 @@ out: } int -glusterd_op_stage_remove_brick (dict_t *dict) +glusterd_op_stage_remove_brick (dict_t *dict, char **op_errstr) { - int ret = -1; - char *volname = NULL; - glusterd_volinfo_t *volinfo = NULL; - dict_t *ctx = NULL; - char *errstr = NULL; - int32_t brick_count = 0; + int ret = -1; + char *volname = NULL; + glusterd_volinfo_t *volinfo = NULL; + char *errstr = NULL; + int32_t brick_count = 0; + char msg[2048] = {0,}; + int32_t flag = 0; + gf1_op_commands cmd = GF_OP_CMD_NONE; ret = dict_get_str (dict, "volname", &volname); - if (ret) { gf_log ("", GF_LOG_ERROR, "Unable to get volume name"); goto out; @@ -723,25 +740,64 @@ glusterd_op_stage_remove_brick (dict_t *dict) goto out; } - if (glusterd_is_defrag_on(volinfo)) { - ctx = glusterd_op_get_ctx (); - errstr = gf_strdup("Rebalance is in progress. Please retry" - " after completion"); - if (!errstr) { - ret = -1; + ret = dict_get_int32 (dict, "command", &flag); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to get brick count"); + goto out; + } + cmd = flag; + + ret = -1; + switch (cmd) { + case GF_OP_CMD_NONE: + errstr = gf_strdup ("no remove-brick command issued"); + goto out; + + case GF_OP_CMD_STATUS: + ret = 0; + goto out; + + case GF_OP_CMD_START: + { + if (GLUSTERD_STATUS_STARTED != volinfo->status) { + snprintf (msg, sizeof (msg), "Volume %s needs to be started " + "before remove-brick (you can use 'force' or " + "'commit' to override this behavior)", + volinfo->volname); + errstr = gf_strdup (msg); + gf_log (THIS->name, GF_LOG_ERROR, "%s", errstr); goto out; } - gf_log ("glusterd", GF_LOG_ERROR, "%s", errstr); - ret = dict_set_dynstr (ctx, "errstr", errstr); - if (ret) { - GF_FREE (errstr); - gf_log ("", GF_LOG_DEBUG, - "failed to set errstr ctx"); + if (glusterd_is_defrag_on(volinfo)) { + errstr = gf_strdup("Rebalance is in progress. Please retry" + " after completion"); + gf_log ("glusterd", GF_LOG_ERROR, "%s", errstr); goto out; } + break; + } - ret = -1; - goto out; + case GF_OP_CMD_PAUSE: + case GF_OP_CMD_ABORT: + { + if (!volinfo->decommission_in_progress) { + errstr = gf_strdup("remove-brick is not in progress"); + gf_log ("glusterd", GF_LOG_ERROR, "%s", errstr); + goto out; + } + break; + } + + case GF_OP_CMD_COMMIT: + if (volinfo->decommission_in_progress) { + errstr = gf_strdup ("use 'force' option as migration " + "is in progress"); + goto out; + } + break; + + case GF_OP_CMD_COMMIT_FORCE: + break; } ret = dict_get_int32 (dict, "count", &brick_count); @@ -750,41 +806,96 @@ glusterd_op_stage_remove_brick (dict_t *dict) goto out; } + ret = 0; if (volinfo->brick_count == brick_count) { - ctx = glusterd_op_get_ctx (); - if (!ctx) { - gf_log ("", GF_LOG_ERROR, - "Operation Context is not present"); - ret = -1; - goto out; - } errstr = gf_strdup ("Deleting all the bricks of the " "volume is not allowed"); - if (!errstr) { - gf_log ("", GF_LOG_ERROR, "Out of memory"); - ret = -1; - goto out; - } - - ret = dict_set_dynstr (ctx, "errstr", errstr); - if (ret) { - GF_FREE (errstr); - gf_log ("", GF_LOG_DEBUG, - "failed to set pump status in ctx"); - goto out; - } - ret = -1; goto out; } out: gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); + if (ret && errstr) { + if (op_errstr) + *op_errstr = errstr; + } return ret; } int +glusterd_remove_brick_migrate_cbk (glusterd_volinfo_t *volinfo, + gf_defrag_status_t status) +{ + int ret = 0; + glusterd_brickinfo_t *brickinfo = NULL; + glusterd_brickinfo_t *tmp = NULL; + + switch (status) { + case GF_DEFRAG_STATUS_PAUSED: + case GF_DEFRAG_STATUS_FAILED: + /* No changes required in the volume file. + everything should remain as is */ + break; + case GF_DEFRAG_STATUS_STOPPED: + /* Fall back to the old volume file */ + list_for_each_entry_safe (brickinfo, tmp, &volinfo->bricks, brick_list) { + if (!brickinfo->decommissioned) + continue; + brickinfo->decommissioned = 0; + } + break; + + case GF_DEFRAG_STATUS_COMPLETE: + /* Done with the task, you can remove the brick from the + volume file */ + list_for_each_entry_safe (brickinfo, tmp, &volinfo->bricks, brick_list) { + if (!brickinfo->decommissioned) + continue; + gf_log (THIS->name, GF_LOG_INFO, "removing the brick %s", + brickinfo->path); + brickinfo->decommissioned = 0; + if (GLUSTERD_STATUS_STARTED == volinfo->status) { + ret = glusterd_brick_stop (volinfo, brickinfo); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, + "Unable to stop glusterfs (%d)", ret); + } + } + glusterd_delete_brick (volinfo, brickinfo); + } + break; + + default: + GF_ASSERT (!"cbk function called with wrong status"); + break; + } + + ret = glusterd_create_volfiles_and_notify_services (volinfo); + if (ret) + gf_log (THIS->name, GF_LOG_ERROR, + "Unable to write volume files (%d)", ret); + + ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT); + if (ret) + gf_log (THIS->name, GF_LOG_ERROR, + "Unable to store volume info (%d)", ret); + + + if (GLUSTERD_STATUS_STARTED == volinfo->status) { + ret = glusterd_check_generate_start_nfs (); + if (ret) + gf_log (THIS->name, GF_LOG_ERROR, + "Unable to start nfs process (%d)", ret); + } + + volinfo->decommission_in_progress = 0; + return 0; +} + + +int glusterd_op_add_brick (dict_t *dict, char **op_errstr) { int ret = 0; @@ -848,15 +959,20 @@ out: } int -glusterd_op_remove_brick (dict_t *dict) +glusterd_op_remove_brick (dict_t *dict, char **op_errstr) { - int ret = -1; - char *volname = NULL; - glusterd_volinfo_t *volinfo = NULL; - char *brick = NULL; - int32_t count = 0; - int32_t i = 1; - char key[256] = {0,}; + int ret = -1; + char *volname = NULL; + glusterd_volinfo_t *volinfo = NULL; + char *brick = NULL; + int32_t count = 0; + int32_t i = 1; + char key[256] = {0,}; + int32_t flag = 0; + char err_str[4096] = {0,}; + int need_rebalance = 0; + int force = 0; + gf1_op_commands cmd = 0; ret = dict_get_str (dict, "volname", &volname); @@ -866,12 +982,99 @@ glusterd_op_remove_brick (dict_t *dict) } ret = glusterd_volinfo_find (volname, &volinfo); - if (ret) { gf_log ("", GF_LOG_ERROR, "Unable to allocate memory"); goto out; } + ret = dict_get_int32 (dict, "command", &flag); + if (ret) { + gf_log ("", GF_LOG_ERROR, "Unable to get brick count"); + goto out; + } + cmd = flag; + + ret = -1; + switch (cmd) { + case GF_OP_CMD_NONE: + goto out; + + case GF_OP_CMD_STATUS: + ret = 0; + goto out; + + case GF_OP_CMD_PAUSE: + { + if (volinfo->decommission_in_progress) { + if (volinfo->defrag == (void *)1) + volinfo->defrag = NULL; + + if (volinfo->defrag) { + LOCK (&volinfo->defrag->lock); + + volinfo->defrag_status = GF_DEFRAG_STATUS_PAUSED; + + UNLOCK (&volinfo->defrag->lock); + } + } + + /* rebalance '_cbk()' will take care of volume file updates */ + ret = 0; + goto out; + } + + case GF_OP_CMD_ABORT: + { + if (volinfo->decommission_in_progress) { + if (volinfo->defrag == (void *)1) + volinfo->defrag = NULL; + + if (volinfo->defrag) { + LOCK (&volinfo->defrag->lock); + + volinfo->defrag_status = GF_DEFRAG_STATUS_STOPPED; + + UNLOCK (&volinfo->defrag->lock); + } + } + + /* rebalance '_cbk()' will take care of volume file updates */ + ret = 0; + goto out; + } + + case GF_OP_CMD_START: + force = 0; + break; + + case GF_OP_CMD_COMMIT: + force = 1; + break; + + case GF_OP_CMD_COMMIT_FORCE: + + if (volinfo->decommission_in_progress) { + if (volinfo->defrag == (void *)1) + volinfo->defrag = NULL; + + if (volinfo->defrag) { + LOCK (&volinfo->defrag->lock); + /* Fake 'rebalance-complete' so the graph change + happens right away */ + volinfo->defrag_status = GF_DEFRAG_STATUS_COMPLETE; + + UNLOCK (&volinfo->defrag->lock); + } + ret = 0; + /* Graph change happens in rebalance _cbk function, + no need to do anything here */ + goto out; + } + + force = 1; + break; + } + ret = dict_get_int32 (dict, "count", &count); if (ret) { gf_log ("", GF_LOG_ERROR, "Unable to get count"); @@ -887,26 +1090,46 @@ glusterd_op_remove_brick (dict_t *dict) goto out; } - ret = glusterd_op_perform_remove_brick (volinfo, brick); + ret = glusterd_op_perform_remove_brick (volinfo, brick, force, + (i == 1) ? &need_rebalance : NULL); if (ret) goto out; i++; } ret = glusterd_create_volfiles_and_notify_services (volinfo); - if (ret) + if (ret) { + gf_log (THIS->name, GF_LOG_WARNING, "failed to create volfiles"); goto out; - - volinfo->defrag_status = 0; + } ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT); - - if (ret) + if (ret) { + gf_log (THIS->name, GF_LOG_WARNING, "failed to store volinfo"); goto out; + } - if (GLUSTERD_STATUS_STARTED == volinfo->status) - ret = glusterd_check_generate_start_nfs (); + volinfo->defrag_status = 0; + if (!force && need_rebalance) { + /* perform the rebalance operations */ + ret = glusterd_handle_defrag_start (volinfo, err_str, 4096, + GF_DEFRAG_CMD_START_FORCE, + glusterd_remove_brick_migrate_cbk); + if (!ret) + volinfo->decommission_in_progress = 1; + + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, + "failed to start the rebalance"); + } + } else { + if (GLUSTERD_STATUS_STARTED == volinfo->status) + ret = glusterd_check_generate_start_nfs (); + } out: + if (ret && err_str[0] && op_errstr) + *op_errstr = gf_strdup (err_str); + return ret; } diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c index 3b30fb080..53556984a 100644 --- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c +++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c @@ -2297,7 +2297,7 @@ glusterd_op_stage_validate (glusterd_op_t op, dict_t *dict, char **op_errstr, break; case GD_OP_REMOVE_BRICK: - ret = glusterd_op_stage_remove_brick (dict); + ret = glusterd_op_stage_remove_brick (dict, op_errstr); break; case GD_OP_LOG_FILENAME: @@ -2387,7 +2387,7 @@ glusterd_op_commit_perform (glusterd_op_t op, dict_t *dict, char **op_errstr, break; case GD_OP_REMOVE_BRICK: - ret = glusterd_op_remove_brick (dict); + ret = glusterd_op_remove_brick (dict, op_errstr); break; case GD_OP_LOG_FILENAME: @@ -2565,6 +2565,7 @@ glusterd_bricks_select_remove_brick (dict_t *dict, char **op_errstr) int32_t i = 1; char key[256] = {0,}; glusterd_pending_node_t *pending_node = NULL; + int32_t force = 0; ret = dict_get_str (dict, "volname", &volname); @@ -2586,6 +2587,12 @@ glusterd_bricks_select_remove_brick (dict_t *dict, char **op_errstr) goto out; } + ret = dict_get_int32 (dict, "force", &force); + if (ret) { + gf_log (THIS->name, GF_LOG_INFO, "force flag is not set"); + ret = 0; + goto out; + } while ( i <= count) { snprintf (key, 256, "brick%d", i); diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c index a013d0adf..defaf947f 100644 --- a/xlators/mgmt/glusterd/src/glusterd-rebalance.c +++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c @@ -46,6 +46,7 @@ #include "cli1-xdr.h" #include "xdr-generic.h" +/* return values - 0: success, +ve: stopped, -ve: failure */ int gf_glusterd_rebalance_move_data (glusterd_volinfo_t *volinfo, const char *dir) { @@ -66,7 +67,8 @@ gf_glusterd_rebalance_move_data (glusterd_volinfo_t *volinfo, const char *dir) if (!fd) goto out; - if (defrag->cmd == GF_DEFRAG_CMD_START_MIGRATE_DATA_FORCE) { + if ((defrag->cmd == GF_DEFRAG_CMD_START_MIGRATE_DATA_FORCE) || + (defrag->cmd == GF_DEFRAG_CMD_START_FORCE)) { strcpy (force_string, "force"); } else { strcpy (force_string, "not-force"); @@ -105,9 +107,11 @@ gf_glusterd_rebalance_move_data (glusterd_volinfo_t *volinfo, const char *dir) } UNLOCK (&defrag->lock); - if (volinfo->defrag_status == GF_DEFRAG_STATUS_STOPED) { + if (volinfo->defrag_status != + GF_DEFRAG_STATUS_MIGRATE_DATA_STARTED) { + /* It can be one of 'stopped|paused|commit' etc */ closedir (fd); - ret = -1; + ret = 1; goto out; } } @@ -144,6 +148,7 @@ out: return ret; } +/* return values - 0: success, +ve: stopped, -ve: failure */ int gf_glusterd_rebalance_fix_layout (glusterd_volinfo_t *volinfo, const char *dir) { @@ -187,9 +192,11 @@ gf_glusterd_rebalance_fix_layout (glusterd_volinfo_t *volinfo, const char *dir) break; } - if (volinfo->defrag_status == GF_DEFRAG_STATUS_STOPED) { + if (volinfo->defrag_status != + GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED) { + /* It can be one of 'stopped|paused|commit' etc */ closedir (fd); - ret = -1; + ret = 1; goto out; } } @@ -210,6 +217,7 @@ glusterd_defrag_start (void *data) int ret = -1; struct stat stbuf = {0,}; + THIS = volinfo->xl; defrag = volinfo->defrag; if (!defrag) goto out; @@ -240,8 +248,10 @@ glusterd_defrag_start (void *data) /* Step 1: Fix layout of all the directories */ ret = gf_glusterd_rebalance_fix_layout (volinfo, defrag->mount); + if (ret < 0) + volinfo->defrag_status = GF_DEFRAG_STATUS_FAILED; + /* in both 'stopped' or 'failure' cases goto out */ if (ret) { - volinfo->defrag_status = GF_DEFRAG_STATUS_FAILED; goto out; } @@ -257,8 +267,10 @@ glusterd_defrag_start (void *data) /* Step 2: Iterate over directories to move data */ ret = gf_glusterd_rebalance_move_data (volinfo, defrag->mount); + if (ret < 0) + volinfo->defrag_status = GF_DEFRAG_STATUS_FAILED; + /* in both 'stopped' or 'failure' cases goto out */ if (ret) { - volinfo->defrag_status = GF_DEFRAG_STATUS_FAILED; goto out; } @@ -267,7 +279,8 @@ glusterd_defrag_start (void *data) } /* Completed whole process */ - if (defrag->cmd == GF_DEFRAG_CMD_START) + if ((defrag->cmd == GF_DEFRAG_CMD_START) || + (defrag->cmd == GF_DEFRAG_CMD_START_FORCE)) volinfo->defrag_status = GF_DEFRAG_STATUS_COMPLETE; volinfo->rebalance_files = defrag->total_files; @@ -281,9 +294,13 @@ out: ret = runcmd ("umount", "-l", defrag->mount, NULL); LOCK_DESTROY (&defrag->lock); + + if (defrag->cbk_fn) { + defrag->cbk_fn (volinfo, volinfo->defrag_status); + } + GF_FREE (defrag); } - return NULL; } @@ -332,7 +349,7 @@ glusterd_defrag_stop (glusterd_volinfo_t *volinfo, u_quad_t *files, LOCK (&volinfo->defrag->lock); { - volinfo->defrag_status = GF_DEFRAG_STATUS_STOPED; + volinfo->defrag_status = GF_DEFRAG_STATUS_STOPPED; *files = volinfo->defrag->total_files; *size = volinfo->defrag->total_data; } @@ -497,7 +514,7 @@ out: int glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr, - size_t len, int cmd) + size_t len, int cmd, defrag_cbk_fn_t cbk) { int ret = -1; glusterd_defrag_info_t *defrag = NULL; @@ -552,6 +569,9 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr, volinfo->defrag_status = GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED; + if (cbk) + defrag->cbk_fn = cbk; + ret = pthread_create (&defrag->th, NULL, glusterd_defrag_start, volinfo); if (ret) { @@ -635,7 +655,7 @@ glusterd_handle_defrag_volume (rpcsvc_request_t *req) case GF_DEFRAG_CMD_START_MIGRATE_DATA_FORCE: { ret = glusterd_handle_defrag_start (volinfo, msg, sizeof (msg), - cli_req.cmd); + cli_req.cmd, NULL); rsp.op_ret = ret; break; } @@ -845,7 +865,7 @@ glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict) case GF_DEFRAG_CMD_START_MIGRATE_DATA: case GF_DEFRAG_CMD_START_MIGRATE_DATA_FORCE: ret = glusterd_handle_defrag_start (volinfo, msg, sizeof (msg), - cmd); + cmd, NULL); break; case GF_DEFRAG_CMD_STOP: ret = glusterd_defrag_stop (volinfo, &files, &size, diff --git a/xlators/mgmt/glusterd/src/glusterd-replace-brick.c b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c index 060d40bed..4a4289910 100644 --- a/xlators/mgmt/glusterd/src/glusterd-replace-brick.c +++ b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c @@ -1486,7 +1486,7 @@ glusterd_op_perform_replace_brick (glusterd_volinfo_t *volinfo, volinfo->brick_count++; - ret = glusterd_op_perform_remove_brick (volinfo, old_brick); + ret = glusterd_op_perform_remove_brick (volinfo, old_brick, 1, NULL); if (ret) goto out; diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c index 53fdcf484..dab075db0 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.c +++ b/xlators/mgmt/glusterd/src/glusterd-store.c @@ -321,6 +321,10 @@ glusterd_store_brickinfo_write (int fd, glusterd_brickinfo_t *brickinfo) ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_RDMA_PORT, value); + snprintf (value, sizeof(value), "%d", brickinfo->decommissioned); + ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED, + value); + out: gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); return ret; @@ -1637,6 +1641,9 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo) pmap = pmap_registry_get (THIS); if (pmap->last_alloc <= brickinfo->rdma_port) pmap->last_alloc = brickinfo->rdma_port + 1; + } else if (!strncmp (key, GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED, + strlen (GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED))) { + gf_string2int (value, &brickinfo->decommissioned); } else { gf_log ("", GF_LOG_ERROR, "Unknown key: %s", key); diff --git a/xlators/mgmt/glusterd/src/glusterd-store.h b/xlators/mgmt/glusterd/src/glusterd-store.h index 3ca232a9a..61bda195d 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.h +++ b/xlators/mgmt/glusterd/src/glusterd-store.h @@ -63,6 +63,7 @@ typedef enum glusterd_store_ver_ac_{ #define GLUSTERD_STORE_KEY_BRICK_PATH "path" #define GLUSTERD_STORE_KEY_BRICK_PORT "listen-port" #define GLUSTERD_STORE_KEY_BRICK_RDMA_PORT "rdma.listen-port" +#define GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED "decommissioned" #define GLUSTERD_STORE_KEY_PEER_UUID "uuid" #define GLUSTERD_STORE_KEY_PEER_HOSTNAME "hostname" diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index 8694f7536..5b247b6a9 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -548,6 +548,8 @@ glusterd_volinfo_new (glusterd_volinfo_t **volinfo) goto out; } + new_volinfo->xl = THIS; + *volinfo = new_volinfo; ret = 0; diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index cb198dfb9..d0533b1fc 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -1811,10 +1811,15 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, char **cluster_args = NULL; int i = 0; int j = 0; - int ret = 0; + int ret = -1; xlator_t *xl = NULL; xlator_t *txl = NULL; xlator_t *trav = NULL; + int removed_bricks = 0; + int index_of_removed_brick = 0; + char *removed_bricklist = NULL; + char volume_name[1024] = {0,}; + int idx = 0; volname = volinfo->volname; dict = volinfo->dict; @@ -1824,7 +1829,7 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, gf_log ("", GF_LOG_ERROR, "volume inconsistency: brick count is 0"); - return -1; + goto out; } if (volinfo->sub_count && volinfo->sub_count < volinfo->brick_count && volinfo->brick_count % volinfo->sub_count != 0) { @@ -1834,7 +1839,7 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, "number of bricks per cluster (%d) in a multi-cluster " "setup", volinfo->brick_count, volinfo->sub_count); - return -1; + goto out; } get_transport_type (volinfo, set_dict, transt, _gf_false); @@ -1844,19 +1849,32 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, i = 0; list_for_each_entry (brick, &volinfo->bricks, brick_list) { + ret = -1; xl = volgen_graph_add_nolink (graph, "protocol/client", "%s-client-%d", volname, i); if (!xl) - return -1; + goto out; ret = xlator_set_option (xl, "remote-host", brick->hostname); if (ret) - return -1; + goto out; ret = xlator_set_option (xl, "remote-subvolume", brick->path); if (ret) - return -1; + goto out; ret = xlator_set_option (xl, "transport-type", transt); if (ret) - return -1; + goto out; + if (brick->decommissioned) { + if (!removed_bricklist) { + removed_bricklist = GF_CALLOC (16 * GF_UNIT_KB, + 1, gf_common_mt_char); + index_of_removed_brick = i; + } + if (removed_bricks) + strcat (removed_bricklist, ","); + snprintf (volume_name, 1024, "%s-client-%d", volname, i); + strcat (removed_bricklist, volume_name); + removed_bricks++; + } i++; } @@ -1866,7 +1884,7 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, "differs from brick count (%d)", i, volinfo->brick_count); - return -1; + goto out; } sub_count = volinfo->sub_count; @@ -1880,15 +1898,18 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, break; case GF_CLUSTER_TYPE_STRIPE_REPLICATE: /* Replicate after the clients, then stripe */ - if (volinfo->replica_count == 0) - return -1; + if (volinfo->replica_count == 0) { + ret = -1; + goto out; + } sub_count = volinfo->replica_count; cluster_args = replicate_args; break; default: gf_log ("", GF_LOG_ERROR, "volume inconsistency: " "unrecognized clustering type"); - return -1; + ret = -1; + goto out; } i = 0; @@ -1901,14 +1922,16 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, cluster_args[0], cluster_args[1], volname, j); - if (!xl) - return -1; + if (!xl) { + ret = -1; + goto out; + } j++; } ret = volgen_xlator_link (xl, trav); if (ret) - return -1; + goto out; if (trav == txl) break; @@ -1928,14 +1951,16 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, cluster_args[0], cluster_args[1], volname, j); - if (!xl) - return -1; + if (!xl) { + ret = -1; + goto out; + } j++; } ret = volgen_xlator_link (xl, trav); if (ret) - return -1; + goto out; if (trav == txl) break; @@ -1953,8 +1978,10 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, if (dist_count > 1) { xl = volgen_graph_add_nolink (graph, "cluster/distribute", "%s-dht", volname); - if (!xl) - return -1; + if (!xl) { + ret = -1; + goto out; + } trav = xl; for (i = 0; i < dist_count; i++) @@ -1962,28 +1989,50 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, for (; trav != xl; trav = trav->prev) { ret = volgen_xlator_link (xl, trav); if (ret) - return -1; + goto out; + } + + if (removed_bricks) { + if (volinfo->sub_count) { + idx = index_of_removed_brick / volinfo->sub_count; + if (GF_CLUSTER_TYPE_REPLICATE == volinfo->type) { + snprintf (volume_name, 1024, "%s-replicate-%d", + volname, idx); + strcpy (removed_bricklist, volume_name); + } else if (volinfo->type != GF_CLUSTER_TYPE_NONE) { + snprintf (volume_name, 1024, "%s-stripe-%d ", + volname, idx); + strcpy (removed_bricklist, volume_name); + } + } + ret = xlator_set_option (xl, "decommissioned-bricks", + removed_bricklist); + if (ret) + goto out; } } ret = glusterd_volinfo_get_boolean (volinfo, VKEY_FEATURES_QUOTA); if (ret == -1) - return -1; + goto out; + if (ret) { xl = volgen_graph_add (graph, "features/quota", volname); - if (!xl) - return -1; + if (!xl) { + ret = -1; + goto out; + } } ret = volgen_graph_set_options_generic (graph, set_dict, volname, &perfxl_option_handler); if (ret) - return -1; + goto out; xl = volgen_graph_add_as (graph, "debug/io-stats", volname); if (!xl) - return -1; + goto out; ret = volgen_graph_set_options_generic (graph, set_dict, "client", &loglevel_option_handler); @@ -1991,6 +2040,11 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, if (!ret) ret = volgen_graph_set_options_generic (graph, set_dict, "client", &sys_loglevel_option_handler); + +out: + if (removed_bricklist) + GF_FREE (removed_bricklist); + return ret; } diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index 86eeaeb1c..c8fa82819 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -45,7 +45,7 @@ #include "glusterd1-xdr.h" #include "protocol-common.h" #include "glusterd-pmap.h" - +#include "cli1-xdr.h" #define GLUSTERD_MAX_VOLUME_NAME 1000 #define DEFAULT_LOG_FILE_DIRECTORY DATADIR "/log/glusterfs" @@ -132,6 +132,7 @@ struct glusterd_brickinfo { gf_brick_status_t status; struct rpc_clnt *rpc; gf_timer_t *timer; + int decommissioned; }; typedef struct glusterd_brickinfo glusterd_brickinfo_t; @@ -142,16 +143,11 @@ struct gf_defrag_brickinfo_ { int size; }; -typedef enum gf_defrag_status_ { - GF_DEFRAG_STATUS_NOT_STARTED, - GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED, - GF_DEFRAG_STATUS_MIGRATE_DATA_STARTED, - GF_DEFRAG_STATUS_STOPED, - GF_DEFRAG_STATUS_COMPLETE, - GF_DEFRAG_STATUS_FAILED, - GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE, - GF_DEFRAG_STATUS_MIGRATE_DATA_COMPLETE, -} gf_defrag_status_t; +struct glusterd_volinfo_; +typedef struct glusterd_volinfo_ glusterd_volinfo_t; + +typedef int (*defrag_cbk_fn_t) (glusterd_volinfo_t *volinfo, + gf_defrag_status_t status); struct glusterd_defrag_info_ { uint64_t total_files; @@ -163,6 +159,8 @@ struct glusterd_defrag_info_ { char mount[1024]; char databuf[131072]; struct gf_defrag_brickinfo_ *bricks; /* volinfo->brick_count */ + + defrag_cbk_fn_t cbk_fn; }; @@ -219,9 +217,10 @@ struct glusterd_volinfo_ { char *logdir; dict_t *gsync_slaves; -}; -typedef struct glusterd_volinfo_ glusterd_volinfo_t; + int decommission_in_progress; + xlator_t *xl; +}; typedef struct glusterd_pending_node_ { void *node; @@ -540,6 +539,8 @@ int glusterd_handle_cli_start_volume (rpcsvc_request_t *req); int glusterd_handle_cli_stop_volume (rpcsvc_request_t *req); int glusterd_handle_cli_delete_volume (rpcsvc_request_t *req); +int glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr, + size_t len, int cmd, defrag_cbk_fn_t cbk); /* op-sm functions */ int glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr); @@ -565,9 +566,9 @@ int glusterd_op_stop_volume (dict_t *dict); int glusterd_op_delete_volume (dict_t *dict); int glusterd_op_add_brick (dict_t *dict, char **op_errstr); -int glusterd_op_remove_brick (dict_t *dict); +int glusterd_op_remove_brick (dict_t *dict, char **op_errstr); int glusterd_op_stage_add_brick (dict_t *dict, char **op_errstr); -int glusterd_op_stage_remove_brick (dict_t *dict); +int glusterd_op_stage_remove_brick (dict_t *dict, char **op_errstr); int glusterd_op_stage_rebalance (dict_t *dict, char **op_errstr); int glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict); @@ -575,7 +576,8 @@ int glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict); /* misc */ void glusterd_do_replace_brick (void *data); -int glusterd_op_perform_remove_brick (glusterd_volinfo_t *volinfo, char *brick); +int glusterd_op_perform_remove_brick (glusterd_volinfo_t *volinfo, char *brick, + int force, int *need_migrate); int glusterd_op_stop_volume_args_get (dict_t *dict, char** volname, int *flags); diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index 09818ce89..a8b7b67a4 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -2545,7 +2545,7 @@ posix_getxattr (call_frame_t *frame, xlator_t *this, } goto done; } - if (loc->inode && IA_ISREG (loc->inode->ia_type) && name && + if (loc->inode && name && (strcmp (name, GF_XATTR_PATHINFO_KEY) == 0)) { snprintf (host_buf, 1024, "<POSIX:%s:%s>", priv->hostname, real_path); |