From 7ba1e1ed45cee56ef51b9c04df99c976546d5d04 Mon Sep 17 00:00:00 2001 From: shishirng Date: Wed, 18 Jan 2012 15:29:15 +0530 Subject: cluster/dht: Rebalance will be a new glusterfs process rebalance will not use any maintainance clients. It is replaced by syncops, with the volfile. Brickop (communication between glusterd<->glusterfs process) is used for status and stop commands. Dept-first traversal of dir is maintained, but data is migrated as and when encounterd. fix-layout (dir) do Complete migrate-data of dir fix-layout (subdir) done Rebalance state is saved in the vol file, for restart-ability. A disconnect event and pidfile state determine the defrag-status Signed-off-by: shishirng Change-Id: Iec6c80c84bbb2142d840242c28db3d5f5be94d01 BUG: 763844 Reviewed-on: http://review.gluster.com/2540 Tested-by: Gluster Build System Reviewed-by: Amar Tumballi --- cli/src/cli-cmd-volume.c | 30 +- cli/src/cli-rpc-ops.c | 71 +-- glusterfsd/src/glusterfsd-mgmt.c | 79 +++- libglusterfs/src/glusterfs.h | 1 + rpc/rpc-lib/src/protocol-common.h | 1 + rpc/xdr/src/cli1-xdr.h | 16 +- rpc/xdr/src/cli1-xdr.x | 10 +- xlators/cluster/dht/src/dht-common.c | 67 ++- xlators/cluster/dht/src/dht-common.h | 47 ++ xlators/cluster/dht/src/dht-mem-types.h | 1 + xlators/cluster/dht/src/dht-rebalance.c | 606 ++++++++++++++++++++++++- xlators/cluster/dht/src/dht.c | 46 +- xlators/mgmt/glusterd/src/glusterd-brick-ops.c | 21 +- xlators/mgmt/glusterd/src/glusterd-handler.c | 6 + xlators/mgmt/glusterd/src/glusterd-op-sm.c | 80 ++++ xlators/mgmt/glusterd/src/glusterd-rebalance.c | 601 ++++++++---------------- xlators/mgmt/glusterd/src/glusterd-rpc-ops.c | 9 +- xlators/mgmt/glusterd/src/glusterd-store.c | 12 + xlators/mgmt/glusterd/src/glusterd-store.h | 1 + xlators/mgmt/glusterd/src/glusterd-utils.c | 54 +++ xlators/mgmt/glusterd/src/glusterd-utils.h | 2 + xlators/mgmt/glusterd/src/glusterd.c | 2 + xlators/mgmt/glusterd/src/glusterd.h | 29 +- 23 files changed, 1260 insertions(+), 532 deletions(-) diff --git a/cli/src/cli-cmd-volume.c b/cli/src/cli-cmd-volume.c index 6bea948e..6ab1515e 100644 --- a/cli/src/cli-cmd-volume.c +++ b/cli/src/cli-cmd-volume.c @@ -731,7 +731,7 @@ cli_cmd_volume_defrag_cbk (struct cli_state *state, struct cli_cmd_word *word, if (!dict) goto out; - if (!((wordcount == 4) || (wordcount == 5) || (wordcount == 6))) { + if (!((wordcount == 4) || (wordcount == 5))) { cli_usage_out (word->pattern); parse_error = 1; goto out; @@ -741,7 +741,7 @@ cli_cmd_volume_defrag_cbk (struct cli_state *state, struct cli_cmd_word *word, index = 3; } else { if (strcmp (words[3], "fix-layout") && - strcmp (words[3], "migrate-data")) { + strcmp (words[3], "start")) { cli_usage_out (word->pattern); parse_error = 1; goto out; @@ -750,7 +750,7 @@ cli_cmd_volume_defrag_cbk (struct cli_state *state, struct cli_cmd_word *word, } if (strcmp (words[index], "start") && strcmp (words[index], "stop") && - strcmp (words[index], "status")) { + strcmp (words[index], "status") && strcmp (words[index], "force")) { cli_usage_out (word->pattern); parse_error = 1; goto out; @@ -766,27 +766,19 @@ cli_cmd_volume_defrag_cbk (struct cli_state *state, struct cli_cmd_word *word, goto out; } if (wordcount == 5) { - ret = dict_set_str (dict, "start-type", (char *)words[3]); - if (ret) - goto out; - ret = dict_set_str (dict, "command", (char *)words[4]); - if (ret) - goto out; - } - - /* 'force' option is valid only for the 'migrate-data' key */ - if (wordcount == 6) { - if (strcmp (words[3], "migrate-data") || - strcmp (words[4], "start") || - strcmp (words[5], "force")) { + if ((strcmp (words[3], "fix-layout") || + strcmp (words[4], "start")) && + (strcmp (words[3], "start") || + strcmp (words[4], "force"))) { cli_usage_out (word->pattern); parse_error = 1; goto out; } - ret = dict_set_str (dict, "start-type", "migrate-data-force"); + + ret = dict_set_str (dict, "option", (char *)words[4]); if (ret) goto out; - ret = dict_set_str (dict, "command", (char *)words[4]); + ret = dict_set_str (dict, "command", (char *)words[3]); if (ret) goto out; } @@ -1776,7 +1768,7 @@ struct cli_cmd volume_cmds[] = { cli_cmd_volume_remove_brick_cbk, "remove brick from volume "}, - { "volume rebalance [fix-layout|migrate-data] {start|stop|status} [force]", + { "volume rebalance [fix-layout] {start|stop|status} [force]", cli_cmd_volume_defrag_cbk, "rebalance operations"}, diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c index 6ed380bf..d1888415 100644 --- a/cli/src/cli-rpc-ops.c +++ b/cli/src/cli-rpc-ops.c @@ -1052,9 +1052,9 @@ gf_cli3_1_defrag_volume_cbk (struct rpc_req *req, struct iovec *iov, volname); } else { snprintf (msg, sizeof (msg), - "stopped rebalance process of volume %s \n" - "(after rebalancing %"PRId64" files totaling" - " %"PRId64" bytes)", volname, files, size); + "Stopped rebalance process on volume %s \n" + "(after rebalancing %"PRId64" bytes - " + "%"PRId64" files)", volname, size, files); } goto done; } @@ -1065,7 +1065,7 @@ gf_cli3_1_defrag_volume_cbk (struct rpc_req *req, struct iovec *iov, "%s", rsp.op_errstr); else snprintf (msg, sizeof (msg), - "failed to get the status of " + "Failed to get the status of " "rebalance process"); goto done; } @@ -1074,11 +1074,8 @@ gf_cli3_1_defrag_volume_cbk (struct rpc_req *req, struct iovec *iov, case GF_DEFRAG_STATUS_NOT_STARTED: status = "not started"; break; - case GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED: - status = "step 1: layout fix in progress"; - break; - case GF_DEFRAG_STATUS_MIGRATE_DATA_STARTED: - status = "step 2: data migration in progress"; + case GF_DEFRAG_STATUS_STARTED: + status = "in progress"; break; case GF_DEFRAG_STATUS_STOPPED: status = "stopped"; @@ -1089,38 +1086,17 @@ gf_cli3_1_defrag_volume_cbk (struct rpc_req *req, struct iovec *iov, case GF_DEFRAG_STATUS_FAILED: status = "failed"; break; - case GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE: - status = "step 1: layout fix complete"; - break; - case GF_DEFRAG_STATUS_MIGRATE_DATA_COMPLETE: - status = "step 2: data migration complete"; - break; - case GF_DEFRAG_STATUS_PAUSED: - status = "paused"; - break; } - if (files && (rsp.op_errno == 1)) { - snprintf (msg, sizeof (msg), - "rebalance %s: fixed layout %"PRId64, - status, files); - goto done; - } - if (files && (rsp.op_errno == 6)) { - snprintf (msg, sizeof (msg), - "rebalance %s: fixed layout %"PRId64, - status, files); - goto done; - } - if (files) { - snprintf (msg, sizeof (msg), - "rebalance %s: rebalanced %"PRId64 + if (files || size || lookup) { + snprintf (msg, sizeof(msg), + "Rebalance %s: rebalanced %"PRId64 " files of size %"PRId64" (total files" " scanned %"PRId64")", status, files, size, lookup); goto done; } - snprintf (msg, sizeof (msg), "rebalance %s", status); + snprintf (msg, sizeof (msg), "Rebalance %s", status); goto done; } @@ -1129,7 +1105,7 @@ gf_cli3_1_defrag_volume_cbk (struct rpc_req *req, struct iovec *iov, snprintf (msg, sizeof (msg), "%s", rsp.op_errstr); else snprintf (msg, sizeof (msg), - "starting rebalance on volume %s has been %s", + "Starting rebalance on volume %s has been %s", volname, (rsp.op_ret) ? "unsuccessful": "successful"); @@ -1398,24 +1374,18 @@ gf_cli3_remove_brick_status_cbk (struct rpc_req *req, struct iovec *iov, case GF_DEFRAG_STATUS_NOT_STARTED: status = "not started"; break; - case GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED: - case GF_DEFRAG_STATUS_MIGRATE_DATA_STARTED: - case GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE: + case GF_DEFRAG_STATUS_STARTED: status = "in progress"; break; case GF_DEFRAG_STATUS_STOPPED: status = "stopped"; break; case GF_DEFRAG_STATUS_COMPLETE: - case GF_DEFRAG_STATUS_MIGRATE_DATA_COMPLETE: status = "completed"; break; case GF_DEFRAG_STATUS_FAILED: status = "failed"; break; - case GF_DEFRAG_STATUS_PAUSED: - status = "paused"; - break; } if (rsp.dict.dict_len) { @@ -2479,20 +2449,19 @@ gf_cli3_1_defrag_volume (call_frame_t *frame, xlator_t *this, if (strcmp (cmd_str, "start") == 0) { cmd = GF_DEFRAG_CMD_START; - ret = dict_get_str (dict, "start-type", &cmd_str); + ret = dict_get_str (dict, "option", &cmd_str); if (!ret) { - if (strcmp (cmd_str, "fix-layout") == 0) { - cmd = GF_DEFRAG_CMD_START_LAYOUT_FIX; - } - if (strcmp (cmd_str, "migrate-data") == 0) { - cmd = GF_DEFRAG_CMD_START_MIGRATE_DATA; - } - if (strcmp (cmd_str, "migrate-data-force") == 0) { - cmd = GF_DEFRAG_CMD_START_MIGRATE_DATA_FORCE; + if (strcmp (cmd_str, "force") == 0) { + cmd = GF_DEFRAG_CMD_START_FORCE; } } goto done; } + + if (strcmp (cmd_str, "fix-layout") == 0) { + cmd = GF_DEFRAG_CMD_START_LAYOUT_FIX; + goto done; + } if (strcmp (cmd_str, "stop") == 0) { cmd = GF_DEFRAG_CMD_STOP; goto done; diff --git a/glusterfsd/src/glusterfsd-mgmt.c b/glusterfsd/src/glusterfsd-mgmt.c index e2b658a9..08f8a05f 100644 --- a/glusterfsd/src/glusterfsd-mgmt.c +++ b/glusterfsd/src/glusterfsd-mgmt.c @@ -736,6 +736,79 @@ out: return ret; } + +int +glusterfs_handle_defrag (rpcsvc_request_t *req) +{ + int32_t ret = -1; + gd1_mgmt_brick_op_req xlator_req = {0,}; + dict_t *dict = NULL; + xlator_t *xlator = NULL; + xlator_t *any = NULL; + dict_t *output = NULL; + char msg[2048] = {0}; + glusterfs_ctx_t *ctx = NULL; + glusterfs_graph_t *active = NULL; + xlator_t *this = NULL; + + GF_ASSERT (req); + this = THIS; + GF_ASSERT (this); + + ctx = glusterfs_ctx_get (); + GF_ASSERT (ctx); + + active = ctx->active; + any = active->first; + if (!xdr_to_generic (req->msg[0], &xlator_req, + (xdrproc_t)xdr_gd1_mgmt_brick_op_req)) { + //failed to decode msg; + req->rpc_err = GARBAGE_ARGS; + goto out; + } + dict = dict_new (); + if (!dict) + goto out; + + ret = dict_unserialize (xlator_req.input.input_val, + xlator_req.input.input_len, + &dict); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to " + "unserialize req-buffer to dictionary"); + goto out; + } + xlator = xlator_search_by_name (any, xlator_req.name); + if (!xlator) { + snprintf (msg, sizeof (msg), "xlator %s is not loaded", + xlator_req.name); + goto out; + } + + output = dict_new (); + if (!output) { + ret = -1; + goto out; + } + + ret = xlator->notify (xlator, GF_EVENT_VOLUME_DEFRAG, dict, output); + + ret = glusterfs_translator_info_response_send (req, ret, + msg, output); +out: + if (dict) + dict_unref (dict); + if (xlator_req.input.input_val) + free (xlator_req.input.input_val); // malloced by xdr + if (output) + dict_unref (output); + if (xlator_req.name) + free (xlator_req.name); //malloced by xdr + + return ret; + +} int glusterfs_handle_brick_status (rpcsvc_request_t *req) { @@ -887,6 +960,9 @@ glusterfs_handle_rpc_msg (rpcsvc_request_t *req) case GLUSTERD_BRICK_STATUS: ret = glusterfs_handle_brick_status (req); break; + case GLUSTERD_BRICK_XLATOR_DEFRAG: + ret = glusterfs_handle_defrag (req); + break; default: break; } @@ -943,7 +1019,8 @@ rpcsvc_actor_t glusterfs_actors[] = { [GLUSTERD_BRICK_TERMINATE] = { "TERMINATE", GLUSTERD_BRICK_TERMINATE, glusterfs_handle_rpc_msg, NULL, NULL, 0}, [GLUSTERD_BRICK_XLATOR_INFO] = { "TRANSLATOR INFO", GLUSTERD_BRICK_XLATOR_INFO, glusterfs_handle_rpc_msg, NULL, NULL, 0}, [GLUSTERD_BRICK_XLATOR_HEAL] = { "TRANSLATOR HEAL", GLUSTERD_BRICK_XLATOR_HEAL, glusterfs_handle_rpc_msg, NULL, NULL, 0}, - [GLUSTERD_BRICK_STATUS] = {"STATUS", GLUSTERD_BRICK_STATUS, glusterfs_handle_rpc_msg, NULL, NULL, 0} + [GLUSTERD_BRICK_STATUS] = {"STATUS", GLUSTERD_BRICK_STATUS, glusterfs_handle_rpc_msg, NULL, NULL, 0}, + [GLUSTERD_BRICK_XLATOR_DEFRAG] = { "TRANSLATOR DEFRAG", GLUSTERD_BRICK_XLATOR_DEFRAG, glusterfs_handle_rpc_msg, NULL, NULL, 0} }; struct rpcsvc_program glusterfs_mop_prog = { diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index ccfdc11d..ef9cdfe8 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -383,6 +383,7 @@ typedef enum { GF_EVENT_TRANSLATOR_INFO, GF_EVENT_TRIGGER_HEAL, GF_EVENT_AUTH_FAILED, + GF_EVENT_VOLUME_DEFRAG, GF_EVENT_MAXVAL, } glusterfs_event_t; diff --git a/rpc/rpc-lib/src/protocol-common.h b/rpc/rpc-lib/src/protocol-common.h index 6ef4cb70..874f46e0 100644 --- a/rpc/rpc-lib/src/protocol-common.h +++ b/rpc/rpc-lib/src/protocol-common.h @@ -184,6 +184,7 @@ enum glusterd_brick_procnum { GLUSTERD_BRICK_XLATOR_HEAL, GLUSTERD_BRICK_STATUS, GLUSTERD_BRICK_OP, + GLUSTERD_BRICK_XLATOR_DEFRAG, GLUSTERD_BRICK_MAXVALUE, }; diff --git a/rpc/xdr/src/cli1-xdr.h b/rpc/xdr/src/cli1-xdr.h index d136ec25..903b6ff7 100644 --- a/rpc/xdr/src/cli1-xdr.h +++ b/rpc/xdr/src/cli1-xdr.h @@ -47,22 +47,16 @@ enum gf_cli_defrag_type { GF_DEFRAG_CMD_STOP = 1 + 1, GF_DEFRAG_CMD_STATUS = 1 + 2, GF_DEFRAG_CMD_START_LAYOUT_FIX = 1 + 3, - GF_DEFRAG_CMD_START_MIGRATE_DATA = 1 + 4, - GF_DEFRAG_CMD_START_MIGRATE_DATA_FORCE = 1 + 5, - GF_DEFRAG_CMD_START_FORCE = 1 + 6, + GF_DEFRAG_CMD_START_FORCE = 1 + 4, }; typedef enum gf_cli_defrag_type gf_cli_defrag_type; enum gf_defrag_status_t { GF_DEFRAG_STATUS_NOT_STARTED = 0, - GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED = 1, - GF_DEFRAG_STATUS_MIGRATE_DATA_STARTED = 2, - GF_DEFRAG_STATUS_STOPPED = 3, - GF_DEFRAG_STATUS_COMPLETE = 4, - GF_DEFRAG_STATUS_FAILED = 5, - GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE = 6, - GF_DEFRAG_STATUS_MIGRATE_DATA_COMPLETE = 7, - GF_DEFRAG_STATUS_PAUSED = 8, + GF_DEFRAG_STATUS_STARTED = 1, + GF_DEFRAG_STATUS_STOPPED = 2, + GF_DEFRAG_STATUS_COMPLETE = 3, + GF_DEFRAG_STATUS_FAILED = 4, }; typedef enum gf_defrag_status_t gf_defrag_status_t; diff --git a/rpc/xdr/src/cli1-xdr.x b/rpc/xdr/src/cli1-xdr.x index 5f491c7b..f45712ce 100644 --- a/rpc/xdr/src/cli1-xdr.x +++ b/rpc/xdr/src/cli1-xdr.x @@ -3,21 +3,15 @@ GF_DEFRAG_CMD_STOP, GF_DEFRAG_CMD_STATUS, GF_DEFRAG_CMD_START_LAYOUT_FIX, - GF_DEFRAG_CMD_START_MIGRATE_DATA, - GF_DEFRAG_CMD_START_MIGRATE_DATA_FORCE, GF_DEFRAG_CMD_START_FORCE /* used by remove-brick data migration */ } ; enum gf_defrag_status_t { GF_DEFRAG_STATUS_NOT_STARTED, - GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED, - GF_DEFRAG_STATUS_MIGRATE_DATA_STARTED, + GF_DEFRAG_STATUS_STARTED, GF_DEFRAG_STATUS_STOPPED, GF_DEFRAG_STATUS_COMPLETE, - GF_DEFRAG_STATUS_FAILED, - GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE, - GF_DEFRAG_STATUS_MIGRATE_DATA_COMPLETE, - GF_DEFRAG_STATUS_PAUSED + GF_DEFRAG_STATUS_FAILED } ; enum gf1_cluster_type { diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 29b3dca8..360a432c 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -4296,16 +4296,22 @@ dht_forget (xlator_t *this, inode_t *inode) int dht_notify (xlator_t *this, int event, void *data, ...) { - xlator_t *subvol = NULL; - int cnt = -1; - int i = -1; - dht_conf_t *conf = NULL; - int ret = -1; - int propagate = 0; + xlator_t *subvol = NULL; + int cnt = -1; + int i = -1; + dht_conf_t *conf = NULL; + int ret = -1; + int propagate = 0; + + int had_heard_from_all = 0; + int have_heard_from_all = 0; + struct timeval time = {0,}; + gf_defrag_info_t *defrag = NULL; + dict_t *dict = NULL; + gf_defrag_type cmd = 0; + dict_t *output = NULL; + va_list ap; - int had_heard_from_all = 0; - int have_heard_from_all = 0; - struct timeval time = {0,}; conf = this->private; if (!conf) @@ -4418,6 +4424,36 @@ dht_notify (xlator_t *this, int event, void *data, ...) UNLOCK (&conf->subvolume_lock); break; + case GF_EVENT_VOLUME_DEFRAG: + { + if (!conf->defrag) { + return ret; + } + defrag = conf->defrag; + + dict = data; + va_start (ap, data); + output = va_arg (ap, dict_t*); + + ret = dict_get_int32 (dict, "rebalance-command", + (int32_t*)&cmd); + if (ret) + return ret; + LOCK (&defrag->lock); + { + if (defrag->is_exiting) + goto unlock; + if (cmd == GF_DEFRAG_CMD_STATUS) + gf_defrag_status_get (defrag, output); + else if (cmd == GF_DEFRAG_CMD_STOP) + gf_defrag_stop (defrag, output); + } +unlock: + UNLOCK (&defrag->lock); + return 0; + break; + } + default: propagate = 1; break; @@ -4433,8 +4469,19 @@ dht_notify (xlator_t *this, int event, void *data, ...) /* if all subvols have reported status, no need to hide anything or wait for anything else. Just propagate blindly */ - if (have_heard_from_all) + if (have_heard_from_all) { propagate = 1; + if (conf->defrag) { + ret = pthread_create (&conf->defrag->th, NULL, + gf_defrag_start, this); + if (ret) { + conf->defrag = NULL; + GF_FREE (conf->defrag); + kill (getpid(), SIGTERM); + } + } + } + if (!had_heard_from_all && have_heard_from_all) { /* This is the first event which completes aggregation diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 749abe53..d97ef9f5 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -175,6 +175,43 @@ struct dht_du { }; typedef struct dht_du dht_du_t; +enum gf_defrag_type { + GF_DEFRAG_CMD_START = 1, + GF_DEFRAG_CMD_STOP = 1 + 1, + GF_DEFRAG_CMD_STATUS = 1 + 2, + GF_DEFRAG_CMD_START_LAYOUT_FIX = 1 + 3, + GF_DEFRAG_CMD_START_FORCE = 1 + 4, +}; +typedef enum gf_defrag_type gf_defrag_type; + +enum gf_defrag_status_t { + GF_DEFRAG_STATUS_NOT_STARTED, + GF_DEFRAG_STATUS_STARTED, + GF_DEFRAG_STATUS_STOPPED, + GF_DEFRAG_STATUS_COMPLETE, + GF_DEFRAG_STATUS_FAILED, +}; +typedef enum gf_defrag_status_t gf_defrag_status_t; + + +struct gf_defrag_info_ { + uint64_t total_files; + uint64_t total_data; + uint64_t num_files_lookedup; + gf_lock_t lock; + int cmd; + pthread_t th; + gf_defrag_status_t defrag_status; + struct rpc_clnt *rpc; + uint32_t connected; + uint32_t is_exiting; + pid_t pid; + inode_t *root_inode; + +}; + +typedef struct gf_defrag_info_ gf_defrag_info_t; + struct dht_conf { gf_lock_t subvolume_lock; int subvolume_cnt; @@ -208,6 +245,9 @@ struct dht_conf { /* to keep track of nodes which are decomissioned */ xlator_t **decommissioned_bricks; + + /* defrag related */ + gf_defrag_info_t *defrag; }; typedef struct dht_conf dht_conf_t; @@ -608,6 +648,13 @@ int dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, struct iatt *postparent); +int +gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict); + +int +gf_defrag_stop (gf_defrag_info_t *defrag, dict_t *output); +void* +gf_defrag_start (void *this); #endif/* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-mem-types.h b/xlators/cluster/dht/src/dht-mem-types.h index 21fb5a7c..a12ed153 100644 --- a/xlators/cluster/dht/src/dht-mem-types.h +++ b/xlators/cluster/dht/src/dht-mem-types.h @@ -37,6 +37,7 @@ enum gf_dht_mem_types_ { gf_switch_mt_switch_struct, gf_dht_mt_subvol_time, gf_dht_mt_loc_t, + gf_defrag_info_mt, gf_dht_mt_end }; #endif diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index dfd6f3b6..46fc8773 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -669,7 +669,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, loc->path, from->name, strerror (errno)); } - if (uuid_compare (empty_iatt.ia_gfid, loc->inode->gfid) == 0) { + if (uuid_compare (empty_iatt.ia_gfid, loc->gfid) == 0) { /* take out the source from namespace */ ret = syncop_unlink (from, loc); if (ret) { @@ -805,3 +805,607 @@ dht_start_rebalance_task (xlator_t *this, call_frame_t *frame) frame, frame); return ret; } + +int +gf_listener_stop (void) +{ + glusterfs_ctx_t *ctx = NULL; + cmd_args_t *cmd_args = NULL; + int ret = 0; + xlator_t *this = NULL; + + ctx = glusterfs_ctx_get (); + GF_ASSERT (ctx); + cmd_args = &ctx->cmd_args; + if (cmd_args->sock_file) { + ret = unlink (cmd_args->sock_file); + if (ret && (ENOENT == errno)) { + ret = 0; + } + } + + if (ret) { + this = THIS; + gf_log (this->name, GF_LOG_ERROR, "Failed to unlink listener " + "socket %s, error: %s", cmd_args->sock_file, + strerror (errno)); + } + return ret; +} + +void +dht_build_root_inode (xlator_t *this, inode_t **inode) +{ + inode_table_t *itable = NULL; + uuid_t root_gfid = {0, }; + + itable = inode_table_new (0, this); + if (!itable) + return; + + root_gfid[15] = 1; + *inode = inode_find (itable, root_gfid); +} + +void +dht_build_root_loc (inode_t *inode, loc_t *loc) +{ + loc->path = "/"; + loc->inode = inode; + loc->inode->ia_type = IA_IFDIR; + memset (loc->gfid, 0, 16); + loc->gfid[15] = 1; +} + + +/* return values: 1 -> error, bug ignore and continue + 0 -> proceed + -1 -> error, handle it */ +int32_t +gf_defrag_handle_migrate_error (int32_t op_errno, gf_defrag_info_t *defrag) +{ + /* if errno is not ENOSPC or ENOTCONN, we can still continue + with rebalance process */ + if ((errno != ENOSPC) || (errno != ENOTCONN)) + return 1; + + if (errno == ENOTCONN) { + /* Most probably mount point went missing (mostly due + to a brick down), say rebalance failure to user, + let him restart it if everything is fine */ + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + return -1; + } + + if (errno == ENOSPC) { + /* rebalance process itself failed, may be + remote brick went down, or write failed due to + disk full etc etc.. */ + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + return -1; + } + + return 0; +} + +/* We do a depth first traversal of directories. But before we move into + * subdirs, we complete the data migration of those directories whose layouts + * have been fixed + */ + +int +gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, + dict_t *migrate_data) +{ + int ret = -1; + loc_t entry_loc = {0,}; + fd_t *fd = NULL; + gf_dirent_t entries; + gf_dirent_t *tmp = NULL; + gf_dirent_t *entry = NULL; + gf_boolean_t free_entries = _gf_false; + off_t offset = 0; + dict_t *dict = NULL; + struct iatt iatt = {0,}; + int32_t op_errno = 0; + + fd = fd_create (loc->inode, defrag->pid); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, "Failed to create fd"); + goto out; + } + + ret = syncop_opendir (this, loc, fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to open dir %s", + loc->path); + goto out; + } + + INIT_LIST_HEAD (&entries.list); + + while ((ret = syncop_readdirp (this, fd, 131072, offset, NULL, + &entries)) != 0) + { + if ((ret < 0) || (ret && (errno == ENOENT))) + break; + + free_entries = _gf_true; + + if (list_empty (&entries.list)) + break; + list_for_each_entry_safe (entry, tmp, &entries.list, list) { + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + ret = 1; + goto out; + } + + offset = entry->d_off; + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + if (IA_ISDIR (entry->d_stat.ia_type)) + continue; + + defrag->num_files_lookedup++; + if (entry->d_stat.ia_nlink > 1) + continue; + + loc_wipe (&entry_loc); + ret =dht_build_child_loc (this, &entry_loc, loc, + entry->d_name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Child loc" + " build failed"); + goto out; + } + + if (uuid_is_null (entry->d_stat.ia_gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + " gfid not present", loc->path, + entry->d_name); + continue; + } + + uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid); + + if (uuid_is_null (loc->gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + " gfid not present", loc->path, + entry->d_name); + continue; + } + + uuid_copy (entry_loc.pargfid, loc->gfid); + + entry_loc.inode->ia_type = entry->d_stat.ia_type; + + ret = syncop_lookup (this, &entry_loc, NULL, &iatt, + NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s" + " lookup failed", entry_loc.path); + continue; + } + + /* if distribute is present, it will honor this key. + * -1 is returned if distribute is not present or file + * doesn't have a link-file. If file has link-file, the + * path of link-file will be the value, and also that + * guarantees that file has to be mostly migrated */ + + ret = syncop_getxattr (this, &entry_loc, &dict, + GF_XATTR_LINKINFO_KEY); + if (ret < 0) { + continue; + } + + ret = syncop_setxattr (this, &entry_loc, migrate_data, + 0); + if (ret) + gf_log (this->name, GF_LOG_ERROR, "setxattr " + "failed for %s", entry_loc.path); + + if (ret == -1) { + op_errno = errno; + ret = gf_defrag_handle_migrate_error (op_errno, + defrag); + + if (!ret) + gf_log (this->name, GF_LOG_DEBUG, + "setxattr on %s failed: %s", + entry_loc.path, + strerror (op_errno)); + else if (ret == 1) + continue; + else if (ret == -1) + goto out; + } + + LOCK (&defrag->lock); + { + defrag->total_files += 1; + defrag->total_data += iatt.ia_size; + } + UNLOCK (&defrag->lock); + } + + gf_dirent_free (&entries); + free_entries = _gf_false; + INIT_LIST_HEAD (&entries.list); + + } + ret = 0; +out: + if (free_entries) + gf_dirent_free (&entries); + + loc_wipe (&entry_loc); + + if (dict) + dict_unref(dict); + + if (fd) + fd_unref (fd); + return ret; + +} + + +int +gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, + dict_t *fix_layout, dict_t *migrate_data) +{ + int ret = -1; + loc_t entry_loc = {0,}; + fd_t *fd = NULL; + gf_dirent_t entries; + gf_dirent_t *tmp = NULL; + gf_dirent_t *entry = NULL; + gf_boolean_t free_entries = _gf_false; + dict_t *dict = NULL; + off_t offset = 0; + struct iatt iatt = {0,}; + + ret = syncop_lookup (this, loc, NULL, &iatt, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Lookup failed on %s", + loc->path); + goto out; + } + + if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) + gf_defrag_migrate_data (this, defrag, loc, migrate_data); + + gf_log (this->name, GF_LOG_TRACE, "fix layout called on %s", loc->path); + + fd = fd_create (loc->inode, defrag->pid); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, "Failed to create fd"); + ret = -1; + goto out; + } + + ret = syncop_opendir (this, loc, fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to open dir %s", + loc->path); + ret = -1; + goto out; + } + + INIT_LIST_HEAD (&entries.list); + while ((ret = syncop_readdirp (this, fd, 131072, offset, NULL, + &entries)) != 0) + { + if ((ret < 0) || (ret && (errno == ENOENT))) + break; + free_entries = _gf_true; + + if (list_empty (&entries.list)) + break; + list_for_each_entry_safe (entry, tmp, &entries.list, list) { + if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { + ret = 1; + goto out; + } + + offset = entry->d_off; + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + if (!IA_ISDIR (entry->d_stat.ia_type)) + continue; + + loc_wipe (&entry_loc); + ret =dht_build_child_loc (this, &entry_loc, loc, + entry->d_name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Child loc" + " build failed"); + goto out; + } + + if (uuid_is_null (entry->d_stat.ia_gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + "gfid not present", loc->path, + entry->d_name); + continue; + } + + entry_loc.inode->ia_type = entry->d_stat.ia_type; + + uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid); + if (uuid_is_null (loc->gfid)) { + gf_log (this->name, GF_LOG_ERROR, "%s/%s" + "gfid not present", loc->path, + entry->d_name); + continue; + } + + uuid_copy (entry_loc.pargfid, loc->gfid); + + ret = syncop_lookup (this, &entry_loc, NULL, &iatt, + NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s" + " lookup failed", entry_loc.path); + continue; + } + + ret = syncop_setxattr (this, &entry_loc, fix_layout, + 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Setxattr " + "failed for %s", entry_loc.path); + defrag->defrag_status = + GF_DEFRAG_STATUS_FAILED; + goto out; + } + ret = gf_defrag_fix_layout (this, defrag, &entry_loc, + fix_layout, migrate_data); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Fix layout " + "failed for %s", entry_loc.path); + goto out; + } + + } + gf_dirent_free (&entries); + free_entries = _gf_false; + INIT_LIST_HEAD (&entries.list); + } + + ret = 0; +out: + if (free_entries) + gf_dirent_free (&entries); + + loc_wipe (&entry_loc); + + if (dict) + dict_unref(dict); + + if (fd) + fd_unref (fd); + + return ret; + +} + + +int +gf_defrag_start_crawl (void *data) +{ + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + int ret = -1; + loc_t loc = {0,}; + struct iatt iatt = {0,}; + struct iatt parent = {0,}; + dict_t *fix_layout = NULL; + dict_t *migrate_data = NULL; + + this = data; + if (!this) + goto out; + + conf = this->private; + if (!conf) + goto out; + + defrag = conf->defrag; + if (!defrag) + goto out; + + dht_build_root_inode (this, &defrag->root_inode); + if (!defrag->root_inode) + goto out; + + dht_build_root_loc (defrag->root_inode, &loc); + + /* fix-layout on '/' first */ + + ret = syncop_lookup (this, &loc, NULL, &iatt, NULL, &parent); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "look up on / failed"); + goto out; + } + + fix_layout = dict_new (); + if (!fix_layout) { + ret = -1; + goto out; + } + + ret = dict_set_str (fix_layout, GF_XATTR_FIX_LAYOUT_KEY, "yes"); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set dict str"); + goto out; + } + + ret = syncop_setxattr (this, &loc, fix_layout, 0); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "fix layout on %s failed", + loc.path); + goto out; + } + + if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { + migrate_data = dict_new (); + if (!migrate_data) { + ret = -1; + goto out; + } + if (defrag->cmd == GF_DEFRAG_CMD_START_FORCE) + ret = dict_set_str (migrate_data, + "distribute.migrate-data", "force"); + else + ret = dict_set_str (migrate_data, + "distribute.migrate-data", + "non-force"); + if (ret) + goto out; + } + ret = gf_defrag_fix_layout (this, defrag, &loc, fix_layout, + migrate_data); + +out: + LOCK (&defrag->lock); + { + gf_defrag_status_get (defrag, NULL); + defrag->is_exiting = 1; + } + UNLOCK (&defrag->lock); + + if (defrag) + GF_FREE (defrag); + + return ret; +} + + +static int +gf_defrag_done (int ret, call_frame_t *sync_frame, void *data) +{ + gf_listener_stop(); + + GF_FREE (data); + STACK_DESTROY (sync_frame->root); + kill (getpid(), SIGTERM); + return 0; +} + +void * +gf_defrag_start (void *data) +{ + int ret = -1; + call_frame_t *frame = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + xlator_t *this = NULL; + + this = data; + conf = this->private; + if (!conf) + goto out; + + defrag = conf->defrag; + if (!defrag) + goto out; + + frame = create_frame (this, this->ctx->pool); + if (!frame) + goto out; + + defrag->pid = frame->root->pid; + + defrag->defrag_status = GF_DEFRAG_STATUS_STARTED; + + ret = synctask_new (this->ctx->env, gf_defrag_start_crawl, + gf_defrag_done, frame, this); + + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Could not create" + " task for rebalance"); +out: + return NULL; +} + +int +gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict) +{ + int ret = 0; + uint64_t files = 0; + uint64_t size = 0; + uint64_t lookup = 0; + + if (!defrag) + goto out; + + ret = 0; + if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) + goto out; + + files = defrag->total_files; + size = defrag->total_data; + lookup = defrag->num_files_lookedup; + + if (!dict) + goto log; + + ret = dict_set_uint64 (dict, "files", files); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set file count"); + + ret = dict_set_uint64 (dict, "size", size); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set size of xfer"); + + ret = dict_set_uint64 (dict, "lookups", lookup); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set lookedup file count"); + + ret = dict_set_int32 (dict, "status", defrag->defrag_status); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set status"); +log: + gf_log (THIS->name, GF_LOG_INFO, "Files migrated: %"PRIu64", size: %" + PRIu64", lookups: %"PRIu64, files, size, lookup); + + +out: + return 0; +} + +int +gf_defrag_stop (gf_defrag_info_t *defrag, dict_t *output) +{ + /* TODO: set a variable 'stop_defrag' here, it should be checked + in defrag loop */ + int ret = -1; + GF_ASSERT (defrag); + + if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) { + goto out; + } + + defrag->defrag_status = GF_DEFRAG_STATUS_STOPPED; + + gf_defrag_status_get (defrag, output); + ret = 0; +out: + gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret); + return ret; +} diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c index 18fee7cd..816bf868 100644 --- a/xlators/cluster/dht/src/dht.c +++ b/xlators/cluster/dht/src/dht.c @@ -182,11 +182,20 @@ out: int notify (xlator_t *this, int event, void *data, ...) { - int ret = -1; + int ret = -1; + va_list ap; + dict_t *output = NULL; GF_VALIDATE_OR_GOTO ("dht", this, out); - ret = dht_notify (this, event, data); + + if (!data) + goto out; + + va_start (ap, data); + output = va_arg (ap, dict_t*); + + ret = dht_notify (this, event, data, output); out: return ret; @@ -343,10 +352,13 @@ out: int init (xlator_t *this) { - dht_conf_t *conf = NULL; - char *temp_str = NULL; - int ret = -1; - int i = 0; + dht_conf_t *conf = NULL; + char *temp_str = NULL; + int ret = -1; + int i = 0; + gf_defrag_info_t *defrag = NULL; + int cmd = 0; + GF_VALIDATE_OR_GOTO ("dht", this, err); @@ -366,6 +378,24 @@ init (xlator_t *this) goto err; } + ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd); + + if (cmd) { + defrag = GF_CALLOC (1, sizeof (gf_defrag_info_t), + gf_defrag_info_mt); + + GF_VALIDATE_OR_GOTO (this->name, defrag, err); + + LOCK_INIT (&defrag->lock); + + defrag->is_exiting = 0; + + defrag->cmd = cmd; + + conf->defrag = defrag; + } + + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { /* If option is not "auto", other options _should_ be boolean */ @@ -550,5 +580,9 @@ struct volume_options options[] = { { .key = {"decommissioned-bricks"}, .type = GF_OPTION_TYPE_ANY, }, + { .key = {"rebalance-cmd"}, + .type = GF_OPTION_TYPE_INT, + }, + { .key = {NULL} }, }; diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c index c170972c..fc9c9cf0 100644 --- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c @@ -31,6 +31,7 @@ #include "glusterd-utils.h" #include "glusterd-volgen.h" #include "run.h" +#include /* misc */ @@ -1384,8 +1385,6 @@ glusterd_op_add_brick (dict_t *dict, char **op_errstr) switch (volinfo->defrag_status) { case GF_DEFRAG_STATUS_FAILED: case GF_DEFRAG_STATUS_COMPLETE: - case GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE: - case GF_DEFRAG_STATUS_MIGRATE_DATA_COMPLETE: volinfo->defrag_status = 0; default: break; @@ -1420,6 +1419,9 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr) int32_t replica_count = 0; glusterd_brickinfo_t *brickinfo = NULL; glusterd_brickinfo_t *tmp = NULL; + glusterd_conf_t *priv = NULL; + char pidfile[PATH_MAX]; + ret = dict_get_str (dict, "volname", &volname); @@ -1456,7 +1458,7 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr) if (volinfo->defrag) { LOCK (&volinfo->defrag->lock); - volinfo->defrag_status = GF_DEFRAG_STATUS_PAUSED; + //volinfo->defrag_status = GF_DEFRAG_STATUS_PAUSED; UNLOCK (&volinfo->defrag->lock); } @@ -1470,13 +1472,14 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr) case GF_OP_CMD_ABORT: { if (volinfo->decommission_in_progress) { - if (volinfo->defrag) { - LOCK (&volinfo->defrag->lock); + priv = THIS->private; + if (!priv) + return ret; - volinfo->defrag_status = GF_DEFRAG_STATUS_STOPPED; + GLUSTERD_GET_DEFRAG_PID_FILE(pidfile, volinfo, priv); + + glusterd_service_stop ("rebalance", pidfile, SIGTERM, 1); - UNLOCK (&volinfo->defrag->lock); - } } /* Fall back to the old volume file */ @@ -1577,8 +1580,6 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr) switch (volinfo->defrag_status) { case GF_DEFRAG_STATUS_FAILED: case GF_DEFRAG_STATUS_COMPLETE: - case GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE: - case GF_DEFRAG_STATUS_MIGRATE_DATA_COMPLETE: volinfo->defrag_status = 0; default: break; diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c index b80164e8..b06dd28c 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handler.c +++ b/xlators/mgmt/glusterd/src/glusterd-handler.c @@ -346,6 +346,11 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo, if (ret) goto out; + snprintf (key, 256, "volume%d.rebalance", count); + ret = dict_set_int32 (volumes, key, volinfo->defrag_cmd); + if (ret) + goto out; + list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { char brick[1024] = {0,}; snprintf (key, 256, "volume%d.brick%d", count, i); @@ -2046,6 +2051,7 @@ glusterd_rpc_create (struct rpc_clnt **rpc, GF_ASSERT (this); GF_ASSERT (options); + new_rpc = rpc_clnt_new (options, this->ctx, this->name); if (!new_rpc) diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c index 4a0561d1..2a4bf82e 100644 --- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c +++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c @@ -150,6 +150,8 @@ glusterd_brick_op_build_payload (glusterd_op_t op, glusterd_brickinfo_t *brickin { int ret = -1; gd1_mgmt_brick_op_req *brick_req = NULL; + char *volname = NULL; + char name[1024] = {0,}; GF_ASSERT (op < GD_OP_MAX); GF_ASSERT (op > GD_OP_NONE); @@ -203,6 +205,21 @@ glusterd_brick_op_build_payload (glusterd_op_t op, glusterd_brickinfo_t *brickin brick_req->op = GLUSTERD_BRICK_STATUS; brick_req->name = ""; } + break; + case GD_OP_REBALANCE: + case GD_OP_DEFRAG_BRICK_VOLUME: + brick_req = GF_CALLOC (1, sizeof (*brick_req), + gf_gld_mt_mop_brick_req_t); + if (!brick_req) + goto out; + + brick_req->op = GLUSTERD_BRICK_XLATOR_DEFRAG; + ret = dict_get_str (dict, "volname", &volname); + if (ret) + goto out; + snprintf (name, 1024, "%s-dht",volname); + brick_req->name = gf_strdup (name); + break; default: goto out; @@ -1617,6 +1634,7 @@ glusterd_op_build_payload (dict_t **req) case GD_OP_HEAL_VOLUME: case GD_OP_STATEDUMP_VOLUME: case GD_OP_CLEARLOCKS_VOLUME: + case GD_OP_DEFRAG_BRICK_VOLUME: { dict_t *dict = ctx; dict_copy (dict, req_dict); @@ -2173,6 +2191,7 @@ glusterd_need_brick_op (glusterd_op_t op) switch (op) { case GD_OP_PROFILE_VOLUME: case GD_OP_STATUS_VOLUME: + case GD_OP_DEFRAG_BRICK_VOLUME: ret = _gf_true; break; default: @@ -2368,6 +2387,7 @@ glusterd_op_stage_validate (glusterd_op_t op, dict_t *dict, char **op_errstr, break; case GD_OP_REBALANCE: + case GD_OP_DEFRAG_BRICK_VOLUME: ret = glusterd_op_stage_rebalance (dict, op_errstr); break; @@ -2464,6 +2484,7 @@ glusterd_op_commit_perform (glusterd_op_t op, dict_t *dict, char **op_errstr, break; case GD_OP_REBALANCE: + case GD_OP_DEFRAG_BRICK_VOLUME: ret = glusterd_op_rebalance (dict, op_errstr, rsp_dict); break; @@ -2613,6 +2634,10 @@ glusterd_handle_brick_rsp (glusterd_brickinfo_t *brickinfo, op_ctx, op_errstr); break; + case GD_OP_DEFRAG_BRICK_VOLUME: + dict_copy (rsp_dict, op_ctx); + break; + default: break; } @@ -2754,6 +2779,7 @@ glusterd_bricks_select_profile_volume (dict_t *dict, char **op_errstr) priv = this->private; GF_ASSERT (priv); + ret = dict_get_str (dict, "volname", &volname); if (ret) { gf_log ("glusterd", GF_LOG_ERROR, "volume name get failed"); @@ -2963,6 +2989,56 @@ out: } + +static int +glusterd_bricks_select_rebalance_volume (dict_t *dict, char **op_errstr) +{ + int ret = -1; + char *volname = NULL; + glusterd_volinfo_t *volinfo = NULL; + xlator_t *this = NULL; + char msg[2048] = {0,}; + glusterd_pending_node_t *pending_node = NULL; + + this = THIS; + GF_ASSERT (this); + + + ret = dict_get_str (dict, "volname", &volname); + if (ret) { + gf_log ("glusterd", GF_LOG_ERROR, "volume name get failed"); + goto out; + } + + ret = glusterd_volinfo_find (volname, &volinfo); + if (ret) { + snprintf (msg, sizeof (msg), "Volume %s does not exist", + volname); + + *op_errstr = gf_strdup (msg); + gf_log ("", GF_LOG_ERROR, "%s", msg); + goto out; + } + pending_node = GF_CALLOC (1, sizeof (*pending_node), + gf_gld_mt_pending_node_t); + if (!pending_node) { + ret = -1; + goto out; + } else { + pending_node->node = volinfo; + pending_node->type = GD_NODE_REBALANCE; + list_add_tail (&pending_node->list, + &opinfo.pending_bricks); + pending_node = NULL; + } + +out: + return ret; +} + + + + static int glusterd_bricks_select_status_volume (dict_t *dict, char **op_errstr) { @@ -3196,6 +3272,9 @@ glusterd_op_bricks_select (glusterd_op_t op, dict_t *dict, char **op_errstr) ret = glusterd_bricks_select_status_volume (dict, op_errstr); break; + case GD_OP_DEFRAG_BRICK_VOLUME: + ret = glusterd_bricks_select_rebalance_volume (dict, op_errstr); + break; default: break; } @@ -3758,6 +3837,7 @@ glusterd_op_free_ctx (glusterd_op_t op, void *ctx) case GD_OP_HEAL_VOLUME: case GD_OP_STATEDUMP_VOLUME: case GD_OP_CLEARLOCKS_VOLUME: + case GD_OP_DEFRAG_BRICK_VOLUME: dict_unref (ctx); break; default: diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c index e5a907ea..0fe8d389 100644 --- a/xlators/mgmt/glusterd/src/glusterd-rebalance.c +++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c @@ -46,363 +46,35 @@ #include "cli1-xdr.h" #include "xdr-generic.h" -/* return values - 0: success, +ve: stopped, -ve: failure */ -int -gf_glusterd_rebalance_move_data (glusterd_volinfo_t *volinfo, const char *dir) -{ - int ret = -1; - DIR *fd = NULL; - glusterd_defrag_info_t *defrag = NULL; - struct dirent *entry = NULL; - struct stat stbuf = {0,}; - char full_path[PATH_MAX] = {0,}; - char linkinfo[PATH_MAX] = {0,}; - char force_string[64] = {0,}; - - if (!volinfo->defrag) - goto out; - - defrag = volinfo->defrag; - - fd = opendir (dir); - if (!fd) - goto out; - - if ((defrag->cmd == GF_DEFRAG_CMD_START_MIGRATE_DATA_FORCE) || - (defrag->cmd == GF_DEFRAG_CMD_START_FORCE)) { - strcpy (force_string, "force"); - } else { - strcpy (force_string, "not-force"); - } - - while ((entry = readdir (fd))) { - if (!entry) - break; - - /* We have to honor 'stop' (or 'pause'|'commit') as early - as possible */ - if (volinfo->defrag_status != - GF_DEFRAG_STATUS_MIGRATE_DATA_STARTED) { - /* It can be one of 'stopped|paused|commit' etc */ - closedir (fd); - ret = 1; - goto out; - } - - if (!strcmp (entry->d_name, ".") || !strcmp (entry->d_name, "..")) - continue; - - snprintf (full_path, PATH_MAX, "%s/%s", dir, entry->d_name); - - ret = lstat (full_path, &stbuf); - if (ret == -1) - continue; - - if (S_ISDIR (stbuf.st_mode)) - continue; - - defrag->num_files_lookedup += 1; - - /* TODO: bring in feature to support hardlink rebalance */ - if (stbuf.st_nlink > 1) - continue; - - /* if distribute is present, it will honor this key. - -1 is returned if distribute is not present or file doesn't - have a link-file. If file has link-file, the path of - link-file will be the value, and also that guarantees - that file has to be mostly migrated */ - ret = sys_lgetxattr (full_path, GF_XATTR_LINKINFO_KEY, - &linkinfo, PATH_MAX); - if (ret <= 0) - continue; - - ret = sys_lsetxattr (full_path, "distribute.migrate-data", - force_string, strlen (force_string), 0); - - /* if errno is not ENOSPC or ENOTCONN, we can still continue - with rebalance process */ - if ((ret == -1) && ((errno != ENOSPC) || - (errno != ENOTCONN))) - continue; - - if ((ret == -1) && (errno == ENOTCONN)) { - /* Most probably mount point went missing (mostly due - to a brick down), say rebalance failure to user, - let him restart it if everything is fine */ - volinfo->defrag_status = GF_DEFRAG_STATUS_FAILED; - break; - } - - if ((ret == -1) && (errno == ENOSPC)) { - /* rebalance process itself failed, may be - remote brick went down, or write failed due to - disk full etc etc.. */ - volinfo->defrag_status = GF_DEFRAG_STATUS_FAILED; - break; - } - - LOCK (&defrag->lock); - { - defrag->total_files += 1; - defrag->total_data += stbuf.st_size; - } - UNLOCK (&defrag->lock); - } - closedir (fd); - - fd = opendir (dir); - if (!fd) - goto out; - while ((entry = readdir (fd))) { - if (!entry) - break; - - /* We have to honor 'stop' (or 'pause'|'commit') as early - as possible */ - if (volinfo->defrag_status != - GF_DEFRAG_STATUS_MIGRATE_DATA_STARTED) { - /* It can be one of 'stopped|paused|commit' etc */ - closedir (fd); - ret = 1; - goto out; - } - - if (!strcmp (entry->d_name, ".") || !strcmp (entry->d_name, "..")) - continue; - - snprintf (full_path, 1024, "%s/%s", dir, entry->d_name); - - ret = lstat (full_path, &stbuf); - if (ret == -1) - continue; - - if (!S_ISDIR (stbuf.st_mode)) - continue; - - ret = gf_glusterd_rebalance_move_data (volinfo, full_path); - if (ret) - break; - } - closedir (fd); - - if (!entry) - ret = 0; -out: - return ret; -} +int32_t +glusterd3_1_brick_op_cbk (struct rpc_req *req, struct iovec *iov, + int count, void *myframe); -/* return values - 0: success, +ve: stopped, -ve: failure */ int -gf_glusterd_rebalance_fix_layout (glusterd_volinfo_t *volinfo, const char *dir) +glusterd_defrag_update_state (glusterd_volinfo_t *volinfo, + glusterd_defrag_info_t *defrag) { - int ret = -1; - char full_path[1024] = {0,}; - struct stat stbuf = {0,}; - DIR *fd = NULL; - struct dirent *entry = NULL; + int ret = -1; + int cmd = 0; - if (!volinfo->defrag) - goto out; - - fd = opendir (dir); - if (!fd) - goto out; - - while ((entry = readdir (fd))) { - if (!entry) - break; - - /* We have to honor 'stop' (or 'pause'|'commit') as early - as possible */ - if (volinfo->defrag_status != - GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED) { - /* It can be one of 'stopped|paused|commit' etc */ - closedir (fd); - ret = 1; - goto out; - } - - if (!strcmp (entry->d_name, ".") || !strcmp (entry->d_name, "..")) - continue; - - snprintf (full_path, 1024, "%s/%s", dir, entry->d_name); - - ret = lstat (full_path, &stbuf); - if (ret == -1) - continue; - - if (S_ISDIR (stbuf.st_mode)) { - /* Fix the layout of the directory */ - /* TODO: isn't error code not important ? */ - sys_lsetxattr (full_path, "distribute.fix.layout", - "yes", 3, 0); - - volinfo->defrag->total_files += 1; - - /* Traverse into subdirectory */ - ret = gf_glusterd_rebalance_fix_layout (volinfo, - full_path); - if (ret) - break; - } - } - closedir (fd); - - if (!entry) - ret = 0; - -out: - return ret; -} - -void * -glusterd_defrag_start (void *data) -{ - glusterd_volinfo_t *volinfo = data; - glusterd_defrag_info_t *defrag = NULL; - int ret = -1; - struct stat stbuf = {0,}; - - THIS = volinfo->xl; - defrag = volinfo->defrag; - if (!defrag) - goto out; - - sleep (1); - ret = lstat (defrag->mount, &stbuf); - if ((ret == -1) && (errno == ENOTCONN)) { - /* Wait for some more time before starting rebalance */ - sleep (2); - ret = lstat (defrag->mount, &stbuf); - if (ret == -1) { - volinfo->defrag_status = GF_DEFRAG_STATUS_FAILED; - volinfo->rebalance_files = 0; - volinfo->rebalance_data = 0; - volinfo->lookedup_files = 0; - goto out; - } - } - - /* Fix the root ('/') first */ - sys_lsetxattr (defrag->mount, "distribute.fix.layout", - "yes", 3, 0); - - if ((defrag->cmd == GF_DEFRAG_CMD_START) || - (defrag->cmd == GF_DEFRAG_CMD_START_LAYOUT_FIX)) { - /* root's layout got fixed */ - defrag->total_files = 1; - - /* Step 1: Fix layout of all the directories */ - ret = gf_glusterd_rebalance_fix_layout (volinfo, defrag->mount); - if (ret < 0) - volinfo->defrag_status = GF_DEFRAG_STATUS_FAILED; - /* in both 'stopped' or 'failure' cases goto out */ - if (ret) { - goto out; - } - - /* Completed first step */ - volinfo->defrag_status = GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE; - } - - if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) { - /* It was used by number of layout fixes on directories */ - defrag->total_files = 0; - - volinfo->defrag_status = GF_DEFRAG_STATUS_MIGRATE_DATA_STARTED; - - /* Step 2: Iterate over directories to move data */ - ret = gf_glusterd_rebalance_move_data (volinfo, defrag->mount); - if (ret < 0) - volinfo->defrag_status = GF_DEFRAG_STATUS_FAILED; - /* in both 'stopped' or 'failure' cases goto out */ - if (ret) { - goto out; - } - - /* Completed second step */ - volinfo->defrag_status = GF_DEFRAG_STATUS_MIGRATE_DATA_COMPLETE; - } - - /* Completed whole process */ - if ((defrag->cmd == GF_DEFRAG_CMD_START) || - (defrag->cmd == GF_DEFRAG_CMD_START_FORCE)) - volinfo->defrag_status = GF_DEFRAG_STATUS_COMPLETE; - - volinfo->rebalance_files = defrag->total_files; - volinfo->rebalance_data = defrag->total_data; - volinfo->lookedup_files = defrag->num_files_lookedup; -out: - volinfo->defrag = NULL; - if (defrag) { - gf_log ("rebalance", GF_LOG_INFO, "rebalance on %s complete", - defrag->mount); - - ret = runcmd ("umount", "-l", defrag->mount, NULL); - LOCK_DESTROY (&defrag->lock); - - if (defrag->cbk_fn) { - defrag->cbk_fn (volinfo, volinfo->defrag_status); - } - - GF_FREE (defrag); - } - return NULL; -} - -int -glusterd_defrag_stop_validate (glusterd_volinfo_t *volinfo, - char *op_errstr, size_t len) -{ - int ret = -1; - if (glusterd_is_defrag_on (volinfo) == 0) { - snprintf (op_errstr, len, "Rebalance on %s is either Completed " - "or not yet started", volinfo->volname); - goto out; - } - ret = 0; -out: - gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret); - return ret; -} - -int -glusterd_defrag_stop (glusterd_volinfo_t *volinfo, u_quad_t *files, - u_quad_t *size, char *op_errstr, size_t len) -{ - /* TODO: set a variable 'stop_defrag' here, it should be checked - in defrag loop */ - int ret = -1; GF_ASSERT (volinfo); - GF_ASSERT (files); - GF_ASSERT (size); - GF_ASSERT (op_errstr); - - if (!volinfo) { - ret = -1; - goto out; - } + GF_ASSERT (defrag); - ret = glusterd_defrag_stop_validate (volinfo, op_errstr, len); - if (ret) { - /* rebalance may be happening on other nodes */ - ret = 0; - goto out; - } - - ret = 0; if (volinfo->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) { goto out; } - LOCK (&volinfo->defrag->lock); + LOCK (&defrag->lock); { - volinfo->defrag_status = GF_DEFRAG_STATUS_STOPPED; - *files = volinfo->defrag->total_files; - *size = volinfo->defrag->total_data; + cmd = defrag->cmd; + if ((cmd == GF_DEFRAG_CMD_START) || (cmd == + GF_DEFRAG_CMD_START_FORCE) || (cmd == + GF_DEFRAG_CMD_START_LAYOUT_FIX)) + volinfo->defrag_status = GF_DEFRAG_STATUS_COMPLETE; + else if (cmd == GF_DEFRAG_CMD_STOP) + volinfo->defrag_status = GF_DEFRAG_STATUS_STOPPED; } - UNLOCK (&volinfo->defrag->lock); + UNLOCK (&defrag->lock); ret = 0; out: @@ -475,10 +147,9 @@ glusterd_rebalance_cmd_attempted_log (int cmd, char *volname) gf_log ("glusterd", GF_LOG_INFO, "Received rebalance " "volume start layout fix on %s", volname); break; - case GF_DEFRAG_CMD_START_MIGRATE_DATA: - case GF_DEFRAG_CMD_START_MIGRATE_DATA_FORCE: + case GF_DEFRAG_CMD_START_FORCE: gf_cmd_log ("Volume rebalance"," on volname: %s " - "cmd: start data migrate attempted", + "cmd: start data force attempted", volname); gf_log ("glusterd", GF_LOG_INFO, "Received rebalance " "volume start migrate data on %s", volname); @@ -539,6 +210,97 @@ out: return ret; } +int32_t +glusterd_defrag_notify (struct rpc_clnt *rpc, void *mydata, + rpc_clnt_event_t event, void *data) +{ + glusterd_volinfo_t *volinfo = NULL; + glusterd_defrag_info_t *defrag = NULL; + int ret = 0; + char pidfile[PATH_MAX]; + glusterd_conf_t *priv = NULL; + + priv = THIS->private; + if (!priv) + return 0; + + volinfo = mydata; + if (!volinfo) + return 0; + + defrag = volinfo->defrag; + if (!defrag) + return 0; + + if ((event == RPC_CLNT_DISCONNECT) && defrag->connected) + volinfo->defrag = NULL; + + GLUSTERD_GET_DEFRAG_PID_FILE(pidfile, volinfo, priv); + + switch (event) { + case RPC_CLNT_CONNECT: + { + if (defrag->connected) + return 0; + + LOCK (&defrag->lock); + { + defrag->connected = 1; + } + UNLOCK (&defrag->lock); + + gf_log ("", GF_LOG_DEBUG, "%s got RPC_CLNT_CONNECT", + rpc->conn.trans->name); + break; + } + + case RPC_CLNT_DISCONNECT: + { + if (!defrag->connected) + return 0; + + LOCK (&defrag->lock); + { + defrag->connected = 0; + } + UNLOCK (&defrag->lock); + + if (!glusterd_is_service_running (pidfile, NULL)) { + glusterd_defrag_update_state (volinfo, defrag); + } else { + volinfo->defrag_status = GF_DEFRAG_STATUS_FAILED; + } + + /* Success or failure, Reset cmd in volinfo */ + + volinfo->defrag_cmd = 0; + + glusterd_store_volinfo (volinfo, + GLUSTERD_VOLINFO_VER_AC_INCREMENT); + + if (defrag->rpc) { + rpc_clnt_unref (defrag->rpc); + defrag->rpc = NULL; + } + if (defrag->cbk_fn) + defrag->cbk_fn (volinfo, volinfo->defrag_status); + + if (defrag) + GF_FREE (defrag); + gf_log ("", GF_LOG_DEBUG, "%s got RPC_CLNT_DISCONNECT", + rpc->conn.trans->name); + break; + } + default: + gf_log ("", GF_LOG_TRACE, + "got some other RPC event %d", event); + ret = 0; + break; + } + + return ret; +} + int glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr, size_t len, int cmd, defrag_cbk_fn_t cbk) @@ -547,6 +309,11 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr, glusterd_defrag_info_t *defrag = NULL; runner_t runner = {0,}; glusterd_conf_t *priv = NULL; + char defrag_path[PATH_MAX]; + struct stat buf = {0,}; + char sockfile[PATH_MAX] = {0,}; + char pidfile[PATH_MAX] = {0,}; + dict_t *options = NULL; priv = THIS->private; @@ -567,18 +334,29 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr, defrag->cmd = cmd; LOCK_INIT (&defrag->lock); - snprintf (defrag->mount, 1024, "%s/mount/%s", - priv->workdir, volinfo->volname); - /* Create a directory, mount glusterfs over it, start glusterfs-defrag */ - runinit (&runner); - runner_add_args (&runner, "mkdir", "-p", defrag->mount, NULL); - ret = runner_run_reuse (&runner); - if (ret) { - runner_log (&runner, "glusterd", GF_LOG_DEBUG, "command failed"); + + volinfo->defrag_status = GF_DEFRAG_STATUS_STARTED; + + volinfo->defrag_cmd = cmd; + glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT); + + GLUSTERD_GET_DEFRAG_DIR (defrag_path, volinfo, priv); + ret = stat (defrag_path, &buf); + if (ret && (errno == ENOENT)) { + runinit (&runner); + runner_add_args (&runner, "mkdir", "-p", defrag_path, NULL); + ret = runner_run_reuse (&runner); + if (ret) { + runner_log (&runner, "glusterd", GF_LOG_DEBUG, + "command failed"); + runner_end (&runner); + goto out; + } runner_end (&runner); - goto out; } - runner_end (&runner); + + GLUSTERD_GET_DEFRAG_SOCK_FILE (sockfile, volinfo, priv); + GLUSTERD_GET_DEFRAG_PID_FILE (pidfile, volinfo, priv); runinit (&runner); runner_add_args (&runner, SBIN_DIR"/glusterfs", @@ -586,34 +364,37 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr, "--xlator-option", "*dht.use-readdirp=yes", "--xlator-option", "*dht.lookup-unhashed=yes", "--xlator-option", "*dht.assert-no-child-down=yes", - defrag->mount, NULL); + NULL); + runner_add_arg (&runner, "--xlator-option"); + runner_argprintf ( &runner, "*dht.rebalance-cmd=%d",cmd); + runner_add_arg (&runner, "--socket-file"); + runner_argprintf (&runner, "%s",sockfile); + runner_add_arg (&runner, "--pid-file"); + runner_argprintf (&runner, "%s",pidfile); + ret = runner_run_reuse (&runner); if (ret) { runner_log (&runner, "glusterd", GF_LOG_DEBUG, "command failed"); runner_end (&runner); goto out; } - runner_end (&runner); - volinfo->defrag_status = GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED; - if ((cmd == GF_DEFRAG_CMD_START_MIGRATE_DATA) || - (cmd == GF_DEFRAG_CMD_START_MIGRATE_DATA_FORCE)) { - volinfo->defrag_status = GF_DEFRAG_STATUS_MIGRATE_DATA_STARTED; + ret = rpc_clnt_transport_unix_options_build (&options, sockfile); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "Unix options build failed"); + goto out; + } + + ret = glusterd_rpc_create (&defrag->rpc, options, + glusterd_defrag_notify, volinfo); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "RPC create failed"); + goto out; } if (cbk) defrag->cbk_fn = cbk; - ret = pthread_create (&defrag->th, NULL, glusterd_defrag_start, - volinfo); - if (ret) { - runinit (&runner); - runner_add_args (&runner, "umount", "-l", defrag->mount, NULL); - ret = runner_run_reuse (&runner); - if (ret) - runner_log (&runner, "glusterd", GF_LOG_DEBUG, "command failed"); - runner_end (&runner); - } out: gf_log ("", GF_LOG_DEBUG, "Returning %d", ret); return ret; @@ -712,7 +493,13 @@ glusterd_handle_defrag_volume (rpcsvc_request_t *req) if (ret) goto out; - ret = glusterd_op_begin (req, GD_OP_REBALANCE, dict); + if ((cmd == GF_DEFRAG_CMD_STATUS) || + (cmd == GF_DEFRAG_CMD_STOP)) { + ret = glusterd_op_begin (req, GD_OP_DEFRAG_BRICK_VOLUME, + dict); + } + else + ret = glusterd_op_begin (req, GD_OP_REBALANCE, dict); out: @@ -762,8 +549,7 @@ glusterd_op_stage_rebalance (dict_t *dict, char **op_errstr) switch (cmd) { case GF_DEFRAG_CMD_START: case GF_DEFRAG_CMD_START_LAYOUT_FIX: - case GF_DEFRAG_CMD_START_MIGRATE_DATA: - case GF_DEFRAG_CMD_START_MIGRATE_DATA_FORCE: + case GF_DEFRAG_CMD_START_FORCE: ret = glusterd_defrag_start_validate (volinfo, msg, sizeof (msg)); if (ret) { @@ -771,6 +557,33 @@ glusterd_op_stage_rebalance (dict_t *dict, char **op_errstr) "start validate failed"); goto out; } + break; + case GF_DEFRAG_CMD_STATUS: + ret = glusterd_is_defrag_on (volinfo); + if (!ret) { + ret = -1; + if (volinfo->defrag_status == + GF_DEFRAG_STATUS_COMPLETE) { + snprintf (msg, sizeof (msg), "Rebalance " + "completed!"); + goto out; + } + snprintf (msg, sizeof(msg), "Rebalance is not running" + " on volume %s", volname); + goto out; + } + break; + + case GF_DEFRAG_CMD_STOP: + ret = glusterd_is_defrag_on (volinfo); + if (!ret) { + gf_log (THIS->name, GF_LOG_DEBUG, + "rebalance is not running"); + ret = -1; + snprintf (msg, sizeof(msg), "Rebalance is not running" + " on volume %s", volname); + goto out; + } default: break; } @@ -792,11 +605,8 @@ glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict) int32_t cmd = 0; char msg[2048] = {0}; glusterd_volinfo_t *volinfo = NULL; - uint64_t files = 0; - uint64_t size = 0; void *node_uuid = NULL; glusterd_conf_t *priv = NULL; - dict_t *tmp_dict = NULL; priv = THIS->private; @@ -839,39 +649,12 @@ glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict) switch (cmd) { case GF_DEFRAG_CMD_START: case GF_DEFRAG_CMD_START_LAYOUT_FIX: - case GF_DEFRAG_CMD_START_MIGRATE_DATA: - case GF_DEFRAG_CMD_START_MIGRATE_DATA_FORCE: + case GF_DEFRAG_CMD_START_FORCE: ret = glusterd_handle_defrag_start (volinfo, msg, sizeof (msg), cmd, NULL); break; - case GF_DEFRAG_CMD_STOP: - ret = glusterd_defrag_stop (volinfo, &files, &size, - msg, sizeof (msg)); - if (!ret && rsp_dict) { - ret = dict_set_uint64 (rsp_dict, "files", files); - if (ret) - gf_log (THIS->name, GF_LOG_WARNING, - "failed to set file count"); - - ret = dict_set_uint64 (rsp_dict, "size", size); - if (ret) - gf_log (THIS->name, GF_LOG_WARNING, - "failed to set xfer size"); - - /* Don't want to propagate errors from dict_set() */ - ret = 0; - } - break; + case GF_DEFRAG_CMD_STOP: case GF_DEFRAG_CMD_STATUS: - - if (rsp_dict) - tmp_dict = rsp_dict; - - /* On source node, there will be no 'rsp_dict' */ - if (!tmp_dict) - tmp_dict = glusterd_op_get_ctx (GD_OP_REBALANCE); - - ret = glusterd_defrag_status_get (volinfo, tmp_dict); break; default: break; diff --git a/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c b/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c index 2cf17b3f..537496f0 100644 --- a/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c @@ -82,6 +82,7 @@ glusterd_op_send_cli_response (glusterd_op_t op, int32_t op_ret, break; } case GD_OP_REBALANCE: + case GD_OP_DEFRAG_BRICK_VOLUME: { if (ctx) { ret = dict_get_int32 (ctx, "status", &status); @@ -1058,7 +1059,8 @@ glusterd_volume_rebalance_use_rsp_dict (dict_t *rsp_dict) GF_ASSERT (rsp_dict); op = glusterd_op_get_op (); - GF_ASSERT (GD_OP_REBALANCE == op); + GF_ASSERT ((GD_OP_REBALANCE == op) || + (GD_OP_DEFRAG_BRICK_VOLUME == op)); ctx_dict = glusterd_op_get_ctx (op); @@ -1224,10 +1226,7 @@ glusterd3_1_commit_op_cbk (struct rpc_req *req, struct iovec *iov, break; case GD_OP_REBALANCE: - ret = glusterd_volume_rebalance_use_rsp_dict (dict); - if (ret) - goto out; - + case GD_OP_DEFRAG_BRICK_VOLUME: break; default: diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c index 4fe8f71c..18d60d0a 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.c +++ b/xlators/mgmt/glusterd/src/glusterd-store.c @@ -622,6 +622,15 @@ glusterd_volume_exclude_options_write (int fd, glusterd_volinfo_t *volinfo) if (ret) goto out; + if (volinfo->defrag_cmd == GF_DEFRAG_CMD_STATUS) + goto out; + + snprintf (buf, sizeof (buf), "%d", volinfo->defrag_cmd); + ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_DEFRAG, + buf); + if (ret) + goto out; + out: if (ret) gf_log ("", GF_LOG_ERROR, "Unable to write volume values" @@ -1860,6 +1869,9 @@ glusterd_store_retrieve_volume (char *volname) } gf_log ("", GF_LOG_DEBUG, "Parsed as "GEOREP" " " slave:key=%s,value:%s", key, value); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_DEFRAG, + strlen (GLUSTERD_STORE_KEY_VOL_DEFRAG))) { + volinfo->defrag_cmd = atoi (value); } else { exists = glusterd_check_option_exists (key, NULL); diff --git a/xlators/mgmt/glusterd/src/glusterd-store.h b/xlators/mgmt/glusterd/src/glusterd-store.h index f1413955..f55fb8c2 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.h +++ b/xlators/mgmt/glusterd/src/glusterd-store.h @@ -59,6 +59,7 @@ typedef enum glusterd_store_ver_ac_{ #define GLUSTERD_STORE_KEY_RB_STATUS "rb_status" #define GLUSTERD_STORE_KEY_RB_SRC_BRICK "rb_src" #define GLUSTERD_STORE_KEY_RB_DST_BRICK "rb_dst" +#define GLUSTERD_STORE_KEY_VOL_DEFRAG "rebalance_status" #define GLUSTERD_STORE_KEY_BRICK_HOSTNAME "hostname" #define GLUSTERD_STORE_KEY_BRICK_PATH "path" diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index 42924a5f..9ec9e16f 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -2016,6 +2016,15 @@ glusterd_import_volinfo (dict_t *vols, int count, goto out; } + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "volume%d.rebalance", count); + ret = dict_get_uint32 (vols, key, &new_volinfo->defrag_cmd); + if (ret) { + snprintf (msg, sizeof (msg), "%s missing in payload for %s", + key, volname); + goto out; + } + uuid_parse (volume_id_str, new_volinfo->volume_id); memset (key, 0, sizeof (key)); @@ -2438,6 +2447,7 @@ glusterd_pending_node_get_rpc (glusterd_pending_node_t *pending_node) struct rpc_clnt *rpc = NULL; glusterd_brickinfo_t *brickinfo = NULL; nodesrv_t *shd = NULL; + glusterd_volinfo_t *volinfo = NULL; GF_VALIDATE_OR_GOTO (THIS->name, pending_node, out); GF_VALIDATE_OR_GOTO (THIS->name, pending_node->node, out); @@ -2449,6 +2459,11 @@ glusterd_pending_node_get_rpc (glusterd_pending_node_t *pending_node) shd = pending_node->node; rpc = shd->rpc; + } else if (pending_node->type == GD_NODE_REBALANCE) { + volinfo = pending_node->node; + if (volinfo->defrag) + rpc = volinfo->defrag->rpc; + } else { GF_ASSERT (0); } @@ -4811,3 +4826,42 @@ glusterd_get_client_filepath (char *filepath, glusterd_volinfo_t *volinfo, snprintf (filepath, PATH_MAX, "%s/%s-fuse.vol", path, volinfo->volname); } + +int +glusterd_volume_defrag_restart (glusterd_volinfo_t *volinfo, char *op_errstr, + size_t len, int cmd, defrag_cbk_fn_t cbk) +{ + glusterd_conf_t *priv = NULL; + char pidfile[PATH_MAX]; + int ret = -1; + pid_t pid; + + priv = THIS->private; + if (!priv) + return ret; + + GLUSTERD_GET_DEFRAG_PID_FILE(pidfile, volinfo, priv); + + if (!glusterd_is_service_running (pidfile, &pid)) { + glusterd_handle_defrag_start (volinfo, op_errstr, len, cmd, + cbk); + } + + return ret; +} + +int +glusterd_restart_rebalance (glusterd_conf_t *conf) +{ + glusterd_volinfo_t *volinfo = NULL; + int ret = 0; + char op_errstr[256]; + + list_for_each_entry (volinfo, &conf->volumes, vol_list) { + if (!volinfo->defrag_cmd) + continue; + glusterd_volume_defrag_restart (volinfo, op_errstr, 256, + volinfo->defrag_cmd, NULL); + } + return ret; +} diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h index 6f9a5e14..e52b25e3 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.h +++ b/xlators/mgmt/glusterd/src/glusterd-utils.h @@ -386,4 +386,6 @@ glusterd_chk_peers_connected_befriended (uuid_t skip_uuid); void glusterd_get_client_filepath (char *filepath, glusterd_volinfo_t *volinfo, gf_transport_type type); +int +glusterd_restart_rebalance (glusterd_conf_t *conf); #endif diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c index e9f3bd05..28e80310 100644 --- a/xlators/mgmt/glusterd/src/glusterd.c +++ b/xlators/mgmt/glusterd/src/glusterd.c @@ -999,6 +999,8 @@ init (xlator_t *this) ret = glusterd_restart_gsyncds (conf); if (ret) goto out; + + glusterd_restart_rebalance (conf); ret = 0; out: if (ret < 0) { diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index 60dbe61e..e200f49d 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -79,6 +79,7 @@ typedef enum glusterd_op_ { GD_OP_STATEDUMP_VOLUME, GD_OP_LIST_VOLUME, GD_OP_CLEARLOCKS_VOLUME, + GD_OP_DEFRAG_BRICK_VOLUME, GD_OP_MAX, } glusterd_op_t; @@ -164,6 +165,9 @@ struct glusterd_defrag_info_ { gf_lock_t lock; int cmd; pthread_t th; + gf_defrag_status_t defrag_status; + struct rpc_clnt * rpc; + uint32_t connected; char mount[1024]; char databuf[131072]; struct gf_defrag_brickinfo_ *bricks; /* volinfo->brick_count */ @@ -210,6 +214,7 @@ struct glusterd_volinfo_ { uint64_t rebalance_data; uint64_t lookedup_files; glusterd_defrag_info_t *defrag; + gf_cli_defrag_type defrag_cmd; /* Replace brick status */ gf_rb_status_t rb_status; @@ -235,7 +240,8 @@ struct glusterd_volinfo_ { typedef enum gd_node_type_ { GD_NODE_NONE, GD_NODE_BRICK, - GD_NODE_SHD + GD_NODE_SHD, + GD_NODE_REBALANCE, } gd_node_type; typedef struct glusterd_pending_node_ { @@ -315,6 +321,27 @@ typedef ssize_t (*gd_serialize_t) (struct iovec outmsg, void *args); STACK_DESTROY (frame->root);\ } while (0) +#define GLUSTERD_GET_DEFRAG_DIR(path, volinfo, priv) do { \ + char vol_path[PATH_MAX]; \ + GLUSTERD_GET_VOLUME_DIR(vol_path, volinfo, priv); \ + snprintf (path, PATH_MAX, "%s/rebalance",vol_path); \ + } while (0) + +#define GLUSTERD_GET_DEFRAG_SOCK_FILE(path, volinfo, priv) do { \ + char defrag_path[PATH_MAX]; \ + GLUSTERD_GET_DEFRAG_DIR(defrag_path, volinfo, priv); \ + snprintf (path, PATH_MAX, "%s/%s.sock", defrag_path, \ + uuid_utoa(priv->uuid)); \ + } while (0) + +#define GLUSTERD_GET_DEFRAG_PID_FILE(path, volinfo, priv) do { \ + char defrag_path[PATH_MAX]; \ + GLUSTERD_GET_DEFRAG_DIR(defrag_path, volinfo, priv); \ + snprintf (path, PATH_MAX, "%s/%s.pid", defrag_path, \ + uuid_utoa(priv->uuid)); \ + } while (0) + + int32_t glusterd_brick_from_brickinfo (glusterd_brickinfo_t *brickinfo, char **new_brick); -- cgit