diff options
author | Diogenes Nunez <dnunez@redhat.com> | 2016-07-27 11:09:47 -0400 |
---|---|---|
committer | Dan Lambright <dlambrig@redhat.com> | 2016-09-04 18:37:57 -0700 |
commit | 261c035c7d0cd1639cc8bd0ead82c30efcc0e93f (patch) | |
tree | af3a2e498023e7ad8af417312b83ce2f969ef738 /xlators/cluster | |
parent | 6459fc812219551291e4be426ed8ecf2c90813a4 (diff) |
cluster/tier: Adding compaction option for metadata databases
Problem: As metadata in the database fills up, querying the database
take a long time. As a result, tier migration slows down. To
counteract this, we added a way to enable the compaction methods of
the underlying database. The goal is to reduce the size of the
underlying file by eliminating database fragmentation.
NOTE: There is currently a bug where sometimes a brick will
attempt to activate compaction. This happens even compaction is already
turned on.
The cause is narrowed down to the compact_mode_switch flipping its value.
Changes: libglusterfs/src/gfdb - Added a gfdb function to compact the
underlying database, compact_db() This is a no-op if the database has
no such option.
- Added a compaction function for SQLite3 that does the following
1) Changes the auto_vacuum pragma of the database
2) Compacts the database according to the type of compaction requested
- Compaction type can be changed by changing the macro
GF_SQL_COMPACT_DEF to one of the 4 compaction types in
gfdb_sqlite3.h
It is currently set to GF_SQL_COMPACT_INCR, or incremental
vacuuming.
xlators/cluster/dht/src - Added the following command-line option to
enable SQLite3 compaction.
gluster volume set <vol-name> tier-compact <off|on>
- Added the following command-line option to change the frequency the
hot and cold tier are ordered to compact.
gluster volume set <vol-name> tier-hot-compact-frequency <int>
gluster volume set <vol-name> tier-cold-compact-frequency <int>
- tier daemon periodically sends the (new)
GFDB_IPC_CTR_SET_COMPACT_PRAGMA IPC to the CTR xlator. The IPC
triggers compaction of the database.
The inputs are both gf_boolean_t.
IPC Input:
compact_active: Is compaction currently on for the db.
compact_mode_switched: Did we flip the compaction switch recently?
IPC Output:
0 if the compaction succeeds.
Non-zero otherwise.
xlators/features/changetimerecorder/src/ - When the CTR gets the
compaction IPC, it launches a thread that will perform the
compaction. The IPC ends after the thread is launched. To avoid extra
allocations, the parameters are passed using static variables.
Change-Id: I5e1433becb9eeff2afe8dcb4a5798977bf5ba0dd
Signed-off-by: Diogenes Nunez <dnunez@redhat.com>
Reviewed-on: http://review.gluster.org/15031
Reviewed-by: Milind Changire <mchangir@redhat.com>
Reviewed-by: Dan Lambright <dlambrig@redhat.com>
Tested-by: Dan Lambright <dlambrig@redhat.com>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
Smoke: Gluster Build System <jenkins@build.gluster.org>
Diffstat (limited to 'xlators/cluster')
-rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 31 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-shared.c | 26 | ||||
-rw-r--r-- | xlators/cluster/dht/src/tier.c | 554 | ||||
-rw-r--r-- | xlators/cluster/dht/src/tier.h | 6 |
4 files changed, 564 insertions, 53 deletions
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 3717a68273c..da1bcb6a4a1 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -310,7 +310,7 @@ typedef struct dht_local dht_local_t; /* du - disk-usage */ struct dht_du { double avail_percent; - double avail_inodes; + double avail_inodes; uint64_t avail_space; uint32_t log; uint32_t chunks; @@ -325,10 +325,10 @@ enum gf_defrag_type { GF_DEFRAG_CMD_START_FORCE = 1 + 4, GF_DEFRAG_CMD_START_TIER = 1 + 5, GF_DEFRAG_CMD_STATUS_TIER = 1 + 6, - GF_DEFRAG_CMD_START_DETACH_TIER = 1 + 7, - GF_DEFRAG_CMD_STOP_DETACH_TIER = 1 + 8, - GF_DEFRAG_CMD_PAUSE_TIER = 1 + 9, - GF_DEFRAG_CMD_RESUME_TIER = 1 + 10, + GF_DEFRAG_CMD_START_DETACH_TIER = 1 + 7, + GF_DEFRAG_CMD_STOP_DETACH_TIER = 1 + 8, + GF_DEFRAG_CMD_PAUSE_TIER = 1 + 9, + GF_DEFRAG_CMD_RESUME_TIER = 1 + 10, }; typedef enum gf_defrag_type gf_defrag_type; @@ -398,9 +398,26 @@ typedef struct gf_tier_conf { uint64_t max_migrate_bytes; int max_migrate_files; tier_mode_t mode; + /* These flags are only used for tier-compact */ + gf_boolean_t compact_active; + /* These 3 flags are set to true when the client changes the */ + /* compaction mode on the command line. */ + /* When they are set, the daemon will trigger compaction as */ + /* soon as possible to activate or deactivate compaction. */ + /* If in the middle of a compaction, then the switches take */ + /* effect on the next compaction, not the current one. */ + /* If the user switches it off, we want to avoid needless */ + /* compactions. */ + /* If the user switches it on, they want to compact as soon */ + /* as possible. */ + gf_boolean_t compact_mode_switched; + gf_boolean_t compact_mode_switched_hot; + gf_boolean_t compact_mode_switched_cold; int tier_max_promote_size; int tier_promote_frequency; int tier_demote_frequency; + int tier_compact_hot_frequency; + int tier_compact_cold_frequency; uint64_t st_last_promoted_size; uint64_t st_last_demoted_size; tier_pause_state_t pause_state; @@ -1023,9 +1040,9 @@ int32_t dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, int32_t valid, dict_t *xdata); int32_t dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t mode, off_t offset, size_t len, dict_t *xdata); + int32_t mode, off_t offset, size_t len, dict_t *xdata); int32_t dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, - off_t offset, size_t len, dict_t *xdata); + off_t offset, size_t len, dict_t *xdata); int32_t dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, off_t len, dict_t *xdata); int32_t dht_ipc (call_frame_t *frame, xlator_t *this, int32_t op, diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 873ced53eec..f410f71b5a6 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -148,7 +148,7 @@ dht_priv_dump (xlator_t *this) gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed); gf_proc_dump_write("gen", "%d", conf->gen); gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk); - gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes); + gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes); gf_proc_dump_write("disk_unit", "%c", conf->disk_unit); gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval); gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit); @@ -433,14 +433,14 @@ dht_reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("lookup-optimize", conf->lookup_optimize, options, bool, out); - GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options, + GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options, percent_or_size, out); /* option can be any one of percent or bytes */ conf->disk_unit = 0; if (conf->min_free_disk < 100.0) conf->disk_unit = 'p'; - GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options, + GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options, percent, out); GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt, @@ -711,8 +711,8 @@ dht_init (xlator_t *this) GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err); - GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, - err); + GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, + err); GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent, err); @@ -901,7 +901,7 @@ struct volume_options options[] = { "process starts balancing out the cluster, and logs will appear " "in log files", }, - { .key = {"min-free-inodes"}, + { .key = {"min-free-inodes"}, .type = GF_OPTION_TYPE_PERCENT, .default_value = "5%", .description = "after system has only N% of inodes, warnings " @@ -1038,6 +1038,20 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_STR, .default_value = "test", }, + { .key = {"tier-compact"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, + { .key = {"tier-hot-compact-frequency"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "604800", + .description = "Frequency to compact DBs on hot tier in system" + }, + { .key = {"tier-cold-compact-frequency"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "604800", + .description = "Frequency to compact DBs on cold tier in system" + }, { .key = {"tier-max-mb"}, .type = GF_OPTION_TYPE_INT, .default_value = "4000", diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c index 0d53a62d327..7e5e1004b84 100644 --- a/xlators/cluster/dht/src/tier.c +++ b/xlators/cluster/dht/src/tier.c @@ -1240,15 +1240,15 @@ tier_process_ctr_query (tier_brick_list_t *local_brick, void *args) gfdb_brick_info->time_stamp, sizeof (gfdb_time_t)); - SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_in_dict, - GFDB_IPC_CTR_KEY, GFDB_IPC_CTR_QUERY_OPS, - ret, out); + SET_DB_PARAM_TO_DICT (this->name, ctr_ipc_in_dict, + GFDB_IPC_CTR_KEY, GFDB_IPC_CTR_QUERY_OPS, + ret, out); - SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_in_dict, - GFDB_IPC_CTR_GET_QFILE_PATH, - local_brick->qfile_path, - ret, out); + SET_DB_PARAM_TO_DICT (this->name, ctr_ipc_in_dict, + GFDB_IPC_CTR_GET_QFILE_PATH, + local_brick->qfile_path, + ret, out); ret = dict_set_bin (ctr_ipc_in_dict, GFDB_IPC_CTR_GET_QUERY_PARAMS, ipc_ctr_params, sizeof (*ipc_ctr_params)); @@ -1360,7 +1360,7 @@ tier_process_brick (tier_brick_list_t *local_brick, void *args) { gf_msg ("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED, "Failed to set %s " "to params dictionary", - GFDB_IPC_CTR_GET_DB_KEY);\ + GFDB_IPC_CTR_GET_DB_KEY); goto out; } @@ -1442,11 +1442,12 @@ tier_build_migration_qfile (migration_args_t *args, } time_in_past.tv_sec = current_time.tv_sec - time_in_past.tv_sec; - /* The migration daemon may run a varrying numberof usec after the sleep */ - /* call triggers. A file may be registered in CTR some number of usec X */ - /* after the daemon started and missed in the subsequent cycle if the */ - /* daemon starts Y usec after the period in seconds where Y>X. Normalize */ - /* away this problem by always setting usec to 0. */ + /* The migration daemon may run a varying numberof usec after the */ + /* sleep call triggers. A file may be registered in CTR some number */ + /* of usec X after the daemon started and missed in the subsequent */ + /* cycle if the daemon starts Y usec after the period in seconds */ + /* where Y>X. Normalize away this problem by always setting usec */ + /* to 0. */ time_in_past.tv_usec = 0; gfdb_brick_info.time_stamp = &time_in_past; @@ -1649,6 +1650,265 @@ out: return ret; } + +/* + * Command the CTR on a brick to compact the local database using an IPC + */ +static int +tier_process_self_compact (tier_brick_list_t *local_brick, void *args) +{ + int ret = -1; + char *db_path = NULL; + query_cbk_args_t *query_cbk_args = NULL; + xlator_t *this = NULL; + gfdb_conn_node_t *conn_node = NULL; + dict_t *params_dict = NULL; + dict_t *ctr_ipc_dict = NULL; + gfdb_brick_info_t *gfdb_brick_info = args; + int is_changing = -1; + + /*Init of all the essentials*/ + GF_VALIDATE_OR_GOTO ("tier", gfdb_brick_info , out); + query_cbk_args = gfdb_brick_info->_query_cbk_args; + + GF_VALIDATE_OR_GOTO ("tier", query_cbk_args->this, out); + this = query_cbk_args->this; + + GF_VALIDATE_OR_GOTO (this->name, + gfdb_brick_info->_query_cbk_args, out); + + GF_VALIDATE_OR_GOTO (this->name, local_brick, out); + + GF_VALIDATE_OR_GOTO (this->name, local_brick->xlator, out); + + GF_VALIDATE_OR_GOTO (this->name, local_brick->brick_db_path, out); + + db_path = local_brick->brick_db_path; + + /*Preparing DB parameters before init_db i.e getting db connection*/ + params_dict = dict_new (); + if (!params_dict) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_ERROR, + "DB Params cannot initialized"); + goto out; + } + SET_DB_PARAM_TO_DICT (this->name, params_dict, + (char *) gfdb_methods.get_db_path_key(), db_path, + ret, out); + + /*Get the db connection*/ + conn_node = gfdb_methods.init_db ((void *)params_dict, + dht_tier_db_type); + if (!conn_node) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_ERROR, + "FATAL: Failed initializing db operations"); + goto out; + } + + ret = 0; + + /*Preparing ctr_ipc_dict*/ + ctr_ipc_dict = dict_new (); + if (!ctr_ipc_dict) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_ERROR, + "ctr_ipc_dict cannot initialized"); + goto out; + } + + ret = dict_set_int32 (ctr_ipc_dict, "compact_active", + query_cbk_args->defrag-> + tier_conf.compact_active); + + if (ret) { + gf_msg ("tier", GF_LOG_ERROR, 0, + LG_MSG_SET_PARAM_FAILED, "Failed to set %s " + "to params dictionary", + "compact_active"); + goto out; + } + + ret = dict_set_int32 (ctr_ipc_dict, "compact_mode_switched", + query_cbk_args->defrag-> + tier_conf.compact_mode_switched); + + if (ret) { + gf_msg ("tier", GF_LOG_ERROR, 0, + LG_MSG_SET_PARAM_FAILED, "Failed to set %s " + "to params dictionary", + "compact_mode_switched"); + goto out; + } + + SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_dict, + GFDB_IPC_CTR_KEY, GFDB_IPC_CTR_SET_COMPACT_PRAGMA, + ret, out); + + gf_msg (this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS, + "Starting Compaction IPC"); + + ret = syncop_ipc (local_brick->xlator, GF_IPC_TARGET_CTR, ctr_ipc_dict, + NULL); + + gf_msg (this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS, + "Ending Compaction IPC"); + + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_ERROR, "Failed compaction " + "on db %s error %d", local_brick->brick_db_path, ret); + goto out; + } + + gf_msg (this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS, + "SUCCESS: %s Compaction", local_brick->brick_name); + + ret = 0; +out: + if (params_dict) { + dict_unref (params_dict); + params_dict = NULL; + } + + if (ctr_ipc_dict) { + dict_unref (ctr_ipc_dict); + ctr_ipc_dict = NULL; + } + + gfdb_methods.fini_db (conn_node); + + return ret; +} + +/* + * This is the call back function for each brick from hot/cold bricklist. + * It determines the database type on each brick and calls the corresponding + * function to prepare the compaction IPC. + */ +static int +tier_compact_db_brick (tier_brick_list_t *local_brick, void *args) +{ + int ret = -1; + char *strval = NULL; + + GF_VALIDATE_OR_GOTO ("tier", local_brick, out); + + GF_VALIDATE_OR_GOTO ("tier", local_brick->xlator, out); + + ret = tier_process_self_compact (local_brick, args); + if (ret) { + gf_msg ("tier", GF_LOG_INFO, 0, + DHT_MSG_LOG_TIER_STATUS, + "Brick %s did not compact", + local_brick->brick_name); + goto out; + } + + ret = 0; + +out: + + return ret; +} + +static int +tier_send_compact (migration_args_t *args, + query_cbk_args_t *query_cbk_args) +{ + gfdb_time_t current_time; + gfdb_brick_info_t gfdb_brick_info; + gfdb_time_t time_in_past; + int ret = -1; + tier_brick_list_t *local_brick = NULL; + + time_in_past.tv_sec = args->freq_time; + time_in_past.tv_usec = 0; + + ret = gettimeofday (¤t_time, NULL); + if (ret == -1) { + gf_msg (args->this->name, GF_LOG_ERROR, errno, + DHT_MSG_SYS_CALL_GET_TIME_FAILED, + "Failed to get current time"); + goto out; + } + time_in_past.tv_sec = current_time.tv_sec - time_in_past.tv_sec; + + /* The migration daemon may run a varying numberof usec after the sleep + call triggers. A file may be registered in CTR some number of usec X + after the daemon started and missed in the subsequent cycle if the + daemon starts Y usec after the period in seconds where Y>X. Normalize + away this problem by always setting usec to 0. */ + time_in_past.tv_usec = 0; + + gfdb_brick_info.time_stamp = &time_in_past; + + /* This is meant to say we are always compacting at this point */ + /* We simply borrow the promotion flag to do this */ + gfdb_brick_info._gfdb_promote = 1; + + gfdb_brick_info._query_cbk_args = query_cbk_args; + + list_for_each_entry (local_brick, args->brick_list, list) { + + gf_msg (args->this->name, GF_LOG_TRACE, 0, + DHT_MSG_LOG_TIER_STATUS, + "Start compaction for %s", + local_brick->brick_name); + + ret = tier_compact_db_brick (local_brick, + &gfdb_brick_info); + if (ret) { + gf_msg (args->this->name, GF_LOG_ERROR, 0, + DHT_MSG_BRICK_QUERY_FAILED, + "Brick %s compaction failed\n", + local_brick->brick_db_path); + } + + gf_msg (args->this->name, GF_LOG_TRACE, 0, + DHT_MSG_LOG_TIER_STATUS, + "End compaction for %s", + local_brick->brick_name); + + } + ret = 0; +out: + return ret; +} + +static int +tier_compact (void *args) +{ + int ret = -1; + query_cbk_args_t query_cbk_args; + migration_args_t *compaction_args = args; + + GF_VALIDATE_OR_GOTO ("tier", compaction_args->this, out); + GF_VALIDATE_OR_GOTO (compaction_args->this->name, + compaction_args->brick_list, out); + GF_VALIDATE_OR_GOTO (compaction_args->this->name, + compaction_args->defrag, out); + + THIS = compaction_args->this; + + query_cbk_args.this = compaction_args->this; + query_cbk_args.defrag = compaction_args->defrag; + query_cbk_args.is_compaction = 1; + + /* Send the compaction pragma out to all the bricks on the bricklist. */ + /* tier_get_bricklist ensures all bricks on the list are local to */ + /* this node. */ + ret = tier_send_compact (compaction_args, &query_cbk_args); + if (ret) + goto out; + + ret = 0; +out: + compaction_args->return_value = ret; + return ret; + } + static int tier_get_bricklist (xlator_t *xl, struct list_head *local_bricklist_head) { @@ -1755,6 +2015,18 @@ tier_get_freq_promote (gf_tier_conf_t *tier_conf) return tier_conf->tier_promote_frequency; } +int +tier_get_freq_compact_hot (gf_tier_conf_t *tier_conf) +{ + return tier_conf->tier_compact_hot_frequency; +} + +int +tier_get_freq_compact_cold (gf_tier_conf_t *tier_conf) +{ + return tier_conf->tier_compact_cold_frequency; +} + static int tier_check_demote (gfdb_time_t current_time, int freq) { @@ -1776,8 +2048,21 @@ tier_check_promote (gf_tier_conf_t *tier_conf, _gf_true : _gf_false; } +static gf_boolean_t +tier_check_compact (gf_tier_conf_t *tier_conf, + gfdb_time_t current_time, + int freq_compact) +{ + + if (!(tier_conf->compact_active || + tier_conf->compact_mode_switched)) + return _gf_false; + return ((current_time.tv_sec % freq_compact) == 0) ? + _gf_true : _gf_false; +} + void clear_bricklist (struct list_head *brick_list) @@ -1824,6 +2109,72 @@ out: return; } +static int +tier_prepare_compact (migration_args_t *args, gfdb_time_t current_time) +{ + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + gf_tier_conf_t *tier_conf = NULL; + gf_boolean_t is_hot_tier = _gf_false; + int freq = 0; + int ret = -1; + const char *tier_type = is_hot_tier ? "hot" : "cold"; + + this = args->this; + + conf = this->private; + + defrag = conf->defrag; + + tier_conf = &defrag->tier_conf; + + is_hot_tier = args->is_hot_tier; + + freq = is_hot_tier ? tier_get_freq_compact_hot (tier_conf) : + tier_get_freq_compact_cold (tier_conf); + + defrag->tier_conf.compact_mode_switched = is_hot_tier ? + defrag->tier_conf.compact_mode_switched_hot : + defrag->tier_conf.compact_mode_switched_cold; + + gf_msg(this->name, GF_LOG_TRACE, 0, + DHT_MSG_LOG_TIER_STATUS, + "Compact mode %i", + defrag->tier_conf.compact_mode_switched); + + if (tier_check_compact (tier_conf, current_time, + freq)) { + gf_msg (this->name, GF_LOG_INFO, 0, + DHT_MSG_LOG_TIER_STATUS, + "Start compaction on %s tier", + tier_type); + + args->freq_time = freq; + ret = tier_compact (args); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_ERROR, "Compaction failed on " + "%s tier", tier_type); + goto out; + } + + gf_msg (this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, + "End compaction on %s tier", tier_type); + + if (is_hot_tier) { + defrag->tier_conf.compact_mode_switched_hot = + _gf_false; + } else { + defrag->tier_conf.compact_mode_switched_cold = + _gf_false; + } + } + +out: + return ret; +} + /* * Main tiering loop. This is called from the promotion and the * demotion threads spawned in tier_start(). @@ -1846,8 +2197,9 @@ static void int check_watermark = 0; gf_defrag_info_t *defrag = NULL; xlator_t *this = NULL; + struct list_head *bricklist_temp = NULL; migration_args_t *args = in_args; - + gf_boolean_t compacted = _gf_false; GF_VALIDATE_OR_GOTO ("tier", args, out); GF_VALIDATE_OR_GOTO ("tier", args->brick_list, out); @@ -1884,7 +2236,8 @@ static void if (xlator != this) { gf_msg (this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, - "Detected graph switch. Exiting migration daemon."); + "Detected graph switch. Exiting migration " + "daemon."); goto out; } @@ -1912,10 +2265,10 @@ static void goto out; } - if (gf_defrag_get_pause_state (&defrag->tier_conf) != TIER_RUNNING) + if (gf_defrag_get_pause_state (&defrag->tier_conf) != + TIER_RUNNING) continue; - /* To have proper synchronization amongst all * brick holding nodes, so that promotion and demotions * start atomicly w.r.t promotion/demotion frequency @@ -1950,7 +2303,6 @@ static void } if (args->is_promotion) { - freq = tier_get_freq_promote (tier_conf); if (tier_check_promote (tier_conf, current_time, freq)) { @@ -1962,21 +2314,22 @@ static void "Promotion failed"); } } - + } else if (args->is_compaction) { + tier_prepare_compact (args, current_time); } else { - freq = tier_get_freq_demote (tier_conf); if (tier_check_demote (current_time, freq)) { args->freq_time = freq; ret = tier_demote (args); if (ret) { - gf_msg (this->name, GF_LOG_ERROR, 0, + gf_msg (this->name, + GF_LOG_ERROR, + 0, DHT_MSG_LOG_TIER_ERROR, "Demotion failed"); } } - } /* Check the statfs immediately after the processing threads @@ -1997,11 +2350,15 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag) { pthread_t promote_thread; pthread_t demote_thread; + pthread_t hot_compact_thread; + pthread_t cold_compact_thread; int ret = -1; struct list_head bricklist_hot = { 0 }; struct list_head bricklist_cold = { 0 }; migration_args_t promotion_args = { 0 }; migration_args_t demotion_args = { 0 }; + migration_args_t hot_compaction_args = { 0 }; + migration_args_t cold_compaction_args = { 0 }; dht_conf_t *conf = NULL; INIT_LIST_HEAD ((&bricklist_hot)); @@ -2016,6 +2373,7 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag) demotion_args.brick_list = &bricklist_hot; demotion_args.defrag = defrag; demotion_args.is_promotion = _gf_false; + demotion_args.is_compaction = _gf_false; ret = pthread_create (&demote_thread, NULL, &tier_run, @@ -2047,6 +2405,47 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag) goto waitforspawned; } + hot_compaction_args.this = this; + hot_compaction_args.brick_list = &bricklist_hot; + hot_compaction_args.defrag = defrag; + hot_compaction_args.is_promotion = _gf_false; + hot_compaction_args.is_compaction = _gf_true; + hot_compaction_args.is_hot_tier = _gf_true; + + ret = pthread_create (&hot_compact_thread, + NULL, &tier_run, + &hot_compaction_args); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_ERROR, + "Failed to start compaction thread."); + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + goto waitforspawnedpromote; + } + + cold_compaction_args.this = this; + cold_compaction_args.brick_list = &bricklist_cold; + cold_compaction_args.defrag = defrag; + cold_compaction_args.is_promotion = _gf_false; + cold_compaction_args.is_compaction = _gf_true; + cold_compaction_args.is_hot_tier = _gf_false; + + ret = pthread_create (&cold_compact_thread, + NULL, &tier_run, + &cold_compaction_args); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_ERROR, + "Failed to start compaction thread."); + defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; + goto waitforspawnedhotcompact; + } + pthread_join (cold_compact_thread, NULL); + +waitforspawnedhotcompact: + pthread_join (hot_compact_thread, NULL); + +waitforspawnedpromote: pthread_join (promote_thread, NULL); waitforspawned: @@ -2055,7 +2454,6 @@ waitforspawned: cleanup: clear_bricklist (&bricklist_cold); clear_bricklist (&bricklist_hot); - return ret; } @@ -2167,8 +2565,8 @@ out: return ret; } -static -int tier_validate_mode (char *mode) +static int +tier_validate_mode (char *mode) { int ret = -1; @@ -2181,6 +2579,26 @@ int tier_validate_mode (char *mode) return ret; } +static gf_boolean_t +tier_validate_compact_mode (char *mode) +{ + gf_boolean_t ret = _gf_false; + + gf_msg ("tier", GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, + "tier_validate_compact_mode: mode = %s", mode); + + if (!strcmp (mode, "on")) { + ret = _gf_true; + } else { + ret = _gf_false; + } + + gf_msg ("tier", GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_STATUS, + "tier_validate_compact_mode: ret = %i", ret); + + return ret; +} int tier_init_methods (xlator_t *this) @@ -2205,8 +2623,6 @@ err: return ret; } - - int tier_init (xlator_t *this) { @@ -2291,6 +2707,22 @@ tier_init (xlator_t *this) defrag->tier_conf.tier_demote_frequency = freq; ret = dict_get_int32 (this->options, + "tier-hot-compact-frequency", &freq); + if (ret) { + freq = DEFAULT_HOT_COMPACT_FREQ_SEC; + } + + defrag->tier_conf.tier_compact_hot_frequency = freq; + + ret = dict_get_int32 (this->options, + "tier-cold-compact-frequency", &freq); + if (ret) { + freq = DEFAULT_COLD_COMPACT_FREQ_SEC; + } + + defrag->tier_conf.tier_compact_cold_frequency = freq; + + ret = dict_get_int32 (this->options, "watermark-hi", &freq); if (ret) { freq = DEFAULT_WM_HI; @@ -2339,6 +2771,29 @@ tier_init (xlator_t *this) defrag->tier_conf.max_migrate_files = freq; ret = dict_get_str (this->options, + "tier-compact", &mode); + + if (ret) { + defrag->tier_conf.compact_active = DEFAULT_COMP_MODE; + } else { + ret = tier_validate_compact_mode (mode); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_ERROR, + "tier_init failed - invalid compaction mode"); + goto out; + } + + /* If compaction is now active, we need to inform the bricks on + the hot and cold tier of this. See dht-common.h for more. */ + defrag->tier_conf.compact_active = ret; + if (ret) { + defrag->tier_conf.compact_mode_switched_hot = _gf_true; + defrag->tier_conf.compact_mode_switched_cold = _gf_true; + } + } + + ret = dict_get_str (this->options, "tier-mode", &mode); if (ret) { defrag->tier_conf.mode = DEFAULT_TIER_MODE; @@ -2361,7 +2816,8 @@ tier_init (xlator_t *this) "tier-pause", &paused); if (paused && strcmp (paused, "on") == 0) - gf_defrag_set_pause_state (&defrag->tier_conf, TIER_REQUEST_PAUSE); + gf_defrag_set_pause_state (&defrag->tier_conf, + TIER_REQUEST_PAUSE); ret = gf_asprintf(&voldir, "%s/%s", DEFAULT_VAR_RUN_DIRECTORY, @@ -2411,7 +2867,6 @@ out: return ret; } - int tier_cli_pause_done (int op_ret, call_frame_t *sync_frame, void *data) { @@ -2445,17 +2900,17 @@ exit: return ret; } - int tier_reconfigure (xlator_t *this, dict_t *options) { - dht_conf_t *conf = NULL; - gf_defrag_info_t *defrag = NULL; - char *mode = NULL; - int migrate_mb = 0; - gf_boolean_t req_pause = _gf_false; - int ret = 0; - call_frame_t *frame = NULL; + dht_conf_t *conf = NULL; + gf_defrag_info_t *defrag = NULL; + char *mode = NULL; + int migrate_mb = 0; + gf_boolean_t req_pause = _gf_false; + int ret = 0; + call_frame_t *frame = NULL; + gf_boolean_t last_compact_setting = _gf_false; conf = this->private; @@ -2489,6 +2944,28 @@ tier_reconfigure (xlator_t *this, dict_t *options) defrag->tier_conf.watermark_low, options, int32, out); + last_compact_setting = defrag->tier_conf.compact_active; + + GF_OPTION_RECONF ("tier-compact", + defrag->tier_conf.compact_active, options, + bool, out); + + if (last_compact_setting != defrag->tier_conf.compact_active) { + defrag->tier_conf.compact_mode_switched_hot = _gf_true; + defrag->tier_conf.compact_mode_switched_cold = _gf_true; + gf_msg (this->name, GF_LOG_INFO, 0, + DHT_MSG_LOG_TIER_STATUS, + "compact mode switched"); + } + + GF_OPTION_RECONF ("tier-hot-compact-frequency", + defrag->tier_conf.tier_compact_hot_frequency, + options, int32, out); + + GF_OPTION_RECONF ("tier-cold-compact-frequency", + defrag->tier_conf.tier_compact_cold_frequency, + options, int32, out); + GF_OPTION_RECONF ("tier-mode", mode, options, str, out); @@ -2558,7 +3035,6 @@ class_methods_t class_methods = { .notify = dht_notify }; - struct xlator_fops fops = { .lookup = dht_lookup, @@ -2611,9 +3087,7 @@ struct xlator_fops fops = { .zerofill = dht_zerofill, }; - struct xlator_cbks cbks = { .release = dht_release, .forget = dht_forget }; - diff --git a/xlators/cluster/dht/src/tier.h b/xlators/cluster/dht/src/tier.h index 0807608fda2..ffb04173bd5 100644 --- a/xlators/cluster/dht/src/tier.h +++ b/xlators/cluster/dht/src/tier.h @@ -54,6 +54,7 @@ typedef struct _query_cbk_args { /* This is write */ int query_fd; int is_promotion; + int is_compaction; /* This is for read */ tier_qfile_array_t *qfile_array; } query_cbk_args_t; @@ -82,6 +83,8 @@ typedef struct _dm_thread_args { int freq_time; int return_value; int is_promotion; + int is_compaction; + gf_boolean_t is_hot_tier; } migration_args_t; typedef enum tier_watermark_op_ { @@ -93,12 +96,15 @@ typedef enum tier_watermark_op_ { #define DEFAULT_PROMOTE_FREQ_SEC 120 #define DEFAULT_DEMOTE_FREQ_SEC 120 +#define DEFAULT_HOT_COMPACT_FREQ_SEC 604800 +#define DEFAULT_COLD_COMPACT_FREQ_SEC 604800 #define DEFAULT_DEMOTE_DEGRADED 10 #define DEFAULT_WRITE_FREQ_SEC 0 #define DEFAULT_READ_FREQ_SEC 0 #define DEFAULT_WM_LOW 75 #define DEFAULT_WM_HI 90 #define DEFAULT_TIER_MODE TIER_MODE_TEST +#define DEFAULT_COMP_MODE _gf_true #define DEFAULT_TIER_MAX_MIGRATE_MB 1000 #define DEFAULT_TIER_MAX_MIGRATE_FILES 5000 |