diff options
author | Dan Lambright <dlambrig@redhat.com> | 2015-09-18 00:49:06 -0400 |
---|---|---|
committer | Dan Lambright <dlambrig@redhat.com> | 2015-10-10 06:00:04 -0700 |
commit | 672baab88fb7f32e844cd4be22e0924e4e0e83fc (patch) | |
tree | 8a05387b3508cc8f15ec448ab2e7b2baac897413 /xlators/cluster/dht | |
parent | a4f982be9b21323038704069a56fb2448369d6a0 (diff) |
cluster/tier: add watermarks and policy driver
This fix introduces infrastructure to support different
policies for promotion and demotion.
Currently the tier feature automatically promotes and demotes
files periodically based on access. This is good for testing
but too stringent for most real workloads. It makes it
difficult to fully utilize a hot tier- data will be demoted
before it is touched- its unlikely a 100GB hot SSD will have
all its data touched in a window of time.
A new parameter "mode" allows the user to pick promotion/demotion
polcies.
The "test mode" will be used for *.t and other general testing.
This is the current mechanism.
The "cache mode" introduces watermarks. The watermarks
represent levels of data residing on the hot tier.
"cache mode" policy:
The % the hot tier is full is called P.
Do not promote or demote more than D MB or F files.
A random number [0-100] is called R.
Rules for migration:
if (P < watermark_low) don't demote, always promote.
if (P >= watermark_low) && (P < watermark_hi) demote if R < P; promote if R > P.
if (P > watermark_hi) always demote, don't promote.
gluster volume set {vol} cluster.watermark-hi %
gluster volume set {vol} cluster.watermark-low %
gluster volume set {vol} cluster.tier-max-mb {D}
gluster volume set {vol} cluster.tier-max-files {F}
gluster volume set {vol} cluster.tier-mode {test|cache}
Change-Id: I157f19667ec95aa1d53406041c1e3b073be127c2
BUG: 1257911
Signed-off-by: Dan Lambright <dlambrig@redhat.com>
Reviewed-on: http://review.gluster.org/12039
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
Diffstat (limited to 'xlators/cluster/dht')
-rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 28 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 8 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-shared.c | 25 | ||||
-rw-r--r-- | xlators/cluster/dht/src/tier.c | 471 | ||||
-rw-r--r-- | xlators/cluster/dht/src/tier.h | 24 |
5 files changed, 456 insertions, 100 deletions
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index c48bf5800b9..95ca7067806 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -332,6 +332,29 @@ struct dht_container { dict_t *migrate_data; }; +typedef enum tier_mode_ { + TIER_MODE_NONE = 0, + TIER_MODE_TEST, + TIER_MODE_WM +} tier_mode_t; + +typedef struct gf_tier_conf { + int is_tier; + int watermark_hi; + int watermark_low; + int watermark_last; + fsblkcnt_t blocks_total; + fsblkcnt_t blocks_used; + int percent_full; + uint64_t max_migrate_bytes; + int max_migrate_files; + tier_mode_t mode; + int tier_promote_frequency; + int tier_demote_frequency; + uint64_t st_last_promoted_size; + uint64_t st_last_demoted_size; +} gf_tier_conf_t; + struct gf_defrag_info_ { uint64_t total_files; uint64_t total_data; @@ -352,8 +375,7 @@ struct gf_defrag_info_ { gf_boolean_t stats; uint32_t new_commit_hash; gf_defrag_pattern_list_t *defrag_pattern; - int tier_promote_frequency; - int tier_demote_frequency; + gf_tier_conf_t tier_conf; /*Data Tiering params for scanner*/ uint64_t total_files_promoted; @@ -1088,5 +1110,7 @@ int32_t dht_set_local_rebalance (xlator_t *this, dht_local_t *local, struct iatt *stbuf, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata); +void +dht_build_root_loc (inode_t *inode, loc_t *loc); #endif/* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index c53c7a99882..7dc89d8a069 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -1326,6 +1326,14 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, } } + /* store size of previous migrated file */ + if (defrag->tier_conf.is_tier) { + if (from == conf->subvolumes[0]) { + defrag->tier_conf.st_last_promoted_size = stbuf.ia_size; + } else { + defrag->tier_conf.st_last_demoted_size = stbuf.ia_size; + } + } /* The src file is being unlinked after this so we don't need to clean it up */ diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 2436eba2a0c..4d700482919 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -978,17 +978,32 @@ struct volume_options options[] = { { .key = {"write-freq-threshold"}, .type = GF_OPTION_TYPE_INT, .default_value = "0", - .description = "Defines the write fequency " - "that would be considered hot" }, { .key = {"read-freq-threshold"}, .type = GF_OPTION_TYPE_INT, .default_value = "0", - .description = "Defines the read fequency " - "that would be considered hot" }, - + { .key = {"watermark-hi"}, + .type = GF_OPTION_TYPE_PERCENT, + .default_value = "90", + }, + { .key = {"watermark-low"}, + .type = GF_OPTION_TYPE_PERCENT, + .default_value = "75", + }, + { .key = {"tier-mode"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "test", + }, + { .key = {"tier-max-mb"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "1000", + }, + { .key = {"tier-max-files"}, + .type = GF_OPTION_TYPE_INT, + .default_value = "5000", + }, /* switch option */ { .key = {"pattern.switch.case"}, .type = GF_OPTION_TYPE_ANY diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c index ff01862bed9..860b1f7da9a 100644 --- a/xlators/cluster/dht/src/tier.c +++ b/xlators/cluster/dht/src/tier.c @@ -114,6 +114,120 @@ out: return ret; } +int +tier_do_migration (xlator_t *this, int promote) +{ + gf_defrag_info_t *defrag = NULL; + dht_conf_t *conf = NULL; + long rand = 0; + int migrate = 0; + gf_tier_conf_t *tier_conf = NULL; + + conf = this->private; + if (!conf) + goto exit; + + defrag = conf->defrag; + if (!defrag) + goto exit; + + if (defrag->tier_conf.mode != TIER_MODE_WM) { + migrate = 1; + goto exit; + } + + tier_conf = &defrag->tier_conf; + + switch (tier_conf->watermark_last) { + case TIER_WM_LOW: + migrate = promote ? 1 : 0; + break; + case TIER_WM_HI: + migrate = promote ? 0 : 1; + break; + case TIER_WM_MID: + rand = random() % 100; + if (promote) { + migrate = (rand > tier_conf->percent_full); + } else { + migrate = (rand <= tier_conf->percent_full); + } + break; + } + +exit: + return migrate; +} + +int +tier_check_watermark (xlator_t *this, loc_t *root_loc) +{ + tier_watermark_op_t wm = TIER_WM_NONE; + int ret = -1; + gf_defrag_info_t *defrag = NULL; + dht_conf_t *conf = NULL; + dict_t *xdata = NULL; + struct statvfs statfs = {0, }; + gf_tier_conf_t *tier_conf = NULL; + + conf = this->private; + if (!conf) + goto exit; + + defrag = conf->defrag; + if (!defrag) + goto exit; + + tier_conf = &defrag->tier_conf; + + if (tier_conf->mode != TIER_MODE_WM) { + ret = 0; + goto exit; + } + + /* Find how much free space is on the hot subvolume. Then see if that value */ + /* is less than or greater than user defined watermarks. Stash results in */ + /* the tier_conf data structure. */ + ret = syncop_statfs (conf->subvolumes[1], root_loc, &statfs, + xdata, NULL); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, -ret, + DHT_MSG_LOG_TIER_STATUS, + "Unable to obtain statfs."); + goto exit; + } + + pthread_mutex_lock (&dm_stat_mutex); + + tier_conf->blocks_total = statfs.f_blocks; + tier_conf->blocks_used = statfs.f_blocks - statfs.f_bfree; + + tier_conf->percent_full = (100 * tier_conf->blocks_used) / + statfs.f_blocks; + pthread_mutex_unlock (&dm_stat_mutex); + + if (tier_conf->percent_full < tier_conf->watermark_low) { + wm = TIER_WM_LOW; + + } else if (tier_conf->percent_full < tier_conf->watermark_hi) { + wm = TIER_WM_MID; + + } else { + wm = TIER_WM_HI; + } + + if (wm != tier_conf->watermark_last) { + + tier_conf->watermark_last = wm; + gf_msg (this->name, GF_LOG_INFO, 0, + DHT_MSG_LOG_TIER_STATUS, + "Tier watermark now %d", wm); + } + +exit: + return ret; +} + static int tier_migrate_using_query_file (void *_args) { @@ -141,6 +255,8 @@ tier_migrate_using_query_file (void *_args) char *link_str = NULL; xlator_t *src_subvol = NULL; dht_conf_t *conf = NULL; + uint64_t total_migrated_bytes = 0; + int total_files = 0; GF_VALIDATE_OR_GOTO ("tier", query_cbk_args, out); GF_VALIDATE_OR_GOTO ("tier", query_cbk_args->this, out); @@ -155,14 +271,20 @@ tier_migrate_using_query_file (void *_args) queryFILE = query_cbk_args->queryFILE; - query_record = gfdb_query_record_init(); + query_record = gfdb_query_record_init (); if (!query_record) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_ERROR, + "Call to gfdb_query_record_init() failed."); goto out; } query_record->_link_info_str = GF_CALLOC (1, DB_QUERY_RECORD_SIZE, gf_common_mt_char); if (!query_record->_link_info_str) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_ERROR, + "Allocating query record link info string failed."); goto out; } link_buffer = query_record->_link_info_str; @@ -191,13 +313,14 @@ tier_migrate_using_query_file (void *_args) continue; } + if (!tier_do_migration (this, query_cbk_args->is_promotion)) + continue; + gf_uuid_parse (gfid_str, query_record->gfid); - if (dict_get(migrate_data, GF_XATTR_FILE_MIGRATE_KEY)) - dict_del(migrate_data, GF_XATTR_FILE_MIGRATE_KEY); + dict_del (migrate_data, GF_XATTR_FILE_MIGRATE_KEY); - if (dict_get(migrate_data, "from.migrator")) - dict_del(migrate_data, "from.migrator"); + dict_del (migrate_data, "from.migrator"); token_str = strtok (link_buffer, delimiter); if (token_str != NULL) { @@ -235,6 +358,7 @@ tier_migrate_using_query_file (void *_args) } per_link_status = 0; + /* Per link of file */ while (token_str != NULL) { @@ -270,9 +394,9 @@ tier_migrate_using_query_file (void *_args) ret = syncop_lookup (this, &p_loc, &par_stbuf, NULL, NULL, NULL); if (ret) { - gf_msg (this->name, GF_LOG_ERROR, 0, + gf_msg (this->name, GF_LOG_ERROR, -ret, DHT_MSG_LOG_TIER_ERROR, - " ERROR in parent lookup\n"); + " Error in parent lookup\n"); per_link_status = -1; goto abort; } @@ -284,7 +408,7 @@ tier_migrate_using_query_file (void *_args) gf_uuid_copy (loc.gfid, query_record->gfid); loc.inode = inode_new (defrag->root_inode->table); gf_uuid_copy (loc.pargfid, link_info->pargfid); - loc.parent = inode_ref(p_loc.inode); + loc.parent = inode_ref (p_loc.inode); loc.name = gf_strdup (link_info->file_name); if (!loc.name) { @@ -325,7 +449,10 @@ tier_migrate_using_query_file (void *_args) * should be. It means another brick moved the file * so is not an error. */ - src_subvol = dht_subvol_get_cached(this, loc.inode); + src_subvol = dht_subvol_get_cached (this, loc.inode); + + if (src_subvol == NULL) + goto abort; if (query_cbk_args->is_promotion && src_subvol == conf->subvolumes[1]) { @@ -363,18 +490,48 @@ tier_migrate_using_query_file (void *_args) goto abort; } - if (query_cbk_args->is_promotion) + if (query_cbk_args->is_promotion) { defrag->total_files_promoted++; - else + total_migrated_bytes += + defrag->tier_conf.st_last_promoted_size; + pthread_mutex_lock (&dm_stat_mutex); + defrag->tier_conf.blocks_used += + defrag->tier_conf.st_last_promoted_size; + pthread_mutex_unlock (&dm_stat_mutex); + } else { defrag->total_files_demoted++; + total_migrated_bytes += + defrag->tier_conf.st_last_demoted_size; + pthread_mutex_lock (&dm_stat_mutex); + defrag->tier_conf.blocks_used -= + defrag->tier_conf.st_last_demoted_size; + pthread_mutex_unlock (&dm_stat_mutex); + } + if (defrag->tier_conf.blocks_total) { + pthread_mutex_lock (&dm_stat_mutex); + defrag->tier_conf.percent_full = + (100 * defrag->tier_conf.blocks_used) / + defrag->tier_conf.blocks_total; + pthread_mutex_unlock (&dm_stat_mutex); + } abort: - loc_wipe(&loc); loc_wipe(&p_loc); token_str = NULL; token_str = strtok (NULL, delimiter); GF_FREE (link_str); + + if ((++total_files > defrag->tier_conf.max_migrate_files) || + (total_migrated_bytes > defrag->tier_conf.max_migrate_bytes)) { + gf_msg (this->name, GF_LOG_INFO, 0, + DHT_MSG_LOG_TIER_STATUS, + "Reached cycle migration limit." + "migrated bytes %"PRId64" files %d", + total_migrated_bytes, + total_files); + goto out; + } } per_file_status = per_link_status; per_file_out: @@ -417,7 +574,7 @@ tier_gf_query_callback (gfdb_query_record_t *gfdb_query_record, GF_VALIDATE_OR_GOTO ("tier", query_cbk_args->queryFILE, out); gf_uuid_unparse (gfdb_query_record->gfid, gfid_str); - fprintf (query_cbk_args->queryFILE, "%s|%s|%ld\n", gfid_str, + fprintf (query_cbk_args->queryFILE, "%s|%s|%zd\n", gfid_str, gfdb_query_record->_link_info_str, gfdb_query_record->link_info_size); @@ -435,7 +592,7 @@ out: /*Create query file in tier process*/ static int -tier_process_self_query (brick_list_t *local_brick, void *args) +tier_process_self_query (tier_brick_list_t *local_brick, void *args) { int ret = -1; char *db_path = NULL; @@ -477,7 +634,7 @@ tier_process_self_query (brick_list_t *local_brick, void *args) db_path, ret, out); /*Get the db connection*/ - conn_node = gfdb_methods.init_db((void *)params_dict, dht_tier_db_type); + conn_node = gfdb_methods.init_db ((void *)params_dict, dht_tier_db_type); if (!conn_node) { gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, @@ -486,8 +643,8 @@ tier_process_self_query (brick_list_t *local_brick, void *args) } /*Query for eligible files from db*/ - query_cbk_args->queryFILE = fopen(GET_QFILE_PATH - (gfdb_brick_dict_info->_gfdb_promote), "a+"); + query_cbk_args->queryFILE = fopen ( + GET_QFILE_PATH (gfdb_brick_dict_info->_gfdb_promote), "a+"); if (!query_cbk_args->queryFILE) { gf_msg (this->name, GF_LOG_ERROR, errno, DHT_MSG_LOG_TIER_ERROR, @@ -592,7 +749,7 @@ out: /*Ask CTR to create the query file*/ static int -tier_process_ctr_query (brick_list_t *local_brick, void *args) +tier_process_ctr_query (tier_brick_list_t *local_brick, void *args) { int ret = -1; query_cbk_args_t *query_cbk_args = NULL; @@ -720,7 +877,7 @@ out: * It picks up each bricks db and queries for eligible files for migration. * The list of eligible files are populated in appropriate query files*/ static int -tier_process_brick (brick_list_t *local_brick, void *args) { +tier_process_brick (tier_brick_list_t *local_brick, void *args) { int ret = -1; dict_t *ctr_ipc_in_dict = NULL; dict_t *ctr_ipc_out_dict = NULL; @@ -834,7 +991,7 @@ tier_build_migration_qfile (demotion_args_t *args, _gfdb_brick_dict_info_t gfdb_brick_dict_info; gfdb_time_t time_in_past; int ret = -1; - brick_list_t *local_brick = NULL; + tier_brick_list_t *local_brick = NULL; /* * The first time this function is called, query file will @@ -929,8 +1086,8 @@ tier_demote (void *args) query_cbk_args.is_promotion = 0; /*Build the query file using bricklist*/ - ret = tier_build_migration_qfile(demotion_args, &query_cbk_args, - _gf_false); + ret = tier_build_migration_qfile (demotion_args, &query_cbk_args, + _gf_false); if (ret) goto out; @@ -967,8 +1124,8 @@ static void query_cbk_args.is_promotion = 1; /*Build the query file using bricklist*/ - ret = tier_build_migration_qfile(promotion_args, &query_cbk_args, - _gf_true); + ret = tier_build_migration_qfile (promotion_args, &query_cbk_args, + _gf_true); if (ret) goto out; @@ -994,7 +1151,7 @@ tier_get_bricklist (xlator_t *xl, struct list_head *local_bricklist_head) char *brickname = NULL; char db_name[PATH_MAX] = ""; int ret = 0; - brick_list_t *local_brick = NULL; + tier_brick_list_t *local_brick = NULL; GF_VALIDATE_OR_GOTO ("tier", xl, out); GF_VALIDATE_OR_GOTO ("tier", local_bricklist_head, out); @@ -1006,19 +1163,19 @@ tier_get_bricklist (xlator_t *xl, struct list_head *local_bricklist_head) * those running on the same node as the tier daemon. */ if (strcmp(xl->type, "protocol/client") == 0) { - ret = dict_get_str(xl->options, "remote-host", &rh); + ret = dict_get_str (xl->options, "remote-host", &rh); if (ret < 0) goto out; - if (gf_is_local_addr (rh)) { + if (gf_is_local_addr (rh)) { - local_brick = GF_CALLOC (1, sizeof(brick_list_t), + local_brick = GF_CALLOC (1, sizeof(tier_brick_list_t), gf_tier_mt_bricklist_t); if (!local_brick) { goto out; } - ret = dict_get_str(xl->options, "remote-subvolume", + ret = dict_get_str (xl->options, "remote-subvolume", &rv); if (ret < 0) goto out; @@ -1051,7 +1208,7 @@ tier_get_bricklist (xlator_t *xl, struct list_head *local_bricklist_head) } for (child = xl->children; child; child = child->next) { - ret = tier_get_bricklist(child->xlator, local_bricklist_head); + ret = tier_get_bricklist (child->xlator, local_bricklist_head); if (ret) { goto out; } @@ -1070,11 +1227,50 @@ out: return ret; } +int +tier_get_freq_demote (gf_tier_conf_t *tier_conf) +{ + if ((tier_conf->mode == TIER_MODE_WM) && + (tier_conf->watermark_last == TIER_WM_HI)) + return DEFAULT_DEMOTE_DEGRADED; + else + return tier_conf->tier_demote_frequency; +} + +int +tier_get_freq_promote (gf_tier_conf_t *tier_conf) +{ + return tier_conf->tier_promote_frequency; +} + +static int +tier_check_demote (gfdb_time_t current_time, + int freq_demote) +{ + return ((current_time.tv_sec % freq_demote) == 0) ? + _gf_true : _gf_false; +} + +static gf_boolean_t +tier_check_promote (gf_tier_conf_t *tier_conf, + gfdb_time_t current_time, + int freq_promote) +{ + if ((tier_conf->mode == TIER_MODE_WM) && + (tier_conf->watermark_last == TIER_WM_HI)) + return _gf_false; + + else + return ((current_time.tv_sec % freq_promote) == 0) ? + _gf_true : _gf_false; +} + + void clear_bricklist (struct list_head *brick_list) { - brick_list_t *local_brick = NULL; - brick_list_t *temp = NULL; + tier_brick_list_t *local_brick = NULL; + tier_brick_list_t *temp = NULL; if (list_empty(brick_list)) { return; @@ -1105,9 +1301,11 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag) pthread_t promote_thread; pthread_t demote_thread; gf_boolean_t is_promotion_triggered = _gf_false; - gf_boolean_t is_demotion_triggered = _gf_false; - xlator_t *any = NULL; - xlator_t *xlator = NULL; + gf_boolean_t is_demotion_triggered = _gf_false; + xlator_t *any = NULL; + xlator_t *xlator = NULL; + gf_tier_conf_t *tier_conf = NULL; + loc_t root_loc = { 0 }; conf = this->private; @@ -1122,6 +1320,9 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag) " demote %d", freq_promote, freq_demote); defrag->defrag_status = GF_DEFRAG_STATUS_STARTED; + tier_conf = &defrag->tier_conf; + + dht_build_root_loc (defrag->root_inode, &root_loc); while (1) { @@ -1130,7 +1331,7 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag) * thread. It will need to be restarted manually. */ any = THIS->ctx->active->first; - xlator = xlator_search_by_name(any, this->name); + xlator = xlator_search_by_name (any, this->name); if (xlator != this) { gf_msg (this->name, GF_LOG_INFO, 0, @@ -1160,10 +1361,6 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag) goto out; } - freq_promote = defrag->tier_promote_frequency; - freq_demote = defrag->tier_demote_frequency; - - /* To have proper synchronization amongst all * brick holding nodes, so that promotion and demotions * start atomicly w.r.t promotion/demotion frequency @@ -1178,18 +1375,29 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag) goto out; } - is_demotion_triggered = ((current_time.tv_sec % - freq_demote) == 0) ? _gf_true : - _gf_false; - is_promotion_triggered = ((current_time.tv_sec % - freq_promote) == 0) ? _gf_true : - _gf_false; + freq_demote = tier_get_freq_demote (tier_conf); + + is_demotion_triggered = tier_check_demote (current_time, + freq_demote); + + freq_promote = tier_get_freq_promote(tier_conf); + + is_promotion_triggered = tier_check_promote (tier_conf, + current_time, + freq_promote); /* If no promotion and no demotion is - * scheduled/triggered skip a iteration */ + * scheduled/triggered skip an iteration */ if (!is_promotion_triggered && !is_demotion_triggered) continue; + ret = tier_check_watermark (this, &root_loc); + if (ret != 0) { + gf_msg (this->name, GF_LOG_CRITICAL, errno, + DHT_MSG_LOG_TIER_ERROR, + "Failed to get watermark"); + goto out; + } ret_promotion = -1; ret_demotion = -1; @@ -1297,8 +1505,8 @@ tier_migration_get_dst (xlator_t *this, dht_local_t *local) int32_t ret = -1; gf_defrag_info_t *defrag = NULL; - GF_VALIDATE_OR_GOTO("tier", this, out); - GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO ("tier", this, out); + GF_VALIDATE_OR_GOTO (this->name, this->private, out); conf = this->private; @@ -1332,10 +1540,10 @@ tier_search (xlator_t *this, dht_layout_t *layout, const char *name) int layout_cold = 0; int layout_hot = 1; - GF_VALIDATE_OR_GOTO("tier", this, out); - GF_VALIDATE_OR_GOTO(this->name, layout, out); - GF_VALIDATE_OR_GOTO(this->name, name, out); - GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO ("tier", this, out); + GF_VALIDATE_OR_GOTO (this->name, layout, out); + GF_VALIDATE_OR_GOTO (this->name, name, out); + GF_VALIDATE_OR_GOTO (this->name, this->private, out); conf = this->private; @@ -1389,7 +1597,7 @@ tier_load_externals (xlator_t *this) char *libpathfull = (LIBDIR "/libgfdb.so.0"); get_gfdb_methods_t get_gfdb_methods; - GF_VALIDATE_OR_GOTO("this", this, out); + GF_VALIDATE_OR_GOTO ("this", this, out); libhandle = dlopen (libpathfull, RTLD_NOW); if (!libhandle) { @@ -1420,6 +1628,20 @@ out: return ret; } +static +int tier_validate_mode (char *mode) +{ + int ret = -1; + + if (strcmp (mode, "test") == 0) { + ret = TIER_MODE_TEST; + } else { + ret = TIER_MODE_WM; + } + + return ret; +} + int tier_init (xlator_t *this) { @@ -1428,10 +1650,11 @@ tier_init (xlator_t *this) dht_conf_t *conf = NULL; gf_defrag_info_t *defrag = NULL; char *voldir = NULL; + char *mode = NULL; - ret = dht_init(this); + ret = dht_init (this); if (ret) { - gf_msg(this->name, GF_LOG_ERROR, 0, + gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, "dht_init failed"); goto out; @@ -1442,7 +1665,7 @@ tier_init (xlator_t *this) conf->methods = &tier_methods; if (conf->subvolume_cnt != 2) { - gf_msg(this->name, GF_LOG_ERROR, 0, + gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, "Invalid number of subvolumes %d", conf->subvolume_cnt); goto out; @@ -1455,7 +1678,7 @@ tier_init (xlator_t *this) } /* if instatiated from server side, load db libraries */ - ret = tier_load_externals(this); + ret = tier_load_externals (this); if (ret) { gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, @@ -1465,13 +1688,15 @@ tier_init (xlator_t *this) defrag = conf->defrag; + defrag->tier_conf.is_tier = 1; + ret = dict_get_int32 (this->options, "tier-promote-frequency", &freq); if (ret) { freq = DEFAULT_PROMOTE_FREQ_SEC; } - defrag->tier_promote_frequency = freq; + defrag->tier_conf.tier_promote_frequency = freq; ret = dict_get_int32 (this->options, "tier-demote-frequency", &freq); @@ -1479,7 +1704,23 @@ tier_init (xlator_t *this) freq = DEFAULT_DEMOTE_FREQ_SEC; } - defrag->tier_demote_frequency = freq; + defrag->tier_conf.tier_demote_frequency = freq; + + ret = dict_get_int32 (this->options, + "watermark-hi", &freq); + if (ret) { + freq = DEFAULT_WM_HI; + } + + defrag->tier_conf.watermark_hi = freq; + + ret = dict_get_int32 (this->options, + "watermark-low", &freq); + if (ret) { + freq = DEFAULT_WM_LOW; + } + + defrag->tier_conf.watermark_low = freq; ret = dict_get_int32 (this->options, "write-freq-threshold", &freq); @@ -1497,7 +1738,38 @@ tier_init (xlator_t *this) defrag->read_freq_threshold = freq; - ret = gf_asprintf(&voldir, "%s/%s", + ret = dict_get_int32 (this->options, + "tier-max-mb", &freq); + if (ret) { + freq = DEFAULT_TIER_MAX_MIGRATE_MB; + } + + defrag->tier_conf.max_migrate_bytes = freq * 1024 * 1024; + + ret = dict_get_int32 (this->options, + "tier-max-files", &freq); + if (ret) { + freq = DEFAULT_TIER_MAX_MIGRATE_FILES; + } + + defrag->tier_conf.max_migrate_files = freq; + + ret = dict_get_str (this->options, + "tier-mode", &mode); + if (ret) { + defrag->tier_conf.mode = DEFAULT_TIER_MODE; + } else { + ret = tier_validate_mode (mode); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, + DHT_MSG_LOG_TIER_ERROR, + "tier_init failed - invalid mode"); + goto out; + } + defrag->tier_conf.mode = ret; + } + + ret = gf_asprintf (&voldir, "%s/%s", DEFAULT_VAR_RUN_DIRECTORY, this->name); if (ret < 0) @@ -1505,7 +1777,7 @@ tier_init (xlator_t *this) ret = mkdir_p(voldir, 0777, _gf_true); if (ret == -1 && errno != EEXIST) { - gf_msg(this->name, GF_LOG_ERROR, 0, + gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, "tier_init failed"); @@ -1515,37 +1787,37 @@ tier_init (xlator_t *this) GF_FREE(voldir); - ret = gf_asprintf(&promotion_qfile, "%s/%s/%s-%s", - DEFAULT_VAR_RUN_DIRECTORY, - this->name, - PROMOTION_QFILE, - this->name); + ret = gf_asprintf (&promotion_qfile, "%s/%s/%s-%s", + DEFAULT_VAR_RUN_DIRECTORY, + this->name, + PROMOTION_QFILE, + this->name); if (ret < 0) goto out; - ret = gf_asprintf(&demotion_qfile, "%s/%s/%s-%s", - DEFAULT_VAR_RUN_DIRECTORY, - this->name, - DEMOTION_QFILE, - this->name); + ret = gf_asprintf (&demotion_qfile, "%s/%s/%s-%s", + DEFAULT_VAR_RUN_DIRECTORY, + this->name, + DEMOTION_QFILE, + this->name); if (ret < 0) { - GF_FREE(promotion_qfile); + GF_FREE (promotion_qfile); goto out; } - unlink(promotion_qfile); - unlink(demotion_qfile); + unlink (promotion_qfile); + unlink (demotion_qfile); - gf_msg(this->name, GF_LOG_INFO, 0, - DHT_MSG_LOG_TIER_STATUS, + gf_msg (this->name, GF_LOG_INFO, 0, + DHT_MSG_LOG_TIER_STATUS, "Promote/demote frequency %d/%d " "Write/Read freq thresholds %d/%d", - defrag->tier_promote_frequency, - defrag->tier_demote_frequency, + defrag->tier_conf.tier_promote_frequency, + defrag->tier_conf.tier_demote_frequency, defrag->write_freq_threshold, defrag->read_freq_threshold); - gf_msg(this->name, GF_LOG_INFO, 0, + gf_msg (this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, "Promote file %s demote file %s", promotion_qfile, demotion_qfile); @@ -1563,18 +1835,19 @@ tier_reconfigure (xlator_t *this, dict_t *options) { dht_conf_t *conf = NULL; gf_defrag_info_t *defrag = NULL; - + char *mode = NULL; + int migrate_mb = 0; conf = this->private; if (conf->defrag) { defrag = conf->defrag; GF_OPTION_RECONF ("tier-promote-frequency", - defrag->tier_promote_frequency, options, - int32, out); + defrag->tier_conf.tier_promote_frequency, + options, int32, out); GF_OPTION_RECONF ("tier-demote-frequency", - defrag->tier_demote_frequency, options, - int32, out); + defrag->tier_conf.tier_demote_frequency, + options, int32, out); GF_OPTION_RECONF ("write-freq-threshold", defrag->write_freq_threshold, options, @@ -1583,6 +1856,28 @@ tier_reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("read-freq-threshold", defrag->read_freq_threshold, options, int32, out); + + GF_OPTION_RECONF ("watermark-hi", + defrag->tier_conf.watermark_hi, options, + int32, out); + + GF_OPTION_RECONF ("watermark-low", + defrag->tier_conf.watermark_low, options, + int32, out); + + GF_OPTION_RECONF ("tier-mode", + mode, options, + str, out); + defrag->tier_conf.mode = tier_validate_mode (mode); + + GF_OPTION_RECONF ("tier-max-mb", + migrate_mb, options, + int32, out); + defrag->tier_conf.max_migrate_bytes = migrate_mb*1024*1024; + + GF_OPTION_RECONF ("tier-max-files", + defrag->tier_conf.max_migrate_files, options, + int32, out); } out: @@ -1593,10 +1888,10 @@ void tier_fini (xlator_t *this) { if (libhandle) - dlclose(libhandle); + dlclose (libhandle); - GF_FREE(demotion_qfile); - GF_FREE(promotion_qfile); + GF_FREE (demotion_qfile); + GF_FREE (promotion_qfile); dht_fini(this); } diff --git a/xlators/cluster/dht/src/tier.h b/xlators/cluster/dht/src/tier.h index d168221fe1d..18ca3269f8b 100644 --- a/xlators/cluster/dht/src/tier.h +++ b/xlators/cluster/dht/src/tier.h @@ -20,10 +20,6 @@ #include <fnmatch.h> #include <signal.h> -#define DEFAULT_PROMOTE_FREQ_SEC 120 -#define DEFAULT_DEMOTE_FREQ_SEC 120 -#define DEFAULT_WRITE_FREQ_SEC 0 -#define DEFAULT_READ_FREQ_SEC 0 /* * Size of timer wheel. We would not promote or demote less * frequently than this number. @@ -60,7 +56,7 @@ typedef struct brick_list { xlator_t *xlator; char *brick_db_path; struct list_head list; -} brick_list_t; +} tier_brick_list_t; typedef struct _dm_thread_args { xlator_t *this; @@ -70,4 +66,22 @@ typedef struct _dm_thread_args { int return_value; } promotion_args_t, demotion_args_t; +typedef enum tier_watermark_op_ { + TIER_WM_NONE = 0, + TIER_WM_LOW, + TIER_WM_HI, + TIER_WM_MID +} tier_watermark_op_t; + +#define DEFAULT_PROMOTE_FREQ_SEC 120 +#define DEFAULT_DEMOTE_FREQ_SEC 120 +#define DEFAULT_DEMOTE_DEGRADED 10 +#define DEFAULT_WRITE_FREQ_SEC 0 +#define DEFAULT_READ_FREQ_SEC 0 +#define DEFAULT_WM_LOW 75 +#define DEFAULT_WM_HI 90 +#define DEFAULT_TIER_MODE TIER_MODE_TEST +#define DEFAULT_TIER_MAX_MIGRATE_MB 1000 +#define DEFAULT_TIER_MAX_MIGRATE_FILES 5000 + #endif |