From 243d61575c093c03b9beb014bf9d097646836e95 Mon Sep 17 00:00:00 2001 From: Jeff Darcy Date: Wed, 7 May 2014 19:31:30 +0000 Subject: dht: make lookup-unhashed=auto do something actually useful The key concept here is to determine whether a directory is "clean" by comparing its last-known-good topology to the current one for the volume. These are stored as "commit hashes" on the directory and the volume root respectively. The volume's commit hash changes whenever a brick is added or removed, and a fix-layout is done. A directory's commit hash changes only when a full rebalance (not just fix-layout) is done on it. If all bricks are present and have a directory commit hash that matches the volume commit hash, then we can assume that every file is in its "proper" place. Therefore, if we look for a file in that proper place and don't find it, we can assume it's not on any other subvolume and *safely* skip the global (broadcast to all) lookup. Change-Id: Id6ce4593ba1f7daffa74cfab591cb45960629ae3 BUG: 1220064 Reviewed-on-master: http://review.gluster.org/#/c/7702/ Signed-off-by: Jeff Darcy Signed-off-by: Shyam Reviewed-on: http://review.gluster.org/10729 Tested-by: Gluster Build System Reviewed-by: Krishnan Parthasarathi Reviewed-by: Vijay Bellur --- xlators/cluster/dht/src/dht-common.c | 87 ++++++++- xlators/cluster/dht/src/dht-common.h | 29 ++- xlators/cluster/dht/src/dht-layout.c | 69 ++++--- xlators/cluster/dht/src/dht-rebalance.c | 109 ++++++++++- xlators/cluster/dht/src/dht-selfheal.c | 316 +++++++++++++++++++++++++++++++- xlators/cluster/dht/src/dht-shared.c | 15 ++ 6 files changed, 577 insertions(+), 48 deletions(-) (limited to 'xlators/cluster/dht') diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 6c0afdbec90..37e07ad77da 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -210,6 +210,7 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame) int ret = -1; dht_layout_t *layout = NULL; dht_conf_t *conf = NULL; + uint32_t vol_commit_hash = 0; local = discover_frame->local; layout = local->layout; @@ -279,6 +280,15 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame) dht_layout_set (this, local->inode, layout); } + if (!conf->vch_forced) { + ret = dict_get_uint32 (local->xattr, + conf->commithash_xattr_name, + &vol_commit_hash); + if (ret == 0) { + conf->vol_commit_hash = vol_commit_hash; + } + } + DHT_STACK_UNWIND (lookup, main_frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, local->xattr, &local->postparent); @@ -459,6 +469,12 @@ dht_discover (call_frame_t *frame, xlator_t *this, loc_t *loc) "%s: Failed to set dictionary value:key = %s", loc->path, conf->link_xattr_name); + if (__is_root_gfid(local->loc.gfid)) { + ret = dict_set_uint32 (local->xattr_req, + conf->commithash_xattr_name, + sizeof(uint32_t)); + } + call_cnt = conf->subvolume_cnt; local->call_cnt = call_cnt; @@ -655,6 +671,7 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_frame_t *copy = NULL; dht_local_t *copy_local = NULL; char gfid[GF_UUID_BUF_SIZE] = {0}; + uint32_t vol_commit_hash = 0; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, err); @@ -667,6 +684,14 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (!conf) goto out; + if (!conf->vch_forced) { + ret = dict_get_uint32 (xattr, conf->commithash_xattr_name, + &vol_commit_hash); + if (ret == 0) { + conf->vol_commit_hash = vol_commit_hash; + } + } + gf_uuid_unparse (local->loc.gfid, gfid); LOCK (&frame->lock); @@ -1852,6 +1877,7 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_frame_t *prev = NULL; int ret = 0; dht_layout_t *parent_layout = NULL; + uint32_t vol_commit_hash = 0; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -1875,6 +1901,14 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "fresh_lookup returned for %s with op_ret %d and " "op_errno %d", loc->path, op_ret, op_errno); + if (!conf->vch_forced) { + ret = dict_get_uint32 (xattr, conf->commithash_xattr_name, + &vol_commit_hash); + if (ret == 0) { + conf->vol_commit_hash = vol_commit_hash; + } + } + if (ENTRY_MISSING (op_ret, op_errno)) { gf_msg_debug (this->name, 0, "Entry %s missing on subvol %s", @@ -1891,7 +1925,10 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, &parent_layout); if (ret || !parent_layout) goto out; - if (parent_layout->search_unhashed) { + if (parent_layout->commit_hash + != conf->vol_commit_hash) { + gf_log (this->name, GF_LOG_DEBUG, + "hashes don't match, do global lookup"); local->op_errno = ENOENT; dht_lookup_everywhere (frame, this, loc); return 0; @@ -2078,6 +2115,12 @@ dht_lookup (call_frame_t *frame, xlator_t *this, return 0; } + if (__is_root_gfid(loc->gfid)) { + ret = dict_set_uint32 (local->xattr_req, + conf->commithash_xattr_name, + sizeof(uint32_t)); + } + if (!hashed_subvol) hashed_subvol = dht_subvol_get_hashed (this, loc); local->hashed_subvol = hashed_subvol; @@ -3238,8 +3281,9 @@ dht_fsetxattr (call_frame_t *frame, xlator_t *this, conf = this->private; - GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, - op_errno, err); + if (!conf->defrag) + GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, + op_errno, err); local = dht_local_init (frame, NULL, fd, GF_FOP_FSETXATTR); if (!local) { @@ -3338,6 +3382,7 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, char value[4096] = {0,}; gf_dht_migrate_data_type_t forced_rebalance = GF_DHT_MIGRATE_DATA; int call_cnt = 0; + uint32_t new_hash = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -3350,8 +3395,10 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, methods = conf->methods; GF_VALIDATE_OR_GOTO (this->name, conf->methods, err); - GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, - op_errno, err); + /* Rebalance daemon is allowed to set internal keys */ + if (!conf->defrag) + GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, + op_errno, err); local = dht_local_init (frame, loc, NULL, GF_FOP_SETXATTR); if (!local) { @@ -3489,6 +3536,22 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_INFO, "fixing the layout of %s", loc->path); + ret = dict_get_uint32(xattr, "new-commit-hash", &new_hash); + if (ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "updating commit hash for %s from %u to %u", + uuid_utoa(loc->gfid), + layout->commit_hash, new_hash); + layout->commit_hash = new_hash; + + ret = dht_update_commit_hash_for_layout (frame); + if (ret) { + op_errno = ENOTCONN; + goto err; + } + return ret; + } + ret = dht_fix_directory_layout (frame, dht_common_setxattr_cbk, layout); if (ret) { @@ -5377,6 +5440,8 @@ dht_mkdir (call_frame_t *frame, xlator_t *this, goto err; } + local->layout->commit_hash = conf->vol_commit_hash; + STACK_WIND (frame, dht_mkdir_hashed_cbk, hashed_subvol, hashed_subvol->fops->mkdir, @@ -6570,10 +6635,12 @@ dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc, ret = snprintf (string, max_string_len, "[Subvol_name: %s, Err: %d , Start: " - "%"PRIu32 " , Stop: %"PRIu32 " ], ", + "%"PRIu32 " , Stop: %"PRIu32 " , Hash: %" + PRIu32 " ], ", layout->list[i].xlator->name, layout->list[i].err, layout->list[i].start, - layout->list[i].stop); + layout->list[i].stop, + layout->list[i].commit_hash); if (ret < 0) return; @@ -6602,10 +6669,12 @@ dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc, ret = snprintf (output_string + off, len - off, "[Subvol_name: %s, Err: %d , Start: " - "%"PRIu32 " , Stop: %"PRIu32 " ], ", + "%"PRIu32 " , Stop: %"PRIu32 " , Hash: %" + PRIu32 " ], ", layout->list[i].xlator->name, layout->list[i].err, layout->list[i].start, - layout->list[i].stop); + layout->list[i].stop, + layout->list[i].commit_hash); if (ret < 0) goto err; diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 9a6ed1a889a..45b6cc9e80b 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -31,6 +31,7 @@ #define DHT_PATHINFO_HEADER "DISTRIBUTE:" #define DHT_FILE_MIGRATE_DOMAIN "dht.file.migrate" #define DHT_LAYOUT_HEAL_DOMAIN "dht.layout.heal" +#define DHT_LAYOUT_HASH_INVALID 1 #include @@ -48,6 +49,20 @@ struct dht_layout { special key */ int cnt; int preset; + /* + * The last *configuration* state for which this directory was known + * to be in balance. The corresponding vol_commit_hash changes + * whenever bricks are added or removed. This value changes when a + * (full) rebalance is complete. If they match, it's safe to assume + * that every file is where it should be and there's no need to do + * lookups for files elsewhere. If they don't, then we have to do a + * global lookup to be sure. + */ + uint32_t commit_hash; + /* + * The *runtime* state of the volume, changes when connections to + * bricks are made or lost. + */ int gen; int type; int ref; /* use with dht_conf_t->layout_lock */ @@ -59,6 +74,7 @@ struct dht_layout { */ uint32_t start; uint32_t stop; + uint32_t commit_hash; xlator_t *xlator; } list[]; }; @@ -325,6 +341,7 @@ struct gf_defrag_info_ { uuid_t node_uuid; struct timeval start_time; gf_boolean_t stats; + uint32_t new_commit_hash; gf_defrag_pattern_list_t *defrag_pattern; int tier_promote_frequency; int tier_demote_frequency; @@ -422,6 +439,7 @@ struct dht_conf { /* Support variable xattr names. */ char *xattr_name; char *link_xattr_name; + char *commithash_xattr_name; char *wild_xattr_name; /* Support size-weighted rebalancing (heterogeneous bricks). */ @@ -436,6 +454,13 @@ struct dht_conf { /*local subvol storage for rebalance*/ xlator_t **local_subvols; int32_t local_subvols_cnt; + + /* + * "Commit hash" for this volume topology. Changed whenever bricks + * are added or removed. + */ + uint32_t vol_commit_hash; + gf_boolean_t vch_forced; }; typedef struct dht_conf dht_conf_t; @@ -576,7 +601,7 @@ int dht_layouts_init (xlator_t *this, dht_conf_t *conf); int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, int op_ret, int op_errno, dict_t *xattr); -int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, +int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, int pos, int32_t **disk_layout_p); int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, int pos, void *disk_layout_raw, int disk_layout_len); @@ -631,6 +656,7 @@ xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx); int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode); +int dht_layout_index_for_subvol (dht_layout_t *layout, xlator_t *subvol); int dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout);; void dht_layout_unref (xlator_t *this, dht_layout_t *layout); dht_layout_t *dht_layout_ref (xlator_t *this, dht_layout_t *layout); @@ -649,6 +675,7 @@ int dht_rename_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *preparent, struct iatt *postparent, dict_t *xdata); +int dht_update_commit_hash_for_layout (call_frame_t *frame); int dht_fix_directory_layout (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, dht_layout_t *layout); diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c index 2ed15c5e43c..f88c786a55b 100644 --- a/xlators/cluster/dht/src/dht-layout.c +++ b/xlators/cluster/dht/src/dht-layout.c @@ -267,7 +267,7 @@ dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, goto out; } - disk_layout[0] = hton32 (1); + disk_layout[0] = hton32 (layout->list[pos].commit_hash); disk_layout[1] = hton32 (layout->type); disk_layout[2] = hton32 (layout->list[pos].start); disk_layout[3] = hton32 (layout->list[pos].stop); @@ -288,10 +288,10 @@ int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, int pos, void *disk_layout_raw, int disk_layout_len) { - int cnt = 0; int type = 0; int start_off = 0; int stop_off = 0; + int commit_hash = 0; int disk_layout[4]; if (!disk_layout_raw) { @@ -305,14 +305,6 @@ dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, memcpy (disk_layout, disk_layout_raw, disk_layout_len); - cnt = ntoh32 (disk_layout[0]); - if (cnt != 1) { - gf_msg (this->name, GF_LOG_ERROR, 0, - DHT_MSG_INVALID_DISK_LAYOUT, - "Invalid disk layout: Invalid count %d", cnt); - return -1; - } - type = ntoh32 (disk_layout[1]); switch (type) { case DHT_HASH_TYPE_DM_USER: @@ -330,21 +322,22 @@ dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, return -1; } + commit_hash = ntoh32 (disk_layout[0]); start_off = ntoh32 (disk_layout[2]); stop_off = ntoh32 (disk_layout[3]); + layout->list[pos].commit_hash = commit_hash; layout->list[pos].start = start_off; layout->list[pos].stop = stop_off; gf_msg_trace (this->name, 0, - "merged to layout: %u - %u (type %d) from %s", - start_off, stop_off, type, + "merged to layout: %u - %u (type %d, hash %d) from %s", + start_off, stop_off, commit_hash, type, layout->list[pos].xlator->name); return 0; } - int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, int op_ret, int op_errno, dict_t *xattr) @@ -397,6 +390,13 @@ dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, subvol->name); goto out; } + + if (layout->commit_hash == 0) { + layout->commit_hash = layout->list[i].commit_hash; + } else if (layout->commit_hash != layout->list[i].commit_hash) { + layout->commit_hash = DHT_LAYOUT_HASH_INVALID; + } + layout->list[i].err = 0; out: @@ -409,6 +409,7 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j) { uint32_t start_swap = 0; uint32_t stop_swap = 0; + uint32_t commit_hash_swap = 0; xlator_t *xlator_swap = 0; int err_swap = 0; @@ -416,16 +417,19 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j) stop_swap = layout->list[i].stop; xlator_swap = layout->list[i].xlator; err_swap = layout->list[i].err; + commit_hash_swap = layout->list[i].commit_hash; layout->list[i].start = layout->list[j].start; layout->list[i].stop = layout->list[j].stop; layout->list[i].xlator = layout->list[j].xlator; layout->list[i].err = layout->list[j].err; + layout->list[i].commit_hash = layout->list[j].commit_hash; layout->list[j].start = start_swap; layout->list[j].stop = stop_swap; layout->list[j].xlator = xlator_swap; layout->list[j].err = err_swap; + layout->list[j].commit_hash = commit_hash_swap; } void @@ -728,9 +732,9 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, int dict_ret = 0; int32_t disk_layout[4]; void *disk_layout_raw = NULL; - int32_t count = -1; uint32_t start_off = -1; uint32_t stop_off = -1; + uint32_t commit_hash = -1; dht_conf_t *conf = this->private; char gfid[GF_UUID_BUF_SIZE] = {0}; @@ -779,27 +783,21 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, memcpy (disk_layout, disk_layout_raw, sizeof (disk_layout)); - count = ntoh32 (disk_layout[0]); - if (count != 1) { - gf_msg (this->name, GF_LOG_ERROR, 0, - DHT_MSG_INVALID_DISK_LAYOUT, - "Invalid disk layout: invalid count %d," - "path = %s, gfid = %s ", count, loc->path, gfid); - ret = -1; - goto out; - } - start_off = ntoh32 (disk_layout[2]); stop_off = ntoh32 (disk_layout[3]); + commit_hash = ntoh32 (disk_layout[0]); if ((layout->list[pos].start != start_off) - || (layout->list[pos].stop != stop_off)) { + || (layout->list[pos].stop != stop_off) + || (layout->list[pos].commit_hash != commit_hash)) { gf_log (this->name, GF_LOG_INFO, - "subvol: %s; inode layout - %"PRIu32" - %"PRIu32"; " - "disk layout - %"PRIu32" - %"PRIu32, + "subvol: %s; inode layout - %"PRIu32" - %"PRIu32 + " - %"PRIu32"; " + "disk layout - %"PRIu32" - %"PRIu32" - %"PRIu32, layout->list[pos].xlator->name, layout->list[pos].start, layout->list[pos].stop, - start_off, stop_off); + layout->list[pos].commit_hash, + start_off, stop_off, commit_hash); ret = 1; } else { ret = 0; @@ -839,3 +837,18 @@ dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode) out: return ret; } + +int +dht_layout_index_for_subvol (dht_layout_t *layout, xlator_t *subvol) +{ + int i = 0, ret = -1; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == subvol) { + ret = i; + break; + } + } + + return ret; +} diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index 60f7314efe0..fae856d969f 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -2336,6 +2336,46 @@ out: return ret; } +int +gf_defrag_settle_hash (xlator_t *this, gf_defrag_info_t *defrag, + loc_t *loc, dict_t *fix_layout) +{ + int ret; + + /* + * Now we're ready to update the directory commit hash for the volume + * root, so that hash miscompares and broadcast lookups can stop. + * However, we want to skip that if fix-layout is all we did. In + * that case, we want the miscompares etc. to continue until a real + * rebalance is complete. + */ + if (defrag->cmd == GF_DEFRAG_CMD_START_LAYOUT_FIX + || defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER + || defrag->cmd == GF_DEFRAG_CMD_START_TIER) { + return 0; + } + + ret = dict_set_uint32 (fix_layout, "new-commit-hash", + defrag->new_commit_hash); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set new-commit-hash"); + return -1; + } + + ret = syncop_setxattr (this, loc, fix_layout, 0, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "fix layout on %s failed", loc->path); + return -1; + } + + /* TBD: find more efficient solution than adding/deleting every time */ + dict_del(fix_layout, "new-commit-hash"); + + return 0; +} + int gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, dict_t *fix_layout, dict_t *migrate_data) @@ -2422,6 +2462,7 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, if (ret) { gf_log (this->name, GF_LOG_ERROR, "Child loc" " build failed"); + ret = -1; goto out; } @@ -2487,9 +2528,16 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, "Fix layout failed for %s", entry_loc.path); defrag->total_failures++; + ret = -1; goto out; } + if (gf_defrag_settle_hash (this, defrag, &entry_loc, + fix_layout) != 0) { + defrag->total_failures++; + ret = -1; + goto out; + } } gf_dirent_free (&entries); free_entries = _gf_false; @@ -2573,6 +2621,36 @@ gf_defrag_start_crawl (void *data) goto out; } + /* + * Unfortunately, we can't do special xattrs (like fix.layout) and + * real ones in the same call currently, and changing it seems + * riskier than just doing two calls. + */ + + gf_log (this->name, GF_LOG_INFO, "%s using commit hash %u", + __func__, conf->vol_commit_hash); + + ret = dict_set_uint32 (fix_layout, conf->commithash_xattr_name, + conf->vol_commit_hash); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set %s", conf->commithash_xattr_name); + defrag->total_failures++; + ret = -1; + goto out; + } + + ret = syncop_setxattr (this, &loc, fix_layout, 0, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "fix layout on %s failed", + loc.path); + defrag->total_failures++; + ret = -1; + goto out; + } + + /* We now return to our regularly scheduled program. */ + ret = dict_set_str (fix_layout, GF_XATTR_FIX_LAYOUT_KEY, "yes"); if (ret) { gf_msg (this->name, GF_LOG_ERROR, 0, @@ -2580,10 +2658,13 @@ gf_defrag_start_crawl (void *data) "Failed to start rebalance:" "Failed to set dictionary value: key = %s", GF_XATTR_FIX_LAYOUT_KEY); + defrag->total_failures++; ret = -1; goto out; } + defrag->new_commit_hash = conf->vol_commit_hash; + ret = syncop_setxattr (this, &loc, fix_layout, 0, NULL, NULL); if (ret) { gf_msg (this->name, GF_LOG_ERROR, 0, @@ -2599,19 +2680,18 @@ gf_defrag_start_crawl (void *data) (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) { migrate_data = dict_new (); if (!migrate_data) { + defrag->total_failures++; ret = -1; goto out; } - if (defrag->cmd == GF_DEFRAG_CMD_START_FORCE) - ret = dict_set_str (migrate_data, - GF_XATTR_FILE_MIGRATE_KEY, - "force"); - else - ret = dict_set_str (migrate_data, - GF_XATTR_FILE_MIGRATE_KEY, - "non-force"); - if (ret) + ret = dict_set_str (migrate_data, GF_XATTR_FILE_MIGRATE_KEY, + (defrag->cmd == GF_DEFRAG_CMD_START_FORCE) + ? "force" : "non-force"); + if (ret) { + defrag->total_failures++; + ret = -1; goto out; + } /* Find local subvolumes */ ret = syncop_getxattr (this, &loc, &dict, @@ -2670,6 +2750,17 @@ gf_defrag_start_crawl (void *data) ret = gf_defrag_fix_layout (this, defrag, &loc, fix_layout, migrate_data); + if (ret) { + defrag->total_failures++; + ret = -1; + goto out; + } + + if (gf_defrag_settle_hash (this, defrag, &loc, fix_layout) != 0) { + defrag->total_failures++; + ret = -1; + goto out; + } if (defrag->cmd == GF_DEFRAG_CMD_START_TIER) { methods = conf->methods; diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index cc093e1199f..c881a361804 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -23,11 +23,14 @@ #define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,path) do { \ layout->list[i].start = srt; \ layout->list[i].stop = srt + chunk - 1; \ + layout->list[i].commit_hash = layout->commit_hash; \ \ gf_msg_trace (this->name, 0, \ - "gave fix: %u - %u on %s for %s", \ + "gave fix: %u - %u, with commit-hash %u" \ + " on %s for %s", \ layout->list[i].start, \ layout->list[i].stop, \ + layout->list[i].commit_hash, \ layout->list[i].xlator->name, path); \ } while (0) @@ -448,6 +451,7 @@ dht_should_fix_layout (call_frame_t *frame, dht_layout_t **inmem, dht_layout_t **ondisk) { gf_boolean_t fixit = _gf_true; + dht_local_t *local = NULL; int layout_span = 0; int decommissioned_bricks = 0; @@ -482,6 +486,10 @@ dht_should_fix_layout (call_frame_t *frame, dht_layout_t **inmem, if (local->selfheal.hole_cnt || local->selfheal.overlaps_cnt) goto out; + /* If commit hashes are being updated, let it through */ + if ((*inmem)->commit_hash != (*ondisk)->commit_hash) + goto out; + layout_span = dht_layout_span (*ondisk); decommissioned_bricks @@ -497,6 +505,7 @@ dht_should_fix_layout (call_frame_t *frame, dht_layout_t **inmem, fixit = _gf_false; out: + return fixit; } @@ -756,6 +765,7 @@ dht_fix_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) dummy = dht_layout_new (this, 1); if (!dummy) goto out; + dummy->commit_hash = layout->commit_hash; for (i = 0; i < conf->subvolume_cnt; i++) { if (_gf_false == dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { @@ -1474,6 +1484,8 @@ dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc, new_layout->list[i].xlator = layout->list[i].xlator; } + new_layout->commit_hash = layout->commit_hash; + if (priv->du_stats) { for (i = 0; i < priv->subvolume_cnt; ++i) { gf_log (this->name, GF_LOG_INFO, @@ -1653,6 +1665,11 @@ dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc, overlaps = local->selfheal.overlaps_cnt; if (holes || overlaps) { + /* If the layout has anomolies which would change the hash + * ranges, then we need to reset the commit_hash for this + * directory, as the layout would change and things may not + * be in place as expected */ + layout->commit_hash = DHT_LAYOUT_HASH_INVALID; dht_selfheal_layout_new_directory (frame, loc, layout); ret = 0; } @@ -1934,3 +1951,300 @@ dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data) DHT_STACK_DESTROY (sync_frame); return 0; } + +/* EXIT: dht_update_commit_hash_for_layout */ +int +dht_update_commit_hash_for_layout_done (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xdata) +{ + dht_local_t *local = NULL; + + local = frame->local; + + /* preserve oldest error */ + if (op_ret && !local->op_ret) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + + DHT_STACK_UNWIND (setxattr, frame, local->op_ret, + local->op_errno, NULL); + + return 0; +} + +int +dht_update_commit_hash_for_layout_unlock (call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + int ret = 0; + + local = frame->local; + + ret = dht_unlock_inodelk (frame, local->lock.locks, + local->lock.lk_count, + dht_update_commit_hash_for_layout_done); + if (ret < 0) { + /* preserve oldest error, just ... */ + if (!local->op_ret) { + local->op_errno = errno; + local->op_ret = -1; + } + + gf_msg (this->name, GF_LOG_WARNING, errno, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, + "Winding unlock failed: stale locks left on brick" + " %s", local->loc.path); + + dht_update_commit_hash_for_layout_done (frame, NULL, this, + 0, 0, NULL); + } + + return 0; +} + +int +dht_update_commit_hash_for_layout_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, + int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + + local = frame->local; + + LOCK (&frame->lock); + /* store first failure, just because */ + if (op_ret && !local->op_ret) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_update_commit_hash_for_layout_unlock (frame, this); + } + + return 0; +} + +int +dht_update_commit_hash_for_layout_resume (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int count = 1, ret = -1, i = 0, j = 0; + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + int32_t *disk_layout = NULL; + dict_t **xattr = NULL; + + local = frame->local; + conf = frame->this->private; + count = conf->local_subvols_cnt; + layout = local->layout; + + if (op_ret < 0) { + goto err_done; + } + + /* We precreate the xattr list as we cannot change call count post the + * first wind as we may never continue from there. So we finish prep + * work before winding the setxattrs */ + xattr = GF_CALLOC (count, sizeof (*xattr), gf_common_mt_char); + if (!xattr) { + local->op_errno = errno; + + gf_msg (this->name, GF_LOG_WARNING, errno, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, + "Directory commit hash update failed:" + " %s: Allocation failed", local->loc.path); + + goto err; + } + + for (i = 0; i < count; i++) { + /* find the layout index for the subvolume */ + ret = dht_layout_index_for_subvol (layout, + conf->local_subvols[i]); + if (ret < 0) { + local->op_errno = ENOENT; + + gf_msg (this->name, GF_LOG_WARNING, 0, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, + "Directory commit hash update failed:" + " %s: (subvol %s) Failed to find disk layout", + local->loc.path, conf->local_subvols[i]->name); + + goto err; + } + j = ret; + + /* update the commit hash for the layout */ + layout->list[j].commit_hash = layout->commit_hash; + + /* extract the current layout */ + ret = dht_disk_layout_extract (this, layout, j, &disk_layout); + if (ret == -1) { + local->op_errno = errno; + + gf_msg (this->name, GF_LOG_WARNING, errno, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, + "Directory commit hash update failed:" + " %s: (subvol %s) Failed to extract disk" + " layout", local->loc.path, + conf->local_subvols[i]->name); + + goto err; + } + + xattr[i] = get_new_dict (); + if (!xattr[i]) { + local->op_errno = errno; + + gf_msg (this->name, GF_LOG_WARNING, errno, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, + "Directory commit hash update failed:" + " %s: Allocation failed", local->loc.path); + + goto err; + } + + ret = dict_set_bin (xattr[i], conf->xattr_name, + disk_layout, 4 * 4); + if (ret != 0) { + local->op_errno = ENOMEM; + + gf_msg (this->name, GF_LOG_WARNING, 0, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, + "Directory self heal xattr failed:" + "%s: (subvol %s) Failed to set xattr" + " dictionary,", local->loc.path, + conf->local_subvols[i]->name); + + goto err; + } + disk_layout = NULL; + + gf_msg_trace (this->name, 0, + "setting commit hash %u on subvolume %s" + " for %s", layout->list[j].commit_hash, + conf->local_subvols[i]->name, local->loc.path); + } + + /* wind the setting of the commit hash across the local subvols */ + local->call_cnt = count; + local->op_ret = 0; + local->op_errno = 0; + for (i = 0; i < count; i++) { + dict_ref (xattr[i]); + + STACK_WIND (frame, dht_update_commit_hash_for_layout_cbk, + conf->local_subvols[i], + conf->local_subvols[i]->fops->setxattr, + &local->loc, xattr[i], 0, NULL); + + dict_unref (xattr[i]); + } + + return 0; +err: + if (xattr) { + for (i = 0; i < count; i++) { + if (xattr[i]) + dict_destroy (xattr[i]); + } + + GF_FREE (xattr); + } + + GF_FREE (disk_layout); + + local->op_ret = -1; + + dht_update_commit_hash_for_layout_unlock (frame, this); + + return 0; +err_done: + local->op_ret = -1; + + dht_update_commit_hash_for_layout_done (frame, NULL, this, 0, 0, NULL); + + return 0; +} + +/* ENTER: dht_update_commit_hash_for_layout (see EXIT above) + * This function is invoked from rebalance only. + * As a result, the check here is simple enough to see if defrag is present + * in the conf, as other data would be populated appropriately if so. + * If ever this was to be used in other code paths, checks would need to + * change. + * + * Functional details: + * - Lock the inodes on the subvols that we want the commit hash updated + * - Update each layout with the inode layout, modified to take in the new + * commit hash. + * - Unlock and return. + */ +int +dht_update_commit_hash_for_layout (call_frame_t *frame) +{ + dht_local_t *local = NULL; + int count = 1, ret = -1, i = 0; + dht_lock_t **lk_array = NULL; + dht_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO (frame->this->name, frame->local, err); + + local = frame->local; + conf = frame->this->private; + + if (!conf->defrag) + goto err; + + count = conf->local_subvols_cnt; + lk_array = GF_CALLOC (count, sizeof (*lk_array), + gf_common_mt_char); + if (lk_array == NULL) + goto err; + + for (i = 0; i < count; i++) { + lk_array[i] = dht_lock_new (frame->this, + conf->local_subvols[i], + &local->loc, F_WRLCK, + DHT_LAYOUT_HEAL_DOMAIN); + if (lk_array[i] == NULL) + goto err; + } + + local->lock.locks = lk_array; + local->lock.lk_count = count; + + ret = dht_blocking_inodelk (frame, lk_array, count, + dht_update_commit_hash_for_layout_resume); + if (ret < 0) { + local->lock.locks = NULL; + local->lock.lk_count = 0; + goto err; + } + + return 0; +err: + if (lk_array != NULL) { + int tmp_count = 0, i = 0; + + for (i = 0; (i < count) && (lk_array[i]); i++, tmp_count++) { + ; + } + + dht_lock_array_free (lk_array, tmp_count); + GF_FREE (lk_array); + } + + return -1; +} diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index ffabc820d70..a1f72a85112 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -569,6 +569,7 @@ dht_init (xlator_t *this) int cmd = 0; char *node_uuid = NULL; int throttle_count = 0; + uint32_t commit_hash = 0; GF_VALIDATE_OR_GOTO ("dht", this, err); @@ -590,6 +591,15 @@ dht_init (xlator_t *this) goto err; } + /* We get the commit-hash to set only for rebalance process */ + if (dict_get_uint32 (this->options, + "commit-hash", &commit_hash) == 0) { + gf_log (this->name, GF_LOG_INFO, "%s using commit hash %u", + __func__, commit_hash); + conf->vol_commit_hash = commit_hash; + conf->vch_forced = _gf_true; + } + ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd); if (cmd) { @@ -760,6 +770,8 @@ dht_init (xlator_t *this) GF_OPTION_INIT ("xattr-name", conf->xattr_name, str, err); gf_asprintf (&conf->link_xattr_name, "%s."DHT_LINKFILE_STR, conf->xattr_name); + gf_asprintf (&conf->commithash_xattr_name, "%s."DHT_COMMITHASH_STR, + conf->xattr_name); gf_asprintf (&conf->wild_xattr_name, "%s*", conf->xattr_name); if (!conf->link_xattr_name || !conf->wild_xattr_name) { goto err; @@ -871,6 +883,9 @@ struct volume_options options[] = { { .key = {"rebalance-cmd"}, .type = GF_OPTION_TYPE_INT, }, + { .key = {"commit-hash"}, + .type = GF_OPTION_TYPE_INT, + }, { .key = {"node-uuid"}, .type = GF_OPTION_TYPE_STR, }, -- cgit