diff options
author | Jeff Darcy <jdarcy@redhat.com> | 2014-05-07 19:31:30 +0000 |
---|---|---|
committer | Vijay Bellur <vbellur@redhat.com> | 2015-05-10 06:17:22 -0700 |
commit | 4eaaf5188fe24a4707dc2cf2934525083cf8e64f (patch) | |
tree | 119e440d7ba0bbd85a929294915ef54085b74ffb /xlators | |
parent | 4b7914384e2613e5ec7c618071cb89187ed6f870 (diff) |
dht: make lookup-unhashed=auto do something actually useful
The key concept here is to determine whether a directory is "clean" by
comparing its last-known-good topology to the current one for the
volume. These are stored as "commit hashes" on the directory and the
volume root respectively. The volume's commit hash changes whenever a
brick is added or removed, and a fix-layout is done. A directory's
commit hash changes only when a full rebalance (not just fix-layout)
is done on it. If all bricks are present and have a directory
commit hash that matches the volume commit hash, then we can assume
that every file is in its "proper" place. Therefore, if we look for
a file in that proper place and don't find it, we can assume it's not
on any other subvolume and *safely* skip the global (broadcast to all)
lookup.
Change-Id: Id6ce4593ba1f7daffa74cfab591cb45960629ae3
BUG: 1219637
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
Signed-off-by: Shyam <srangana@redhat.com>
Reviewed-on: http://review.gluster.org/7702
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Tested-by: NetBSD Build System
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'xlators')
-rw-r--r-- | xlators/cluster/dht/src/dht-common.c | 87 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 29 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-layout.c | 78 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 109 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-selfheal.c | 316 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-shared.c | 15 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-brick-ops.c | 5 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-op-sm.c | 105 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-rebalance.c | 7 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd.h | 1 |
10 files changed, 664 insertions, 88 deletions
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 54ad68246fb..8309b317017 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -210,6 +210,7 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame) int ret = -1; dht_layout_t *layout = NULL; dht_conf_t *conf = NULL; + uint32_t vol_commit_hash = 0; local = discover_frame->local; layout = local->layout; @@ -279,6 +280,15 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame) dht_layout_set (this, local->inode, layout); } + if (!conf->vch_forced) { + ret = dict_get_uint32 (local->xattr, + conf->commithash_xattr_name, + &vol_commit_hash); + if (ret == 0) { + conf->vol_commit_hash = vol_commit_hash; + } + } + DHT_STACK_UNWIND (lookup, main_frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, local->xattr, &local->postparent); @@ -459,6 +469,12 @@ dht_discover (call_frame_t *frame, xlator_t *this, loc_t *loc) "%s: Failed to set dictionary value:key = %s", loc->path, conf->link_xattr_name); + if (__is_root_gfid(local->loc.gfid)) { + ret = dict_set_uint32 (local->xattr_req, + conf->commithash_xattr_name, + sizeof(uint32_t)); + } + call_cnt = conf->subvolume_cnt; local->call_cnt = call_cnt; @@ -655,6 +671,7 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_frame_t *copy = NULL; dht_local_t *copy_local = NULL; char gfid[GF_UUID_BUF_SIZE] = {0}; + uint32_t vol_commit_hash = 0; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, err); @@ -667,6 +684,14 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (!conf) goto out; + if (!conf->vch_forced) { + ret = dict_get_uint32 (xattr, conf->commithash_xattr_name, + &vol_commit_hash); + if (ret == 0) { + conf->vol_commit_hash = vol_commit_hash; + } + } + gf_uuid_unparse (local->loc.gfid, gfid); LOCK (&frame->lock); @@ -1852,6 +1877,7 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_frame_t *prev = NULL; int ret = 0; dht_layout_t *parent_layout = NULL; + uint32_t vol_commit_hash = 0; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -1875,6 +1901,14 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "fresh_lookup returned for %s with op_ret %d and " "op_errno %d", loc->path, op_ret, op_errno); + if (!conf->vch_forced) { + ret = dict_get_uint32 (xattr, conf->commithash_xattr_name, + &vol_commit_hash); + if (ret == 0) { + conf->vol_commit_hash = vol_commit_hash; + } + } + if (ENTRY_MISSING (op_ret, op_errno)) { gf_msg_debug (this->name, 0, "Entry %s missing on subvol %s", @@ -1891,7 +1925,10 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, &parent_layout); if (ret || !parent_layout) goto out; - if (parent_layout->search_unhashed) { + if (parent_layout->commit_hash + != conf->vol_commit_hash) { + gf_log (this->name, GF_LOG_DEBUG, + "hashes don't match, do global lookup"); local->op_errno = ENOENT; dht_lookup_everywhere (frame, this, loc); return 0; @@ -2078,6 +2115,12 @@ dht_lookup (call_frame_t *frame, xlator_t *this, return 0; } + if (__is_root_gfid(loc->gfid)) { + ret = dict_set_uint32 (local->xattr_req, + conf->commithash_xattr_name, + sizeof(uint32_t)); + } + if (!hashed_subvol) hashed_subvol = dht_subvol_get_hashed (this, loc); local->hashed_subvol = hashed_subvol; @@ -3238,8 +3281,9 @@ dht_fsetxattr (call_frame_t *frame, xlator_t *this, conf = this->private; - GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, - op_errno, err); + if (!conf->defrag) + GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, + op_errno, err); local = dht_local_init (frame, NULL, fd, GF_FOP_FSETXATTR); if (!local) { @@ -3338,6 +3382,7 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, char value[4096] = {0,}; gf_dht_migrate_data_type_t forced_rebalance = GF_DHT_MIGRATE_DATA; int call_cnt = 0; + uint32_t new_hash = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -3350,8 +3395,10 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, methods = conf->methods; GF_VALIDATE_OR_GOTO (this->name, conf->methods, err); - GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, - op_errno, err); + /* Rebalance daemon is allowed to set internal keys */ + if (!conf->defrag) + GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, + op_errno, err); local = dht_local_init (frame, loc, NULL, GF_FOP_SETXATTR); if (!local) { @@ -3491,6 +3538,22 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_INFO, "fixing the layout of %s", loc->path); + ret = dict_get_uint32(xattr, "new-commit-hash", &new_hash); + if (ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "updating commit hash for %s from %u to %u", + uuid_utoa(loc->gfid), + layout->commit_hash, new_hash); + layout->commit_hash = new_hash; + + ret = dht_update_commit_hash_for_layout (frame); + if (ret) { + op_errno = ENOTCONN; + goto err; + } + return ret; + } + ret = dht_fix_directory_layout (frame, dht_common_setxattr_cbk, layout); if (ret) { @@ -5379,6 +5442,8 @@ dht_mkdir (call_frame_t *frame, xlator_t *this, goto err; } + local->layout->commit_hash = conf->vol_commit_hash; + STACK_WIND (frame, dht_mkdir_hashed_cbk, hashed_subvol, hashed_subvol->fops->mkdir, @@ -6573,10 +6638,12 @@ dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc, ret = snprintf (string, max_string_len, "[Subvol_name: %s, Err: %d , Start: " - "%"PRIu32 " , Stop: %"PRIu32 " ], ", + "%"PRIu32 " , Stop: %"PRIu32 " , Hash: %" + PRIu32 " ], ", layout->list[i].xlator->name, layout->list[i].err, layout->list[i].start, - layout->list[i].stop); + layout->list[i].stop, + layout->list[i].commit_hash); if (ret < 0) return; @@ -6605,10 +6672,12 @@ dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc, ret = snprintf (output_string + off, len - off, "[Subvol_name: %s, Err: %d , Start: " - "%"PRIu32 " , Stop: %"PRIu32 " ], ", + "%"PRIu32 " , Stop: %"PRIu32 " , Hash: %" + PRIu32 " ], ", layout->list[i].xlator->name, layout->list[i].err, layout->list[i].start, - layout->list[i].stop); + layout->list[i].stop, + layout->list[i].commit_hash); if (ret < 0) goto err; diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 3e2d5d725e9..5eb65bf0397 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -32,6 +32,7 @@ #define DHT_FILE_MIGRATE_DOMAIN "dht.file.migrate" #define DHT_LAYOUT_HEAL_DOMAIN "dht.layout.heal" #define TIERING_MIGRATION_KEY "tiering.migration" +#define DHT_LAYOUT_HASH_INVALID 1 #include <fnmatch.h> @@ -49,6 +50,20 @@ struct dht_layout { special key */ int cnt; int preset; + /* + * The last *configuration* state for which this directory was known + * to be in balance. The corresponding vol_commit_hash changes + * whenever bricks are added or removed. This value changes when a + * (full) rebalance is complete. If they match, it's safe to assume + * that every file is where it should be and there's no need to do + * lookups for files elsewhere. If they don't, then we have to do a + * global lookup to be sure. + */ + uint32_t commit_hash; + /* + * The *runtime* state of the volume, changes when connections to + * bricks are made or lost. + */ int gen; int type; int ref; /* use with dht_conf_t->layout_lock */ @@ -60,6 +75,7 @@ struct dht_layout { */ uint32_t start; uint32_t stop; + uint32_t commit_hash; xlator_t *xlator; } list[]; }; @@ -326,6 +342,7 @@ struct gf_defrag_info_ { uuid_t node_uuid; struct timeval start_time; gf_boolean_t stats; + uint32_t new_commit_hash; gf_defrag_pattern_list_t *defrag_pattern; int tier_promote_frequency; int tier_demote_frequency; @@ -423,6 +440,7 @@ struct dht_conf { /* Support variable xattr names. */ char *xattr_name; char *link_xattr_name; + char *commithash_xattr_name; char *wild_xattr_name; /* Support size-weighted rebalancing (heterogeneous bricks). */ @@ -437,6 +455,13 @@ struct dht_conf { /*local subvol storage for rebalance*/ xlator_t **local_subvols; int32_t local_subvols_cnt; + + /* + * "Commit hash" for this volume topology. Changed whenever bricks + * are added or removed. + */ + uint32_t vol_commit_hash; + gf_boolean_t vch_forced; }; typedef struct dht_conf dht_conf_t; @@ -577,7 +602,7 @@ int dht_layouts_init (xlator_t *this, dht_conf_t *conf); int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, int op_ret, int op_errno, dict_t *xattr); -int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, +int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, int pos, int32_t **disk_layout_p); int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, int pos, void *disk_layout_raw, int disk_layout_len); @@ -632,6 +657,7 @@ xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx); int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode); +int dht_layout_index_for_subvol (dht_layout_t *layout, xlator_t *subvol); int dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout);; void dht_layout_unref (xlator_t *this, dht_layout_t *layout); dht_layout_t *dht_layout_ref (xlator_t *this, dht_layout_t *layout); @@ -650,6 +676,7 @@ int dht_rename_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *preparent, struct iatt *postparent, dict_t *xdata); +int dht_update_commit_hash_for_layout (call_frame_t *frame); int dht_fix_directory_layout (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, dht_layout_t *layout); diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c index 6ef28472307..da8f13fc428 100644 --- a/xlators/cluster/dht/src/dht-layout.c +++ b/xlators/cluster/dht/src/dht-layout.c @@ -267,7 +267,7 @@ dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, goto out; } - disk_layout[0] = hton32 (1); + disk_layout[0] = hton32 (layout->list[pos].commit_hash); disk_layout[1] = hton32 (layout->type); disk_layout[2] = hton32 (layout->list[pos].start); disk_layout[3] = hton32 (layout->list[pos].stop); @@ -288,10 +288,10 @@ int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, int pos, void *disk_layout_raw, int disk_layout_len) { - int cnt = 0; int type = 0; int start_off = 0; int stop_off = 0; + int commit_hash = 0; int disk_layout[4]; if (!disk_layout_raw) { @@ -305,14 +305,6 @@ dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, memcpy (disk_layout, disk_layout_raw, disk_layout_len); - cnt = ntoh32 (disk_layout[0]); - if (cnt != 1) { - gf_msg (this->name, GF_LOG_ERROR, 0, - DHT_MSG_INVALID_DISK_LAYOUT, - "Invalid disk layout: Invalid count %d", cnt); - return -1; - } - type = ntoh32 (disk_layout[1]); switch (type) { case DHT_HASH_TYPE_DM_USER: @@ -330,21 +322,22 @@ dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, return -1; } + commit_hash = ntoh32 (disk_layout[0]); start_off = ntoh32 (disk_layout[2]); stop_off = ntoh32 (disk_layout[3]); + layout->list[pos].commit_hash = commit_hash; layout->list[pos].start = start_off; layout->list[pos].stop = stop_off; gf_msg_trace (this->name, 0, - "merged to layout: %u - %u (type %d) from %s", - start_off, stop_off, type, + "merged to layout: %u - %u (type %d, hash %d) from %s", + start_off, stop_off, commit_hash, type, layout->list[pos].xlator->name); return 0; } - int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, int op_ret, int op_errno, dict_t *xattr) @@ -397,6 +390,13 @@ dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, subvol->name); goto out; } + + if (layout->commit_hash == 0) { + layout->commit_hash = layout->list[i].commit_hash; + } else if (layout->commit_hash != layout->list[i].commit_hash) { + layout->commit_hash = DHT_LAYOUT_HASH_INVALID; + } + layout->list[i].err = 0; out: @@ -409,6 +409,7 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j) { uint32_t start_swap = 0; uint32_t stop_swap = 0; + uint32_t commit_hash_swap = 0; xlator_t *xlator_swap = 0; int err_swap = 0; @@ -416,16 +417,19 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j) stop_swap = layout->list[i].stop; xlator_swap = layout->list[i].xlator; err_swap = layout->list[i].err; + commit_hash_swap = layout->list[i].commit_hash; layout->list[i].start = layout->list[j].start; layout->list[i].stop = layout->list[j].stop; layout->list[i].xlator = layout->list[j].xlator; layout->list[i].err = layout->list[j].err; + layout->list[i].commit_hash = layout->list[j].commit_hash; layout->list[j].start = start_swap; layout->list[j].stop = stop_swap; layout->list[j].xlator = xlator_swap; layout->list[j].err = err_swap; + layout->list[j].commit_hash = commit_hash_swap; } void @@ -728,9 +732,9 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, int dict_ret = 0; int32_t disk_layout[4]; void *disk_layout_raw = NULL; - int32_t count = -1; uint32_t start_off = -1; uint32_t stop_off = -1; + uint32_t commit_hash = -1; dht_conf_t *conf = this->private; char gfid[GF_UUID_BUF_SIZE] = {0}; @@ -795,36 +799,21 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, memcpy (disk_layout, disk_layout_raw, sizeof (disk_layout)); - count = ntoh32 (disk_layout[0]); - if (count != 1) { - if (loc) { - gf_msg (this->name, GF_LOG_ERROR, 0, - DHT_MSG_INVALID_DISK_LAYOUT, - "Invalid disk layout: invalid count %d," - "path = %s, gfid = %s ", - count, loc->path, gfid); - } else { - gf_msg (this->name, GF_LOG_ERROR, 0, - DHT_MSG_INVALID_DISK_LAYOUT, - "Invalid disk layout: invalid count %d," - "path not found, gfid = %s ", - count, gfid); - } - ret = -1; - goto out; - } - start_off = ntoh32 (disk_layout[2]); stop_off = ntoh32 (disk_layout[3]); + commit_hash = ntoh32 (disk_layout[0]); if ((layout->list[pos].start != start_off) - || (layout->list[pos].stop != stop_off)) { + || (layout->list[pos].stop != stop_off) + || (layout->list[pos].commit_hash != commit_hash)) { gf_log (this->name, GF_LOG_INFO, - "subvol: %s; inode layout - %"PRIu32" - %"PRIu32"; " - "disk layout - %"PRIu32" - %"PRIu32, + "subvol: %s; inode layout - %"PRIu32" - %"PRIu32 + " - %"PRIu32"; " + "disk layout - %"PRIu32" - %"PRIu32" - %"PRIu32, layout->list[pos].xlator->name, layout->list[pos].start, layout->list[pos].stop, - start_off, stop_off); + layout->list[pos].commit_hash, + start_off, stop_off, commit_hash); ret = 1; } else { ret = 0; @@ -864,3 +853,18 @@ dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode) out: return ret; } + +int +dht_layout_index_for_subvol (dht_layout_t *layout, xlator_t *subvol) +{ + int i = 0, ret = -1; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == subvol) { + ret = i; + break; + } + } + + return ret; +} diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index 767b457ce77..fcb005ecc3d 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -2336,6 +2336,46 @@ out: } int +gf_defrag_settle_hash (xlator_t *this, gf_defrag_info_t *defrag, + loc_t *loc, dict_t *fix_layout) +{ + int ret; + + /* + * Now we're ready to update the directory commit hash for the volume + * root, so that hash miscompares and broadcast lookups can stop. + * However, we want to skip that if fix-layout is all we did. In + * that case, we want the miscompares etc. to continue until a real + * rebalance is complete. + */ + if (defrag->cmd == GF_DEFRAG_CMD_START_LAYOUT_FIX + || defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER + || defrag->cmd == GF_DEFRAG_CMD_START_TIER) { + return 0; + } + + ret = dict_set_uint32 (fix_layout, "new-commit-hash", + defrag->new_commit_hash); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set new-commit-hash"); + return -1; + } + + ret = syncop_setxattr (this, loc, fix_layout, 0, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "fix layout on %s failed", loc->path); + return -1; + } + + /* TBD: find more efficient solution than adding/deleting every time */ + dict_del(fix_layout, "new-commit-hash"); + + return 0; +} + +int gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, dict_t *fix_layout, dict_t *migrate_data) { @@ -2421,6 +2461,7 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, if (ret) { gf_log (this->name, GF_LOG_ERROR, "Child loc" " build failed"); + ret = -1; goto out; } @@ -2486,9 +2527,16 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, "Fix layout failed for %s", entry_loc.path); defrag->total_failures++; + ret = -1; goto out; } + if (gf_defrag_settle_hash (this, defrag, &entry_loc, + fix_layout) != 0) { + defrag->total_failures++; + ret = -1; + goto out; + } } gf_dirent_free (&entries); free_entries = _gf_false; @@ -2572,6 +2620,36 @@ gf_defrag_start_crawl (void *data) goto out; } + /* + * Unfortunately, we can't do special xattrs (like fix.layout) and + * real ones in the same call currently, and changing it seems + * riskier than just doing two calls. + */ + + gf_log (this->name, GF_LOG_INFO, "%s using commit hash %u", + __func__, conf->vol_commit_hash); + + ret = dict_set_uint32 (fix_layout, conf->commithash_xattr_name, + conf->vol_commit_hash); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set %s", conf->commithash_xattr_name); + defrag->total_failures++; + ret = -1; + goto out; + } + + ret = syncop_setxattr (this, &loc, fix_layout, 0, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "fix layout on %s failed", + loc.path); + defrag->total_failures++; + ret = -1; + goto out; + } + + /* We now return to our regularly scheduled program. */ + ret = dict_set_str (fix_layout, GF_XATTR_FIX_LAYOUT_KEY, "yes"); if (ret) { gf_msg (this->name, GF_LOG_ERROR, 0, @@ -2579,10 +2657,13 @@ gf_defrag_start_crawl (void *data) "Failed to start rebalance:" "Failed to set dictionary value: key = %s", GF_XATTR_FIX_LAYOUT_KEY); + defrag->total_failures++; ret = -1; goto out; } + defrag->new_commit_hash = conf->vol_commit_hash; + ret = syncop_setxattr (this, &loc, fix_layout, 0, NULL, NULL); if (ret) { gf_msg (this->name, GF_LOG_ERROR, 0, @@ -2598,19 +2679,18 @@ gf_defrag_start_crawl (void *data) (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) { migrate_data = dict_new (); if (!migrate_data) { + defrag->total_failures++; ret = -1; goto out; } - if (defrag->cmd == GF_DEFRAG_CMD_START_FORCE) - ret = dict_set_str (migrate_data, - GF_XATTR_FILE_MIGRATE_KEY, - "force"); - else - ret = dict_set_str (migrate_data, - GF_XATTR_FILE_MIGRATE_KEY, - "non-force"); - if (ret) + ret = dict_set_str (migrate_data, GF_XATTR_FILE_MIGRATE_KEY, + (defrag->cmd == GF_DEFRAG_CMD_START_FORCE) + ? "force" : "non-force"); + if (ret) { + defrag->total_failures++; + ret = -1; goto out; + } /* Find local subvolumes */ ret = syncop_getxattr (this, &loc, &dict, @@ -2669,6 +2749,17 @@ gf_defrag_start_crawl (void *data) ret = gf_defrag_fix_layout (this, defrag, &loc, fix_layout, migrate_data); + if (ret) { + defrag->total_failures++; + ret = -1; + goto out; + } + + if (gf_defrag_settle_hash (this, defrag, &loc, fix_layout) != 0) { + defrag->total_failures++; + ret = -1; + goto out; + } if (defrag->cmd == GF_DEFRAG_CMD_START_TIER) { methods = conf->methods; diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index cc093e1199f..c881a361804 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -23,11 +23,14 @@ #define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,path) do { \ layout->list[i].start = srt; \ layout->list[i].stop = srt + chunk - 1; \ + layout->list[i].commit_hash = layout->commit_hash; \ \ gf_msg_trace (this->name, 0, \ - "gave fix: %u - %u on %s for %s", \ + "gave fix: %u - %u, with commit-hash %u" \ + " on %s for %s", \ layout->list[i].start, \ layout->list[i].stop, \ + layout->list[i].commit_hash, \ layout->list[i].xlator->name, path); \ } while (0) @@ -448,6 +451,7 @@ dht_should_fix_layout (call_frame_t *frame, dht_layout_t **inmem, dht_layout_t **ondisk) { gf_boolean_t fixit = _gf_true; + dht_local_t *local = NULL; int layout_span = 0; int decommissioned_bricks = 0; @@ -482,6 +486,10 @@ dht_should_fix_layout (call_frame_t *frame, dht_layout_t **inmem, if (local->selfheal.hole_cnt || local->selfheal.overlaps_cnt) goto out; + /* If commit hashes are being updated, let it through */ + if ((*inmem)->commit_hash != (*ondisk)->commit_hash) + goto out; + layout_span = dht_layout_span (*ondisk); decommissioned_bricks @@ -497,6 +505,7 @@ dht_should_fix_layout (call_frame_t *frame, dht_layout_t **inmem, fixit = _gf_false; out: + return fixit; } @@ -756,6 +765,7 @@ dht_fix_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) dummy = dht_layout_new (this, 1); if (!dummy) goto out; + dummy->commit_hash = layout->commit_hash; for (i = 0; i < conf->subvolume_cnt; i++) { if (_gf_false == dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { @@ -1474,6 +1484,8 @@ dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc, new_layout->list[i].xlator = layout->list[i].xlator; } + new_layout->commit_hash = layout->commit_hash; + if (priv->du_stats) { for (i = 0; i < priv->subvolume_cnt; ++i) { gf_log (this->name, GF_LOG_INFO, @@ -1653,6 +1665,11 @@ dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc, overlaps = local->selfheal.overlaps_cnt; if (holes || overlaps) { + /* If the layout has anomolies which would change the hash + * ranges, then we need to reset the commit_hash for this + * directory, as the layout would change and things may not + * be in place as expected */ + layout->commit_hash = DHT_LAYOUT_HASH_INVALID; dht_selfheal_layout_new_directory (frame, loc, layout); ret = 0; } @@ -1934,3 +1951,300 @@ dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data) DHT_STACK_DESTROY (sync_frame); return 0; } + +/* EXIT: dht_update_commit_hash_for_layout */ +int +dht_update_commit_hash_for_layout_done (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xdata) +{ + dht_local_t *local = NULL; + + local = frame->local; + + /* preserve oldest error */ + if (op_ret && !local->op_ret) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + + DHT_STACK_UNWIND (setxattr, frame, local->op_ret, + local->op_errno, NULL); + + return 0; +} + +int +dht_update_commit_hash_for_layout_unlock (call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + int ret = 0; + + local = frame->local; + + ret = dht_unlock_inodelk (frame, local->lock.locks, + local->lock.lk_count, + dht_update_commit_hash_for_layout_done); + if (ret < 0) { + /* preserve oldest error, just ... */ + if (!local->op_ret) { + local->op_errno = errno; + local->op_ret = -1; + } + + gf_msg (this->name, GF_LOG_WARNING, errno, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, + "Winding unlock failed: stale locks left on brick" + " %s", local->loc.path); + + dht_update_commit_hash_for_layout_done (frame, NULL, this, + 0, 0, NULL); + } + + return 0; +} + +int +dht_update_commit_hash_for_layout_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, + int op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + + local = frame->local; + + LOCK (&frame->lock); + /* store first failure, just because */ + if (op_ret && !local->op_ret) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_update_commit_hash_for_layout_unlock (frame, this); + } + + return 0; +} + +int +dht_update_commit_hash_for_layout_resume (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + int count = 1, ret = -1, i = 0, j = 0; + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + int32_t *disk_layout = NULL; + dict_t **xattr = NULL; + + local = frame->local; + conf = frame->this->private; + count = conf->local_subvols_cnt; + layout = local->layout; + + if (op_ret < 0) { + goto err_done; + } + + /* We precreate the xattr list as we cannot change call count post the + * first wind as we may never continue from there. So we finish prep + * work before winding the setxattrs */ + xattr = GF_CALLOC (count, sizeof (*xattr), gf_common_mt_char); + if (!xattr) { + local->op_errno = errno; + + gf_msg (this->name, GF_LOG_WARNING, errno, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, + "Directory commit hash update failed:" + " %s: Allocation failed", local->loc.path); + + goto err; + } + + for (i = 0; i < count; i++) { + /* find the layout index for the subvolume */ + ret = dht_layout_index_for_subvol (layout, + conf->local_subvols[i]); + if (ret < 0) { + local->op_errno = ENOENT; + + gf_msg (this->name, GF_LOG_WARNING, 0, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, + "Directory commit hash update failed:" + " %s: (subvol %s) Failed to find disk layout", + local->loc.path, conf->local_subvols[i]->name); + + goto err; + } + j = ret; + + /* update the commit hash for the layout */ + layout->list[j].commit_hash = layout->commit_hash; + + /* extract the current layout */ + ret = dht_disk_layout_extract (this, layout, j, &disk_layout); + if (ret == -1) { + local->op_errno = errno; + + gf_msg (this->name, GF_LOG_WARNING, errno, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, + "Directory commit hash update failed:" + " %s: (subvol %s) Failed to extract disk" + " layout", local->loc.path, + conf->local_subvols[i]->name); + + goto err; + } + + xattr[i] = get_new_dict (); + if (!xattr[i]) { + local->op_errno = errno; + + gf_msg (this->name, GF_LOG_WARNING, errno, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, + "Directory commit hash update failed:" + " %s: Allocation failed", local->loc.path); + + goto err; + } + + ret = dict_set_bin (xattr[i], conf->xattr_name, + disk_layout, 4 * 4); + if (ret != 0) { + local->op_errno = ENOMEM; + + gf_msg (this->name, GF_LOG_WARNING, 0, + DHT_MSG_DIR_SELFHEAL_XATTR_FAILED, + "Directory self heal xattr failed:" + "%s: (subvol %s) Failed to set xattr" + " dictionary,", local->loc.path, + conf->local_subvols[i]->name); + + goto err; + } + disk_layout = NULL; + + gf_msg_trace (this->name, 0, + "setting commit hash %u on subvolume %s" + " for %s", layout->list[j].commit_hash, + conf->local_subvols[i]->name, local->loc.path); + } + + /* wind the setting of the commit hash across the local subvols */ + local->call_cnt = count; + local->op_ret = 0; + local->op_errno = 0; + for (i = 0; i < count; i++) { + dict_ref (xattr[i]); + + STACK_WIND (frame, dht_update_commit_hash_for_layout_cbk, + conf->local_subvols[i], + conf->local_subvols[i]->fops->setxattr, + &local->loc, xattr[i], 0, NULL); + + dict_unref (xattr[i]); + } + + return 0; +err: + if (xattr) { + for (i = 0; i < count; i++) { + if (xattr[i]) + dict_destroy (xattr[i]); + } + + GF_FREE (xattr); + } + + GF_FREE (disk_layout); + + local->op_ret = -1; + + dht_update_commit_hash_for_layout_unlock (frame, this); + + return 0; +err_done: + local->op_ret = -1; + + dht_update_commit_hash_for_layout_done (frame, NULL, this, 0, 0, NULL); + + return 0; +} + +/* ENTER: dht_update_commit_hash_for_layout (see EXIT above) + * This function is invoked from rebalance only. + * As a result, the check here is simple enough to see if defrag is present + * in the conf, as other data would be populated appropriately if so. + * If ever this was to be used in other code paths, checks would need to + * change. + * + * Functional details: + * - Lock the inodes on the subvols that we want the commit hash updated + * - Update each layout with the inode layout, modified to take in the new + * commit hash. + * - Unlock and return. + */ +int +dht_update_commit_hash_for_layout (call_frame_t *frame) +{ + dht_local_t *local = NULL; + int count = 1, ret = -1, i = 0; + dht_lock_t **lk_array = NULL; + dht_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO (frame->this->name, frame->local, err); + + local = frame->local; + conf = frame->this->private; + + if (!conf->defrag) + goto err; + + count = conf->local_subvols_cnt; + lk_array = GF_CALLOC (count, sizeof (*lk_array), + gf_common_mt_char); + if (lk_array == NULL) + goto err; + + for (i = 0; i < count; i++) { + lk_array[i] = dht_lock_new (frame->this, + conf->local_subvols[i], + &local->loc, F_WRLCK, + DHT_LAYOUT_HEAL_DOMAIN); + if (lk_array[i] == NULL) + goto err; + } + + local->lock.locks = lk_array; + local->lock.lk_count = count; + + ret = dht_blocking_inodelk (frame, lk_array, count, + dht_update_commit_hash_for_layout_resume); + if (ret < 0) { + local->lock.locks = NULL; + local->lock.lk_count = 0; + goto err; + } + + return 0; +err: + if (lk_array != NULL) { + int tmp_count = 0, i = 0; + + for (i = 0; (i < count) && (lk_array[i]); i++, tmp_count++) { + ; + } + + dht_lock_array_free (lk_array, tmp_count); + GF_FREE (lk_array); + } + + return -1; +} diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 04e96127ec6..2fef13adbd1 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -569,6 +569,7 @@ dht_init (xlator_t *this) int cmd = 0; char *node_uuid = NULL; int throttle_count = 0; + uint32_t commit_hash = 0; GF_VALIDATE_OR_GOTO ("dht", this, err); @@ -590,6 +591,15 @@ dht_init (xlator_t *this) goto err; } + /* We get the commit-hash to set only for rebalance process */ + if (dict_get_uint32 (this->options, + "commit-hash", &commit_hash) == 0) { + gf_log (this->name, GF_LOG_INFO, "%s using commit hash %u", + __func__, commit_hash); + conf->vol_commit_hash = commit_hash; + conf->vch_forced = _gf_true; + } + ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd); if (cmd) { @@ -760,6 +770,8 @@ dht_init (xlator_t *this) GF_OPTION_INIT ("xattr-name", conf->xattr_name, str, err); gf_asprintf (&conf->link_xattr_name, "%s."DHT_LINKFILE_STR, conf->xattr_name); + gf_asprintf (&conf->commithash_xattr_name, "%s."DHT_COMMITHASH_STR, + conf->xattr_name); gf_asprintf (&conf->wild_xattr_name, "%s*", conf->xattr_name); if (!conf->link_xattr_name || !conf->wild_xattr_name) { goto err; @@ -871,6 +883,9 @@ struct volume_options options[] = { { .key = {"rebalance-cmd"}, .type = GF_OPTION_TYPE_INT, }, + { .key = {"commit-hash"}, + .type = GF_OPTION_TYPE_INT, + }, { .key = {"node-uuid"}, .type = GF_OPTION_TYPE_STR, }, diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c index 75c7926e49e..f2e7d3a9d3e 100644 --- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c @@ -2022,6 +2022,8 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr) dict_t *bricks_dict = NULL; char *brick_tmpstr = NULL; int start_remove = 0; + uint32_t commit_hash = 0; + this = THIS; GF_ASSERT (this); @@ -2287,6 +2289,9 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr) break; } if (!force && need_rebalance) { + if (dict_get_uint32(dict, "commit-hash", &commit_hash) == 0) { + volinfo->rebal.commit_hash = commit_hash; + } /* perform the rebalance operations */ ret = glusterd_handle_defrag_start (volinfo, err_str, sizeof (err_str), diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c index d90b392546a..969548b7600 100644 --- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c +++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c @@ -3404,6 +3404,36 @@ out: } int +gd_set_commit_hash (dict_t *dict) +{ + struct timeval tv; + uint32_t hash; + + /* + * We need a commit hash that won't conflict with others we might have + * set, or zero which is the implicit value if we never have. Using + * seconds<<3 like this ensures that we'll only get a collision if two + * consecutive rebalances are separated by exactly 2^29 seconds - about + * 17 years - and even then there's only a 1/8 chance of a collision in + * the low order bits. It's far more likely that this code will have + * changed completely by then. If not, call me in 2031. + * + * P.S. Time zone changes? Yeah, right. + */ + gettimeofday (&tv, NULL); + hash = tv.tv_sec << 3; + + /* + * Make sure at least one of those low-order bits is set. The extra + * shifting is because not all machines have sub-millisecond time + * resolution. + */ + hash |= 1 << ((tv.tv_usec >> 10) % 3); + + return dict_set_uint32 (dict, "commit-hash", hash); +} + +int glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx) { int ret = -1; @@ -3415,6 +3445,7 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx) uint32_t status_cmd = GF_CLI_STATUS_NONE; char *errstr = NULL; xlator_t *this = NULL; + gf_boolean_t do_common = _gf_false; GF_ASSERT (req); @@ -3503,12 +3534,6 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx) } break; - case GD_OP_SYNC_VOLUME: - { - dict_copy (dict, req_dict); - break; - } - case GD_OP_REMOVE_BRICK: { dict_t *dict = ctx; @@ -3525,6 +3550,10 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx) if (ret) goto out; + if (gd_set_commit_hash(dict) != 0) { + goto out; + } + dict_destroy (req_dict); req_dict = dict_ref (dict); } @@ -3544,8 +3573,10 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx) dict_copy (dict, req_dict); break; } + do_common = _gf_true; } - /*fall-through*/ + break; + case GD_OP_DELETE_VOLUME: case GD_OP_START_VOLUME: case GD_OP_STOP_VOLUME: @@ -3555,7 +3586,6 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx) case GD_OP_LOG_ROTATE: case GD_OP_QUOTA: case GD_OP_PROFILE_VOLUME: - case GD_OP_REBALANCE: case GD_OP_HEAL_VOLUME: case GD_OP_STATEDUMP_VOLUME: case GD_OP_CLEARLOCKS_VOLUME: @@ -3563,49 +3593,62 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx) case GD_OP_BARRIER: case GD_OP_BITROT: { - ret = dict_get_str (dict, "volname", &volname); - if (ret) { - gf_log (this->name, GF_LOG_CRITICAL, - "volname is not present in " - "operation ctx"); - goto out; - } - - if (strcasecmp (volname, "all")) { - ret = glusterd_dict_set_volid (dict, - volname, - op_errstr); - if (ret) - goto out; - } - dict_copy (dict, req_dict); + do_common = _gf_true; } break; - case GD_OP_COPY_FILE: + case GD_OP_REBALANCE: { - dict_copy (dict, req_dict); - break; + if (gd_set_commit_hash(dict) != 0) { + goto out; + } + do_common = _gf_true; } + break; + case GD_OP_SYNC_VOLUME: + case GD_OP_COPY_FILE: case GD_OP_SYS_EXEC: { dict_copy (dict, req_dict); - break; } + break; case GD_OP_GANESHA: { dict_copy (dict, req_dict); - break; } + break; default: break; } - *req = req_dict; - ret = 0; + /* + * This has been moved out of the switch so that multiple ops with + * other special needs can all "fall through" to it. + */ + if (do_common) { + ret = dict_get_str (dict, "volname", &volname); + if (ret) { + gf_log (this->name, GF_LOG_CRITICAL, + "volname is not present in " + "operation ctx"); + goto out; + } + + if (strcasecmp (volname, "all")) { + ret = glusterd_dict_set_volid (dict, + volname, + op_errstr); + if (ret) + goto out; + } + dict_copy (dict, req_dict); + } + + *req = req_dict; + ret = 0; out: return ret; diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c index 1566f54695f..9111c07b8fc 100644 --- a/xlators/mgmt/glusterd/src/glusterd-rebalance.c +++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c @@ -284,6 +284,9 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr, runner_argprintf ( &runner, "*dht.rebalance-cmd=%d",cmd); runner_add_arg (&runner, "--xlator-option"); runner_argprintf (&runner, "*dht.node-uuid=%s", uuid_utoa(MY_UUID)); + runner_add_arg (&runner, "--xlator-option"); + runner_argprintf (&runner, "*dht.commit-hash=%u", + volinfo->rebal.commit_hash); runner_add_arg (&runner, "--socket-file"); runner_argprintf (&runner, "%s",sockfile); runner_add_arg (&runner, "--pid-file"); @@ -725,6 +728,7 @@ glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict) char *task_id_str = NULL; dict_t *ctx = NULL; xlator_t *this = NULL; + uint32_t commit_hash; this = THIS; GF_ASSERT (this); @@ -813,6 +817,9 @@ glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict) glusterd_store_perform_node_state_store (volinfo); break; } + if (dict_get_uint32 (dict, "commit-hash", &commit_hash) == 0) { + volinfo->rebal.commit_hash = commit_hash; + } ret = glusterd_handle_defrag_start (volinfo, msg, sizeof (msg), cmd, NULL, GD_OP_REBALANCE); break; diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index 335513b4934..384b6f4186e 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -286,6 +286,7 @@ struct glusterd_rebalance_ { glusterd_op_t op; dict_t *dict; /* Dict to store misc information * like list of bricks being removed */ + uint32_t commit_hash; }; typedef struct glusterd_rebalance_ glusterd_rebalance_t; |