summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/dht
diff options
context:
space:
mode:
authorJeff Darcy <jdarcy@redhat.com>2014-05-07 19:31:30 +0000
committerVijay Bellur <vbellur@redhat.com>2015-05-09 21:55:09 -0700
commit243d61575c093c03b9beb014bf9d097646836e95 (patch)
treeafaccb59310013c4f7c6bb867231c4d8988a697c /xlators/cluster/dht
parent58ef6a233f43bc644be55d2b5510b12718a6835e (diff)
dht: make lookup-unhashed=auto do something actually useful
The key concept here is to determine whether a directory is "clean" by comparing its last-known-good topology to the current one for the volume. These are stored as "commit hashes" on the directory and the volume root respectively. The volume's commit hash changes whenever a brick is added or removed, and a fix-layout is done. A directory's commit hash changes only when a full rebalance (not just fix-layout) is done on it. If all bricks are present and have a directory commit hash that matches the volume commit hash, then we can assume that every file is in its "proper" place. Therefore, if we look for a file in that proper place and don't find it, we can assume it's not on any other subvolume and *safely* skip the global (broadcast to all) lookup. Change-Id: Id6ce4593ba1f7daffa74cfab591cb45960629ae3 BUG: 1220064 Reviewed-on-master: http://review.gluster.org/#/c/7702/ Signed-off-by: Jeff Darcy <jdarcy@redhat.com> Signed-off-by: Shyam <srangana@redhat.com> Reviewed-on: http://review.gluster.org/10729 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Krishnan Parthasarathi <kparthas@redhat.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'xlators/cluster/dht')
-rw-r--r--xlators/cluster/dht/src/dht-common.c87
-rw-r--r--xlators/cluster/dht/src/dht-common.h29
-rw-r--r--xlators/cluster/dht/src/dht-layout.c69
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c109
-rw-r--r--xlators/cluster/dht/src/dht-selfheal.c316
-rw-r--r--xlators/cluster/dht/src/dht-shared.c15
6 files changed, 577 insertions, 48 deletions
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 6c0afdbec90..37e07ad77da 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -210,6 +210,7 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
int ret = -1;
dht_layout_t *layout = NULL;
dht_conf_t *conf = NULL;
+ uint32_t vol_commit_hash = 0;
local = discover_frame->local;
layout = local->layout;
@@ -279,6 +280,15 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
dht_layout_set (this, local->inode, layout);
}
+ if (!conf->vch_forced) {
+ ret = dict_get_uint32 (local->xattr,
+ conf->commithash_xattr_name,
+ &vol_commit_hash);
+ if (ret == 0) {
+ conf->vol_commit_hash = vol_commit_hash;
+ }
+ }
+
DHT_STACK_UNWIND (lookup, main_frame, local->op_ret, local->op_errno,
local->inode, &local->stbuf, local->xattr,
&local->postparent);
@@ -459,6 +469,12 @@ dht_discover (call_frame_t *frame, xlator_t *this, loc_t *loc)
"%s: Failed to set dictionary value:key = %s",
loc->path, conf->link_xattr_name);
+ if (__is_root_gfid(local->loc.gfid)) {
+ ret = dict_set_uint32 (local->xattr_req,
+ conf->commithash_xattr_name,
+ sizeof(uint32_t));
+ }
+
call_cnt = conf->subvolume_cnt;
local->call_cnt = call_cnt;
@@ -655,6 +671,7 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_frame_t *copy = NULL;
dht_local_t *copy_local = NULL;
char gfid[GF_UUID_BUF_SIZE] = {0};
+ uint32_t vol_commit_hash = 0;
GF_VALIDATE_OR_GOTO ("dht", frame, err);
GF_VALIDATE_OR_GOTO ("dht", this, err);
@@ -667,6 +684,14 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (!conf)
goto out;
+ if (!conf->vch_forced) {
+ ret = dict_get_uint32 (xattr, conf->commithash_xattr_name,
+ &vol_commit_hash);
+ if (ret == 0) {
+ conf->vol_commit_hash = vol_commit_hash;
+ }
+ }
+
gf_uuid_unparse (local->loc.gfid, gfid);
LOCK (&frame->lock);
@@ -1852,6 +1877,7 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_frame_t *prev = NULL;
int ret = 0;
dht_layout_t *parent_layout = NULL;
+ uint32_t vol_commit_hash = 0;
GF_VALIDATE_OR_GOTO ("dht", frame, err);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -1875,6 +1901,14 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
"fresh_lookup returned for %s with op_ret %d and "
"op_errno %d", loc->path, op_ret, op_errno);
+ if (!conf->vch_forced) {
+ ret = dict_get_uint32 (xattr, conf->commithash_xattr_name,
+ &vol_commit_hash);
+ if (ret == 0) {
+ conf->vol_commit_hash = vol_commit_hash;
+ }
+ }
+
if (ENTRY_MISSING (op_ret, op_errno)) {
gf_msg_debug (this->name, 0,
"Entry %s missing on subvol %s",
@@ -1891,7 +1925,10 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
&parent_layout);
if (ret || !parent_layout)
goto out;
- if (parent_layout->search_unhashed) {
+ if (parent_layout->commit_hash
+ != conf->vol_commit_hash) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "hashes don't match, do global lookup");
local->op_errno = ENOENT;
dht_lookup_everywhere (frame, this, loc);
return 0;
@@ -2078,6 +2115,12 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
return 0;
}
+ if (__is_root_gfid(loc->gfid)) {
+ ret = dict_set_uint32 (local->xattr_req,
+ conf->commithash_xattr_name,
+ sizeof(uint32_t));
+ }
+
if (!hashed_subvol)
hashed_subvol = dht_subvol_get_hashed (this, loc);
local->hashed_subvol = hashed_subvol;
@@ -3238,8 +3281,9 @@ dht_fsetxattr (call_frame_t *frame, xlator_t *this,
conf = this->private;
- GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
- op_errno, err);
+ if (!conf->defrag)
+ GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
+ op_errno, err);
local = dht_local_init (frame, NULL, fd, GF_FOP_FSETXATTR);
if (!local) {
@@ -3338,6 +3382,7 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
char value[4096] = {0,};
gf_dht_migrate_data_type_t forced_rebalance = GF_DHT_MIGRATE_DATA;
int call_cnt = 0;
+ uint32_t new_hash = 0;
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
@@ -3350,8 +3395,10 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
methods = conf->methods;
GF_VALIDATE_OR_GOTO (this->name, conf->methods, err);
- GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
- op_errno, err);
+ /* Rebalance daemon is allowed to set internal keys */
+ if (!conf->defrag)
+ GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
+ op_errno, err);
local = dht_local_init (frame, loc, NULL, GF_FOP_SETXATTR);
if (!local) {
@@ -3489,6 +3536,22 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
gf_log (this->name, GF_LOG_INFO,
"fixing the layout of %s", loc->path);
+ ret = dict_get_uint32(xattr, "new-commit-hash", &new_hash);
+ if (ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "updating commit hash for %s from %u to %u",
+ uuid_utoa(loc->gfid),
+ layout->commit_hash, new_hash);
+ layout->commit_hash = new_hash;
+
+ ret = dht_update_commit_hash_for_layout (frame);
+ if (ret) {
+ op_errno = ENOTCONN;
+ goto err;
+ }
+ return ret;
+ }
+
ret = dht_fix_directory_layout (frame, dht_common_setxattr_cbk,
layout);
if (ret) {
@@ -5377,6 +5440,8 @@ dht_mkdir (call_frame_t *frame, xlator_t *this,
goto err;
}
+ local->layout->commit_hash = conf->vol_commit_hash;
+
STACK_WIND (frame, dht_mkdir_hashed_cbk,
hashed_subvol,
hashed_subvol->fops->mkdir,
@@ -6570,10 +6635,12 @@ dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc,
ret = snprintf (string, max_string_len,
"[Subvol_name: %s, Err: %d , Start: "
- "%"PRIu32 " , Stop: %"PRIu32 " ], ",
+ "%"PRIu32 " , Stop: %"PRIu32 " , Hash: %"
+ PRIu32 " ], ",
layout->list[i].xlator->name,
layout->list[i].err, layout->list[i].start,
- layout->list[i].stop);
+ layout->list[i].stop,
+ layout->list[i].commit_hash);
if (ret < 0)
return;
@@ -6602,10 +6669,12 @@ dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc,
ret = snprintf (output_string + off, len - off,
"[Subvol_name: %s, Err: %d , Start: "
- "%"PRIu32 " , Stop: %"PRIu32 " ], ",
+ "%"PRIu32 " , Stop: %"PRIu32 " , Hash: %"
+ PRIu32 " ], ",
layout->list[i].xlator->name,
layout->list[i].err, layout->list[i].start,
- layout->list[i].stop);
+ layout->list[i].stop,
+ layout->list[i].commit_hash);
if (ret < 0)
goto err;
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index 9a6ed1a889a..45b6cc9e80b 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -31,6 +31,7 @@
#define DHT_PATHINFO_HEADER "DISTRIBUTE:"
#define DHT_FILE_MIGRATE_DOMAIN "dht.file.migrate"
#define DHT_LAYOUT_HEAL_DOMAIN "dht.layout.heal"
+#define DHT_LAYOUT_HASH_INVALID 1
#include <fnmatch.h>
@@ -48,6 +49,20 @@ struct dht_layout {
special key */
int cnt;
int preset;
+ /*
+ * The last *configuration* state for which this directory was known
+ * to be in balance. The corresponding vol_commit_hash changes
+ * whenever bricks are added or removed. This value changes when a
+ * (full) rebalance is complete. If they match, it's safe to assume
+ * that every file is where it should be and there's no need to do
+ * lookups for files elsewhere. If they don't, then we have to do a
+ * global lookup to be sure.
+ */
+ uint32_t commit_hash;
+ /*
+ * The *runtime* state of the volume, changes when connections to
+ * bricks are made or lost.
+ */
int gen;
int type;
int ref; /* use with dht_conf_t->layout_lock */
@@ -59,6 +74,7 @@ struct dht_layout {
*/
uint32_t start;
uint32_t stop;
+ uint32_t commit_hash;
xlator_t *xlator;
} list[];
};
@@ -325,6 +341,7 @@ struct gf_defrag_info_ {
uuid_t node_uuid;
struct timeval start_time;
gf_boolean_t stats;
+ uint32_t new_commit_hash;
gf_defrag_pattern_list_t *defrag_pattern;
int tier_promote_frequency;
int tier_demote_frequency;
@@ -422,6 +439,7 @@ struct dht_conf {
/* Support variable xattr names. */
char *xattr_name;
char *link_xattr_name;
+ char *commithash_xattr_name;
char *wild_xattr_name;
/* Support size-weighted rebalancing (heterogeneous bricks). */
@@ -436,6 +454,13 @@ struct dht_conf {
/*local subvol storage for rebalance*/
xlator_t **local_subvols;
int32_t local_subvols_cnt;
+
+ /*
+ * "Commit hash" for this volume topology. Changed whenever bricks
+ * are added or removed.
+ */
+ uint32_t vol_commit_hash;
+ gf_boolean_t vch_forced;
};
typedef struct dht_conf dht_conf_t;
@@ -576,7 +601,7 @@ int dht_layouts_init (xlator_t *this, dht_conf_t *conf);
int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
int op_ret, int op_errno, dict_t *xattr);
-int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,
+int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,
int pos, int32_t **disk_layout_p);
int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
int pos, void *disk_layout_raw, int disk_layout_len);
@@ -631,6 +656,7 @@ xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx);
int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode);
+int dht_layout_index_for_subvol (dht_layout_t *layout, xlator_t *subvol);
int dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout);;
void dht_layout_unref (xlator_t *this, dht_layout_t *layout);
dht_layout_t *dht_layout_ref (xlator_t *this, dht_layout_t *layout);
@@ -649,6 +675,7 @@ int dht_rename_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *preparent, struct iatt *postparent,
dict_t *xdata);
+int dht_update_commit_hash_for_layout (call_frame_t *frame);
int dht_fix_directory_layout (call_frame_t *frame,
dht_selfheal_dir_cbk_t dir_cbk,
dht_layout_t *layout);
diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c
index 2ed15c5e43c..f88c786a55b 100644
--- a/xlators/cluster/dht/src/dht-layout.c
+++ b/xlators/cluster/dht/src/dht-layout.c
@@ -267,7 +267,7 @@ dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,
goto out;
}
- disk_layout[0] = hton32 (1);
+ disk_layout[0] = hton32 (layout->list[pos].commit_hash);
disk_layout[1] = hton32 (layout->type);
disk_layout[2] = hton32 (layout->list[pos].start);
disk_layout[3] = hton32 (layout->list[pos].stop);
@@ -288,10 +288,10 @@ int
dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
int pos, void *disk_layout_raw, int disk_layout_len)
{
- int cnt = 0;
int type = 0;
int start_off = 0;
int stop_off = 0;
+ int commit_hash = 0;
int disk_layout[4];
if (!disk_layout_raw) {
@@ -305,14 +305,6 @@ dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
memcpy (disk_layout, disk_layout_raw, disk_layout_len);
- cnt = ntoh32 (disk_layout[0]);
- if (cnt != 1) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_INVALID_DISK_LAYOUT,
- "Invalid disk layout: Invalid count %d", cnt);
- return -1;
- }
-
type = ntoh32 (disk_layout[1]);
switch (type) {
case DHT_HASH_TYPE_DM_USER:
@@ -330,21 +322,22 @@ dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
return -1;
}
+ commit_hash = ntoh32 (disk_layout[0]);
start_off = ntoh32 (disk_layout[2]);
stop_off = ntoh32 (disk_layout[3]);
+ layout->list[pos].commit_hash = commit_hash;
layout->list[pos].start = start_off;
layout->list[pos].stop = stop_off;
gf_msg_trace (this->name, 0,
- "merged to layout: %u - %u (type %d) from %s",
- start_off, stop_off, type,
+ "merged to layout: %u - %u (type %d, hash %d) from %s",
+ start_off, stop_off, commit_hash, type,
layout->list[pos].xlator->name);
return 0;
}
-
int
dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
int op_ret, int op_errno, dict_t *xattr)
@@ -397,6 +390,13 @@ dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
subvol->name);
goto out;
}
+
+ if (layout->commit_hash == 0) {
+ layout->commit_hash = layout->list[i].commit_hash;
+ } else if (layout->commit_hash != layout->list[i].commit_hash) {
+ layout->commit_hash = DHT_LAYOUT_HASH_INVALID;
+ }
+
layout->list[i].err = 0;
out:
@@ -409,6 +409,7 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j)
{
uint32_t start_swap = 0;
uint32_t stop_swap = 0;
+ uint32_t commit_hash_swap = 0;
xlator_t *xlator_swap = 0;
int err_swap = 0;
@@ -416,16 +417,19 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j)
stop_swap = layout->list[i].stop;
xlator_swap = layout->list[i].xlator;
err_swap = layout->list[i].err;
+ commit_hash_swap = layout->list[i].commit_hash;
layout->list[i].start = layout->list[j].start;
layout->list[i].stop = layout->list[j].stop;
layout->list[i].xlator = layout->list[j].xlator;
layout->list[i].err = layout->list[j].err;
+ layout->list[i].commit_hash = layout->list[j].commit_hash;
layout->list[j].start = start_swap;
layout->list[j].stop = stop_swap;
layout->list[j].xlator = xlator_swap;
layout->list[j].err = err_swap;
+ layout->list[j].commit_hash = commit_hash_swap;
}
void
@@ -728,9 +732,9 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
int dict_ret = 0;
int32_t disk_layout[4];
void *disk_layout_raw = NULL;
- int32_t count = -1;
uint32_t start_off = -1;
uint32_t stop_off = -1;
+ uint32_t commit_hash = -1;
dht_conf_t *conf = this->private;
char gfid[GF_UUID_BUF_SIZE] = {0};
@@ -779,27 +783,21 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
memcpy (disk_layout, disk_layout_raw, sizeof (disk_layout));
- count = ntoh32 (disk_layout[0]);
- if (count != 1) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_INVALID_DISK_LAYOUT,
- "Invalid disk layout: invalid count %d,"
- "path = %s, gfid = %s ", count, loc->path, gfid);
- ret = -1;
- goto out;
- }
-
start_off = ntoh32 (disk_layout[2]);
stop_off = ntoh32 (disk_layout[3]);
+ commit_hash = ntoh32 (disk_layout[0]);
if ((layout->list[pos].start != start_off)
- || (layout->list[pos].stop != stop_off)) {
+ || (layout->list[pos].stop != stop_off)
+ || (layout->list[pos].commit_hash != commit_hash)) {
gf_log (this->name, GF_LOG_INFO,
- "subvol: %s; inode layout - %"PRIu32" - %"PRIu32"; "
- "disk layout - %"PRIu32" - %"PRIu32,
+ "subvol: %s; inode layout - %"PRIu32" - %"PRIu32
+ " - %"PRIu32"; "
+ "disk layout - %"PRIu32" - %"PRIu32" - %"PRIu32,
layout->list[pos].xlator->name,
layout->list[pos].start, layout->list[pos].stop,
- start_off, stop_off);
+ layout->list[pos].commit_hash,
+ start_off, stop_off, commit_hash);
ret = 1;
} else {
ret = 0;
@@ -839,3 +837,18 @@ dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode)
out:
return ret;
}
+
+int
+dht_layout_index_for_subvol (dht_layout_t *layout, xlator_t *subvol)
+{
+ int i = 0, ret = -1;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == subvol) {
+ ret = i;
+ break;
+ }
+ }
+
+ return ret;
+}
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index 60f7314efe0..fae856d969f 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -2337,6 +2337,46 @@ out:
}
int
+gf_defrag_settle_hash (xlator_t *this, gf_defrag_info_t *defrag,
+ loc_t *loc, dict_t *fix_layout)
+{
+ int ret;
+
+ /*
+ * Now we're ready to update the directory commit hash for the volume
+ * root, so that hash miscompares and broadcast lookups can stop.
+ * However, we want to skip that if fix-layout is all we did. In
+ * that case, we want the miscompares etc. to continue until a real
+ * rebalance is complete.
+ */
+ if (defrag->cmd == GF_DEFRAG_CMD_START_LAYOUT_FIX
+ || defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER
+ || defrag->cmd == GF_DEFRAG_CMD_START_TIER) {
+ return 0;
+ }
+
+ ret = dict_set_uint32 (fix_layout, "new-commit-hash",
+ defrag->new_commit_hash);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set new-commit-hash");
+ return -1;
+ }
+
+ ret = syncop_setxattr (this, loc, fix_layout, 0, NULL, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "fix layout on %s failed", loc->path);
+ return -1;
+ }
+
+ /* TBD: find more efficient solution than adding/deleting every time */
+ dict_del(fix_layout, "new-commit-hash");
+
+ return 0;
+}
+
+int
gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
dict_t *fix_layout, dict_t *migrate_data)
{
@@ -2422,6 +2462,7 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Child loc"
" build failed");
+ ret = -1;
goto out;
}
@@ -2487,9 +2528,16 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
"Fix layout failed for %s",
entry_loc.path);
defrag->total_failures++;
+ ret = -1;
goto out;
}
+ if (gf_defrag_settle_hash (this, defrag, &entry_loc,
+ fix_layout) != 0) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
}
gf_dirent_free (&entries);
free_entries = _gf_false;
@@ -2573,6 +2621,36 @@ gf_defrag_start_crawl (void *data)
goto out;
}
+ /*
+ * Unfortunately, we can't do special xattrs (like fix.layout) and
+ * real ones in the same call currently, and changing it seems
+ * riskier than just doing two calls.
+ */
+
+ gf_log (this->name, GF_LOG_INFO, "%s using commit hash %u",
+ __func__, conf->vol_commit_hash);
+
+ ret = dict_set_uint32 (fix_layout, conf->commithash_xattr_name,
+ conf->vol_commit_hash);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set %s", conf->commithash_xattr_name);
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_setxattr (this, &loc, fix_layout, 0, NULL, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "fix layout on %s failed",
+ loc.path);
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+
+ /* We now return to our regularly scheduled program. */
+
ret = dict_set_str (fix_layout, GF_XATTR_FIX_LAYOUT_KEY, "yes");
if (ret) {
gf_msg (this->name, GF_LOG_ERROR, 0,
@@ -2580,10 +2658,13 @@ gf_defrag_start_crawl (void *data)
"Failed to start rebalance:"
"Failed to set dictionary value: key = %s",
GF_XATTR_FIX_LAYOUT_KEY);
+ defrag->total_failures++;
ret = -1;
goto out;
}
+ defrag->new_commit_hash = conf->vol_commit_hash;
+
ret = syncop_setxattr (this, &loc, fix_layout, 0, NULL, NULL);
if (ret) {
gf_msg (this->name, GF_LOG_ERROR, 0,
@@ -2599,19 +2680,18 @@ gf_defrag_start_crawl (void *data)
(defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) {
migrate_data = dict_new ();
if (!migrate_data) {
+ defrag->total_failures++;
ret = -1;
goto out;
}
- if (defrag->cmd == GF_DEFRAG_CMD_START_FORCE)
- ret = dict_set_str (migrate_data,
- GF_XATTR_FILE_MIGRATE_KEY,
- "force");
- else
- ret = dict_set_str (migrate_data,
- GF_XATTR_FILE_MIGRATE_KEY,
- "non-force");
- if (ret)
+ ret = dict_set_str (migrate_data, GF_XATTR_FILE_MIGRATE_KEY,
+ (defrag->cmd == GF_DEFRAG_CMD_START_FORCE)
+ ? "force" : "non-force");
+ if (ret) {
+ defrag->total_failures++;
+ ret = -1;
goto out;
+ }
/* Find local subvolumes */
ret = syncop_getxattr (this, &loc, &dict,
@@ -2670,6 +2750,17 @@ gf_defrag_start_crawl (void *data)
ret = gf_defrag_fix_layout (this, defrag, &loc, fix_layout,
migrate_data);
+ if (ret) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+
+ if (gf_defrag_settle_hash (this, defrag, &loc, fix_layout) != 0) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
if (defrag->cmd == GF_DEFRAG_CMD_START_TIER) {
methods = conf->methods;
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
index cc093e1199f..c881a361804 100644
--- a/xlators/cluster/dht/src/dht-selfheal.c
+++ b/xlators/cluster/dht/src/dht-selfheal.c
@@ -23,11 +23,14 @@
#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,path) do { \
layout->list[i].start = srt; \
layout->list[i].stop = srt + chunk - 1; \
+ layout->list[i].commit_hash = layout->commit_hash; \
\
gf_msg_trace (this->name, 0, \
- "gave fix: %u - %u on %s for %s", \
+ "gave fix: %u - %u, with commit-hash %u" \
+ " on %s for %s", \
layout->list[i].start, \
layout->list[i].stop, \
+ layout->list[i].commit_hash, \
layout->list[i].xlator->name, path); \
} while (0)
@@ -448,6 +451,7 @@ dht_should_fix_layout (call_frame_t *frame, dht_layout_t **inmem,
dht_layout_t **ondisk)
{
gf_boolean_t fixit = _gf_true;
+
dht_local_t *local = NULL;
int layout_span = 0;
int decommissioned_bricks = 0;
@@ -482,6 +486,10 @@ dht_should_fix_layout (call_frame_t *frame, dht_layout_t **inmem,
if (local->selfheal.hole_cnt || local->selfheal.overlaps_cnt)
goto out;
+ /* If commit hashes are being updated, let it through */
+ if ((*inmem)->commit_hash != (*ondisk)->commit_hash)
+ goto out;
+
layout_span = dht_layout_span (*ondisk);
decommissioned_bricks
@@ -497,6 +505,7 @@ dht_should_fix_layout (call_frame_t *frame, dht_layout_t **inmem,
fixit = _gf_false;
out:
+
return fixit;
}
@@ -756,6 +765,7 @@ dht_fix_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
dummy = dht_layout_new (this, 1);
if (!dummy)
goto out;
+ dummy->commit_hash = layout->commit_hash;
for (i = 0; i < conf->subvolume_cnt; i++) {
if (_gf_false ==
dht_is_subvol_in_layout (layout, conf->subvolumes[i])) {
@@ -1474,6 +1484,8 @@ dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc,
new_layout->list[i].xlator = layout->list[i].xlator;
}
+ new_layout->commit_hash = layout->commit_hash;
+
if (priv->du_stats) {
for (i = 0; i < priv->subvolume_cnt; ++i) {
gf_log (this->name, GF_LOG_INFO,
@@ -1653,6 +1665,11 @@ dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc,
overlaps = local->selfheal.overlaps_cnt;
if (holes || overlaps) {
+ /* If the layout has anomolies which would change the hash
+ * ranges, then we need to reset the commit_hash for this
+ * directory, as the layout would change and things may not
+ * be in place as expected */
+ layout->commit_hash = DHT_LAYOUT_HASH_INVALID;
dht_selfheal_layout_new_directory (frame, loc, layout);
ret = 0;
}
@@ -1934,3 +1951,300 @@ dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data)
DHT_STACK_DESTROY (sync_frame);
return 0;
}
+
+/* EXIT: dht_update_commit_hash_for_layout */
+int
+dht_update_commit_hash_for_layout_done (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ /* preserve oldest error */
+ if (op_ret && !local->op_ret) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+
+ DHT_STACK_UNWIND (setxattr, frame, local->op_ret,
+ local->op_errno, NULL);
+
+ return 0;
+}
+
+int
+dht_update_commit_hash_for_layout_unlock (call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+ int ret = 0;
+
+ local = frame->local;
+
+ ret = dht_unlock_inodelk (frame, local->lock.locks,
+ local->lock.lk_count,
+ dht_update_commit_hash_for_layout_done);
+ if (ret < 0) {
+ /* preserve oldest error, just ... */
+ if (!local->op_ret) {
+ local->op_errno = errno;
+ local->op_ret = -1;
+ }
+
+ gf_msg (this->name, GF_LOG_WARNING, errno,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "Winding unlock failed: stale locks left on brick"
+ " %s", local->loc.path);
+
+ dht_update_commit_hash_for_layout_done (frame, NULL, this,
+ 0, 0, NULL);
+ }
+
+ return 0;
+}
+
+int
+dht_update_commit_hash_for_layout_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret,
+ int op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ /* store first failure, just because */
+ if (op_ret && !local->op_ret) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ this_call_cnt = dht_frame_return (frame);
+
+ if (is_last_call (this_call_cnt)) {
+ dht_update_commit_hash_for_layout_unlock (frame, this);
+ }
+
+ return 0;
+}
+
+int
+dht_update_commit_hash_for_layout_resume (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int count = 1, ret = -1, i = 0, j = 0;
+ dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ int32_t *disk_layout = NULL;
+ dict_t **xattr = NULL;
+
+ local = frame->local;
+ conf = frame->this->private;
+ count = conf->local_subvols_cnt;
+ layout = local->layout;
+
+ if (op_ret < 0) {
+ goto err_done;
+ }
+
+ /* We precreate the xattr list as we cannot change call count post the
+ * first wind as we may never continue from there. So we finish prep
+ * work before winding the setxattrs */
+ xattr = GF_CALLOC (count, sizeof (*xattr), gf_common_mt_char);
+ if (!xattr) {
+ local->op_errno = errno;
+
+ gf_msg (this->name, GF_LOG_WARNING, errno,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "Directory commit hash update failed:"
+ " %s: Allocation failed", local->loc.path);
+
+ goto err;
+ }
+
+ for (i = 0; i < count; i++) {
+ /* find the layout index for the subvolume */
+ ret = dht_layout_index_for_subvol (layout,
+ conf->local_subvols[i]);
+ if (ret < 0) {
+ local->op_errno = ENOENT;
+
+ gf_msg (this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "Directory commit hash update failed:"
+ " %s: (subvol %s) Failed to find disk layout",
+ local->loc.path, conf->local_subvols[i]->name);
+
+ goto err;
+ }
+ j = ret;
+
+ /* update the commit hash for the layout */
+ layout->list[j].commit_hash = layout->commit_hash;
+
+ /* extract the current layout */
+ ret = dht_disk_layout_extract (this, layout, j, &disk_layout);
+ if (ret == -1) {
+ local->op_errno = errno;
+
+ gf_msg (this->name, GF_LOG_WARNING, errno,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "Directory commit hash update failed:"
+ " %s: (subvol %s) Failed to extract disk"
+ " layout", local->loc.path,
+ conf->local_subvols[i]->name);
+
+ goto err;
+ }
+
+ xattr[i] = get_new_dict ();
+ if (!xattr[i]) {
+ local->op_errno = errno;
+
+ gf_msg (this->name, GF_LOG_WARNING, errno,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "Directory commit hash update failed:"
+ " %s: Allocation failed", local->loc.path);
+
+ goto err;
+ }
+
+ ret = dict_set_bin (xattr[i], conf->xattr_name,
+ disk_layout, 4 * 4);
+ if (ret != 0) {
+ local->op_errno = ENOMEM;
+
+ gf_msg (this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "Directory self heal xattr failed:"
+ "%s: (subvol %s) Failed to set xattr"
+ " dictionary,", local->loc.path,
+ conf->local_subvols[i]->name);
+
+ goto err;
+ }
+ disk_layout = NULL;
+
+ gf_msg_trace (this->name, 0,
+ "setting commit hash %u on subvolume %s"
+ " for %s", layout->list[j].commit_hash,
+ conf->local_subvols[i]->name, local->loc.path);
+ }
+
+ /* wind the setting of the commit hash across the local subvols */
+ local->call_cnt = count;
+ local->op_ret = 0;
+ local->op_errno = 0;
+ for (i = 0; i < count; i++) {
+ dict_ref (xattr[i]);
+
+ STACK_WIND (frame, dht_update_commit_hash_for_layout_cbk,
+ conf->local_subvols[i],
+ conf->local_subvols[i]->fops->setxattr,
+ &local->loc, xattr[i], 0, NULL);
+
+ dict_unref (xattr[i]);
+ }
+
+ return 0;
+err:
+ if (xattr) {
+ for (i = 0; i < count; i++) {
+ if (xattr[i])
+ dict_destroy (xattr[i]);
+ }
+
+ GF_FREE (xattr);
+ }
+
+ GF_FREE (disk_layout);
+
+ local->op_ret = -1;
+
+ dht_update_commit_hash_for_layout_unlock (frame, this);
+
+ return 0;
+err_done:
+ local->op_ret = -1;
+
+ dht_update_commit_hash_for_layout_done (frame, NULL, this, 0, 0, NULL);
+
+ return 0;
+}
+
+/* ENTER: dht_update_commit_hash_for_layout (see EXIT above)
+ * This function is invoked from rebalance only.
+ * As a result, the check here is simple enough to see if defrag is present
+ * in the conf, as other data would be populated appropriately if so.
+ * If ever this was to be used in other code paths, checks would need to
+ * change.
+ *
+ * Functional details:
+ * - Lock the inodes on the subvols that we want the commit hash updated
+ * - Update each layout with the inode layout, modified to take in the new
+ * commit hash.
+ * - Unlock and return.
+ */
+int
+dht_update_commit_hash_for_layout (call_frame_t *frame)
+{
+ dht_local_t *local = NULL;
+ int count = 1, ret = -1, i = 0;
+ dht_lock_t **lk_array = NULL;
+ dht_conf_t *conf = NULL;
+
+ GF_VALIDATE_OR_GOTO ("dht", frame, err);
+ GF_VALIDATE_OR_GOTO (frame->this->name, frame->local, err);
+
+ local = frame->local;
+ conf = frame->this->private;
+
+ if (!conf->defrag)
+ goto err;
+
+ count = conf->local_subvols_cnt;
+ lk_array = GF_CALLOC (count, sizeof (*lk_array),
+ gf_common_mt_char);
+ if (lk_array == NULL)
+ goto err;
+
+ for (i = 0; i < count; i++) {
+ lk_array[i] = dht_lock_new (frame->this,
+ conf->local_subvols[i],
+ &local->loc, F_WRLCK,
+ DHT_LAYOUT_HEAL_DOMAIN);
+ if (lk_array[i] == NULL)
+ goto err;
+ }
+
+ local->lock.locks = lk_array;
+ local->lock.lk_count = count;
+
+ ret = dht_blocking_inodelk (frame, lk_array, count,
+ dht_update_commit_hash_for_layout_resume);
+ if (ret < 0) {
+ local->lock.locks = NULL;
+ local->lock.lk_count = 0;
+ goto err;
+ }
+
+ return 0;
+err:
+ if (lk_array != NULL) {
+ int tmp_count = 0, i = 0;
+
+ for (i = 0; (i < count) && (lk_array[i]); i++, tmp_count++) {
+ ;
+ }
+
+ dht_lock_array_free (lk_array, tmp_count);
+ GF_FREE (lk_array);
+ }
+
+ return -1;
+}
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
index ffabc820d70..a1f72a85112 100644
--- a/xlators/cluster/dht/src/dht-shared.c
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -569,6 +569,7 @@ dht_init (xlator_t *this)
int cmd = 0;
char *node_uuid = NULL;
int throttle_count = 0;
+ uint32_t commit_hash = 0;
GF_VALIDATE_OR_GOTO ("dht", this, err);
@@ -590,6 +591,15 @@ dht_init (xlator_t *this)
goto err;
}
+ /* We get the commit-hash to set only for rebalance process */
+ if (dict_get_uint32 (this->options,
+ "commit-hash", &commit_hash) == 0) {
+ gf_log (this->name, GF_LOG_INFO, "%s using commit hash %u",
+ __func__, commit_hash);
+ conf->vol_commit_hash = commit_hash;
+ conf->vch_forced = _gf_true;
+ }
+
ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd);
if (cmd) {
@@ -760,6 +770,8 @@ dht_init (xlator_t *this)
GF_OPTION_INIT ("xattr-name", conf->xattr_name, str, err);
gf_asprintf (&conf->link_xattr_name, "%s."DHT_LINKFILE_STR,
conf->xattr_name);
+ gf_asprintf (&conf->commithash_xattr_name, "%s."DHT_COMMITHASH_STR,
+ conf->xattr_name);
gf_asprintf (&conf->wild_xattr_name, "%s*", conf->xattr_name);
if (!conf->link_xattr_name || !conf->wild_xattr_name) {
goto err;
@@ -871,6 +883,9 @@ struct volume_options options[] = {
{ .key = {"rebalance-cmd"},
.type = GF_OPTION_TYPE_INT,
},
+ { .key = {"commit-hash"},
+ .type = GF_OPTION_TYPE_INT,
+ },
{ .key = {"node-uuid"},
.type = GF_OPTION_TYPE_STR,
},