summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/dht/src
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/dht/src')
-rw-r--r--xlators/cluster/dht/src/dht-common.c87
-rw-r--r--xlators/cluster/dht/src/dht-common.h29
-rw-r--r--xlators/cluster/dht/src/dht-layout.c78
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c109
-rw-r--r--xlators/cluster/dht/src/dht-selfheal.c316
-rw-r--r--xlators/cluster/dht/src/dht-shared.c15
6 files changed, 577 insertions, 57 deletions
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 54ad68246fb..8309b317017 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -210,6 +210,7 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
int ret = -1;
dht_layout_t *layout = NULL;
dht_conf_t *conf = NULL;
+ uint32_t vol_commit_hash = 0;
local = discover_frame->local;
layout = local->layout;
@@ -279,6 +280,15 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
dht_layout_set (this, local->inode, layout);
}
+ if (!conf->vch_forced) {
+ ret = dict_get_uint32 (local->xattr,
+ conf->commithash_xattr_name,
+ &vol_commit_hash);
+ if (ret == 0) {
+ conf->vol_commit_hash = vol_commit_hash;
+ }
+ }
+
DHT_STACK_UNWIND (lookup, main_frame, local->op_ret, local->op_errno,
local->inode, &local->stbuf, local->xattr,
&local->postparent);
@@ -459,6 +469,12 @@ dht_discover (call_frame_t *frame, xlator_t *this, loc_t *loc)
"%s: Failed to set dictionary value:key = %s",
loc->path, conf->link_xattr_name);
+ if (__is_root_gfid(local->loc.gfid)) {
+ ret = dict_set_uint32 (local->xattr_req,
+ conf->commithash_xattr_name,
+ sizeof(uint32_t));
+ }
+
call_cnt = conf->subvolume_cnt;
local->call_cnt = call_cnt;
@@ -655,6 +671,7 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_frame_t *copy = NULL;
dht_local_t *copy_local = NULL;
char gfid[GF_UUID_BUF_SIZE] = {0};
+ uint32_t vol_commit_hash = 0;
GF_VALIDATE_OR_GOTO ("dht", frame, err);
GF_VALIDATE_OR_GOTO ("dht", this, err);
@@ -667,6 +684,14 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (!conf)
goto out;
+ if (!conf->vch_forced) {
+ ret = dict_get_uint32 (xattr, conf->commithash_xattr_name,
+ &vol_commit_hash);
+ if (ret == 0) {
+ conf->vol_commit_hash = vol_commit_hash;
+ }
+ }
+
gf_uuid_unparse (local->loc.gfid, gfid);
LOCK (&frame->lock);
@@ -1852,6 +1877,7 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_frame_t *prev = NULL;
int ret = 0;
dht_layout_t *parent_layout = NULL;
+ uint32_t vol_commit_hash = 0;
GF_VALIDATE_OR_GOTO ("dht", frame, err);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -1875,6 +1901,14 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
"fresh_lookup returned for %s with op_ret %d and "
"op_errno %d", loc->path, op_ret, op_errno);
+ if (!conf->vch_forced) {
+ ret = dict_get_uint32 (xattr, conf->commithash_xattr_name,
+ &vol_commit_hash);
+ if (ret == 0) {
+ conf->vol_commit_hash = vol_commit_hash;
+ }
+ }
+
if (ENTRY_MISSING (op_ret, op_errno)) {
gf_msg_debug (this->name, 0,
"Entry %s missing on subvol %s",
@@ -1891,7 +1925,10 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
&parent_layout);
if (ret || !parent_layout)
goto out;
- if (parent_layout->search_unhashed) {
+ if (parent_layout->commit_hash
+ != conf->vol_commit_hash) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "hashes don't match, do global lookup");
local->op_errno = ENOENT;
dht_lookup_everywhere (frame, this, loc);
return 0;
@@ -2078,6 +2115,12 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
return 0;
}
+ if (__is_root_gfid(loc->gfid)) {
+ ret = dict_set_uint32 (local->xattr_req,
+ conf->commithash_xattr_name,
+ sizeof(uint32_t));
+ }
+
if (!hashed_subvol)
hashed_subvol = dht_subvol_get_hashed (this, loc);
local->hashed_subvol = hashed_subvol;
@@ -3238,8 +3281,9 @@ dht_fsetxattr (call_frame_t *frame, xlator_t *this,
conf = this->private;
- GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
- op_errno, err);
+ if (!conf->defrag)
+ GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
+ op_errno, err);
local = dht_local_init (frame, NULL, fd, GF_FOP_FSETXATTR);
if (!local) {
@@ -3338,6 +3382,7 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
char value[4096] = {0,};
gf_dht_migrate_data_type_t forced_rebalance = GF_DHT_MIGRATE_DATA;
int call_cnt = 0;
+ uint32_t new_hash = 0;
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
@@ -3350,8 +3395,10 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
methods = conf->methods;
GF_VALIDATE_OR_GOTO (this->name, conf->methods, err);
- GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
- op_errno, err);
+ /* Rebalance daemon is allowed to set internal keys */
+ if (!conf->defrag)
+ GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
+ op_errno, err);
local = dht_local_init (frame, loc, NULL, GF_FOP_SETXATTR);
if (!local) {
@@ -3491,6 +3538,22 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
gf_log (this->name, GF_LOG_INFO,
"fixing the layout of %s", loc->path);
+ ret = dict_get_uint32(xattr, "new-commit-hash", &new_hash);
+ if (ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "updating commit hash for %s from %u to %u",
+ uuid_utoa(loc->gfid),
+ layout->commit_hash, new_hash);
+ layout->commit_hash = new_hash;
+
+ ret = dht_update_commit_hash_for_layout (frame);
+ if (ret) {
+ op_errno = ENOTCONN;
+ goto err;
+ }
+ return ret;
+ }
+
ret = dht_fix_directory_layout (frame, dht_common_setxattr_cbk,
layout);
if (ret) {
@@ -5379,6 +5442,8 @@ dht_mkdir (call_frame_t *frame, xlator_t *this,
goto err;
}
+ local->layout->commit_hash = conf->vol_commit_hash;
+
STACK_WIND (frame, dht_mkdir_hashed_cbk,
hashed_subvol,
hashed_subvol->fops->mkdir,
@@ -6573,10 +6638,12 @@ dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc,
ret = snprintf (string, max_string_len,
"[Subvol_name: %s, Err: %d , Start: "
- "%"PRIu32 " , Stop: %"PRIu32 " ], ",
+ "%"PRIu32 " , Stop: %"PRIu32 " , Hash: %"
+ PRIu32 " ], ",
layout->list[i].xlator->name,
layout->list[i].err, layout->list[i].start,
- layout->list[i].stop);
+ layout->list[i].stop,
+ layout->list[i].commit_hash);
if (ret < 0)
return;
@@ -6605,10 +6672,12 @@ dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc,
ret = snprintf (output_string + off, len - off,
"[Subvol_name: %s, Err: %d , Start: "
- "%"PRIu32 " , Stop: %"PRIu32 " ], ",
+ "%"PRIu32 " , Stop: %"PRIu32 " , Hash: %"
+ PRIu32 " ], ",
layout->list[i].xlator->name,
layout->list[i].err, layout->list[i].start,
- layout->list[i].stop);
+ layout->list[i].stop,
+ layout->list[i].commit_hash);
if (ret < 0)
goto err;
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index 3e2d5d725e9..5eb65bf0397 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -32,6 +32,7 @@
#define DHT_FILE_MIGRATE_DOMAIN "dht.file.migrate"
#define DHT_LAYOUT_HEAL_DOMAIN "dht.layout.heal"
#define TIERING_MIGRATION_KEY "tiering.migration"
+#define DHT_LAYOUT_HASH_INVALID 1
#include <fnmatch.h>
@@ -49,6 +50,20 @@ struct dht_layout {
special key */
int cnt;
int preset;
+ /*
+ * The last *configuration* state for which this directory was known
+ * to be in balance. The corresponding vol_commit_hash changes
+ * whenever bricks are added or removed. This value changes when a
+ * (full) rebalance is complete. If they match, it's safe to assume
+ * that every file is where it should be and there's no need to do
+ * lookups for files elsewhere. If they don't, then we have to do a
+ * global lookup to be sure.
+ */
+ uint32_t commit_hash;
+ /*
+ * The *runtime* state of the volume, changes when connections to
+ * bricks are made or lost.
+ */
int gen;
int type;
int ref; /* use with dht_conf_t->layout_lock */
@@ -60,6 +75,7 @@ struct dht_layout {
*/
uint32_t start;
uint32_t stop;
+ uint32_t commit_hash;
xlator_t *xlator;
} list[];
};
@@ -326,6 +342,7 @@ struct gf_defrag_info_ {
uuid_t node_uuid;
struct timeval start_time;
gf_boolean_t stats;
+ uint32_t new_commit_hash;
gf_defrag_pattern_list_t *defrag_pattern;
int tier_promote_frequency;
int tier_demote_frequency;
@@ -423,6 +440,7 @@ struct dht_conf {
/* Support variable xattr names. */
char *xattr_name;
char *link_xattr_name;
+ char *commithash_xattr_name;
char *wild_xattr_name;
/* Support size-weighted rebalancing (heterogeneous bricks). */
@@ -437,6 +455,13 @@ struct dht_conf {
/*local subvol storage for rebalance*/
xlator_t **local_subvols;
int32_t local_subvols_cnt;
+
+ /*
+ * "Commit hash" for this volume topology. Changed whenever bricks
+ * are added or removed.
+ */
+ uint32_t vol_commit_hash;
+ gf_boolean_t vch_forced;
};
typedef struct dht_conf dht_conf_t;
@@ -577,7 +602,7 @@ int dht_layouts_init (xlator_t *this, dht_conf_t *conf);
int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
int op_ret, int op_errno, dict_t *xattr);
-int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,
+int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,
int pos, int32_t **disk_layout_p);
int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
int pos, void *disk_layout_raw, int disk_layout_len);
@@ -632,6 +657,7 @@ xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx);
int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode);
+int dht_layout_index_for_subvol (dht_layout_t *layout, xlator_t *subvol);
int dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout);;
void dht_layout_unref (xlator_t *this, dht_layout_t *layout);
dht_layout_t *dht_layout_ref (xlator_t *this, dht_layout_t *layout);
@@ -650,6 +676,7 @@ int dht_rename_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *preparent, struct iatt *postparent,
dict_t *xdata);
+int dht_update_commit_hash_for_layout (call_frame_t *frame);
int dht_fix_directory_layout (call_frame_t *frame,
dht_selfheal_dir_cbk_t dir_cbk,
dht_layout_t *layout);
diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c
index 6ef28472307..da8f13fc428 100644
--- a/xlators/cluster/dht/src/dht-layout.c
+++ b/xlators/cluster/dht/src/dht-layout.c
@@ -267,7 +267,7 @@ dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,
goto out;
}
- disk_layout[0] = hton32 (1);
+ disk_layout[0] = hton32 (layout->list[pos].commit_hash);
disk_layout[1] = hton32 (layout->type);
disk_layout[2] = hton32 (layout->list[pos].start);
disk_layout[3] = hton32 (layout->list[pos].stop);
@@ -288,10 +288,10 @@ int
dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
int pos, void *disk_layout_raw, int disk_layout_len)
{
- int cnt = 0;
int type = 0;
int start_off = 0;
int stop_off = 0;
+ int commit_hash = 0;
int disk_layout[4];
if (!disk_layout_raw) {
@@ -305,14 +305,6 @@ dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
memcpy (disk_layout, disk_layout_raw, disk_layout_len);
- cnt = ntoh32 (disk_layout[0]);
- if (cnt != 1) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_INVALID_DISK_LAYOUT,
- "Invalid disk layout: Invalid count %d", cnt);
- return -1;
- }
-
type = ntoh32 (disk_layout[1]);
switch (type) {
case DHT_HASH_TYPE_DM_USER:
@@ -330,21 +322,22 @@ dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
return -1;
}
+ commit_hash = ntoh32 (disk_layout[0]);
start_off = ntoh32 (disk_layout[2]);
stop_off = ntoh32 (disk_layout[3]);
+ layout->list[pos].commit_hash = commit_hash;
layout->list[pos].start = start_off;
layout->list[pos].stop = stop_off;
gf_msg_trace (this->name, 0,
- "merged to layout: %u - %u (type %d) from %s",
- start_off, stop_off, type,
+ "merged to layout: %u - %u (type %d, hash %d) from %s",
+ start_off, stop_off, commit_hash, type,
layout->list[pos].xlator->name);
return 0;
}
-
int
dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
int op_ret, int op_errno, dict_t *xattr)
@@ -397,6 +390,13 @@ dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
subvol->name);
goto out;
}
+
+ if (layout->commit_hash == 0) {
+ layout->commit_hash = layout->list[i].commit_hash;
+ } else if (layout->commit_hash != layout->list[i].commit_hash) {
+ layout->commit_hash = DHT_LAYOUT_HASH_INVALID;
+ }
+
layout->list[i].err = 0;
out:
@@ -409,6 +409,7 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j)
{
uint32_t start_swap = 0;
uint32_t stop_swap = 0;
+ uint32_t commit_hash_swap = 0;
xlator_t *xlator_swap = 0;
int err_swap = 0;
@@ -416,16 +417,19 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j)
stop_swap = layout->list[i].stop;
xlator_swap = layout->list[i].xlator;
err_swap = layout->list[i].err;
+ commit_hash_swap = layout->list[i].commit_hash;
layout->list[i].start = layout->list[j].start;
layout->list[i].stop = layout->list[j].stop;
layout->list[i].xlator = layout->list[j].xlator;
layout->list[i].err = layout->list[j].err;
+ layout->list[i].commit_hash = layout->list[j].commit_hash;
layout->list[j].start = start_swap;
layout->list[j].stop = stop_swap;
layout->list[j].xlator = xlator_swap;
layout->list[j].err = err_swap;
+ layout->list[j].commit_hash = commit_hash_swap;
}
void
@@ -728,9 +732,9 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
int dict_ret = 0;
int32_t disk_layout[4];
void *disk_layout_raw = NULL;
- int32_t count = -1;
uint32_t start_off = -1;
uint32_t stop_off = -1;
+ uint32_t commit_hash = -1;
dht_conf_t *conf = this->private;
char gfid[GF_UUID_BUF_SIZE] = {0};
@@ -795,36 +799,21 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
memcpy (disk_layout, disk_layout_raw, sizeof (disk_layout));
- count = ntoh32 (disk_layout[0]);
- if (count != 1) {
- if (loc) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_INVALID_DISK_LAYOUT,
- "Invalid disk layout: invalid count %d,"
- "path = %s, gfid = %s ",
- count, loc->path, gfid);
- } else {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_INVALID_DISK_LAYOUT,
- "Invalid disk layout: invalid count %d,"
- "path not found, gfid = %s ",
- count, gfid);
- }
- ret = -1;
- goto out;
- }
-
start_off = ntoh32 (disk_layout[2]);
stop_off = ntoh32 (disk_layout[3]);
+ commit_hash = ntoh32 (disk_layout[0]);
if ((layout->list[pos].start != start_off)
- || (layout->list[pos].stop != stop_off)) {
+ || (layout->list[pos].stop != stop_off)
+ || (layout->list[pos].commit_hash != commit_hash)) {
gf_log (this->name, GF_LOG_INFO,
- "subvol: %s; inode layout - %"PRIu32" - %"PRIu32"; "
- "disk layout - %"PRIu32" - %"PRIu32,
+ "subvol: %s; inode layout - %"PRIu32" - %"PRIu32
+ " - %"PRIu32"; "
+ "disk layout - %"PRIu32" - %"PRIu32" - %"PRIu32,
layout->list[pos].xlator->name,
layout->list[pos].start, layout->list[pos].stop,
- start_off, stop_off);
+ layout->list[pos].commit_hash,
+ start_off, stop_off, commit_hash);
ret = 1;
} else {
ret = 0;
@@ -864,3 +853,18 @@ dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode)
out:
return ret;
}
+
+int
+dht_layout_index_for_subvol (dht_layout_t *layout, xlator_t *subvol)
+{
+ int i = 0, ret = -1;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == subvol) {
+ ret = i;
+ break;
+ }
+ }
+
+ return ret;
+}
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index 767b457ce77..fcb005ecc3d 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -2336,6 +2336,46 @@ out:
}
int
+gf_defrag_settle_hash (xlator_t *this, gf_defrag_info_t *defrag,
+ loc_t *loc, dict_t *fix_layout)
+{
+ int ret;
+
+ /*
+ * Now we're ready to update the directory commit hash for the volume
+ * root, so that hash miscompares and broadcast lookups can stop.
+ * However, we want to skip that if fix-layout is all we did. In
+ * that case, we want the miscompares etc. to continue until a real
+ * rebalance is complete.
+ */
+ if (defrag->cmd == GF_DEFRAG_CMD_START_LAYOUT_FIX
+ || defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER
+ || defrag->cmd == GF_DEFRAG_CMD_START_TIER) {
+ return 0;
+ }
+
+ ret = dict_set_uint32 (fix_layout, "new-commit-hash",
+ defrag->new_commit_hash);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set new-commit-hash");
+ return -1;
+ }
+
+ ret = syncop_setxattr (this, loc, fix_layout, 0, NULL, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "fix layout on %s failed", loc->path);
+ return -1;
+ }
+
+ /* TBD: find more efficient solution than adding/deleting every time */
+ dict_del(fix_layout, "new-commit-hash");
+
+ return 0;
+}
+
+int
gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
dict_t *fix_layout, dict_t *migrate_data)
{
@@ -2421,6 +2461,7 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Child loc"
" build failed");
+ ret = -1;
goto out;
}
@@ -2486,9 +2527,16 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
"Fix layout failed for %s",
entry_loc.path);
defrag->total_failures++;
+ ret = -1;
goto out;
}
+ if (gf_defrag_settle_hash (this, defrag, &entry_loc,
+ fix_layout) != 0) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
}
gf_dirent_free (&entries);
free_entries = _gf_false;
@@ -2572,6 +2620,36 @@ gf_defrag_start_crawl (void *data)
goto out;
}
+ /*
+ * Unfortunately, we can't do special xattrs (like fix.layout) and
+ * real ones in the same call currently, and changing it seems
+ * riskier than just doing two calls.
+ */
+
+ gf_log (this->name, GF_LOG_INFO, "%s using commit hash %u",
+ __func__, conf->vol_commit_hash);
+
+ ret = dict_set_uint32 (fix_layout, conf->commithash_xattr_name,
+ conf->vol_commit_hash);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set %s", conf->commithash_xattr_name);
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_setxattr (this, &loc, fix_layout, 0, NULL, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "fix layout on %s failed",
+ loc.path);
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+
+ /* We now return to our regularly scheduled program. */
+
ret = dict_set_str (fix_layout, GF_XATTR_FIX_LAYOUT_KEY, "yes");
if (ret) {
gf_msg (this->name, GF_LOG_ERROR, 0,
@@ -2579,10 +2657,13 @@ gf_defrag_start_crawl (void *data)
"Failed to start rebalance:"
"Failed to set dictionary value: key = %s",
GF_XATTR_FIX_LAYOUT_KEY);
+ defrag->total_failures++;
ret = -1;
goto out;
}
+ defrag->new_commit_hash = conf->vol_commit_hash;
+
ret = syncop_setxattr (this, &loc, fix_layout, 0, NULL, NULL);
if (ret) {
gf_msg (this->name, GF_LOG_ERROR, 0,
@@ -2598,19 +2679,18 @@ gf_defrag_start_crawl (void *data)
(defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) {
migrate_data = dict_new ();
if (!migrate_data) {
+ defrag->total_failures++;
ret = -1;
goto out;
}
- if (defrag->cmd == GF_DEFRAG_CMD_START_FORCE)
- ret = dict_set_str (migrate_data,
- GF_XATTR_FILE_MIGRATE_KEY,
- "force");
- else
- ret = dict_set_str (migrate_data,
- GF_XATTR_FILE_MIGRATE_KEY,
- "non-force");
- if (ret)
+ ret = dict_set_str (migrate_data, GF_XATTR_FILE_MIGRATE_KEY,
+ (defrag->cmd == GF_DEFRAG_CMD_START_FORCE)
+ ? "force" : "non-force");
+ if (ret) {
+ defrag->total_failures++;
+ ret = -1;
goto out;
+ }
/* Find local subvolumes */
ret = syncop_getxattr (this, &loc, &dict,
@@ -2669,6 +2749,17 @@ gf_defrag_start_crawl (void *data)
ret = gf_defrag_fix_layout (this, defrag, &loc, fix_layout,
migrate_data);
+ if (ret) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+
+ if (gf_defrag_settle_hash (this, defrag, &loc, fix_layout) != 0) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
if (defrag->cmd == GF_DEFRAG_CMD_START_TIER) {
methods = conf->methods;
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
index cc093e1199f..c881a361804 100644
--- a/xlators/cluster/dht/src/dht-selfheal.c
+++ b/xlators/cluster/dht/src/dht-selfheal.c
@@ -23,11 +23,14 @@
#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,path) do { \
layout->list[i].start = srt; \
layout->list[i].stop = srt + chunk - 1; \
+ layout->list[i].commit_hash = layout->commit_hash; \
\
gf_msg_trace (this->name, 0, \
- "gave fix: %u - %u on %s for %s", \
+ "gave fix: %u - %u, with commit-hash %u" \
+ " on %s for %s", \
layout->list[i].start, \
layout->list[i].stop, \
+ layout->list[i].commit_hash, \
layout->list[i].xlator->name, path); \
} while (0)
@@ -448,6 +451,7 @@ dht_should_fix_layout (call_frame_t *frame, dht_layout_t **inmem,
dht_layout_t **ondisk)
{
gf_boolean_t fixit = _gf_true;
+
dht_local_t *local = NULL;
int layout_span = 0;
int decommissioned_bricks = 0;
@@ -482,6 +486,10 @@ dht_should_fix_layout (call_frame_t *frame, dht_layout_t **inmem,
if (local->selfheal.hole_cnt || local->selfheal.overlaps_cnt)
goto out;
+ /* If commit hashes are being updated, let it through */
+ if ((*inmem)->commit_hash != (*ondisk)->commit_hash)
+ goto out;
+
layout_span = dht_layout_span (*ondisk);
decommissioned_bricks
@@ -497,6 +505,7 @@ dht_should_fix_layout (call_frame_t *frame, dht_layout_t **inmem,
fixit = _gf_false;
out:
+
return fixit;
}
@@ -756,6 +765,7 @@ dht_fix_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
dummy = dht_layout_new (this, 1);
if (!dummy)
goto out;
+ dummy->commit_hash = layout->commit_hash;
for (i = 0; i < conf->subvolume_cnt; i++) {
if (_gf_false ==
dht_is_subvol_in_layout (layout, conf->subvolumes[i])) {
@@ -1474,6 +1484,8 @@ dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc,
new_layout->list[i].xlator = layout->list[i].xlator;
}
+ new_layout->commit_hash = layout->commit_hash;
+
if (priv->du_stats) {
for (i = 0; i < priv->subvolume_cnt; ++i) {
gf_log (this->name, GF_LOG_INFO,
@@ -1653,6 +1665,11 @@ dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc,
overlaps = local->selfheal.overlaps_cnt;
if (holes || overlaps) {
+ /* If the layout has anomolies which would change the hash
+ * ranges, then we need to reset the commit_hash for this
+ * directory, as the layout would change and things may not
+ * be in place as expected */
+ layout->commit_hash = DHT_LAYOUT_HASH_INVALID;
dht_selfheal_layout_new_directory (frame, loc, layout);
ret = 0;
}
@@ -1934,3 +1951,300 @@ dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data)
DHT_STACK_DESTROY (sync_frame);
return 0;
}
+
+/* EXIT: dht_update_commit_hash_for_layout */
+int
+dht_update_commit_hash_for_layout_done (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ /* preserve oldest error */
+ if (op_ret && !local->op_ret) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+
+ DHT_STACK_UNWIND (setxattr, frame, local->op_ret,
+ local->op_errno, NULL);
+
+ return 0;
+}
+
+int
+dht_update_commit_hash_for_layout_unlock (call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+ int ret = 0;
+
+ local = frame->local;
+
+ ret = dht_unlock_inodelk (frame, local->lock.locks,
+ local->lock.lk_count,
+ dht_update_commit_hash_for_layout_done);
+ if (ret < 0) {
+ /* preserve oldest error, just ... */
+ if (!local->op_ret) {
+ local->op_errno = errno;
+ local->op_ret = -1;
+ }
+
+ gf_msg (this->name, GF_LOG_WARNING, errno,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "Winding unlock failed: stale locks left on brick"
+ " %s", local->loc.path);
+
+ dht_update_commit_hash_for_layout_done (frame, NULL, this,
+ 0, 0, NULL);
+ }
+
+ return 0;
+}
+
+int
+dht_update_commit_hash_for_layout_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret,
+ int op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ /* store first failure, just because */
+ if (op_ret && !local->op_ret) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ this_call_cnt = dht_frame_return (frame);
+
+ if (is_last_call (this_call_cnt)) {
+ dht_update_commit_hash_for_layout_unlock (frame, this);
+ }
+
+ return 0;
+}
+
+int
+dht_update_commit_hash_for_layout_resume (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int count = 1, ret = -1, i = 0, j = 0;
+ dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ int32_t *disk_layout = NULL;
+ dict_t **xattr = NULL;
+
+ local = frame->local;
+ conf = frame->this->private;
+ count = conf->local_subvols_cnt;
+ layout = local->layout;
+
+ if (op_ret < 0) {
+ goto err_done;
+ }
+
+ /* We precreate the xattr list as we cannot change call count post the
+ * first wind as we may never continue from there. So we finish prep
+ * work before winding the setxattrs */
+ xattr = GF_CALLOC (count, sizeof (*xattr), gf_common_mt_char);
+ if (!xattr) {
+ local->op_errno = errno;
+
+ gf_msg (this->name, GF_LOG_WARNING, errno,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "Directory commit hash update failed:"
+ " %s: Allocation failed", local->loc.path);
+
+ goto err;
+ }
+
+ for (i = 0; i < count; i++) {
+ /* find the layout index for the subvolume */
+ ret = dht_layout_index_for_subvol (layout,
+ conf->local_subvols[i]);
+ if (ret < 0) {
+ local->op_errno = ENOENT;
+
+ gf_msg (this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "Directory commit hash update failed:"
+ " %s: (subvol %s) Failed to find disk layout",
+ local->loc.path, conf->local_subvols[i]->name);
+
+ goto err;
+ }
+ j = ret;
+
+ /* update the commit hash for the layout */
+ layout->list[j].commit_hash = layout->commit_hash;
+
+ /* extract the current layout */
+ ret = dht_disk_layout_extract (this, layout, j, &disk_layout);
+ if (ret == -1) {
+ local->op_errno = errno;
+
+ gf_msg (this->name, GF_LOG_WARNING, errno,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "Directory commit hash update failed:"
+ " %s: (subvol %s) Failed to extract disk"
+ " layout", local->loc.path,
+ conf->local_subvols[i]->name);
+
+ goto err;
+ }
+
+ xattr[i] = get_new_dict ();
+ if (!xattr[i]) {
+ local->op_errno = errno;
+
+ gf_msg (this->name, GF_LOG_WARNING, errno,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "Directory commit hash update failed:"
+ " %s: Allocation failed", local->loc.path);
+
+ goto err;
+ }
+
+ ret = dict_set_bin (xattr[i], conf->xattr_name,
+ disk_layout, 4 * 4);
+ if (ret != 0) {
+ local->op_errno = ENOMEM;
+
+ gf_msg (this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "Directory self heal xattr failed:"
+ "%s: (subvol %s) Failed to set xattr"
+ " dictionary,", local->loc.path,
+ conf->local_subvols[i]->name);
+
+ goto err;
+ }
+ disk_layout = NULL;
+
+ gf_msg_trace (this->name, 0,
+ "setting commit hash %u on subvolume %s"
+ " for %s", layout->list[j].commit_hash,
+ conf->local_subvols[i]->name, local->loc.path);
+ }
+
+ /* wind the setting of the commit hash across the local subvols */
+ local->call_cnt = count;
+ local->op_ret = 0;
+ local->op_errno = 0;
+ for (i = 0; i < count; i++) {
+ dict_ref (xattr[i]);
+
+ STACK_WIND (frame, dht_update_commit_hash_for_layout_cbk,
+ conf->local_subvols[i],
+ conf->local_subvols[i]->fops->setxattr,
+ &local->loc, xattr[i], 0, NULL);
+
+ dict_unref (xattr[i]);
+ }
+
+ return 0;
+err:
+ if (xattr) {
+ for (i = 0; i < count; i++) {
+ if (xattr[i])
+ dict_destroy (xattr[i]);
+ }
+
+ GF_FREE (xattr);
+ }
+
+ GF_FREE (disk_layout);
+
+ local->op_ret = -1;
+
+ dht_update_commit_hash_for_layout_unlock (frame, this);
+
+ return 0;
+err_done:
+ local->op_ret = -1;
+
+ dht_update_commit_hash_for_layout_done (frame, NULL, this, 0, 0, NULL);
+
+ return 0;
+}
+
+/* ENTER: dht_update_commit_hash_for_layout (see EXIT above)
+ * This function is invoked from rebalance only.
+ * As a result, the check here is simple enough to see if defrag is present
+ * in the conf, as other data would be populated appropriately if so.
+ * If ever this was to be used in other code paths, checks would need to
+ * change.
+ *
+ * Functional details:
+ * - Lock the inodes on the subvols that we want the commit hash updated
+ * - Update each layout with the inode layout, modified to take in the new
+ * commit hash.
+ * - Unlock and return.
+ */
+int
+dht_update_commit_hash_for_layout (call_frame_t *frame)
+{
+ dht_local_t *local = NULL;
+ int count = 1, ret = -1, i = 0;
+ dht_lock_t **lk_array = NULL;
+ dht_conf_t *conf = NULL;
+
+ GF_VALIDATE_OR_GOTO ("dht", frame, err);
+ GF_VALIDATE_OR_GOTO (frame->this->name, frame->local, err);
+
+ local = frame->local;
+ conf = frame->this->private;
+
+ if (!conf->defrag)
+ goto err;
+
+ count = conf->local_subvols_cnt;
+ lk_array = GF_CALLOC (count, sizeof (*lk_array),
+ gf_common_mt_char);
+ if (lk_array == NULL)
+ goto err;
+
+ for (i = 0; i < count; i++) {
+ lk_array[i] = dht_lock_new (frame->this,
+ conf->local_subvols[i],
+ &local->loc, F_WRLCK,
+ DHT_LAYOUT_HEAL_DOMAIN);
+ if (lk_array[i] == NULL)
+ goto err;
+ }
+
+ local->lock.locks = lk_array;
+ local->lock.lk_count = count;
+
+ ret = dht_blocking_inodelk (frame, lk_array, count,
+ dht_update_commit_hash_for_layout_resume);
+ if (ret < 0) {
+ local->lock.locks = NULL;
+ local->lock.lk_count = 0;
+ goto err;
+ }
+
+ return 0;
+err:
+ if (lk_array != NULL) {
+ int tmp_count = 0, i = 0;
+
+ for (i = 0; (i < count) && (lk_array[i]); i++, tmp_count++) {
+ ;
+ }
+
+ dht_lock_array_free (lk_array, tmp_count);
+ GF_FREE (lk_array);
+ }
+
+ return -1;
+}
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
index 04e96127ec6..2fef13adbd1 100644
--- a/xlators/cluster/dht/src/dht-shared.c
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -569,6 +569,7 @@ dht_init (xlator_t *this)
int cmd = 0;
char *node_uuid = NULL;
int throttle_count = 0;
+ uint32_t commit_hash = 0;
GF_VALIDATE_OR_GOTO ("dht", this, err);
@@ -590,6 +591,15 @@ dht_init (xlator_t *this)
goto err;
}
+ /* We get the commit-hash to set only for rebalance process */
+ if (dict_get_uint32 (this->options,
+ "commit-hash", &commit_hash) == 0) {
+ gf_log (this->name, GF_LOG_INFO, "%s using commit hash %u",
+ __func__, commit_hash);
+ conf->vol_commit_hash = commit_hash;
+ conf->vch_forced = _gf_true;
+ }
+
ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd);
if (cmd) {
@@ -760,6 +770,8 @@ dht_init (xlator_t *this)
GF_OPTION_INIT ("xattr-name", conf->xattr_name, str, err);
gf_asprintf (&conf->link_xattr_name, "%s."DHT_LINKFILE_STR,
conf->xattr_name);
+ gf_asprintf (&conf->commithash_xattr_name, "%s."DHT_COMMITHASH_STR,
+ conf->xattr_name);
gf_asprintf (&conf->wild_xattr_name, "%s*", conf->xattr_name);
if (!conf->link_xattr_name || !conf->wild_xattr_name) {
goto err;
@@ -871,6 +883,9 @@ struct volume_options options[] = {
{ .key = {"rebalance-cmd"},
.type = GF_OPTION_TYPE_INT,
},
+ { .key = {"commit-hash"},
+ .type = GF_OPTION_TYPE_INT,
+ },
{ .key = {"node-uuid"},
.type = GF_OPTION_TYPE_STR,
},