summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeff Darcy <jdarcy@redhat.com>2014-05-07 19:31:30 +0000
committerVijay Bellur <vbellur@redhat.com>2015-05-09 21:55:09 -0700
commit243d61575c093c03b9beb014bf9d097646836e95 (patch)
treeafaccb59310013c4f7c6bb867231c4d8988a697c
parent58ef6a233f43bc644be55d2b5510b12718a6835e (diff)
dht: make lookup-unhashed=auto do something actually useful
The key concept here is to determine whether a directory is "clean" by comparing its last-known-good topology to the current one for the volume. These are stored as "commit hashes" on the directory and the volume root respectively. The volume's commit hash changes whenever a brick is added or removed, and a fix-layout is done. A directory's commit hash changes only when a full rebalance (not just fix-layout) is done on it. If all bricks are present and have a directory commit hash that matches the volume commit hash, then we can assume that every file is in its "proper" place. Therefore, if we look for a file in that proper place and don't find it, we can assume it's not on any other subvolume and *safely* skip the global (broadcast to all) lookup. Change-Id: Id6ce4593ba1f7daffa74cfab591cb45960629ae3 BUG: 1220064 Reviewed-on-master: http://review.gluster.org/#/c/7702/ Signed-off-by: Jeff Darcy <jdarcy@redhat.com> Signed-off-by: Shyam <srangana@redhat.com> Reviewed-on: http://review.gluster.org/10729 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Krishnan Parthasarathi <kparthas@redhat.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com>
-rw-r--r--libglusterfs/src/glusterfs.h1
-rwxr-xr-xtests/bugs/distribute/bug-907072.t18
-rwxr-xr-xtests/bugs/distribute/bug-921408.t4
-rwxr-xr-xtests/bugs/glusterd/bug-1070734.t7
-rwxr-xr-xtests/features/unhashed-auto.t99
-rw-r--r--xlators/cluster/dht/src/dht-common.c87
-rw-r--r--xlators/cluster/dht/src/dht-common.h29
-rw-r--r--xlators/cluster/dht/src/dht-layout.c69
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c109
-rw-r--r--xlators/cluster/dht/src/dht-selfheal.c316
-rw-r--r--xlators/cluster/dht/src/dht-shared.c15
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-brick-ops.c5
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-op-sm.c105
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rebalance.c7
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.h1
15 files changed, 781 insertions, 91 deletions
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h
index b26580f1ec9..3843bb76ed9 100644
--- a/libglusterfs/src/glusterfs.h
+++ b/libglusterfs/src/glusterfs.h
@@ -227,6 +227,7 @@
(iabuf)->ia_type) & ~S_IFMT)\
== DHT_LINKFILE_MODE)
#define DHT_LINKFILE_STR "linkto"
+#define DHT_COMMITHASH_STR "commithash"
#define DHT_SKIP_NON_LINKTO_UNLINK "unlink-only-if-dht-linkto-file"
#define DHT_SKIP_OPEN_FD_UNLINK "dont-unlink-for-open-fd"
diff --git a/tests/bugs/distribute/bug-907072.t b/tests/bugs/distribute/bug-907072.t
index 1e8bd280f32..a4d98831380 100755
--- a/tests/bugs/distribute/bug-907072.t
+++ b/tests/bugs/distribute/bug-907072.t
@@ -17,10 +17,11 @@ TEST glusterfs -s $H0 --volfile-id $V0 $M0;
TEST mkdir $M0/test;
-OLD_LAYOUT0=`get_layout $B0/${V0}0/test`;
-OLD_LAYOUT1=`get_layout $B0/${V0}1/test`;
-OLD_LAYOUT2=`get_layout $B0/${V0}2/test`;
-OLD_LAYOUT3=`get_layout $B0/${V0}3/test`;
+# Extract the layout sans the commit hash
+OLD_LAYOUT0=`get_layout $B0/${V0}0/test | cut -c11-34`;
+OLD_LAYOUT1=`get_layout $B0/${V0}1/test | cut -c11-34`;
+OLD_LAYOUT2=`get_layout $B0/${V0}2/test | cut -c11-34`;
+OLD_LAYOUT3=`get_layout $B0/${V0}3/test | cut -c11-34`;
TEST killall glusterfsd;
@@ -36,10 +37,11 @@ EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
TEST glusterfs -s $H0 --volfile-id $V0 $M0;
TEST stat $M0/test;
-NEW_LAYOUT0=`get_layout $B0/${V0}0/test`;
-NEW_LAYOUT1=`get_layout $B0/${V0}1/test`;
-NEW_LAYOUT2=`get_layout $B0/${V0}2/test`;
-NEW_LAYOUT3=`get_layout $B0/${V0}3/test`;
+# Extract the layout sans the commit hash
+NEW_LAYOUT0=`get_layout $B0/${V0}0/test | cut -c11-34`;
+NEW_LAYOUT1=`get_layout $B0/${V0}1/test | cut -c11-34`;
+NEW_LAYOUT2=`get_layout $B0/${V0}2/test | cut -c11-34`;
+NEW_LAYOUT3=`get_layout $B0/${V0}3/test | cut -c11-34`;
EXPECT $OLD_LAYOUT0 echo $NEW_LAYOUT0;
EXPECT $OLD_LAYOUT1 echo $NEW_LAYOUT1;
diff --git a/tests/bugs/distribute/bug-921408.t b/tests/bugs/distribute/bug-921408.t
index b1887f8ae22..559114bb85a 100755
--- a/tests/bugs/distribute/bug-921408.t
+++ b/tests/bugs/distribute/bug-921408.t
@@ -37,7 +37,7 @@ addbr_rebal_till_layout_change()
then
break
fi
- NEW_LAYOUT=`get_layout $B0/${V0}0`
+ NEW_LAYOUT=`get_layout $B0/${V0}0 | cut -c11-34`
if [ $OLD_LAYOUT == $NEW_LAYOUT ]
then
i=`expr $i + 1`;
@@ -64,7 +64,7 @@ TEST touch $M0/test/test
fd=`fd_available`
TEST fd_open $fd "rw" $M0/test/test
-OLD_LAYOUT=`get_layout $B0/${V0}0`
+OLD_LAYOUT=`get_layout $B0/${V0}0 | cut -c11-34`
addbr_rebal_till_layout_change 1
diff --git a/tests/bugs/glusterd/bug-1070734.t b/tests/bugs/glusterd/bug-1070734.t
index b5a53c24cab..5db60e0cfe6 100755
--- a/tests/bugs/glusterd/bug-1070734.t
+++ b/tests/bugs/glusterd/bug-1070734.t
@@ -65,8 +65,11 @@ TEST [ -f ${OTHERBRICK}/DIR/file ]
#Check the DIR on HASHED should have got zeroed layout and the \
#OTHERBRICK should have got full layout
-EXPECT "0x00000001000000000000000000000000" dht_get_layout $HASHED/DIR ;
-EXPECT "0x000000010000000000000000ffffffff" dht_get_layout $OTHERBRICK/DIR;
+shorter_layout () {
+ dht_get_layout $1 | cut -c 19-
+}
+EXPECT "0000000000000000" shorter_layout $HASHED/DIR ;
+EXPECT "00000000ffffffff" shorter_layout $OTHERBRICK/DIR;
## Before killing daemon to avoid deadlocks
EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" umount_nfs $N0
diff --git a/tests/features/unhashed-auto.t b/tests/features/unhashed-auto.t
new file mode 100755
index 00000000000..97663c20e10
--- /dev/null
+++ b/tests/features/unhashed-auto.t
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+. $(dirname $0)/../dht.rc
+
+NFILES=100
+
+touch_files () {
+ for i in $(seq 1 $NFILES); do
+ touch $(printf $M0/dir/file%02d $i)
+ done
+}
+
+count_files () {
+ found=0
+ for i in $(seq 1 $NFILES); do
+ if [ -f $(printf $M0/dir/file%02d $i) ]; then
+ found=$((found+1))
+ fi
+ done
+ echo "found $found files" > /dev/tty
+ echo $found
+}
+
+wait_for_rebalance () {
+ while true; do
+ tmp=$(rebalance_completed)
+ if [ $tmp -eq 1 ]; then
+ sleep 1
+ else
+ break
+ fi
+ done
+}
+
+get_xattr () {
+ cmd="getfattr --absolute-names --only-values -n trusted.glusterfs.dht"
+ $cmd $1 | od -tx1 -An | tr -d ' '
+}
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume info
+
+TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2}
+EXPECT "$V0" volinfo_field $V0 'Volume Name'
+EXPECT 'Created' volinfo_field $V0 'Status'
+
+TEST $CLI volume set $V0 cluster.lookup-unhashed auto
+
+TEST $CLI volume start $V0
+EXPECT 'Started' volinfo_field $V0 'Status'
+
+# Create some files for later tests.
+TEST $GFS -s $H0 --volfile-id $V0 $M0
+TEST mkdir $M0/dir
+TEST touch_files
+TEST umount $M0
+
+# Add a brick and do the fix-layout part of rebalance to update directory layouts
+# (including their directory commit hashes).
+TEST $CLI volume add-brick $V0 $H0:$B0/${V0}3
+EXPECT '3' brick_count $V0
+TEST $CLI volume rebalance $V0 fix-layout start
+TEST wait_for_rebalance
+
+# Now for the sneaky part. *Undo* the part of rebalance that updated the volume
+# commit hash, forcing a false match between that and the directory commit hashes.
+TEST setfattr -x trusted.glusterfs.dht.commithash $B0/${V0}1
+TEST setfattr -x trusted.glusterfs.dht.commithash $B0/${V0}2
+TEST setfattr -x trusted.glusterfs.dht.commithash $B0/${V0}3
+
+# Mount and check that we do *not* see all of the files. This indicates that we
+# correctly skipped the broadcast lookup that would have found them.
+TEST $GFS -s $H0 --volfile-id $V0 $M0
+TEST [ $(count_files) -ne 100 ]
+TEST umount $M0
+
+# Do the fix-layout again to generate a new volume commit hash.
+TEST $CLI volume rebalance $V0 fix-layout start
+TEST wait_for_rebalance
+
+# Mount and check that we *do* see all of the files. This indicates that we saw
+# the mismatch and did the broadcast lookup this time.
+TEST $GFS -s $H0 --volfile-id $V0 $M0
+TEST [ $(count_files) -eq 100 ]
+TEST umount $M0
+
+# Do a *full* rebalance and verify that the directory commit hash changed.
+old_val=$(get_xattr $B0/${V0}1/dir)
+TEST $CLI volume rebalance $V0 start
+TEST wait_for_rebalance
+new_val=$(get_xattr $B0/${V0}1/dir)
+TEST [ ! x"$old_val" = x"$new_val" ]
+
+cleanup
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 6c0afdbec90..37e07ad77da 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -210,6 +210,7 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
int ret = -1;
dht_layout_t *layout = NULL;
dht_conf_t *conf = NULL;
+ uint32_t vol_commit_hash = 0;
local = discover_frame->local;
layout = local->layout;
@@ -279,6 +280,15 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
dht_layout_set (this, local->inode, layout);
}
+ if (!conf->vch_forced) {
+ ret = dict_get_uint32 (local->xattr,
+ conf->commithash_xattr_name,
+ &vol_commit_hash);
+ if (ret == 0) {
+ conf->vol_commit_hash = vol_commit_hash;
+ }
+ }
+
DHT_STACK_UNWIND (lookup, main_frame, local->op_ret, local->op_errno,
local->inode, &local->stbuf, local->xattr,
&local->postparent);
@@ -459,6 +469,12 @@ dht_discover (call_frame_t *frame, xlator_t *this, loc_t *loc)
"%s: Failed to set dictionary value:key = %s",
loc->path, conf->link_xattr_name);
+ if (__is_root_gfid(local->loc.gfid)) {
+ ret = dict_set_uint32 (local->xattr_req,
+ conf->commithash_xattr_name,
+ sizeof(uint32_t));
+ }
+
call_cnt = conf->subvolume_cnt;
local->call_cnt = call_cnt;
@@ -655,6 +671,7 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_frame_t *copy = NULL;
dht_local_t *copy_local = NULL;
char gfid[GF_UUID_BUF_SIZE] = {0};
+ uint32_t vol_commit_hash = 0;
GF_VALIDATE_OR_GOTO ("dht", frame, err);
GF_VALIDATE_OR_GOTO ("dht", this, err);
@@ -667,6 +684,14 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (!conf)
goto out;
+ if (!conf->vch_forced) {
+ ret = dict_get_uint32 (xattr, conf->commithash_xattr_name,
+ &vol_commit_hash);
+ if (ret == 0) {
+ conf->vol_commit_hash = vol_commit_hash;
+ }
+ }
+
gf_uuid_unparse (local->loc.gfid, gfid);
LOCK (&frame->lock);
@@ -1852,6 +1877,7 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_frame_t *prev = NULL;
int ret = 0;
dht_layout_t *parent_layout = NULL;
+ uint32_t vol_commit_hash = 0;
GF_VALIDATE_OR_GOTO ("dht", frame, err);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -1875,6 +1901,14 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
"fresh_lookup returned for %s with op_ret %d and "
"op_errno %d", loc->path, op_ret, op_errno);
+ if (!conf->vch_forced) {
+ ret = dict_get_uint32 (xattr, conf->commithash_xattr_name,
+ &vol_commit_hash);
+ if (ret == 0) {
+ conf->vol_commit_hash = vol_commit_hash;
+ }
+ }
+
if (ENTRY_MISSING (op_ret, op_errno)) {
gf_msg_debug (this->name, 0,
"Entry %s missing on subvol %s",
@@ -1891,7 +1925,10 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
&parent_layout);
if (ret || !parent_layout)
goto out;
- if (parent_layout->search_unhashed) {
+ if (parent_layout->commit_hash
+ != conf->vol_commit_hash) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "hashes don't match, do global lookup");
local->op_errno = ENOENT;
dht_lookup_everywhere (frame, this, loc);
return 0;
@@ -2078,6 +2115,12 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
return 0;
}
+ if (__is_root_gfid(loc->gfid)) {
+ ret = dict_set_uint32 (local->xattr_req,
+ conf->commithash_xattr_name,
+ sizeof(uint32_t));
+ }
+
if (!hashed_subvol)
hashed_subvol = dht_subvol_get_hashed (this, loc);
local->hashed_subvol = hashed_subvol;
@@ -3238,8 +3281,9 @@ dht_fsetxattr (call_frame_t *frame, xlator_t *this,
conf = this->private;
- GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
- op_errno, err);
+ if (!conf->defrag)
+ GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
+ op_errno, err);
local = dht_local_init (frame, NULL, fd, GF_FOP_FSETXATTR);
if (!local) {
@@ -3338,6 +3382,7 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
char value[4096] = {0,};
gf_dht_migrate_data_type_t forced_rebalance = GF_DHT_MIGRATE_DATA;
int call_cnt = 0;
+ uint32_t new_hash = 0;
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
@@ -3350,8 +3395,10 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
methods = conf->methods;
GF_VALIDATE_OR_GOTO (this->name, conf->methods, err);
- GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
- op_errno, err);
+ /* Rebalance daemon is allowed to set internal keys */
+ if (!conf->defrag)
+ GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
+ op_errno, err);
local = dht_local_init (frame, loc, NULL, GF_FOP_SETXATTR);
if (!local) {
@@ -3489,6 +3536,22 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
gf_log (this->name, GF_LOG_INFO,
"fixing the layout of %s", loc->path);
+ ret = dict_get_uint32(xattr, "new-commit-hash", &new_hash);
+ if (ret == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "updating commit hash for %s from %u to %u",
+ uuid_utoa(loc->gfid),
+ layout->commit_hash, new_hash);
+ layout->commit_hash = new_hash;
+
+ ret = dht_update_commit_hash_for_layout (frame);
+ if (ret) {
+ op_errno = ENOTCONN;
+ goto err;
+ }
+ return ret;
+ }
+
ret = dht_fix_directory_layout (frame, dht_common_setxattr_cbk,
layout);
if (ret) {
@@ -5377,6 +5440,8 @@ dht_mkdir (call_frame_t *frame, xlator_t *this,
goto err;
}
+ local->layout->commit_hash = conf->vol_commit_hash;
+
STACK_WIND (frame, dht_mkdir_hashed_cbk,
hashed_subvol,
hashed_subvol->fops->mkdir,
@@ -6570,10 +6635,12 @@ dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc,
ret = snprintf (string, max_string_len,
"[Subvol_name: %s, Err: %d , Start: "
- "%"PRIu32 " , Stop: %"PRIu32 " ], ",
+ "%"PRIu32 " , Stop: %"PRIu32 " , Hash: %"
+ PRIu32 " ], ",
layout->list[i].xlator->name,
layout->list[i].err, layout->list[i].start,
- layout->list[i].stop);
+ layout->list[i].stop,
+ layout->list[i].commit_hash);
if (ret < 0)
return;
@@ -6602,10 +6669,12 @@ dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc,
ret = snprintf (output_string + off, len - off,
"[Subvol_name: %s, Err: %d , Start: "
- "%"PRIu32 " , Stop: %"PRIu32 " ], ",
+ "%"PRIu32 " , Stop: %"PRIu32 " , Hash: %"
+ PRIu32 " ], ",
layout->list[i].xlator->name,
layout->list[i].err, layout->list[i].start,
- layout->list[i].stop);
+ layout->list[i].stop,
+ layout->list[i].commit_hash);
if (ret < 0)
goto err;
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index 9a6ed1a889a..45b6cc9e80b 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -31,6 +31,7 @@
#define DHT_PATHINFO_HEADER "DISTRIBUTE:"
#define DHT_FILE_MIGRATE_DOMAIN "dht.file.migrate"
#define DHT_LAYOUT_HEAL_DOMAIN "dht.layout.heal"
+#define DHT_LAYOUT_HASH_INVALID 1
#include <fnmatch.h>
@@ -48,6 +49,20 @@ struct dht_layout {
special key */
int cnt;
int preset;
+ /*
+ * The last *configuration* state for which this directory was known
+ * to be in balance. The corresponding vol_commit_hash changes
+ * whenever bricks are added or removed. This value changes when a
+ * (full) rebalance is complete. If they match, it's safe to assume
+ * that every file is where it should be and there's no need to do
+ * lookups for files elsewhere. If they don't, then we have to do a
+ * global lookup to be sure.
+ */
+ uint32_t commit_hash;
+ /*
+ * The *runtime* state of the volume, changes when connections to
+ * bricks are made or lost.
+ */
int gen;
int type;
int ref; /* use with dht_conf_t->layout_lock */
@@ -59,6 +74,7 @@ struct dht_layout {
*/
uint32_t start;
uint32_t stop;
+ uint32_t commit_hash;
xlator_t *xlator;
} list[];
};
@@ -325,6 +341,7 @@ struct gf_defrag_info_ {
uuid_t node_uuid;
struct timeval start_time;
gf_boolean_t stats;
+ uint32_t new_commit_hash;
gf_defrag_pattern_list_t *defrag_pattern;
int tier_promote_frequency;
int tier_demote_frequency;
@@ -422,6 +439,7 @@ struct dht_conf {
/* Support variable xattr names. */
char *xattr_name;
char *link_xattr_name;
+ char *commithash_xattr_name;
char *wild_xattr_name;
/* Support size-weighted rebalancing (heterogeneous bricks). */
@@ -436,6 +454,13 @@ struct dht_conf {
/*local subvol storage for rebalance*/
xlator_t **local_subvols;
int32_t local_subvols_cnt;
+
+ /*
+ * "Commit hash" for this volume topology. Changed whenever bricks
+ * are added or removed.
+ */
+ uint32_t vol_commit_hash;
+ gf_boolean_t vch_forced;
};
typedef struct dht_conf dht_conf_t;
@@ -576,7 +601,7 @@ int dht_layouts_init (xlator_t *this, dht_conf_t *conf);
int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
int op_ret, int op_errno, dict_t *xattr);
-int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,
+int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,
int pos, int32_t **disk_layout_p);
int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
int pos, void *disk_layout_raw, int disk_layout_len);
@@ -631,6 +656,7 @@ xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx);
int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode);
+int dht_layout_index_for_subvol (dht_layout_t *layout, xlator_t *subvol);
int dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout);;
void dht_layout_unref (xlator_t *this, dht_layout_t *layout);
dht_layout_t *dht_layout_ref (xlator_t *this, dht_layout_t *layout);
@@ -649,6 +675,7 @@ int dht_rename_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *preparent, struct iatt *postparent,
dict_t *xdata);
+int dht_update_commit_hash_for_layout (call_frame_t *frame);
int dht_fix_directory_layout (call_frame_t *frame,
dht_selfheal_dir_cbk_t dir_cbk,
dht_layout_t *layout);
diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c
index 2ed15c5e43c..f88c786a55b 100644
--- a/xlators/cluster/dht/src/dht-layout.c
+++ b/xlators/cluster/dht/src/dht-layout.c
@@ -267,7 +267,7 @@ dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,
goto out;
}
- disk_layout[0] = hton32 (1);
+ disk_layout[0] = hton32 (layout->list[pos].commit_hash);
disk_layout[1] = hton32 (layout->type);
disk_layout[2] = hton32 (layout->list[pos].start);
disk_layout[3] = hton32 (layout->list[pos].stop);
@@ -288,10 +288,10 @@ int
dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
int pos, void *disk_layout_raw, int disk_layout_len)
{
- int cnt = 0;
int type = 0;
int start_off = 0;
int stop_off = 0;
+ int commit_hash = 0;
int disk_layout[4];
if (!disk_layout_raw) {
@@ -305,14 +305,6 @@ dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
memcpy (disk_layout, disk_layout_raw, disk_layout_len);
- cnt = ntoh32 (disk_layout[0]);
- if (cnt != 1) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_INVALID_DISK_LAYOUT,
- "Invalid disk layout: Invalid count %d", cnt);
- return -1;
- }
-
type = ntoh32 (disk_layout[1]);
switch (type) {
case DHT_HASH_TYPE_DM_USER:
@@ -330,21 +322,22 @@ dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
return -1;
}
+ commit_hash = ntoh32 (disk_layout[0]);
start_off = ntoh32 (disk_layout[2]);
stop_off = ntoh32 (disk_layout[3]);
+ layout->list[pos].commit_hash = commit_hash;
layout->list[pos].start = start_off;
layout->list[pos].stop = stop_off;
gf_msg_trace (this->name, 0,
- "merged to layout: %u - %u (type %d) from %s",
- start_off, stop_off, type,
+ "merged to layout: %u - %u (type %d, hash %d) from %s",
+ start_off, stop_off, commit_hash, type,
layout->list[pos].xlator->name);
return 0;
}
-
int
dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
int op_ret, int op_errno, dict_t *xattr)
@@ -397,6 +390,13 @@ dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
subvol->name);
goto out;
}
+
+ if (layout->commit_hash == 0) {
+ layout->commit_hash = layout->list[i].commit_hash;
+ } else if (layout->commit_hash != layout->list[i].commit_hash) {
+ layout->commit_hash = DHT_LAYOUT_HASH_INVALID;
+ }
+
layout->list[i].err = 0;
out:
@@ -409,6 +409,7 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j)
{
uint32_t start_swap = 0;
uint32_t stop_swap = 0;
+ uint32_t commit_hash_swap = 0;
xlator_t *xlator_swap = 0;
int err_swap = 0;
@@ -416,16 +417,19 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j)
stop_swap = layout->list[i].stop;
xlator_swap = layout->list[i].xlator;
err_swap = layout->list[i].err;
+ commit_hash_swap = layout->list[i].commit_hash;
layout->list[i].start = layout->list[j].start;
layout->list[i].stop = layout->list[j].stop;
layout->list[i].xlator = layout->list[j].xlator;
layout->list[i].err = layout->list[j].err;
+ layout->list[i].commit_hash = layout->list[j].commit_hash;
layout->list[j].start = start_swap;
layout->list[j].stop = stop_swap;
layout->list[j].xlator = xlator_swap;
layout->list[j].err = err_swap;
+ layout->list[j].commit_hash = commit_hash_swap;
}
void
@@ -728,9 +732,9 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
int dict_ret = 0;
int32_t disk_layout[4];
void *disk_layout_raw = NULL;
- int32_t count = -1;
uint32_t start_off = -1;
uint32_t stop_off = -1;
+ uint32_t commit_hash = -1;
dht_conf_t *conf = this->private;
char gfid[GF_UUID_BUF_SIZE] = {0};
@@ -779,27 +783,21 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
memcpy (disk_layout, disk_layout_raw, sizeof (disk_layout));
- count = ntoh32 (disk_layout[0]);
- if (count != 1) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_INVALID_DISK_LAYOUT,
- "Invalid disk layout: invalid count %d,"
- "path = %s, gfid = %s ", count, loc->path, gfid);
- ret = -1;
- goto out;
- }
-
start_off = ntoh32 (disk_layout[2]);
stop_off = ntoh32 (disk_layout[3]);
+ commit_hash = ntoh32 (disk_layout[0]);
if ((layout->list[pos].start != start_off)
- || (layout->list[pos].stop != stop_off)) {
+ || (layout->list[pos].stop != stop_off)
+ || (layout->list[pos].commit_hash != commit_hash)) {
gf_log (this->name, GF_LOG_INFO,
- "subvol: %s; inode layout - %"PRIu32" - %"PRIu32"; "
- "disk layout - %"PRIu32" - %"PRIu32,
+ "subvol: %s; inode layout - %"PRIu32" - %"PRIu32
+ " - %"PRIu32"; "
+ "disk layout - %"PRIu32" - %"PRIu32" - %"PRIu32,
layout->list[pos].xlator->name,
layout->list[pos].start, layout->list[pos].stop,
- start_off, stop_off);
+ layout->list[pos].commit_hash,
+ start_off, stop_off, commit_hash);
ret = 1;
} else {
ret = 0;
@@ -839,3 +837,18 @@ dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode)
out:
return ret;
}
+
+int
+dht_layout_index_for_subvol (dht_layout_t *layout, xlator_t *subvol)
+{
+ int i = 0, ret = -1;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == subvol) {
+ ret = i;
+ break;
+ }
+ }
+
+ return ret;
+}
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index 60f7314efe0..fae856d969f 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -2337,6 +2337,46 @@ out:
}
int
+gf_defrag_settle_hash (xlator_t *this, gf_defrag_info_t *defrag,
+ loc_t *loc, dict_t *fix_layout)
+{
+ int ret;
+
+ /*
+ * Now we're ready to update the directory commit hash for the volume
+ * root, so that hash miscompares and broadcast lookups can stop.
+ * However, we want to skip that if fix-layout is all we did. In
+ * that case, we want the miscompares etc. to continue until a real
+ * rebalance is complete.
+ */
+ if (defrag->cmd == GF_DEFRAG_CMD_START_LAYOUT_FIX
+ || defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER
+ || defrag->cmd == GF_DEFRAG_CMD_START_TIER) {
+ return 0;
+ }
+
+ ret = dict_set_uint32 (fix_layout, "new-commit-hash",
+ defrag->new_commit_hash);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set new-commit-hash");
+ return -1;
+ }
+
+ ret = syncop_setxattr (this, loc, fix_layout, 0, NULL, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "fix layout on %s failed", loc->path);
+ return -1;
+ }
+
+ /* TBD: find more efficient solution than adding/deleting every time */
+ dict_del(fix_layout, "new-commit-hash");
+
+ return 0;
+}
+
+int
gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
dict_t *fix_layout, dict_t *migrate_data)
{
@@ -2422,6 +2462,7 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Child loc"
" build failed");
+ ret = -1;
goto out;
}
@@ -2487,9 +2528,16 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
"Fix layout failed for %s",
entry_loc.path);
defrag->total_failures++;
+ ret = -1;
goto out;
}
+ if (gf_defrag_settle_hash (this, defrag, &entry_loc,
+ fix_layout) != 0) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
}
gf_dirent_free (&entries);
free_entries = _gf_false;
@@ -2573,6 +2621,36 @@ gf_defrag_start_crawl (void *data)
goto out;
}
+ /*
+ * Unfortunately, we can't do special xattrs (like fix.layout) and
+ * real ones in the same call currently, and changing it seems
+ * riskier than just doing two calls.
+ */
+
+ gf_log (this->name, GF_LOG_INFO, "%s using commit hash %u",
+ __func__, conf->vol_commit_hash);
+
+ ret = dict_set_uint32 (fix_layout, conf->commithash_xattr_name,
+ conf->vol_commit_hash);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set %s", conf->commithash_xattr_name);
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_setxattr (this, &loc, fix_layout, 0, NULL, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "fix layout on %s failed",
+ loc.path);
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+
+ /* We now return to our regularly scheduled program. */
+
ret = dict_set_str (fix_layout, GF_XATTR_FIX_LAYOUT_KEY, "yes");
if (ret) {
gf_msg (this->name, GF_LOG_ERROR, 0,
@@ -2580,10 +2658,13 @@ gf_defrag_start_crawl (void *data)
"Failed to start rebalance:"
"Failed to set dictionary value: key = %s",
GF_XATTR_FIX_LAYOUT_KEY);
+ defrag->total_failures++;
ret = -1;
goto out;
}
+ defrag->new_commit_hash = conf->vol_commit_hash;
+
ret = syncop_setxattr (this, &loc, fix_layout, 0, NULL, NULL);
if (ret) {
gf_msg (this->name, GF_LOG_ERROR, 0,
@@ -2599,19 +2680,18 @@ gf_defrag_start_crawl (void *data)
(defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) {
migrate_data = dict_new ();
if (!migrate_data) {
+ defrag->total_failures++;
ret = -1;
goto out;
}
- if (defrag->cmd == GF_DEFRAG_CMD_START_FORCE)
- ret = dict_set_str (migrate_data,
- GF_XATTR_FILE_MIGRATE_KEY,
- "force");
- else
- ret = dict_set_str (migrate_data,
- GF_XATTR_FILE_MIGRATE_KEY,
- "non-force");
- if (ret)
+ ret = dict_set_str (migrate_data, GF_XATTR_FILE_MIGRATE_KEY,
+ (defrag->cmd == GF_DEFRAG_CMD_START_FORCE)
+ ? "force" : "non-force");
+ if (ret) {
+ defrag->total_failures++;
+ ret = -1;
goto out;
+ }
/* Find local subvolumes */
ret = syncop_getxattr (this, &loc, &dict,
@@ -2670,6 +2750,17 @@ gf_defrag_start_crawl (void *data)
ret = gf_defrag_fix_layout (this, defrag, &loc, fix_layout,
migrate_data);
+ if (ret) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+
+ if (gf_defrag_settle_hash (this, defrag, &loc, fix_layout) != 0) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
if (defrag->cmd == GF_DEFRAG_CMD_START_TIER) {
methods = conf->methods;
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
index cc093e1199f..c881a361804 100644
--- a/xlators/cluster/dht/src/dht-selfheal.c
+++ b/xlators/cluster/dht/src/dht-selfheal.c
@@ -23,11 +23,14 @@
#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,path) do { \
layout->list[i].start = srt; \
layout->list[i].stop = srt + chunk - 1; \
+ layout->list[i].commit_hash = layout->commit_hash; \
\
gf_msg_trace (this->name, 0, \
- "gave fix: %u - %u on %s for %s", \
+ "gave fix: %u - %u, with commit-hash %u" \
+ " on %s for %s", \
layout->list[i].start, \
layout->list[i].stop, \
+ layout->list[i].commit_hash, \
layout->list[i].xlator->name, path); \
} while (0)
@@ -448,6 +451,7 @@ dht_should_fix_layout (call_frame_t *frame, dht_layout_t **inmem,
dht_layout_t **ondisk)
{
gf_boolean_t fixit = _gf_true;
+
dht_local_t *local = NULL;
int layout_span = 0;
int decommissioned_bricks = 0;
@@ -482,6 +486,10 @@ dht_should_fix_layout (call_frame_t *frame, dht_layout_t **inmem,
if (local->selfheal.hole_cnt || local->selfheal.overlaps_cnt)
goto out;
+ /* If commit hashes are being updated, let it through */
+ if ((*inmem)->commit_hash != (*ondisk)->commit_hash)
+ goto out;
+
layout_span = dht_layout_span (*ondisk);
decommissioned_bricks
@@ -497,6 +505,7 @@ dht_should_fix_layout (call_frame_t *frame, dht_layout_t **inmem,
fixit = _gf_false;
out:
+
return fixit;
}
@@ -756,6 +765,7 @@ dht_fix_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
dummy = dht_layout_new (this, 1);
if (!dummy)
goto out;
+ dummy->commit_hash = layout->commit_hash;
for (i = 0; i < conf->subvolume_cnt; i++) {
if (_gf_false ==
dht_is_subvol_in_layout (layout, conf->subvolumes[i])) {
@@ -1474,6 +1484,8 @@ dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc,
new_layout->list[i].xlator = layout->list[i].xlator;
}
+ new_layout->commit_hash = layout->commit_hash;
+
if (priv->du_stats) {
for (i = 0; i < priv->subvolume_cnt; ++i) {
gf_log (this->name, GF_LOG_INFO,
@@ -1653,6 +1665,11 @@ dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc,
overlaps = local->selfheal.overlaps_cnt;
if (holes || overlaps) {
+ /* If the layout has anomolies which would change the hash
+ * ranges, then we need to reset the commit_hash for this
+ * directory, as the layout would change and things may not
+ * be in place as expected */
+ layout->commit_hash = DHT_LAYOUT_HASH_INVALID;
dht_selfheal_layout_new_directory (frame, loc, layout);
ret = 0;
}
@@ -1934,3 +1951,300 @@ dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data)
DHT_STACK_DESTROY (sync_frame);
return 0;
}
+
+/* EXIT: dht_update_commit_hash_for_layout */
+int
+dht_update_commit_hash_for_layout_done (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ /* preserve oldest error */
+ if (op_ret && !local->op_ret) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+
+ DHT_STACK_UNWIND (setxattr, frame, local->op_ret,
+ local->op_errno, NULL);
+
+ return 0;
+}
+
+int
+dht_update_commit_hash_for_layout_unlock (call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+ int ret = 0;
+
+ local = frame->local;
+
+ ret = dht_unlock_inodelk (frame, local->lock.locks,
+ local->lock.lk_count,
+ dht_update_commit_hash_for_layout_done);
+ if (ret < 0) {
+ /* preserve oldest error, just ... */
+ if (!local->op_ret) {
+ local->op_errno = errno;
+ local->op_ret = -1;
+ }
+
+ gf_msg (this->name, GF_LOG_WARNING, errno,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "Winding unlock failed: stale locks left on brick"
+ " %s", local->loc.path);
+
+ dht_update_commit_hash_for_layout_done (frame, NULL, this,
+ 0, 0, NULL);
+ }
+
+ return 0;
+}
+
+int
+dht_update_commit_hash_for_layout_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret,
+ int op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ /* store first failure, just because */
+ if (op_ret && !local->op_ret) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+ UNLOCK (&frame->lock);
+
+ this_call_cnt = dht_frame_return (frame);
+
+ if (is_last_call (this_call_cnt)) {
+ dht_update_commit_hash_for_layout_unlock (frame, this);
+ }
+
+ return 0;
+}
+
+int
+dht_update_commit_hash_for_layout_resume (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int count = 1, ret = -1, i = 0, j = 0;
+ dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ int32_t *disk_layout = NULL;
+ dict_t **xattr = NULL;
+
+ local = frame->local;
+ conf = frame->this->private;
+ count = conf->local_subvols_cnt;
+ layout = local->layout;
+
+ if (op_ret < 0) {
+ goto err_done;
+ }
+
+ /* We precreate the xattr list as we cannot change call count post the
+ * first wind as we may never continue from there. So we finish prep
+ * work before winding the setxattrs */
+ xattr = GF_CALLOC (count, sizeof (*xattr), gf_common_mt_char);
+ if (!xattr) {
+ local->op_errno = errno;
+
+ gf_msg (this->name, GF_LOG_WARNING, errno,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "Directory commit hash update failed:"
+ " %s: Allocation failed", local->loc.path);
+
+ goto err;
+ }
+
+ for (i = 0; i < count; i++) {
+ /* find the layout index for the subvolume */
+ ret = dht_layout_index_for_subvol (layout,
+ conf->local_subvols[i]);
+ if (ret < 0) {
+ local->op_errno = ENOENT;
+
+ gf_msg (this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "Directory commit hash update failed:"
+ " %s: (subvol %s) Failed to find disk layout",
+ local->loc.path, conf->local_subvols[i]->name);
+
+ goto err;
+ }
+ j = ret;
+
+ /* update the commit hash for the layout */
+ layout->list[j].commit_hash = layout->commit_hash;
+
+ /* extract the current layout */
+ ret = dht_disk_layout_extract (this, layout, j, &disk_layout);
+ if (ret == -1) {
+ local->op_errno = errno;
+
+ gf_msg (this->name, GF_LOG_WARNING, errno,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "Directory commit hash update failed:"
+ " %s: (subvol %s) Failed to extract disk"
+ " layout", local->loc.path,
+ conf->local_subvols[i]->name);
+
+ goto err;
+ }
+
+ xattr[i] = get_new_dict ();
+ if (!xattr[i]) {
+ local->op_errno = errno;
+
+ gf_msg (this->name, GF_LOG_WARNING, errno,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "Directory commit hash update failed:"
+ " %s: Allocation failed", local->loc.path);
+
+ goto err;
+ }
+
+ ret = dict_set_bin (xattr[i], conf->xattr_name,
+ disk_layout, 4 * 4);
+ if (ret != 0) {
+ local->op_errno = ENOMEM;
+
+ gf_msg (this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ "Directory self heal xattr failed:"
+ "%s: (subvol %s) Failed to set xattr"
+ " dictionary,", local->loc.path,
+ conf->local_subvols[i]->name);
+
+ goto err;
+ }
+ disk_layout = NULL;
+
+ gf_msg_trace (this->name, 0,
+ "setting commit hash %u on subvolume %s"
+ " for %s", layout->list[j].commit_hash,
+ conf->local_subvols[i]->name, local->loc.path);
+ }
+
+ /* wind the setting of the commit hash across the local subvols */
+ local->call_cnt = count;
+ local->op_ret = 0;
+ local->op_errno = 0;
+ for (i = 0; i < count; i++) {
+ dict_ref (xattr[i]);
+
+ STACK_WIND (frame, dht_update_commit_hash_for_layout_cbk,
+ conf->local_subvols[i],
+ conf->local_subvols[i]->fops->setxattr,
+ &local->loc, xattr[i], 0, NULL);
+
+ dict_unref (xattr[i]);
+ }
+
+ return 0;
+err:
+ if (xattr) {
+ for (i = 0; i < count; i++) {
+ if (xattr[i])
+ dict_destroy (xattr[i]);
+ }
+
+ GF_FREE (xattr);
+ }
+
+ GF_FREE (disk_layout);
+
+ local->op_ret = -1;
+
+ dht_update_commit_hash_for_layout_unlock (frame, this);
+
+ return 0;
+err_done:
+ local->op_ret = -1;
+
+ dht_update_commit_hash_for_layout_done (frame, NULL, this, 0, 0, NULL);
+
+ return 0;
+}
+
+/* ENTER: dht_update_commit_hash_for_layout (see EXIT above)
+ * This function is invoked from rebalance only.
+ * As a result, the check here is simple enough to see if defrag is present
+ * in the conf, as other data would be populated appropriately if so.
+ * If ever this was to be used in other code paths, checks would need to
+ * change.
+ *
+ * Functional details:
+ * - Lock the inodes on the subvols that we want the commit hash updated
+ * - Update each layout with the inode layout, modified to take in the new
+ * commit hash.
+ * - Unlock and return.
+ */
+int
+dht_update_commit_hash_for_layout (call_frame_t *frame)
+{
+ dht_local_t *local = NULL;
+ int count = 1, ret = -1, i = 0;
+ dht_lock_t **lk_array = NULL;
+ dht_conf_t *conf = NULL;
+
+ GF_VALIDATE_OR_GOTO ("dht", frame, err);
+ GF_VALIDATE_OR_GOTO (frame->this->name, frame->local, err);
+
+ local = frame->local;
+ conf = frame->this->private;
+
+ if (!conf->defrag)
+ goto err;
+
+ count = conf->local_subvols_cnt;
+ lk_array = GF_CALLOC (count, sizeof (*lk_array),
+ gf_common_mt_char);
+ if (lk_array == NULL)
+ goto err;
+
+ for (i = 0; i < count; i++) {
+ lk_array[i] = dht_lock_new (frame->this,
+ conf->local_subvols[i],
+ &local->loc, F_WRLCK,
+ DHT_LAYOUT_HEAL_DOMAIN);
+ if (lk_array[i] == NULL)
+ goto err;
+ }
+
+ local->lock.locks = lk_array;
+ local->lock.lk_count = count;
+
+ ret = dht_blocking_inodelk (frame, lk_array, count,
+ dht_update_commit_hash_for_layout_resume);
+ if (ret < 0) {
+ local->lock.locks = NULL;
+ local->lock.lk_count = 0;
+ goto err;
+ }
+
+ return 0;
+err:
+ if (lk_array != NULL) {
+ int tmp_count = 0, i = 0;
+
+ for (i = 0; (i < count) && (lk_array[i]); i++, tmp_count++) {
+ ;
+ }
+
+ dht_lock_array_free (lk_array, tmp_count);
+ GF_FREE (lk_array);
+ }
+
+ return -1;
+}
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
index ffabc820d70..a1f72a85112 100644
--- a/xlators/cluster/dht/src/dht-shared.c
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -569,6 +569,7 @@ dht_init (xlator_t *this)
int cmd = 0;
char *node_uuid = NULL;
int throttle_count = 0;
+ uint32_t commit_hash = 0;
GF_VALIDATE_OR_GOTO ("dht", this, err);
@@ -590,6 +591,15 @@ dht_init (xlator_t *this)
goto err;
}
+ /* We get the commit-hash to set only for rebalance process */
+ if (dict_get_uint32 (this->options,
+ "commit-hash", &commit_hash) == 0) {
+ gf_log (this->name, GF_LOG_INFO, "%s using commit hash %u",
+ __func__, commit_hash);
+ conf->vol_commit_hash = commit_hash;
+ conf->vch_forced = _gf_true;
+ }
+
ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd);
if (cmd) {
@@ -760,6 +770,8 @@ dht_init (xlator_t *this)
GF_OPTION_INIT ("xattr-name", conf->xattr_name, str, err);
gf_asprintf (&conf->link_xattr_name, "%s."DHT_LINKFILE_STR,
conf->xattr_name);
+ gf_asprintf (&conf->commithash_xattr_name, "%s."DHT_COMMITHASH_STR,
+ conf->xattr_name);
gf_asprintf (&conf->wild_xattr_name, "%s*", conf->xattr_name);
if (!conf->link_xattr_name || !conf->wild_xattr_name) {
goto err;
@@ -871,6 +883,9 @@ struct volume_options options[] = {
{ .key = {"rebalance-cmd"},
.type = GF_OPTION_TYPE_INT,
},
+ { .key = {"commit-hash"},
+ .type = GF_OPTION_TYPE_INT,
+ },
{ .key = {"node-uuid"},
.type = GF_OPTION_TYPE_STR,
},
diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
index 92d15c615be..019766c5d83 100644
--- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
+++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
@@ -1997,6 +1997,8 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
dict_t *bricks_dict = NULL;
char *brick_tmpstr = NULL;
int start_remove = 0;
+ uint32_t commit_hash = 0;
+
this = THIS;
GF_ASSERT (this);
@@ -2262,6 +2264,9 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
break;
}
if (!force && need_rebalance) {
+ if (dict_get_uint32(dict, "commit-hash", &commit_hash) == 0) {
+ volinfo->rebal.commit_hash = commit_hash;
+ }
/* perform the rebalance operations */
ret = glusterd_handle_defrag_start
(volinfo, err_str, sizeof (err_str),
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
index 858f0771ca6..bc0763483fd 100644
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
@@ -3404,6 +3404,36 @@ out:
}
int
+gd_set_commit_hash (dict_t *dict)
+{
+ struct timeval tv;
+ uint32_t hash;
+
+ /*
+ * We need a commit hash that won't conflict with others we might have
+ * set, or zero which is the implicit value if we never have. Using
+ * seconds<<3 like this ensures that we'll only get a collision if two
+ * consecutive rebalances are separated by exactly 2^29 seconds - about
+ * 17 years - and even then there's only a 1/8 chance of a collision in
+ * the low order bits. It's far more likely that this code will have
+ * changed completely by then. If not, call me in 2031.
+ *
+ * P.S. Time zone changes? Yeah, right.
+ */
+ gettimeofday (&tv, NULL);
+ hash = tv.tv_sec << 3;
+
+ /*
+ * Make sure at least one of those low-order bits is set. The extra
+ * shifting is because not all machines have sub-millisecond time
+ * resolution.
+ */
+ hash |= 1 << ((tv.tv_usec >> 10) % 3);
+
+ return dict_set_uint32 (dict, "commit-hash", hash);
+}
+
+int
glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)
{
int ret = -1;
@@ -3415,6 +3445,7 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)
uint32_t status_cmd = GF_CLI_STATUS_NONE;
char *errstr = NULL;
xlator_t *this = NULL;
+ gf_boolean_t do_common = _gf_false;
GF_ASSERT (req);
@@ -3503,12 +3534,6 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)
}
break;
- case GD_OP_SYNC_VOLUME:
- {
- dict_copy (dict, req_dict);
- break;
- }
-
case GD_OP_REMOVE_BRICK:
{
dict_t *dict = ctx;
@@ -3525,6 +3550,10 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)
if (ret)
goto out;
+ if (gd_set_commit_hash(dict) != 0) {
+ goto out;
+ }
+
dict_destroy (req_dict);
req_dict = dict_ref (dict);
}
@@ -3544,8 +3573,10 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)
dict_copy (dict, req_dict);
break;
}
+ do_common = _gf_true;
}
- /*fall-through*/
+ break;
+
case GD_OP_DELETE_VOLUME:
case GD_OP_START_VOLUME:
case GD_OP_STOP_VOLUME:
@@ -3555,7 +3586,6 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)
case GD_OP_LOG_ROTATE:
case GD_OP_QUOTA:
case GD_OP_PROFILE_VOLUME:
- case GD_OP_REBALANCE:
case GD_OP_HEAL_VOLUME:
case GD_OP_STATEDUMP_VOLUME:
case GD_OP_CLEARLOCKS_VOLUME:
@@ -3563,49 +3593,62 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)
case GD_OP_BARRIER:
case GD_OP_BITROT:
{
- ret = dict_get_str (dict, "volname", &volname);
- if (ret) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "volname is not present in "
- "operation ctx");
- goto out;
- }
-
- if (strcasecmp (volname, "all")) {
- ret = glusterd_dict_set_volid (dict,
- volname,
- op_errstr);
- if (ret)
- goto out;
- }
- dict_copy (dict, req_dict);
+ do_common = _gf_true;
}
break;
- case GD_OP_COPY_FILE:
+ case GD_OP_REBALANCE:
{
- dict_copy (dict, req_dict);
- break;
+ if (gd_set_commit_hash(dict) != 0) {
+ goto out;
+ }
+ do_common = _gf_true;
}
+ break;
+ case GD_OP_SYNC_VOLUME:
+ case GD_OP_COPY_FILE:
case GD_OP_SYS_EXEC:
{
dict_copy (dict, req_dict);
- break;
}
+ break;
case GD_OP_GANESHA:
{
dict_copy (dict, req_dict);
- break;
}
+ break;
default:
break;
}
- *req = req_dict;
- ret = 0;
+ /*
+ * This has been moved out of the switch so that multiple ops with
+ * other special needs can all "fall through" to it.
+ */
+ if (do_common) {
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "volname is not present in "
+ "operation ctx");
+ goto out;
+ }
+
+ if (strcasecmp (volname, "all")) {
+ ret = glusterd_dict_set_volid (dict,
+ volname,
+ op_errstr);
+ if (ret)
+ goto out;
+ }
+ dict_copy (dict, req_dict);
+ }
+
+ *req = req_dict;
+ ret = 0;
out:
return ret;
diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c
index 48d9a706042..cf8ee3a79f7 100644
--- a/xlators/mgmt/glusterd/src/glusterd-rebalance.c
+++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c
@@ -284,6 +284,9 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr,
runner_argprintf ( &runner, "*dht.rebalance-cmd=%d",cmd);
runner_add_arg (&runner, "--xlator-option");
runner_argprintf (&runner, "*dht.node-uuid=%s", uuid_utoa(MY_UUID));
+ runner_add_arg (&runner, "--xlator-option");
+ runner_argprintf (&runner, "*dht.commit-hash=%u",
+ volinfo->rebal.commit_hash);
runner_add_arg (&runner, "--socket-file");
runner_argprintf (&runner, "%s",sockfile);
runner_add_arg (&runner, "--pid-file");
@@ -716,6 +719,7 @@ glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
char *task_id_str = NULL;
dict_t *ctx = NULL;
xlator_t *this = NULL;
+ uint32_t commit_hash;
this = THIS;
GF_ASSERT (this);
@@ -804,6 +808,9 @@ glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
glusterd_store_perform_node_state_store (volinfo);
break;
}
+ if (dict_get_uint32 (dict, "commit-hash", &commit_hash) == 0) {
+ volinfo->rebal.commit_hash = commit_hash;
+ }
ret = glusterd_handle_defrag_start (volinfo, msg, sizeof (msg),
cmd, NULL, GD_OP_REBALANCE);
break;
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
index 3f2ff45f1a1..5341192e84a 100644
--- a/xlators/mgmt/glusterd/src/glusterd.h
+++ b/xlators/mgmt/glusterd/src/glusterd.h
@@ -286,6 +286,7 @@ struct glusterd_rebalance_ {
glusterd_op_t op;
dict_t *dict; /* Dict to store misc information
* like list of bricks being removed */
+ uint32_t commit_hash;
};
typedef struct glusterd_rebalance_ glusterd_rebalance_t;