summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPranith Kumar K <pkarampu@redhat.com>2017-01-25 15:31:44 +0530
committerPranith Kumar Karampuri <pkarampu@redhat.com>2017-02-26 22:06:55 -0500
commitc1fc1fc9cb5a13e6ddf8c9270deb0c7609333540 (patch)
treea3876aa8a0c1b087429ba916c9380b90bcda6b72
parent4638dfc1fee80f9338f2941f3cccb17bec63989a (diff)
cluster/ec: Don't trigger data/metadata heal on Lookups
Problem-1 If Lookup which doesn't take any locks observes version mismatch it can't be trusted. If we launch a heal based on this information it will lead to self-heals which will affect I/O performance in the cases where Lookup is wrong. Considering self-heal-daemon and operations on the inode from client which take locks can still trigger heal we can choose to not attempt a heal on Lookup. Problem-2: Fixed spurious failure of tests/bitrot/bug-1373520.t For the issues above, what was happening was that ec_heal_inspect() is preventing 'name' heal to happen Problem-3: tests/basic/ec/ec-background-heals.t To be honest I don't know what the problem was, while fixing the 2 problems above, I made some changes to ec_heal_inspect() and ec_need_heal() after which when I tried to recreate the spurious failure it just didn't happen even after a long time. BUG: 1414287 Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> Change-Id: Ife2535e1d0b267712973673f6d474e288f3c6834 Reviewed-on: https://review.gluster.org/16468 Smoke: Gluster Build System <jenkins@build.gluster.org> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> Reviewed-by: Xavier Hernandez <xhernandez@datalab.es> CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Ashish Pandey <aspandey@redhat.com>
-rw-r--r--libglusterfs/src/cluster-syncop.c4
-rw-r--r--tests/basic/ec/ec-background-heals.t2
-rw-r--r--tests/basic/ec/heal-info.t73
-rw-r--r--tests/basic/ec/self-heal.t21
-rw-r--r--tests/bitrot/bug-1373520.t37
-rw-r--r--tests/volume.rc5
-rw-r--r--xlators/cluster/ec/src/ec-common.c66
-rw-r--r--xlators/cluster/ec/src/ec-common.h1
-rw-r--r--xlators/cluster/ec/src/ec-heal.c321
-rw-r--r--xlators/cluster/ec/src/ec-helpers.c18
-rw-r--r--xlators/cluster/ec/src/ec-helpers.h3
11 files changed, 408 insertions, 143 deletions
diff --git a/libglusterfs/src/cluster-syncop.c b/libglusterfs/src/cluster-syncop.c
index 10993e6088d..98a46c85e4b 100644
--- a/libglusterfs/src/cluster-syncop.c
+++ b/libglusterfs/src/cluster-syncop.c
@@ -102,6 +102,10 @@ void
cluster_replies_wipe (default_args_cbk_t *replies, int numsubvols)
{
int i = 0;
+
+ if (!replies)
+ return;
+
for (i = 0; i < numsubvols; i++)
args_cbk_wipe (&replies[i]);
memset (replies, 0, numsubvols * sizeof (*replies));
diff --git a/tests/basic/ec/ec-background-heals.t b/tests/basic/ec/ec-background-heals.t
index e99178d224e..eb434908bad 100644
--- a/tests/basic/ec/ec-background-heals.t
+++ b/tests/basic/ec/ec-background-heals.t
@@ -101,5 +101,3 @@ EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
TEST chown root:root $M0/{a,b,c,d}
EXPECT "0" mount_get_option_value $M0 $V0-disperse-0 heal-waiters
cleanup
-#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=1416689
-#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=1416689
diff --git a/tests/basic/ec/heal-info.t b/tests/basic/ec/heal-info.t
new file mode 100644
index 00000000000..7393d22d222
--- /dev/null
+++ b/tests/basic/ec/heal-info.t
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+# This test checks if heal info works as expected or not
+
+function create_files {
+ for i in {21..1000};
+ do
+ dd if=/dev/zero of=$M0/$i bs=1M count=1 2>&1 > /dev/null;
+ done
+ rm -f $M0/lock
+}
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 redundancy 2 $H0:$B0/${V0}{0..5}
+TEST $CLI volume set $V0 client-log-level DEBUG
+TEST $CLI volume heal $V0 disable
+TEST $CLI volume start $V0
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 --direct-io-mode=yes $M0;
+# Wait until all 6 childs have been recognized by the ec xlator
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0
+
+#heal info should give zero entries to be healed when I/O is going on
+dd if=/dev/zero of=$M0/a bs=1M count=2048 &
+dd_pid=$!
+sleep 3 #Wait for I/O to proceed for some time
+EXPECT "^0$" get_pending_heal_count $V0
+kill -9 $dd_pid
+touch $M0/lock
+create_files &
+
+total_heal_count=0
+while [ -f $M0/lock ];
+do
+ heal_count=$(get_pending_heal_count $V0)
+ total_heal_count=$((heal_count+total_heal_count))
+done
+EXPECT "^0$" echo $total_heal_count
+
+#When only data heal is required it should print it
+#There is no easy way to create this using commands so assigning xattrs directly
+TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}0/1000
+TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}1/1000
+TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}2/1000
+TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}3/1000
+TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}4/1000
+TEST setfattr -n trusted.ec.version -v 0x00000000000000010000000000000000 $B0/${V0}5/1000
+index_path=$B0/${V0}5/.glusterfs/indices/xattrop/$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/${V0}5/1000))
+while [ -f $index_path ]; do :; done
+TEST touch $index_path
+EXPECT "^1$" get_pending_heal_count $V0
+TEST rm -f $M0/1000
+
+#When files/directories need heal test that it prints them
+TEST touch $M0/{1..10}
+TEST kill_brick $V0 $H0 $B0/${V0}0
+for i in {11..20};
+do
+ echo abc > $M0/$i #Data + entry + metadata heal
+done
+for i in {1..10};
+do
+ chmod +x $M0/$i;
+done
+
+EXPECT "^105$" get_pending_heal_count $V0
+
+cleanup
diff --git a/tests/basic/ec/self-heal.t b/tests/basic/ec/self-heal.t
index 3e3467535fb..7f3486fe27b 100644
--- a/tests/basic/ec/self-heal.t
+++ b/tests/basic/ec/self-heal.t
@@ -9,6 +9,7 @@ cleanup
function check_mount_dir
{
+ getfattr -d -m. -e hex $M0 2>&1 > /dev/null
for i in {1..20}; do
ls -l $M0/ | grep "dir1"
if [ $? -ne 0 ]; then
@@ -21,7 +22,7 @@ function check_mount_dir
function check_size
{
- stat $M0/$1
+ cat $M0/$1 2>&1 > /dev/null
for i in "${brick[@]}"; do
res=`stat -c "%s" $i/$1`
if [ "$res" != "$2" ]; then
@@ -35,7 +36,7 @@ function check_size
function check_mode
{
- stat $M0/$1
+ getfattr -d -m. -e hex $M0/$1 2>&1 > /dev/null
for i in "${brick[@]}"; do
res=`stat -c "%A" $i/$1`
if [ "$res" != "$2" ]; then
@@ -49,7 +50,7 @@ function check_mode
function check_date
{
- stat $M0/$1
+ getfattr -d -m. -e hex $M0/$1 2>&1 > /dev/null
for i in "${brick[@]}"; do
res=`stat -c "%Y" $i/$1`
if [ "$res" != "$2" ]; then
@@ -63,7 +64,7 @@ function check_date
function check_xattr
{
- stat $M0/$1
+ getfattr -d -m. -e hex $M0/$1 2>&1 > /dev/null
for i in "${brick[@]}"; do
getfattr -n $2 $i/$1 2>/dev/null
if [ $? -eq 0 ]; then
@@ -77,7 +78,7 @@ function check_xattr
function check_dir
{
- getfattr -m. -d $M0/dir1
+ getfattr -m. -d $M0/dir1 2>&1 > /dev/null
for i in "${brick[@]}"; do
if [ ! -d $i/dir1 ]; then
echo "N"
@@ -90,7 +91,7 @@ function check_dir
function check_soft_link
{
- stat $M0/test3
+ getfattr -d -m. -e hex $M0/test3 2>&1 > /dev/null
for i in "${brick[@]}"; do
if [ ! -h $i/test3 ]; then
echo "N"
@@ -103,7 +104,7 @@ function check_soft_link
function check_hard_link
{
- stat $M0/test4
+ getfattr -d -m. -e hex $M0/test4 2>&1 > /dev/null
for i in "${brick[@]}"; do
res=`stat -c "%h" $i/test4`
if [ "$res" != "3" ]; then
@@ -125,10 +126,14 @@ TESTS_EXPECTED_IN_LOOP=194
TEST glusterd
TEST pidof glusterd
TEST $CLI volume create $V0 redundancy 2 $H0:$B0/${V0}{0..5}
+TEST $CLI volume set $V0 client-log-level DEBUG
+#Write-behind has a bug where lookup can race over write which leads to size mismatch on the mount after a 'cp'
+TEST $CLI volume set $V0 performance.write-behind off
EXPECT "Created" volinfo_field $V0 'Status'
TEST $CLI volume start $V0
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Started" volinfo_field $V0 'Status'
-TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
+#direct-io-mode is to make sure 'cat' leads to READ fop which triggers heal
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 --direct-io-mode=yes $M0;
# Wait until all 6 childs have been recognized by the ec xlator
EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0
diff --git a/tests/bitrot/bug-1373520.t b/tests/bitrot/bug-1373520.t
index 271bb3de287..225d3b1a9bc 100644
--- a/tests/bitrot/bug-1373520.t
+++ b/tests/bitrot/bug-1373520.t
@@ -49,39 +49,10 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" get_bitd_count
#Delete file and all links from backend
TEST rm -rf $(find $B0/${V0}5 -inum $(stat -c %i $B0/${V0}5/FILE1))
-# The test for each file below used to look like this:
-#
-# TEST stat $M0/FILE1
-# EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" stat $B0/${V0}5/FILE1
-#
-# That didn't really work, because EXPECT_WITHIN would bail immediately if
-# 'stat' returned an error - which it would if the file wasn't there yet.
-# Since changing this, I usually see at least a few retries, and sometimes more
-# than twenty, before the check for HL_FILE1 succeeds. The 'ls' is also
-# necessary, to force a name heal as well as data. With both that and the
-# 'stat' on $M0 being done here for every retry, there's no longer any need to
-# have them elsewhere.
-#
-# If we had EW_RETRIES support (https://review.gluster.org/#/c/16451/) we could
-# use it here to see how many retries are typical on the machines we use for
-# regression, and set an appropriate upper bound. As of right now, though,
-# that support does not exist yet.
-ugly_stat () {
- local client_dir=$1
- local brick_dir=$2
- local bare_file=$3
-
- ls $client_dir
- stat -c %s $client_dir/$bare_file
- stat -c %s $brick_dir/$bare_file 2> /dev/null || echo "UNKNOWN"
-}
-
#Access files
-EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" ugly_stat $M0 $B0/${V0}5 FILE1
-EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" ugly_stat $M0 $B0/${V0}5 HL_FILE1
+TEST cat $M0/FILE1
+EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" path_size $B0/${V0}5/FILE1
+TEST cat $M0/HL_FILE1
+EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" path_size $B0/${V0}5/HL_FILE1
cleanup;
-#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=1417540
-#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=1417540
-
-
diff --git a/tests/volume.rc b/tests/volume.rc
index f76f37a44fc..5419e399d8f 100644
--- a/tests/volume.rc
+++ b/tests/volume.rc
@@ -519,6 +519,11 @@ function path_exists {
if [ $? -eq 0 ]; then echo "Y"; else echo "N"; fi
}
+function path_size {
+ local size=$(stat -c %s $1)
+ if [ $? -eq 0 ]; then echo $size; else echo ""; fi
+}
+
function force_umount {
${UMOUNT_F} $*
if [ $? -eq 0 ]; then echo "Y"; else echo "N"; fi
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 5e1efe381d4..823922542a0 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -82,10 +82,50 @@ int32_t ec_heal_report(call_frame_t * frame, void * cookie, xlator_t * this,
return 0;
}
+static uintptr_t
+ec_fop_needs_name_heal (ec_fop_data_t *fop)
+{
+ ec_t *ec = NULL;
+ ec_cbk_data_t *cbk = NULL;
+ ec_cbk_data_t *enoent_cbk = NULL;
+
+ ec = fop->xl->private;
+ if (fop->id != GF_FOP_LOOKUP)
+ return 0;
+
+ if (!fop->loc[0].name || strlen (fop->loc[0].name) == 0)
+ return 0;
+
+ list_for_each_entry(cbk, &fop->cbk_list, list)
+ {
+ if (cbk->op_ret < 0 && cbk->op_errno == ENOENT) {
+ enoent_cbk = cbk;
+ break;
+ }
+ }
+
+ if (!enoent_cbk)
+ return 0;
+
+ return ec->xl_up & ~enoent_cbk->mask;
+}
+
int32_t ec_fop_needs_heal(ec_fop_data_t *fop)
{
ec_t *ec = fop->xl->private;
+ if (fop->lock_count == 0) {
+ /*
+ * if fop->lock_count is zero that means it saw version mismatch
+ * without any locks so it can't be trusted. If we launch a heal
+ * based on this it will lead to INODELKs which will affect I/O
+ * performance. Considering self-heal-daemon and operations on
+ * the inode from client which take locks can still trigger the
+ * heal we can choose to not attempt a heal when fop->lock_count
+ * is zero.
+ */
+ return 0;
+ }
return (ec->xl_up & ~(fop->remaining | fop->good)) != 0;
}
@@ -95,7 +135,7 @@ void ec_check_status(ec_fop_data_t * fop)
int32_t partial = 0;
char str1[32], str2[32], str3[32], str4[32], str5[32];
- if (!ec_fop_needs_heal(fop)) {
+ if (!ec_fop_needs_name_heal (fop) && !ec_fop_needs_heal(fop)) {
return;
}
@@ -108,19 +148,17 @@ void ec_check_status(ec_fop_data_t * fop)
}
}
- if (fop->lock_count > 0) {
- gf_msg (fop->xl->name, GF_LOG_WARNING, 0,
- EC_MSG_OP_FAIL_ON_SUBVOLS,
- "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, "
- "remaining=%s, good=%s, bad=%s)",
- gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes,
- ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
- ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
- ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes),
- ec_bin(str4, sizeof(str4), fop->good, ec->nodes),
- ec_bin(str5, sizeof(str5),
- ec->xl_up & ~(fop->remaining | fop->good), ec->nodes));
- }
+ gf_msg (fop->xl->name, GF_LOG_WARNING, 0,
+ EC_MSG_OP_FAIL_ON_SUBVOLS,
+ "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, "
+ "remaining=%s, good=%s, bad=%s)",
+ gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes,
+ ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+ ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
+ ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes),
+ ec_bin(str4, sizeof(str4), fop->good, ec->nodes),
+ ec_bin(str5, sizeof(str5),
+ ec->xl_up & ~(fop->remaining | fop->good), ec->nodes));
if (fop->use_fd)
{
if (fop->fd != NULL) {
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
index c532e0e03b0..a03a590402a 100644
--- a/xlators/cluster/ec/src/ec-common.h
+++ b/xlators/cluster/ec/src/ec-common.h
@@ -121,6 +121,7 @@ void ec_handle_healers_done (ec_fop_data_t *fop);
int32_t
ec_heal_inspect (call_frame_t *frame, ec_t *ec,
inode_t *inode, unsigned char *locked_on,
+ gf_boolean_t self_locked, gf_boolean_t thorough,
gf_boolean_t *need_heal);
int32_t
ec_get_heal_info (xlator_t *this, loc_t *loc, dict_t **dict);
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
index 06a4dbb5841..19ba5787006 100644
--- a/xlators/cluster/ec/src/ec-heal.c
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -345,7 +345,7 @@ ec_heal_entry_find_direction (ec_t *ec, default_args_cbk_t *replies,
if (source == -1)
source = i;
- ret = ec_dict_del_array (replies[i].xdata, EC_XATTR_VERSION,
+ ret = ec_dict_get_array (replies[i].xdata, EC_XATTR_VERSION,
xattr, EC_VERSION_SIZE);
if (ret == 0) {
versions[i] = xattr[EC_DATA_TXN];
@@ -356,7 +356,7 @@ ec_heal_entry_find_direction (ec_t *ec, default_args_cbk_t *replies,
}
memset (xattr, 0, sizeof(xattr));
- ret = ec_dict_del_array (replies[i].xdata, EC_XATTR_DIRTY,
+ ret = ec_dict_get_array (replies[i].xdata, EC_XATTR_DIRTY,
xattr, EC_VERSION_SIZE);
if (ret == 0) {
dirty[i] = xattr[EC_DATA_TXN];
@@ -453,6 +453,7 @@ out:
loc_wipe (&loc);
return op_ret;
}
+
int
ec_heal_metadata_find_direction (ec_t *ec, default_args_cbk_t *replies,
uint64_t *versions, uint64_t *dirty,
@@ -479,14 +480,14 @@ ec_heal_metadata_find_direction (ec_t *ec, default_args_cbk_t *replies,
continue;
if (replies[i].op_ret < 0)
continue;
- ret = ec_dict_del_array (replies[i].xdata, EC_XATTR_VERSION,
+ ret = ec_dict_get_array (replies[i].xdata, EC_XATTR_VERSION,
xattr, EC_VERSION_SIZE);
if (ret == 0) {
versions[i] = xattr[EC_METADATA_TXN];
}
memset (xattr, 0, sizeof (xattr));
- ret = ec_dict_del_array (replies[i].xdata, EC_XATTR_DIRTY,
+ ret = ec_dict_get_array (replies[i].xdata, EC_XATTR_DIRTY,
xattr, EC_VERSION_SIZE);
if (ret == 0) {
dirty[i] = xattr[EC_METADATA_TXN];
@@ -1496,26 +1497,22 @@ unlock:
/*Find direction for data heal and heal info*/
int
ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies,
- uint64_t *data_versions, uint64_t *meta_versions,
+ uint64_t *data_versions,
uint64_t *dirty, uint64_t *size, unsigned char *sources,
- unsigned char *healed_sinks, int which)
+ unsigned char *healed_sinks,
+ gf_boolean_t check_ondisksize, int which)
{
uint64_t xattr[EC_VERSION_SIZE] = {0};
char version_size[128] = {0};
dict_t *version_size_db = NULL;
- uint64_t *m_versions = NULL;
unsigned char *same = NULL;
int max_same_count = 0;
int source = 0;
int i = 0;
int ret = 0;
dict_t *dict = NULL;
+ uint64_t source_size = 0;
- if (!meta_versions) {
- m_versions = alloca0 (ec->nodes * sizeof (*m_versions));
- } else {
- m_versions = meta_versions;
- }
version_size_db = dict_new ();
if (!version_size_db) {
ret = -ENOMEM;
@@ -1530,17 +1527,14 @@ ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies,
dict = (which == EC_COMBINE_XDATA) ? replies[i].xdata :
replies[i].xattr;
- ret = ec_dict_del_array (dict, EC_XATTR_VERSION,
+ ret = ec_dict_get_array (dict, EC_XATTR_VERSION,
xattr, EC_VERSION_SIZE);
if (ret == 0) {
data_versions[i] = xattr[EC_DATA_TXN];
- if (meta_versions) {
- m_versions[i] = xattr[EC_METADATA_TXN];
- }
}
memset (xattr, 0, sizeof (xattr));
- ret = ec_dict_del_array (dict, EC_XATTR_DIRTY,
+ ret = ec_dict_get_array (dict, EC_XATTR_DIRTY,
xattr, EC_VERSION_SIZE);
if (ret == 0) {
dirty[i] = xattr[EC_DATA_TXN];
@@ -1549,8 +1543,7 @@ ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies,
&size[i]);
/*Build a db of same metadata and data version and size*/
snprintf (version_size, sizeof (version_size),
- "%"PRIu64"-%"PRIu64"-%"PRIu64, data_versions[i],
- m_versions[i], size[i]);
+ "%"PRIu64"-%"PRIu64, data_versions[i], size[i]);
ret = dict_get_bin (version_size_db, version_size,
(void **)&same);
@@ -1581,9 +1574,7 @@ ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies,
goto out;
} else {
snprintf (version_size, sizeof (version_size),
- "%"PRIu64"-%"PRIu64"-%"PRIu64,
- data_versions[source],
- m_versions[source],
+ "%"PRIu64"-%"PRIu64, data_versions[source],
size[source]);
ret = dict_get_bin (version_size_db, version_size,
@@ -1598,6 +1589,30 @@ ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies,
}
}
+ /* There could be files with versions, size same but on disk ia_size
+ * could be different because of disk crashes, mark them as sinks as
+ * well*/
+
+ if (check_ondisksize) {
+ source_size = ec_adjust_size (ec, size[source], 1);
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (sources[i]) {
+ if (replies[i].stat.ia_size != source_size) {
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ max_same_count--;
+ } else {
+ source = i;
+ }
+ }
+ }
+ if (max_same_count < ec->fragments) {
+ ret = -EIO;
+ goto out;
+ }
+ }
+
ret = source;
out:
if (version_size_db)
@@ -1613,17 +1628,20 @@ __ec_heal_data_prepare (call_frame_t *frame, ec_t *ec, fd_t *fd,
struct iatt *stbuf)
{
default_args_cbk_t *replies = NULL;
+ default_args_cbk_t *fstat_replies = NULL;
unsigned char *output = NULL;
+ unsigned char *fstat_output = NULL;
dict_t *xattrs = NULL;
uint64_t zero_array[2] = {0};
int source = 0;
int ret = 0;
uint64_t zero_value = 0;
- uint64_t source_size = 0;
int i = 0;
EC_REPLIES_ALLOC (replies, ec->nodes);
+ EC_REPLIES_ALLOC (fstat_replies, ec->nodes);
output = alloca0(ec->nodes);
+ fstat_output = alloca0(ec->nodes);
xattrs = dict_new ();
if (!xattrs ||
dict_set_static_bin (xattrs, EC_XATTR_VERSION, zero_array,
@@ -1639,43 +1657,34 @@ __ec_heal_data_prepare (call_frame_t *frame, ec_t *ec, fd_t *fd,
ret = cluster_fxattrop (ec->xl_list, locked_on, ec->nodes,
replies, output, frame, ec->xl, fd,
GF_XATTROP_ADD_ARRAY64, xattrs, NULL);
+
+ ret = cluster_fstat (ec->xl_list, locked_on, ec->nodes, fstat_replies,
+ fstat_output, frame, ec->xl, fd, NULL);
+
+ for (i = 0; i < ec->nodes; i++) {
+ output[i] = output[i] && fstat_output[i];
+ replies[i].valid = output[i];
+ if (output[i])
+ replies[i].stat = fstat_replies[i].stat;
+ }
+
if (EC_COUNT (output, ec->nodes) <= ec->fragments) {
ret = -ENOTCONN;
goto out;
}
- source = ec_heal_data_find_direction (ec, replies, versions, NULL,
+ source = ec_heal_data_find_direction (ec, replies, versions,
dirty, size, sources,
- healed_sinks, EC_COMBINE_DICT);
+ healed_sinks, _gf_true,
+ EC_COMBINE_DICT);
ret = source;
if (ret < 0)
goto out;
- /* There could be files with versions, size same but on disk ia_size
- * could be different because of disk crashes, mark them as sinks as
- * well*/
- ret = cluster_fstat (ec->xl_list, locked_on, ec->nodes, replies,
- output, frame, ec->xl, fd, NULL);
- EC_INTERSECT (sources, sources, output, ec->nodes);
- EC_INTERSECT (healed_sinks, healed_sinks, output, ec->nodes);
- if (EC_COUNT (sources, ec->nodes) < ec->fragments) {
- ret = -ENOTCONN;
- goto out;
- }
-
- source_size = ec_adjust_size (ec, size[source], 1);
+ if (stbuf)
+ *stbuf = replies[source].stat;
for (i = 0; i < ec->nodes; i++) {
- if (sources[i]) {
- if (replies[i].stat.ia_size != source_size) {
- sources[i] = 0;
- healed_sinks[i] = 1;
- } else if (stbuf) {
- source = i;
- *stbuf = replies[i].stat;
- }
- }
-
if (healed_sinks[i]) {
if (replies[i].stat.ia_size)
trim[i] = 1;
@@ -1692,6 +1701,7 @@ out:
if (xattrs)
dict_unref (xattrs);
cluster_replies_wipe (replies, ec->nodes);
+ cluster_replies_wipe (fstat_replies, ec->nodes);
if (ret < 0) {
gf_msg_debug (ec->xl->name, 0, "%s: heal failed %s",
uuid_utoa (fd->inode->gfid), strerror (-ret));
@@ -2346,7 +2356,7 @@ ec_heal_do (xlator_t *this, void *data, loc_t *loc, int32_t partial)
frame = create_frame (this, this->ctx->pool);
if (!frame)
- return;
+ goto out;
ec_owner_set(frame, frame->root);
/*Do heal as root*/
@@ -2360,15 +2370,6 @@ ec_heal_do (xlator_t *this, void *data, loc_t *loc, int32_t partial)
up_subvols = alloca0(ec->nodes);
ec_mask_to_char_array (ec->xl_up, up_subvols, ec->nodes);
- ec_heal_inspect (frame, ec, loc->inode, up_subvols,
- &need_heal);
- if (!need_heal) {
- gf_msg (ec->xl->name, GF_LOG_DEBUG, 0,
- EC_MSG_HEAL_FAIL, "Heal is not required for : %s ",
- uuid_utoa(loc->gfid));
- goto out;
- }
-
if (loc->name && strlen (loc->name)) {
ret = ec_heal_name (frame, ec, loc->parent, (char *)loc->name,
participants);
@@ -2386,6 +2387,17 @@ ec_heal_do (xlator_t *this, void *data, loc_t *loc, int32_t partial)
}
}
+ /* Mount triggers heal only when it detects that it must need heal, shd
+ * triggers heals periodically which need not be thorough*/
+ ec_heal_inspect (frame, ec, loc->inode, up_subvols, _gf_false,
+ !ec->shd.iamshd, &need_heal);
+ if (!need_heal) {
+ gf_msg (ec->xl->name, GF_LOG_DEBUG, 0,
+ EC_MSG_HEAL_FAIL, "Heal is not required for : %s ",
+ uuid_utoa(loc->gfid));
+ goto out;
+ }
+
msources = alloca0(ec->nodes);
mhealed_sinks = alloca0(ec->nodes);
ret = ec_heal_metadata (frame, ec, loc->inode, msources, mhealed_sinks);
@@ -2425,7 +2437,8 @@ out:
ec->nodes),
mgood & good, mbad & bad, NULL);
}
- STACK_DESTROY (frame->root);
+ if (frame)
+ STACK_DESTROY (frame->root);
return;
}
@@ -2681,40 +2694,170 @@ out:
return ret;
}
-int32_t
-ec_need_heal (ec_t *ec, default_args_cbk_t *replies,
- gf_boolean_t *need_heal, int32_t lock_count)
+static int32_t
+_need_heal_calculate (ec_t *ec, uint64_t *dirty, unsigned char *sources,
+ gf_boolean_t self_locked, int32_t lock_count,
+ gf_boolean_t *need_heal)
+{
+ int i = 0;
+ int source_count = 0;
+
+ source_count = EC_COUNT (sources, ec->nodes);
+ if (source_count == ec->nodes) {
+ *need_heal = _gf_false;
+ if (self_locked || lock_count == 0) {
+ for (i = 0; i < ec->nodes; i++) {
+ if (dirty[i]) {
+ *need_heal = _gf_true;
+ goto out;
+ }
+ }
+ } else {
+ for (i = 0; i < ec->nodes; i++) {
+ /* Since each lock can only increment the dirty
+ * count once, if dirty is > 1 it means that
+ * another operation has left the dirty count
+ * set and this indicates a problem in the
+ * inode.*/
+ if (dirty[i] > 1) {
+ *need_heal = _gf_true;
+ goto out;
+ }
+ }
+ }
+ } else {
+ *need_heal = _gf_true;
+ }
+
+out:
+ return source_count;
+}
+
+static int32_t
+ec_need_metadata_heal (ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
+ int32_t lock_count, gf_boolean_t self_locked,
+ gf_boolean_t thorough, gf_boolean_t *need_heal)
{
uint64_t *dirty = NULL;
unsigned char *sources = NULL;
unsigned char *healed_sinks = NULL;
- uint64_t *data_versions = NULL;
uint64_t *meta_versions = NULL;
+ int ret = 0;
+ int i = 0;
+
+ sources = alloca0(ec->nodes);
+ healed_sinks = alloca0(ec->nodes);
+ dirty = alloca0 (ec->nodes * sizeof (*dirty));
+ meta_versions = alloca0 (ec->nodes * sizeof (*meta_versions));
+ ret = ec_heal_metadata_find_direction (ec, replies, meta_versions,
+ dirty, sources, healed_sinks);
+ if (ret < 0 && ret != -EIO) {
+ goto out;
+ }
+
+ ret = _need_heal_calculate (ec, dirty, sources, self_locked, lock_count,
+ need_heal);
+ if (ret == ec->nodes && !(*need_heal)) {
+ for (i = 1; i < ec->nodes; i++) {
+ if (meta_versions[i] != meta_versions[0]) {
+ *need_heal = _gf_true;
+ goto out;
+ }
+ }
+ }
+out:
+ return ret;
+}
+
+static int32_t
+ec_need_data_heal (ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
+ int32_t lock_count, gf_boolean_t self_locked,
+ gf_boolean_t thorough, gf_boolean_t *need_heal)
+{
+ uint64_t *dirty = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *healed_sinks = NULL;
+ uint64_t *data_versions = NULL;
uint64_t *size = NULL;
int ret = 0;
- int source_count = 0;
sources = alloca0(ec->nodes);
healed_sinks = alloca0(ec->nodes);
dirty = alloca0 (ec->nodes * sizeof (*dirty));
- size = alloca0 (ec->nodes * sizeof (*size));
data_versions = alloca0 (ec->nodes * sizeof (*data_versions));
- meta_versions = alloca0 (ec->nodes * sizeof (*meta_versions));
+ size = alloca0 (ec->nodes * sizeof (*size));
+ /* When dd is going on and heal info is called there is a very good
+ * chance for on disk sizes to mismatch eventhough nothing is wrong
+ * we don't need ondisk size check there. But if the file is either
+ * self-locked or the caller wants a thorough check then make sure to
+ * perform on disk check also. */
ret = ec_heal_data_find_direction (ec, replies, data_versions,
- meta_versions, dirty, size,
- sources, healed_sinks,
+ dirty, size, sources, healed_sinks,
+ self_locked || thorough,
EC_COMBINE_XDATA);
if (ret < 0 && ret != -EIO) {
goto out;
}
- source_count = EC_COUNT (sources, ec->nodes);
- if (source_count == ec->nodes && lock_count > 0) {
- *need_heal = _gf_false;
- } else {
- *need_heal = _gf_true;
+
+ ret = _need_heal_calculate (ec, dirty, sources, self_locked, lock_count,
+ need_heal);
+out:
+ return ret;
+}
+
+static int32_t
+ec_need_entry_heal (ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
+ int32_t lock_count, gf_boolean_t self_locked,
+ gf_boolean_t thorough, gf_boolean_t *need_heal)
+{
+ uint64_t *dirty = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *healed_sinks = NULL;
+ uint64_t *data_versions = NULL;
+ int ret = 0;
+
+ sources = alloca0(ec->nodes);
+ healed_sinks = alloca0(ec->nodes);
+ dirty = alloca0 (ec->nodes * sizeof (*dirty));
+ data_versions = alloca0 (ec->nodes * sizeof (*data_versions));
+
+ ret = ec_heal_entry_find_direction (ec, replies, data_versions,
+ dirty, sources, healed_sinks);
+ if (ret < 0 && ret != -EIO) {
+ goto out;
+ }
+
+ ret = _need_heal_calculate (ec, dirty, sources, self_locked, lock_count,
+ need_heal);
+out:
+ return ret;
+}
+
+static int32_t
+ec_need_heal (ec_t *ec, inode_t *inode, default_args_cbk_t *replies,
+ int32_t lock_count, gf_boolean_t self_locked,
+ gf_boolean_t thorough, gf_boolean_t *need_heal)
+{
+ int ret = 0;
+
+
+ ret = ec_need_metadata_heal (ec, inode, replies, lock_count,
+ self_locked, thorough, need_heal);
+ if (ret < 0)
+ goto out;
+
+ if (*need_heal)
+ goto out;
+
+ if (inode->ia_type == IA_IFREG) {
+ ret = ec_need_data_heal (ec, inode, replies, lock_count,
+ self_locked, thorough, need_heal);
+ } else if (inode->ia_type == IA_IFDIR) {
+ ret = ec_need_entry_heal (ec, inode, replies, lock_count,
+ self_locked, thorough, need_heal);
}
- ret = source_count;
+
out:
return ret;
}
@@ -2722,6 +2865,7 @@ out:
int32_t
ec_heal_inspect (call_frame_t *frame, ec_t *ec,
inode_t *inode, unsigned char *locked_on,
+ gf_boolean_t self_locked, gf_boolean_t thorough,
gf_boolean_t *need_heal)
{
loc_t loc = {0};
@@ -2742,8 +2886,6 @@ ec_heal_inspect (call_frame_t *frame, ec_t *ec,
xdata = dict_new ();
if (!xdata ||
- dict_set_str(xdata, GLUSTERFS_INODELK_DOM_COUNT,
- ec->xl->name) ||
dict_set_static_bin (xdata, EC_XATTR_VERSION, zero_array,
sizeof (zero_array)) ||
dict_set_static_bin (xdata, EC_XATTR_DIRTY, zero_array,
@@ -2753,6 +2895,16 @@ ec_heal_inspect (call_frame_t *frame, ec_t *ec,
ret = -ENOMEM;
goto out;
}
+
+ if (!self_locked) {
+ ret = dict_set_str(xdata, GLUSTERFS_INODELK_DOM_COUNT,
+ ec->xl->name);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
ret = cluster_lookup (ec->xl_list, locked_on, ec->nodes, replies,
output, frame, ec->xl, &loc, xdata);
@@ -2762,6 +2914,9 @@ ec_heal_inspect (call_frame_t *frame, ec_t *ec,
goto out;
}
+ if (self_locked)
+ goto need_heal;
+
for (i = 0; i < ec->nodes; i++) {
if (!output[i] || !replies[i].xdata) {
continue;
@@ -2771,7 +2926,9 @@ ec_heal_inspect (call_frame_t *frame, ec_t *ec,
break;
}
}
- ret = ec_need_heal (ec, replies, need_heal, lock_count);
+need_heal:
+ ret = ec_need_heal (ec, inode, replies, lock_count,
+ self_locked, thorough, need_heal);
out:
cluster_replies_wipe (replies, ec->nodes);
@@ -2805,8 +2962,8 @@ ec_heal_locked_inspect (call_frame_t *frame, ec_t *ec, inode_t *inode,
*need_heal = _gf_true;
goto unlock;
}
- ret = ec_heal_inspect (frame, ec, inode,
- locked_on, need_heal);
+ ret = ec_heal_inspect (frame, ec, inode, locked_on, _gf_true, _gf_true,
+ need_heal);
unlock:
cluster_uninodelk (ec->xl_list, locked_on, ec->nodes,
replies, output, frame, ec->xl,
@@ -2854,9 +3011,9 @@ ec_get_heal_info (xlator_t *this, loc_t *entry_loc, dict_t **dict_rsp)
goto out;
}
- ret = ec_heal_inspect (frame, ec, loc.inode, up_subvols,
- &need_heal);
- if (ret == ec->nodes) {
+ ret = ec_heal_inspect (frame, ec, loc.inode, up_subvols, _gf_false,
+ _gf_false, &need_heal);
+ if (ret == ec->nodes && !need_heal) {
goto set_heal;
}
need_heal = _gf_false;
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
index 195bb6377fa..751d45a63f6 100644
--- a/xlators/cluster/ec/src/ec-helpers.c
+++ b/xlators/cluster/ec/src/ec-helpers.c
@@ -211,8 +211,8 @@ int32_t ec_dict_set_array(dict_t *dict, char *key, uint64_t value[],
}
-int32_t ec_dict_del_array(dict_t *dict, char *key, uint64_t value[],
- int32_t size)
+int32_t
+ec_dict_get_array (dict_t *dict, char *key, uint64_t value[], int32_t size)
{
void *ptr;
int32_t len;
@@ -246,11 +246,21 @@ int32_t ec_dict_del_array(dict_t *dict, char *key, uint64_t value[],
}
}
- dict_del(dict, key);
-
return 0;
}
+int32_t
+ec_dict_del_array (dict_t *dict, char *key, uint64_t value[], int32_t size)
+{
+ int ret = 0;
+
+ ret = ec_dict_get_array (dict, key, value, size);
+ if (ret == 0)
+ dict_del(dict, key);
+
+ return ret;
+}
+
int32_t ec_dict_set_number(dict_t * dict, char * key, uint64_t value)
{
diff --git a/xlators/cluster/ec/src/ec-helpers.h b/xlators/cluster/ec/src/ec-helpers.h
index 0b355bd440e..4d2145c8317 100644
--- a/xlators/cluster/ec/src/ec-helpers.h
+++ b/xlators/cluster/ec/src/ec-helpers.h
@@ -30,6 +30,9 @@ int32_t ec_buffer_alloc(xlator_t *xl, size_t size, struct iobref **piobref,
void **ptr);
int32_t ec_dict_set_array(dict_t *dict, char *key,
uint64_t *value, int32_t size);
+int32_t ec_dict_get_array (dict_t *dict, char *key, uint64_t value[],
+ int32_t size);
+
int32_t ec_dict_del_array(dict_t *dict, char *key,
uint64_t *value, int32_t size);
int32_t ec_dict_set_number(dict_t * dict, char * key, uint64_t value);