diff options
author | Pranith Kumar K <pkarampu@redhat.com> | 2017-01-25 15:31:44 +0530 |
---|---|---|
committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2017-02-26 22:06:55 -0500 |
commit | c1fc1fc9cb5a13e6ddf8c9270deb0c7609333540 (patch) | |
tree | a3876aa8a0c1b087429ba916c9380b90bcda6b72 | |
parent | 4638dfc1fee80f9338f2941f3cccb17bec63989a (diff) |
cluster/ec: Don't trigger data/metadata heal on Lookups
Problem-1
If Lookup which doesn't take any locks observes version mismatch it can't be
trusted. If we launch a heal based on this information it will lead to
self-heals which will affect I/O performance in the cases where Lookup is
wrong. Considering self-heal-daemon and operations on the inode from client
which take locks can still trigger heal we can choose to not attempt a heal on
Lookup.
Problem-2:
Fixed spurious failure of
tests/bitrot/bug-1373520.t
For the issues above, what was happening was that ec_heal_inspect()
is preventing 'name' heal to happen
Problem-3:
tests/basic/ec/ec-background-heals.t
To be honest I don't know what the problem was, while fixing
the 2 problems above, I made some changes to ec_heal_inspect() and
ec_need_heal() after which when I tried to recreate the spurious
failure it just didn't happen even after a long time.
BUG: 1414287
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Change-Id: Ife2535e1d0b267712973673f6d474e288f3c6834
Reviewed-on: https://review.gluster.org/16468
Smoke: Gluster Build System <jenkins@build.gluster.org>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
Reviewed-by: Xavier Hernandez <xhernandez@datalab.es>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Ashish Pandey <aspandey@redhat.com>
-rw-r--r-- | libglusterfs/src/cluster-syncop.c | 4 | ||||
-rw-r--r-- | tests/basic/ec/ec-background-heals.t | 2 | ||||
-rw-r--r-- | tests/basic/ec/heal-info.t | 73 | ||||
-rw-r--r-- | tests/basic/ec/self-heal.t | 21 | ||||
-rw-r--r-- | tests/bitrot/bug-1373520.t | 37 | ||||
-rw-r--r-- | tests/volume.rc | 5 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 66 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-common.h | 1 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-heal.c | 321 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-helpers.c | 18 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-helpers.h | 3 |
11 files changed, 408 insertions, 143 deletions
diff --git a/libglusterfs/src/cluster-syncop.c b/libglusterfs/src/cluster-syncop.c index 10993e6088d..98a46c85e4b 100644 --- a/libglusterfs/src/cluster-syncop.c +++ b/libglusterfs/src/cluster-syncop.c @@ -102,6 +102,10 @@ void cluster_replies_wipe (default_args_cbk_t *replies, int numsubvols) { int i = 0; + + if (!replies) + return; + for (i = 0; i < numsubvols; i++) args_cbk_wipe (&replies[i]); memset (replies, 0, numsubvols * sizeof (*replies)); diff --git a/tests/basic/ec/ec-background-heals.t b/tests/basic/ec/ec-background-heals.t index e99178d224e..eb434908bad 100644 --- a/tests/basic/ec/ec-background-heals.t +++ b/tests/basic/ec/ec-background-heals.t @@ -101,5 +101,3 @@ EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 TEST chown root:root $M0/{a,b,c,d} EXPECT "0" mount_get_option_value $M0 $V0-disperse-0 heal-waiters cleanup -#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=1416689 -#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=1416689 diff --git a/tests/basic/ec/heal-info.t b/tests/basic/ec/heal-info.t new file mode 100644 index 00000000000..7393d22d222 --- /dev/null +++ b/tests/basic/ec/heal-info.t @@ -0,0 +1,73 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks if heal info works as expected or not + +function create_files { + for i in {21..1000}; + do + dd if=/dev/zero of=$M0/$i bs=1M count=1 2>&1 > /dev/null; + done + rm -f $M0/lock +} + +cleanup + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 redundancy 2 $H0:$B0/${V0}{0..5} +TEST $CLI volume set $V0 client-log-level DEBUG +TEST $CLI volume heal $V0 disable +TEST $CLI volume start $V0 +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 --direct-io-mode=yes $M0; +# Wait until all 6 childs have been recognized by the ec xlator +EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 + +#heal info should give zero entries to be healed when I/O is going on +dd if=/dev/zero of=$M0/a bs=1M count=2048 & +dd_pid=$! +sleep 3 #Wait for I/O to proceed for some time +EXPECT "^0$" get_pending_heal_count $V0 +kill -9 $dd_pid +touch $M0/lock +create_files & + +total_heal_count=0 +while [ -f $M0/lock ]; +do + heal_count=$(get_pending_heal_count $V0) + total_heal_count=$((heal_count+total_heal_count)) +done +EXPECT "^0$" echo $total_heal_count + +#When only data heal is required it should print it +#There is no easy way to create this using commands so assigning xattrs directly +TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}0/1000 +TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}1/1000 +TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}2/1000 +TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}3/1000 +TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}4/1000 +TEST setfattr -n trusted.ec.version -v 0x00000000000000010000000000000000 $B0/${V0}5/1000 +index_path=$B0/${V0}5/.glusterfs/indices/xattrop/$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/${V0}5/1000)) +while [ -f $index_path ]; do :; done +TEST touch $index_path +EXPECT "^1$" get_pending_heal_count $V0 +TEST rm -f $M0/1000 + +#When files/directories need heal test that it prints them +TEST touch $M0/{1..10} +TEST kill_brick $V0 $H0 $B0/${V0}0 +for i in {11..20}; +do + echo abc > $M0/$i #Data + entry + metadata heal +done +for i in {1..10}; +do + chmod +x $M0/$i; +done + +EXPECT "^105$" get_pending_heal_count $V0 + +cleanup diff --git a/tests/basic/ec/self-heal.t b/tests/basic/ec/self-heal.t index 3e3467535fb..7f3486fe27b 100644 --- a/tests/basic/ec/self-heal.t +++ b/tests/basic/ec/self-heal.t @@ -9,6 +9,7 @@ cleanup function check_mount_dir { + getfattr -d -m. -e hex $M0 2>&1 > /dev/null for i in {1..20}; do ls -l $M0/ | grep "dir1" if [ $? -ne 0 ]; then @@ -21,7 +22,7 @@ function check_mount_dir function check_size { - stat $M0/$1 + cat $M0/$1 2>&1 > /dev/null for i in "${brick[@]}"; do res=`stat -c "%s" $i/$1` if [ "$res" != "$2" ]; then @@ -35,7 +36,7 @@ function check_size function check_mode { - stat $M0/$1 + getfattr -d -m. -e hex $M0/$1 2>&1 > /dev/null for i in "${brick[@]}"; do res=`stat -c "%A" $i/$1` if [ "$res" != "$2" ]; then @@ -49,7 +50,7 @@ function check_mode function check_date { - stat $M0/$1 + getfattr -d -m. -e hex $M0/$1 2>&1 > /dev/null for i in "${brick[@]}"; do res=`stat -c "%Y" $i/$1` if [ "$res" != "$2" ]; then @@ -63,7 +64,7 @@ function check_date function check_xattr { - stat $M0/$1 + getfattr -d -m. -e hex $M0/$1 2>&1 > /dev/null for i in "${brick[@]}"; do getfattr -n $2 $i/$1 2>/dev/null if [ $? -eq 0 ]; then @@ -77,7 +78,7 @@ function check_xattr function check_dir { - getfattr -m. -d $M0/dir1 + getfattr -m. -d $M0/dir1 2>&1 > /dev/null for i in "${brick[@]}"; do if [ ! -d $i/dir1 ]; then echo "N" @@ -90,7 +91,7 @@ function check_dir function check_soft_link { - stat $M0/test3 + getfattr -d -m. -e hex $M0/test3 2>&1 > /dev/null for i in "${brick[@]}"; do if [ ! -h $i/test3 ]; then echo "N" @@ -103,7 +104,7 @@ function check_soft_link function check_hard_link { - stat $M0/test4 + getfattr -d -m. -e hex $M0/test4 2>&1 > /dev/null for i in "${brick[@]}"; do res=`stat -c "%h" $i/test4` if [ "$res" != "3" ]; then @@ -125,10 +126,14 @@ TESTS_EXPECTED_IN_LOOP=194 TEST glusterd TEST pidof glusterd TEST $CLI volume create $V0 redundancy 2 $H0:$B0/${V0}{0..5} +TEST $CLI volume set $V0 client-log-level DEBUG +#Write-behind has a bug where lookup can race over write which leads to size mismatch on the mount after a 'cp' +TEST $CLI volume set $V0 performance.write-behind off EXPECT "Created" volinfo_field $V0 'Status' TEST $CLI volume start $V0 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Started" volinfo_field $V0 'Status' -TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +#direct-io-mode is to make sure 'cat' leads to READ fop which triggers heal +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 --direct-io-mode=yes $M0; # Wait until all 6 childs have been recognized by the ec xlator EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 diff --git a/tests/bitrot/bug-1373520.t b/tests/bitrot/bug-1373520.t index 271bb3de287..225d3b1a9bc 100644 --- a/tests/bitrot/bug-1373520.t +++ b/tests/bitrot/bug-1373520.t @@ -49,39 +49,10 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" get_bitd_count #Delete file and all links from backend TEST rm -rf $(find $B0/${V0}5 -inum $(stat -c %i $B0/${V0}5/FILE1)) -# The test for each file below used to look like this: -# -# TEST stat $M0/FILE1 -# EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" stat $B0/${V0}5/FILE1 -# -# That didn't really work, because EXPECT_WITHIN would bail immediately if -# 'stat' returned an error - which it would if the file wasn't there yet. -# Since changing this, I usually see at least a few retries, and sometimes more -# than twenty, before the check for HL_FILE1 succeeds. The 'ls' is also -# necessary, to force a name heal as well as data. With both that and the -# 'stat' on $M0 being done here for every retry, there's no longer any need to -# have them elsewhere. -# -# If we had EW_RETRIES support (https://review.gluster.org/#/c/16451/) we could -# use it here to see how many retries are typical on the machines we use for -# regression, and set an appropriate upper bound. As of right now, though, -# that support does not exist yet. -ugly_stat () { - local client_dir=$1 - local brick_dir=$2 - local bare_file=$3 - - ls $client_dir - stat -c %s $client_dir/$bare_file - stat -c %s $brick_dir/$bare_file 2> /dev/null || echo "UNKNOWN" -} - #Access files -EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" ugly_stat $M0 $B0/${V0}5 FILE1 -EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" ugly_stat $M0 $B0/${V0}5 HL_FILE1 +TEST cat $M0/FILE1 +EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" path_size $B0/${V0}5/FILE1 +TEST cat $M0/HL_FILE1 +EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" path_size $B0/${V0}5/HL_FILE1 cleanup; -#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=1417540 -#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=1417540 - - diff --git a/tests/volume.rc b/tests/volume.rc index f76f37a44fc..5419e399d8f 100644 --- a/tests/volume.rc +++ b/tests/volume.rc @@ -519,6 +519,11 @@ function path_exists { if [ $? -eq 0 ]; then echo "Y"; else echo "N"; fi } +function path_size { + local size=$(stat -c %s $1) + if [ $? -eq 0 ]; then echo $size; else echo ""; fi +} + function force_umount { ${UMOUNT_F} $* if [ $? -eq 0 ]; then echo "Y"; else echo "N"; fi diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 5e1efe381d4..823922542a0 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -82,10 +82,50 @@ int32_t ec_heal_report(call_frame_t * frame, void * cookie, xlator_t * this, return 0; } +static uintptr_t +ec_fop_needs_name_heal (ec_fop_data_t *fop) +{ + ec_t *ec = NULL; + ec_cbk_data_t *cbk = NULL; + ec_cbk_data_t *enoent_cbk = NULL; + + ec = fop->xl->private; + if (fop->id != GF_FOP_LOOKUP) + return 0; + + if (!fop->loc[0].name || strlen (fop->loc[0].name) == 0) + return 0; + + list_for_each_entry(cbk, &fop->cbk_list, list) + { + if (cbk->op_ret < 0 && cbk->op_errno == ENOENT) { + enoent_cbk = cbk; + break; + } + } + + if (!enoent_cbk) + return 0; + + return ec->xl_up & ~enoent_cbk->mask; +} + int32_t ec_fop_needs_heal(ec_fop_data_t *fop) { ec_t *ec = fop->xl->private; + if (fop->lock_count == 0) { + /* + * if fop->lock_count is zero that means it saw version mismatch + * without any locks so it can't be trusted. If we launch a heal + * based on this it will lead to INODELKs which will affect I/O + * performance. Considering self-heal-daemon and operations on + * the inode from client which take locks can still trigger the + * heal we can choose to not attempt a heal when fop->lock_count + * is zero. + */ + return 0; + } return (ec->xl_up & ~(fop->remaining | fop->good)) != 0; } @@ -95,7 +135,7 @@ void ec_check_status(ec_fop_data_t * fop) int32_t partial = 0; char str1[32], str2[32], str3[32], str4[32], str5[32]; - if (!ec_fop_needs_heal(fop)) { + if (!ec_fop_needs_name_heal (fop) && !ec_fop_needs_heal(fop)) { return; } @@ -108,19 +148,17 @@ void ec_check_status(ec_fop_data_t * fop) } } - if (fop->lock_count > 0) { - gf_msg (fop->xl->name, GF_LOG_WARNING, 0, - EC_MSG_OP_FAIL_ON_SUBVOLS, - "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, " - "remaining=%s, good=%s, bad=%s)", - gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes, - ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), - ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), - ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes), - ec_bin(str4, sizeof(str4), fop->good, ec->nodes), - ec_bin(str5, sizeof(str5), - ec->xl_up & ~(fop->remaining | fop->good), ec->nodes)); - } + gf_msg (fop->xl->name, GF_LOG_WARNING, 0, + EC_MSG_OP_FAIL_ON_SUBVOLS, + "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, " + "remaining=%s, good=%s, bad=%s)", + gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes, + ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), + ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), + ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes), + ec_bin(str4, sizeof(str4), fop->good, ec->nodes), + ec_bin(str5, sizeof(str5), + ec->xl_up & ~(fop->remaining | fop->good), ec->nodes)); if (fop->use_fd) { if (fop->fd != NULL) { diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index c532e0e03b0..a03a590402a 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -121,6 +121,7 @@ void ec_handle_healers_done (ec_fop_data_t *fop); int32_t ec_heal_inspect (call_frame_t *frame, ec_t *ec, inode_t *inode, unsigned char *locked_on, + gf_boolean_t self_locked, gf_boolean_t thorough, gf_boolean_t *need_heal); int32_t ec_get_heal_info (xlator_t *this, loc_t *loc, dict_t **dict); diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c index 06a4dbb5841..19ba5787006 100644 --- a/xlators/cluster/ec/src/ec-heal.c +++ b/xlators/cluster/ec/src/ec-heal.c @@ -345,7 +345,7 @@ ec_heal_entry_find_direction (ec_t *ec, default_args_cbk_t *replies, if (source == -1) source = i; - ret = ec_dict_del_array (replies[i].xdata, EC_XATTR_VERSION, + ret = ec_dict_get_array (replies[i].xdata, EC_XATTR_VERSION, xattr, EC_VERSION_SIZE); if (ret == 0) { versions[i] = xattr[EC_DATA_TXN]; @@ -356,7 +356,7 @@ ec_heal_entry_find_direction (ec_t *ec, default_args_cbk_t *replies, } memset (xattr, 0, sizeof(xattr)); - ret = ec_dict_del_array (replies[i].xdata, EC_XATTR_DIRTY, + ret = ec_dict_get_array (replies[i].xdata, EC_XATTR_DIRTY, xattr, EC_VERSION_SIZE); if (ret == 0) { dirty[i] = xattr[EC_DATA_TXN]; @@ -453,6 +453,7 @@ out: loc_wipe (&loc); return op_ret; } + int ec_heal_metadata_find_direction (ec_t *ec, default_args_cbk_t *replies, uint64_t *versions, uint64_t *dirty, @@ -479,14 +480,14 @@ ec_heal_metadata_find_direction (ec_t *ec, default_args_cbk_t *replies, continue; if (replies[i].op_ret < 0) continue; - ret = ec_dict_del_array (replies[i].xdata, EC_XATTR_VERSION, + ret = ec_dict_get_array (replies[i].xdata, EC_XATTR_VERSION, xattr, EC_VERSION_SIZE); if (ret == 0) { versions[i] = xattr[EC_METADATA_TXN]; } memset (xattr, 0, sizeof (xattr)); - ret = ec_dict_del_array (replies[i].xdata, EC_XATTR_DIRTY, + ret = ec_dict_get_array (replies[i].xdata, EC_XATTR_DIRTY, xattr, EC_VERSION_SIZE); if (ret == 0) { dirty[i] = xattr[EC_METADATA_TXN]; @@ -1496,26 +1497,22 @@ unlock: /*Find direction for data heal and heal info*/ int ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies, - uint64_t *data_versions, uint64_t *meta_versions, + uint64_t *data_versions, uint64_t *dirty, uint64_t *size, unsigned char *sources, - unsigned char *healed_sinks, int which) + unsigned char *healed_sinks, + gf_boolean_t check_ondisksize, int which) { uint64_t xattr[EC_VERSION_SIZE] = {0}; char version_size[128] = {0}; dict_t *version_size_db = NULL; - uint64_t *m_versions = NULL; unsigned char *same = NULL; int max_same_count = 0; int source = 0; int i = 0; int ret = 0; dict_t *dict = NULL; + uint64_t source_size = 0; - if (!meta_versions) { - m_versions = alloca0 (ec->nodes * sizeof (*m_versions)); - } else { - m_versions = meta_versions; - } version_size_db = dict_new (); if (!version_size_db) { ret = -ENOMEM; @@ -1530,17 +1527,14 @@ ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies, dict = (which == EC_COMBINE_XDATA) ? replies[i].xdata : replies[i].xattr; - ret = ec_dict_del_array (dict, EC_XATTR_VERSION, + ret = ec_dict_get_array (dict, EC_XATTR_VERSION, xattr, EC_VERSION_SIZE); if (ret == 0) { data_versions[i] = xattr[EC_DATA_TXN]; - if (meta_versions) { - m_versions[i] = xattr[EC_METADATA_TXN]; - } } memset (xattr, 0, sizeof (xattr)); - ret = ec_dict_del_array (dict, EC_XATTR_DIRTY, + ret = ec_dict_get_array (dict, EC_XATTR_DIRTY, xattr, EC_VERSION_SIZE); if (ret == 0) { dirty[i] = xattr[EC_DATA_TXN]; @@ -1549,8 +1543,7 @@ ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies, &size[i]); /*Build a db of same metadata and data version and size*/ snprintf (version_size, sizeof (version_size), - "%"PRIu64"-%"PRIu64"-%"PRIu64, data_versions[i], - m_versions[i], size[i]); + "%"PRIu64"-%"PRIu64, data_versions[i], size[i]); ret = dict_get_bin (version_size_db, version_size, (void **)&same); @@ -1581,9 +1574,7 @@ ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies, goto out; } else { snprintf (version_size, sizeof (version_size), - "%"PRIu64"-%"PRIu64"-%"PRIu64, - data_versions[source], - m_versions[source], + "%"PRIu64"-%"PRIu64, data_versions[source], size[source]); ret = dict_get_bin (version_size_db, version_size, @@ -1598,6 +1589,30 @@ ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies, } } + /* There could be files with versions, size same but on disk ia_size + * could be different because of disk crashes, mark them as sinks as + * well*/ + + if (check_ondisksize) { + source_size = ec_adjust_size (ec, size[source], 1); + + for (i = 0; i < ec->nodes; i++) { + if (sources[i]) { + if (replies[i].stat.ia_size != source_size) { + sources[i] = 0; + healed_sinks[i] = 1; + max_same_count--; + } else { + source = i; + } + } + } + if (max_same_count < ec->fragments) { + ret = -EIO; + goto out; + } + } + ret = source; out: if (version_size_db) @@ -1613,17 +1628,20 @@ __ec_heal_data_prepare (call_frame_t *frame, ec_t *ec, fd_t *fd, struct iatt *stbuf) { default_args_cbk_t *replies = NULL; + default_args_cbk_t *fstat_replies = NULL; unsigned char *output = NULL; + unsigned char *fstat_output = NULL; dict_t *xattrs = NULL; uint64_t zero_array[2] = {0}; int source = 0; int ret = 0; uint64_t zero_value = 0; - uint64_t source_size = 0; int i = 0; EC_REPLIES_ALLOC (replies, ec->nodes); + EC_REPLIES_ALLOC (fstat_replies, ec->nodes); output = alloca0(ec->nodes); + fstat_output = alloca0(ec->nodes); xattrs = dict_new (); if (!xattrs || dict_set_static_bin (xattrs, EC_XATTR_VERSION, zero_array, @@ -1639,43 +1657,34 @@ __ec_heal_data_prepare (call_frame_t *frame, ec_t *ec, fd_t *fd, ret = cluster_fxattrop (ec->xl_list, locked_on, ec->nodes, replies, output, frame, ec->xl, fd, GF_XATTROP_ADD_ARRAY64, xattrs, NULL); + + ret = cluster_fstat (ec->xl_list, locked_on, ec->nodes, fstat_replies, + fstat_output, frame, ec->xl, fd, NULL); + + for (i = 0; i < ec->nodes; i++) { + output[i] = output[i] && fstat_output[i]; + replies[i].valid = output[i]; + if (output[i]) + replies[i].stat = fstat_replies[i].stat; + } + if (EC_COUNT (output, ec->nodes) <= ec->fragments) { ret = -ENOTCONN; goto out; } - source = ec_heal_data_find_direction (ec, replies, versions, NULL, + source = ec_heal_data_find_direction (ec, replies, versions, dirty, size, sources, - healed_sinks, EC_COMBINE_DICT); + healed_sinks, _gf_true, + EC_COMBINE_DICT); ret = source; if (ret < 0) goto out; - /* There could be files with versions, size same but on disk ia_size - * could be different because of disk crashes, mark them as sinks as - * well*/ - ret = cluster_fstat (ec->xl_list, locked_on, ec->nodes, replies, - output, frame, ec->xl, fd, NULL); - EC_INTERSECT (sources, sources, output, ec->nodes); - EC_INTERSECT (healed_sinks, healed_sinks, output, ec->nodes); - if (EC_COUNT (sources, ec->nodes) < ec->fragments) { - ret = -ENOTCONN; - goto out; - } - - source_size = ec_adjust_size (ec, size[source], 1); + if (stbuf) + *stbuf = replies[source].stat; for (i = 0; i < ec->nodes; i++) { - if (sources[i]) { - if (replies[i].stat.ia_size != source_size) { - sources[i] = 0; - healed_sinks[i] = 1; - } else if (stbuf) { - source = i; - *stbuf = replies[i].stat; - } - } - if (healed_sinks[i]) { if (replies[i].stat.ia_size) trim[i] = 1; @@ -1692,6 +1701,7 @@ out: if (xattrs) dict_unref (xattrs); cluster_replies_wipe (replies, ec->nodes); + cluster_replies_wipe (fstat_replies, ec->nodes); if (ret < 0) { gf_msg_debug (ec->xl->name, 0, "%s: heal failed %s", uuid_utoa (fd->inode->gfid), strerror (-ret)); @@ -2346,7 +2356,7 @@ ec_heal_do (xlator_t *this, void *data, loc_t *loc, int32_t partial) frame = create_frame (this, this->ctx->pool); if (!frame) - return; + goto out; ec_owner_set(frame, frame->root); /*Do heal as root*/ @@ -2360,15 +2370,6 @@ ec_heal_do (xlator_t *this, void *data, loc_t *loc, int32_t partial) up_subvols = alloca0(ec->nodes); ec_mask_to_char_array (ec->xl_up, up_subvols, ec->nodes); - ec_heal_inspect (frame, ec, loc->inode, up_subvols, - &need_heal); - if (!need_heal) { - gf_msg (ec->xl->name, GF_LOG_DEBUG, 0, - EC_MSG_HEAL_FAIL, "Heal is not required for : %s ", - uuid_utoa(loc->gfid)); - goto out; - } - if (loc->name && strlen (loc->name)) { ret = ec_heal_name (frame, ec, loc->parent, (char *)loc->name, participants); @@ -2386,6 +2387,17 @@ ec_heal_do (xlator_t *this, void *data, loc_t *loc, int32_t partial) } } + /* Mount triggers heal only when it detects that it must need heal, shd + * triggers heals periodically which need not be thorough*/ + ec_heal_inspect (frame, ec, loc->inode, up_subvols, _gf_false, + !ec->shd.iamshd, &need_heal); + if (!need_heal) { + gf_msg (ec->xl->name, GF_LOG_DEBUG, 0, + EC_MSG_HEAL_FAIL, "Heal is not required for : %s ", + uuid_utoa(loc->gfid)); + goto out; + } + msources = alloca0(ec->nodes); mhealed_sinks = alloca0(ec->nodes); ret = ec_heal_metadata (frame, ec, loc->inode, msources, mhealed_sinks); @@ -2425,7 +2437,8 @@ out: ec->nodes), mgood & good, mbad & bad, NULL); } - STACK_DESTROY (frame->root); + if (frame) + STACK_DESTROY (frame->root); return; } @@ -2681,40 +2694,170 @@ out: return ret; } -int32_t -ec_need_heal (ec_t *ec, default_args_cbk_t *replies, - gf_boolean_t *need_heal, int32_t lock_count) +static int32_t +_need_heal_calculate (ec_t *ec, uint64_t *dirty, unsigned char *sources, + gf_boolean_t self_locked, int32_t lock_count, + gf_boolean_t *need_heal) +{ + int i = 0; + int source_count = 0; + + source_count = EC_COUNT (sources, ec->nodes); + if (source_count == ec->nodes) { + *need_heal = _gf_false; + if (self_locked || lock_count == 0) { + for (i = 0; i < ec->nodes; i++) { + if (dirty[i]) { + *need_heal = _gf_true; + goto out; + } + } + } else { + for (i = 0; i < ec->nodes; i++) { + /* Since each lock can only increment the dirty + * count once, if dirty is > 1 it means that + * another operation has left the dirty count + * set and this indicates a problem in the + * inode.*/ + if (dirty[i] > 1) { + *need_heal = _gf_true; + goto out; + } + } + } + } else { + *need_heal = _gf_true; + } + +out: + return source_count; +} + +static int32_t +ec_need_metadata_heal (ec_t *ec, inode_t *inode, default_args_cbk_t *replies, + int32_t lock_count, gf_boolean_t self_locked, + gf_boolean_t thorough, gf_boolean_t *need_heal) { uint64_t *dirty = NULL; unsigned char *sources = NULL; unsigned char *healed_sinks = NULL; - uint64_t *data_versions = NULL; uint64_t *meta_versions = NULL; + int ret = 0; + int i = 0; + + sources = alloca0(ec->nodes); + healed_sinks = alloca0(ec->nodes); + dirty = alloca0 (ec->nodes * sizeof (*dirty)); + meta_versions = alloca0 (ec->nodes * sizeof (*meta_versions)); + ret = ec_heal_metadata_find_direction (ec, replies, meta_versions, + dirty, sources, healed_sinks); + if (ret < 0 && ret != -EIO) { + goto out; + } + + ret = _need_heal_calculate (ec, dirty, sources, self_locked, lock_count, + need_heal); + if (ret == ec->nodes && !(*need_heal)) { + for (i = 1; i < ec->nodes; i++) { + if (meta_versions[i] != meta_versions[0]) { + *need_heal = _gf_true; + goto out; + } + } + } +out: + return ret; +} + +static int32_t +ec_need_data_heal (ec_t *ec, inode_t *inode, default_args_cbk_t *replies, + int32_t lock_count, gf_boolean_t self_locked, + gf_boolean_t thorough, gf_boolean_t *need_heal) +{ + uint64_t *dirty = NULL; + unsigned char *sources = NULL; + unsigned char *healed_sinks = NULL; + uint64_t *data_versions = NULL; uint64_t *size = NULL; int ret = 0; - int source_count = 0; sources = alloca0(ec->nodes); healed_sinks = alloca0(ec->nodes); dirty = alloca0 (ec->nodes * sizeof (*dirty)); - size = alloca0 (ec->nodes * sizeof (*size)); data_versions = alloca0 (ec->nodes * sizeof (*data_versions)); - meta_versions = alloca0 (ec->nodes * sizeof (*meta_versions)); + size = alloca0 (ec->nodes * sizeof (*size)); + /* When dd is going on and heal info is called there is a very good + * chance for on disk sizes to mismatch eventhough nothing is wrong + * we don't need ondisk size check there. But if the file is either + * self-locked or the caller wants a thorough check then make sure to + * perform on disk check also. */ ret = ec_heal_data_find_direction (ec, replies, data_versions, - meta_versions, dirty, size, - sources, healed_sinks, + dirty, size, sources, healed_sinks, + self_locked || thorough, EC_COMBINE_XDATA); if (ret < 0 && ret != -EIO) { goto out; } - source_count = EC_COUNT (sources, ec->nodes); - if (source_count == ec->nodes && lock_count > 0) { - *need_heal = _gf_false; - } else { - *need_heal = _gf_true; + + ret = _need_heal_calculate (ec, dirty, sources, self_locked, lock_count, + need_heal); +out: + return ret; +} + +static int32_t +ec_need_entry_heal (ec_t *ec, inode_t *inode, default_args_cbk_t *replies, + int32_t lock_count, gf_boolean_t self_locked, + gf_boolean_t thorough, gf_boolean_t *need_heal) +{ + uint64_t *dirty = NULL; + unsigned char *sources = NULL; + unsigned char *healed_sinks = NULL; + uint64_t *data_versions = NULL; + int ret = 0; + + sources = alloca0(ec->nodes); + healed_sinks = alloca0(ec->nodes); + dirty = alloca0 (ec->nodes * sizeof (*dirty)); + data_versions = alloca0 (ec->nodes * sizeof (*data_versions)); + + ret = ec_heal_entry_find_direction (ec, replies, data_versions, + dirty, sources, healed_sinks); + if (ret < 0 && ret != -EIO) { + goto out; + } + + ret = _need_heal_calculate (ec, dirty, sources, self_locked, lock_count, + need_heal); +out: + return ret; +} + +static int32_t +ec_need_heal (ec_t *ec, inode_t *inode, default_args_cbk_t *replies, + int32_t lock_count, gf_boolean_t self_locked, + gf_boolean_t thorough, gf_boolean_t *need_heal) +{ + int ret = 0; + + + ret = ec_need_metadata_heal (ec, inode, replies, lock_count, + self_locked, thorough, need_heal); + if (ret < 0) + goto out; + + if (*need_heal) + goto out; + + if (inode->ia_type == IA_IFREG) { + ret = ec_need_data_heal (ec, inode, replies, lock_count, + self_locked, thorough, need_heal); + } else if (inode->ia_type == IA_IFDIR) { + ret = ec_need_entry_heal (ec, inode, replies, lock_count, + self_locked, thorough, need_heal); } - ret = source_count; + out: return ret; } @@ -2722,6 +2865,7 @@ out: int32_t ec_heal_inspect (call_frame_t *frame, ec_t *ec, inode_t *inode, unsigned char *locked_on, + gf_boolean_t self_locked, gf_boolean_t thorough, gf_boolean_t *need_heal) { loc_t loc = {0}; @@ -2742,8 +2886,6 @@ ec_heal_inspect (call_frame_t *frame, ec_t *ec, xdata = dict_new (); if (!xdata || - dict_set_str(xdata, GLUSTERFS_INODELK_DOM_COUNT, - ec->xl->name) || dict_set_static_bin (xdata, EC_XATTR_VERSION, zero_array, sizeof (zero_array)) || dict_set_static_bin (xdata, EC_XATTR_DIRTY, zero_array, @@ -2753,6 +2895,16 @@ ec_heal_inspect (call_frame_t *frame, ec_t *ec, ret = -ENOMEM; goto out; } + + if (!self_locked) { + ret = dict_set_str(xdata, GLUSTERFS_INODELK_DOM_COUNT, + ec->xl->name); + if (ret) { + ret = -ENOMEM; + goto out; + } + } + ret = cluster_lookup (ec->xl_list, locked_on, ec->nodes, replies, output, frame, ec->xl, &loc, xdata); @@ -2762,6 +2914,9 @@ ec_heal_inspect (call_frame_t *frame, ec_t *ec, goto out; } + if (self_locked) + goto need_heal; + for (i = 0; i < ec->nodes; i++) { if (!output[i] || !replies[i].xdata) { continue; @@ -2771,7 +2926,9 @@ ec_heal_inspect (call_frame_t *frame, ec_t *ec, break; } } - ret = ec_need_heal (ec, replies, need_heal, lock_count); +need_heal: + ret = ec_need_heal (ec, inode, replies, lock_count, + self_locked, thorough, need_heal); out: cluster_replies_wipe (replies, ec->nodes); @@ -2805,8 +2962,8 @@ ec_heal_locked_inspect (call_frame_t *frame, ec_t *ec, inode_t *inode, *need_heal = _gf_true; goto unlock; } - ret = ec_heal_inspect (frame, ec, inode, - locked_on, need_heal); + ret = ec_heal_inspect (frame, ec, inode, locked_on, _gf_true, _gf_true, + need_heal); unlock: cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output, frame, ec->xl, @@ -2854,9 +3011,9 @@ ec_get_heal_info (xlator_t *this, loc_t *entry_loc, dict_t **dict_rsp) goto out; } - ret = ec_heal_inspect (frame, ec, loc.inode, up_subvols, - &need_heal); - if (ret == ec->nodes) { + ret = ec_heal_inspect (frame, ec, loc.inode, up_subvols, _gf_false, + _gf_false, &need_heal); + if (ret == ec->nodes && !need_heal) { goto set_heal; } need_heal = _gf_false; diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c index 195bb6377fa..751d45a63f6 100644 --- a/xlators/cluster/ec/src/ec-helpers.c +++ b/xlators/cluster/ec/src/ec-helpers.c @@ -211,8 +211,8 @@ int32_t ec_dict_set_array(dict_t *dict, char *key, uint64_t value[], } -int32_t ec_dict_del_array(dict_t *dict, char *key, uint64_t value[], - int32_t size) +int32_t +ec_dict_get_array (dict_t *dict, char *key, uint64_t value[], int32_t size) { void *ptr; int32_t len; @@ -246,11 +246,21 @@ int32_t ec_dict_del_array(dict_t *dict, char *key, uint64_t value[], } } - dict_del(dict, key); - return 0; } +int32_t +ec_dict_del_array (dict_t *dict, char *key, uint64_t value[], int32_t size) +{ + int ret = 0; + + ret = ec_dict_get_array (dict, key, value, size); + if (ret == 0) + dict_del(dict, key); + + return ret; +} + int32_t ec_dict_set_number(dict_t * dict, char * key, uint64_t value) { diff --git a/xlators/cluster/ec/src/ec-helpers.h b/xlators/cluster/ec/src/ec-helpers.h index 0b355bd440e..4d2145c8317 100644 --- a/xlators/cluster/ec/src/ec-helpers.h +++ b/xlators/cluster/ec/src/ec-helpers.h @@ -30,6 +30,9 @@ int32_t ec_buffer_alloc(xlator_t *xl, size_t size, struct iobref **piobref, void **ptr); int32_t ec_dict_set_array(dict_t *dict, char *key, uint64_t *value, int32_t size); +int32_t ec_dict_get_array (dict_t *dict, char *key, uint64_t value[], + int32_t size); + int32_t ec_dict_del_array(dict_t *dict, char *key, uint64_t *value, int32_t size); int32_t ec_dict_set_number(dict_t * dict, char * key, uint64_t value); |