diff options
| -rw-r--r-- | libglusterfs/src/cluster-syncop.c | 4 | ||||
| -rw-r--r-- | tests/basic/ec/ec-background-heals.t | 2 | ||||
| -rw-r--r-- | tests/basic/ec/heal-info.t | 73 | ||||
| -rw-r--r-- | tests/basic/ec/self-heal.t | 21 | ||||
| -rw-r--r-- | tests/bitrot/bug-1373520.t | 35 | ||||
| -rw-r--r-- | tests/volume.rc | 5 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 66 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-common.h | 1 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-heal.c | 321 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-helpers.c | 18 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-helpers.h | 3 | 
11 files changed, 408 insertions, 141 deletions
diff --git a/libglusterfs/src/cluster-syncop.c b/libglusterfs/src/cluster-syncop.c index 10993e6088d..98a46c85e4b 100644 --- a/libglusterfs/src/cluster-syncop.c +++ b/libglusterfs/src/cluster-syncop.c @@ -102,6 +102,10 @@ void  cluster_replies_wipe (default_args_cbk_t *replies, int numsubvols)  {          int i = 0; + +        if (!replies) +                return; +          for (i = 0; i < numsubvols; i++)                  args_cbk_wipe (&replies[i]);          memset (replies, 0, numsubvols * sizeof (*replies)); diff --git a/tests/basic/ec/ec-background-heals.t b/tests/basic/ec/ec-background-heals.t index f0cabeab61c..eb434908bad 100644 --- a/tests/basic/ec/ec-background-heals.t +++ b/tests/basic/ec/ec-background-heals.t @@ -101,5 +101,3 @@ EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0  TEST chown root:root $M0/{a,b,c,d}  EXPECT "0" mount_get_option_value $M0 $V0-disperse-0 heal-waiters  cleanup -#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=1419696 -#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=1419696 diff --git a/tests/basic/ec/heal-info.t b/tests/basic/ec/heal-info.t new file mode 100644 index 00000000000..7393d22d222 --- /dev/null +++ b/tests/basic/ec/heal-info.t @@ -0,0 +1,73 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks if heal info works as expected or not + +function create_files { +        for i in {21..1000}; +        do +                dd if=/dev/zero of=$M0/$i bs=1M count=1 2>&1 > /dev/null; +        done +        rm -f $M0/lock +} + +cleanup + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 redundancy 2 $H0:$B0/${V0}{0..5} +TEST $CLI volume set $V0 client-log-level DEBUG +TEST $CLI volume heal $V0 disable +TEST $CLI volume start $V0 +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 --direct-io-mode=yes $M0; +# Wait until all 6 childs have been recognized by the ec xlator +EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 + +#heal info should give zero entries to be healed when I/O is going on +dd if=/dev/zero of=$M0/a bs=1M count=2048 & +dd_pid=$! +sleep 3 #Wait for I/O to proceed for some time +EXPECT "^0$" get_pending_heal_count $V0 +kill -9 $dd_pid +touch $M0/lock +create_files & + +total_heal_count=0 +while [ -f $M0/lock ]; +do +        heal_count=$(get_pending_heal_count $V0) +        total_heal_count=$((heal_count+total_heal_count)) +done +EXPECT "^0$" echo $total_heal_count + +#When only data heal is required it should print it +#There is no easy way to create this using commands so assigning xattrs directly +TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}0/1000 +TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}1/1000 +TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}2/1000 +TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}3/1000 +TEST setfattr -n trusted.ec.version -v 0x00000000000000020000000000000000 $B0/${V0}4/1000 +TEST setfattr -n trusted.ec.version -v 0x00000000000000010000000000000000 $B0/${V0}5/1000 +index_path=$B0/${V0}5/.glusterfs/indices/xattrop/$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/${V0}5/1000)) +while [ -f $index_path ]; do :; done +TEST touch $index_path +EXPECT "^1$" get_pending_heal_count $V0 +TEST rm -f $M0/1000 + +#When files/directories need heal test that it prints them +TEST touch $M0/{1..10} +TEST kill_brick $V0 $H0 $B0/${V0}0 +for i in {11..20}; +do +        echo abc > $M0/$i #Data + entry + metadata heal +done +for i in {1..10}; +do +        chmod +x $M0/$i; +done + +EXPECT "^105$" get_pending_heal_count $V0 + +cleanup diff --git a/tests/basic/ec/self-heal.t b/tests/basic/ec/self-heal.t index 3e3467535fb..7f3486fe27b 100644 --- a/tests/basic/ec/self-heal.t +++ b/tests/basic/ec/self-heal.t @@ -9,6 +9,7 @@ cleanup  function check_mount_dir  { +    getfattr -d -m. -e hex $M0 2>&1 > /dev/null      for i in {1..20}; do          ls -l $M0/ | grep "dir1"          if [ $? -ne 0 ]; then @@ -21,7 +22,7 @@ function check_mount_dir  function check_size  { -    stat $M0/$1 +    cat $M0/$1 2>&1 > /dev/null      for i in "${brick[@]}"; do          res=`stat -c "%s" $i/$1`          if [ "$res" != "$2" ]; then @@ -35,7 +36,7 @@ function check_size  function check_mode  { -    stat $M0/$1 +    getfattr -d -m. -e hex $M0/$1 2>&1 > /dev/null      for i in "${brick[@]}"; do          res=`stat -c "%A" $i/$1`          if [ "$res" != "$2" ]; then @@ -49,7 +50,7 @@ function check_mode  function check_date  { -    stat $M0/$1 +    getfattr -d -m. -e hex $M0/$1 2>&1 > /dev/null      for i in "${brick[@]}"; do          res=`stat -c "%Y" $i/$1`          if [ "$res" != "$2" ]; then @@ -63,7 +64,7 @@ function check_date  function check_xattr  { -    stat $M0/$1 +    getfattr -d -m. -e hex $M0/$1 2>&1 > /dev/null      for i in "${brick[@]}"; do          getfattr -n $2 $i/$1 2>/dev/null          if [ $? -eq 0 ]; then @@ -77,7 +78,7 @@ function check_xattr  function check_dir  { -    getfattr -m. -d $M0/dir1 +    getfattr -m. -d $M0/dir1 2>&1 > /dev/null      for i in "${brick[@]}"; do          if [ ! -d $i/dir1 ]; then              echo "N" @@ -90,7 +91,7 @@ function check_dir  function check_soft_link  { -    stat $M0/test3 +    getfattr -d -m. -e hex $M0/test3 2>&1 > /dev/null      for i in "${brick[@]}"; do          if [ ! -h $i/test3 ]; then              echo "N" @@ -103,7 +104,7 @@ function check_soft_link  function check_hard_link  { -    stat $M0/test4 +    getfattr -d -m. -e hex $M0/test4 2>&1 > /dev/null      for i in "${brick[@]}"; do          res=`stat -c "%h" $i/test4`          if [ "$res" != "3" ]; then @@ -125,10 +126,14 @@ TESTS_EXPECTED_IN_LOOP=194  TEST glusterd  TEST pidof glusterd  TEST $CLI volume create $V0 redundancy 2 $H0:$B0/${V0}{0..5} +TEST $CLI volume set $V0 client-log-level DEBUG +#Write-behind has a bug where lookup can race over write which leads to size mismatch on the mount after a 'cp' +TEST $CLI volume set $V0 performance.write-behind off  EXPECT "Created" volinfo_field $V0 'Status'  TEST $CLI volume start $V0  EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Started" volinfo_field $V0 'Status' -TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +#direct-io-mode is to make sure 'cat' leads to READ fop which triggers heal +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 --direct-io-mode=yes $M0;  # Wait until all 6 childs have been recognized by the ec xlator  EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 diff --git a/tests/bitrot/bug-1373520.t b/tests/bitrot/bug-1373520.t index 9362b9c516b..96e2891439f 100644 --- a/tests/bitrot/bug-1373520.t +++ b/tests/bitrot/bug-1373520.t @@ -49,37 +49,10 @@ EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" get_bitd_count  #Delete file and all links from backend  TEST rm -rf $(find $B0/${V0}5 -inum $(stat -c %i $B0/${V0}5/FILE1)) -# The test for each file below used to look like this: -#  -#   TEST stat $M0/FILE1 -#   EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" stat $B0/${V0}5/FILE1 -# -# That didn't really work, because EXPECT_WITHIN would bail immediately if -# 'stat' returned an error - which it would if the file wasn't there yet. -# Since changing this, I usually see at least a few retries, and sometimes more -# than twenty, before the check for HL_FILE1 succeeds.  The 'ls' is also -# necessary, to force a name heal as well as data.  With both that and the -# 'stat' on $M0 being done here for every retry, there's no longer any need to -# have them elsewhere. -# -# If we had EW_RETRIES support (https://review.gluster.org/#/c/16451/) we could -# use it here to see how many retries are typical on the machines we use for -# regression, and set an appropriate upper bound.  As of right now, though, -# that support does not exist yet. -ugly_stat () { -	local client_dir=$1 -	local brick_dir=$2 -	local bare_file=$3 - -	ls $client_dir -	stat -c %s $client_dir/$bare_file -	stat -c %s $brick_dir/$bare_file 2> /dev/null || echo "UNKNOWN" -} -  #Access files -EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" ugly_stat $M0 $B0/${V0}5 FILE1 -EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" ugly_stat $M0 $B0/${V0}5 HL_FILE1 +TEST cat $M0/FILE1 +EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" path_size $B0/${V0}5/FILE1 +TEST cat $M0/HL_FILE1 +EXPECT_WITHIN $HEAL_TIMEOUT "$SIZE" path_size $B0/${V0}5/HL_FILE1  cleanup; -#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=1419696 -#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=1419696 diff --git a/tests/volume.rc b/tests/volume.rc index e55a98253b8..9d85c035b15 100644 --- a/tests/volume.rc +++ b/tests/volume.rc @@ -519,6 +519,11 @@ function path_exists {          if [ $? -eq 0 ]; then echo "Y"; else echo "N"; fi  } +function path_size { +        local size=$(stat -c %s $1) +        if [ $? -eq 0 ]; then echo $size; else echo ""; fi +} +  function force_umount {          ${UMOUNT_F} $*          if [ $? -eq 0 ]; then echo "Y"; else echo "N"; fi diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index 5e1efe381d4..823922542a0 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -82,10 +82,50 @@ int32_t ec_heal_report(call_frame_t * frame, void * cookie, xlator_t * this,      return 0;  } +static uintptr_t +ec_fop_needs_name_heal (ec_fop_data_t *fop) +{ +        ec_t          *ec         = NULL; +        ec_cbk_data_t *cbk        = NULL; +        ec_cbk_data_t *enoent_cbk = NULL; + +        ec = fop->xl->private; +        if (fop->id != GF_FOP_LOOKUP) +                return 0; + +        if (!fop->loc[0].name || strlen (fop->loc[0].name) == 0) +                return 0; + +        list_for_each_entry(cbk, &fop->cbk_list, list) +        { +                if (cbk->op_ret < 0 && cbk->op_errno == ENOENT) { +                        enoent_cbk = cbk; +                        break; +                } +        } + +        if (!enoent_cbk) +                return 0; + +        return ec->xl_up & ~enoent_cbk->mask; +} +  int32_t ec_fop_needs_heal(ec_fop_data_t *fop)  {      ec_t *ec = fop->xl->private; +    if (fop->lock_count == 0) { +    /* +     * if fop->lock_count is zero that means it saw version mismatch +     * without any locks so it can't be trusted. If we launch a heal +     * based on this it will lead to INODELKs which will affect I/O +     * performance. Considering self-heal-daemon and operations on +     * the inode from client which take locks can still trigger the +     * heal we can choose to not attempt a heal when fop->lock_count +     * is zero. +     */ +            return 0; +    }      return (ec->xl_up & ~(fop->remaining | fop->good)) != 0;  } @@ -95,7 +135,7 @@ void ec_check_status(ec_fop_data_t * fop)      int32_t partial = 0;      char str1[32], str2[32], str3[32], str4[32], str5[32]; -    if (!ec_fop_needs_heal(fop)) { +    if (!ec_fop_needs_name_heal (fop) && !ec_fop_needs_heal(fop)) {          return;      } @@ -108,19 +148,17 @@ void ec_check_status(ec_fop_data_t * fop)          }      } -    if (fop->lock_count > 0) { -            gf_msg (fop->xl->name, GF_LOG_WARNING, 0, -                    EC_MSG_OP_FAIL_ON_SUBVOLS, -                    "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, " -                    "remaining=%s, good=%s, bad=%s)", -                    gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes, -                    ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), -                    ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), -                    ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes), -                    ec_bin(str4, sizeof(str4), fop->good, ec->nodes), -                    ec_bin(str5, sizeof(str5), -                    ec->xl_up & ~(fop->remaining | fop->good), ec->nodes)); -    } +    gf_msg (fop->xl->name, GF_LOG_WARNING, 0, +            EC_MSG_OP_FAIL_ON_SUBVOLS, +            "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, " +            "remaining=%s, good=%s, bad=%s)", +            gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes, +            ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), +            ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), +            ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes), +            ec_bin(str4, sizeof(str4), fop->good, ec->nodes), +            ec_bin(str5, sizeof(str5), +            ec->xl_up & ~(fop->remaining | fop->good), ec->nodes));      if (fop->use_fd)      {          if (fop->fd != NULL) { diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index c532e0e03b0..a03a590402a 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -121,6 +121,7 @@ void ec_handle_healers_done (ec_fop_data_t *fop);  int32_t  ec_heal_inspect (call_frame_t *frame, ec_t *ec,                   inode_t *inode, unsigned char *locked_on, +                 gf_boolean_t self_locked, gf_boolean_t thorough,                   gf_boolean_t *need_heal);  int32_t  ec_get_heal_info (xlator_t *this, loc_t *loc, dict_t **dict); diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c index 06a4dbb5841..19ba5787006 100644 --- a/xlators/cluster/ec/src/ec-heal.c +++ b/xlators/cluster/ec/src/ec-heal.c @@ -345,7 +345,7 @@ ec_heal_entry_find_direction (ec_t *ec, default_args_cbk_t *replies,                  if (source == -1)                          source = i; -                ret = ec_dict_del_array (replies[i].xdata, EC_XATTR_VERSION, +                ret = ec_dict_get_array (replies[i].xdata, EC_XATTR_VERSION,                                           xattr, EC_VERSION_SIZE);                  if (ret == 0) {                          versions[i] = xattr[EC_DATA_TXN]; @@ -356,7 +356,7 @@ ec_heal_entry_find_direction (ec_t *ec, default_args_cbk_t *replies,                  }                  memset (xattr, 0, sizeof(xattr)); -                ret = ec_dict_del_array (replies[i].xdata, EC_XATTR_DIRTY, +                ret = ec_dict_get_array (replies[i].xdata, EC_XATTR_DIRTY,                                           xattr, EC_VERSION_SIZE);                  if (ret == 0) {                          dirty[i] = xattr[EC_DATA_TXN]; @@ -453,6 +453,7 @@ out:          loc_wipe (&loc);          return op_ret;  } +  int  ec_heal_metadata_find_direction (ec_t *ec, default_args_cbk_t *replies,                                   uint64_t *versions, uint64_t *dirty, @@ -479,14 +480,14 @@ ec_heal_metadata_find_direction (ec_t *ec, default_args_cbk_t *replies,                          continue;                  if (replies[i].op_ret < 0)                          continue; -                ret = ec_dict_del_array (replies[i].xdata, EC_XATTR_VERSION, +                ret = ec_dict_get_array (replies[i].xdata, EC_XATTR_VERSION,                                           xattr, EC_VERSION_SIZE);                  if (ret == 0) {                          versions[i] = xattr[EC_METADATA_TXN];                  }                  memset (xattr, 0, sizeof (xattr)); -                ret = ec_dict_del_array (replies[i].xdata, EC_XATTR_DIRTY, +                ret = ec_dict_get_array (replies[i].xdata, EC_XATTR_DIRTY,                                           xattr, EC_VERSION_SIZE);                  if (ret == 0) {                          dirty[i] = xattr[EC_METADATA_TXN]; @@ -1496,26 +1497,22 @@ unlock:  /*Find direction for data heal and heal info*/  int  ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies, -                       uint64_t *data_versions,  uint64_t *meta_versions, +                       uint64_t *data_versions,                         uint64_t *dirty, uint64_t *size, unsigned char *sources, -                       unsigned char *healed_sinks, int which) +                       unsigned char *healed_sinks, +                       gf_boolean_t check_ondisksize, int which)  {          uint64_t        xattr[EC_VERSION_SIZE] = {0};          char            version_size[128] = {0};          dict_t          *version_size_db = NULL; -        uint64_t        *m_versions      = NULL;          unsigned char   *same            = NULL;          int             max_same_count   = 0;          int             source           = 0;          int             i                = 0;          int             ret              = 0;          dict_t          *dict            = NULL; +        uint64_t        source_size      = 0; -        if (!meta_versions) { -                m_versions = alloca0 (ec->nodes * sizeof (*m_versions)); -        } else { -                m_versions = meta_versions; -        }          version_size_db = dict_new ();          if (!version_size_db) {                  ret = -ENOMEM; @@ -1530,17 +1527,14 @@ ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies,                  dict = (which == EC_COMBINE_XDATA) ? replies[i].xdata :                                                       replies[i].xattr; -                ret = ec_dict_del_array (dict, EC_XATTR_VERSION, +                ret = ec_dict_get_array (dict, EC_XATTR_VERSION,                                           xattr, EC_VERSION_SIZE);                  if (ret == 0) {                          data_versions[i] = xattr[EC_DATA_TXN]; -                        if (meta_versions) { -                                m_versions[i] = xattr[EC_METADATA_TXN]; -                        }                  }                  memset (xattr, 0, sizeof (xattr)); -                ret = ec_dict_del_array (dict, EC_XATTR_DIRTY, +                ret = ec_dict_get_array (dict, EC_XATTR_DIRTY,                                           xattr, EC_VERSION_SIZE);                  if (ret == 0) {                          dirty[i] = xattr[EC_DATA_TXN]; @@ -1549,8 +1543,7 @@ ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies,                                            &size[i]);                  /*Build a db of same metadata and data version and size*/                  snprintf (version_size, sizeof (version_size), -                          "%"PRIu64"-%"PRIu64"-%"PRIu64, data_versions[i], -                          m_versions[i], size[i]); +                          "%"PRIu64"-%"PRIu64, data_versions[i], size[i]);                  ret = dict_get_bin (version_size_db, version_size,                                      (void **)&same); @@ -1581,9 +1574,7 @@ ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies,                  goto out;          } else {                  snprintf (version_size, sizeof (version_size), -                          "%"PRIu64"-%"PRIu64"-%"PRIu64, -                          data_versions[source], -                          m_versions[source], +                          "%"PRIu64"-%"PRIu64, data_versions[source],                            size[source]);                  ret = dict_get_bin (version_size_db, version_size, @@ -1598,6 +1589,30 @@ ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies,                  }          } +        /* There could be files with versions, size same but on disk ia_size +         * could be different because of disk crashes, mark them as sinks as +         * well*/ + +        if (check_ondisksize) { +                source_size = ec_adjust_size (ec, size[source], 1); + +                for (i = 0; i < ec->nodes; i++) { +                        if (sources[i]) { +                                if (replies[i].stat.ia_size != source_size) { +                                        sources[i] = 0; +                                        healed_sinks[i] = 1; +                                        max_same_count--; +                                } else { +                                        source = i; +                                } +                        } +                } +                if (max_same_count < ec->fragments) { +                        ret = -EIO; +                        goto out; +                } +        } +          ret = source;  out:          if (version_size_db) @@ -1613,17 +1628,20 @@ __ec_heal_data_prepare (call_frame_t *frame, ec_t *ec, fd_t *fd,                          struct iatt *stbuf)  {          default_args_cbk_t *replies = NULL; +        default_args_cbk_t *fstat_replies = NULL;          unsigned char      *output  = NULL; +        unsigned char      *fstat_output  = NULL;          dict_t             *xattrs  = NULL;          uint64_t           zero_array[2] = {0};          int                source   = 0;          int                ret      = 0;          uint64_t           zero_value = 0; -        uint64_t           source_size = 0;          int                i = 0;          EC_REPLIES_ALLOC (replies, ec->nodes); +        EC_REPLIES_ALLOC (fstat_replies, ec->nodes);          output = alloca0(ec->nodes); +        fstat_output = alloca0(ec->nodes);          xattrs = dict_new ();          if (!xattrs ||              dict_set_static_bin (xattrs, EC_XATTR_VERSION, zero_array, @@ -1639,43 +1657,34 @@ __ec_heal_data_prepare (call_frame_t *frame, ec_t *ec, fd_t *fd,          ret = cluster_fxattrop (ec->xl_list, locked_on, ec->nodes,                                  replies, output, frame, ec->xl, fd,                                  GF_XATTROP_ADD_ARRAY64, xattrs, NULL); + +        ret = cluster_fstat (ec->xl_list, locked_on, ec->nodes, fstat_replies, +                             fstat_output, frame, ec->xl, fd, NULL); + +        for (i = 0; i < ec->nodes; i++) { +                output[i] = output[i] && fstat_output[i]; +                replies[i].valid = output[i]; +                if (output[i]) +                        replies[i].stat = fstat_replies[i].stat; +        } +          if (EC_COUNT (output, ec->nodes) <= ec->fragments) {                  ret = -ENOTCONN;                  goto out;          } -        source = ec_heal_data_find_direction (ec, replies, versions, NULL, +        source = ec_heal_data_find_direction (ec, replies, versions,                                                dirty, size, sources, -                                              healed_sinks, EC_COMBINE_DICT); +                                              healed_sinks, _gf_true, +                                              EC_COMBINE_DICT);          ret = source;          if (ret < 0)                  goto out; -        /* There could be files with versions, size same but on disk ia_size -         * could be different because of disk crashes, mark them as sinks as -         * well*/ -        ret = cluster_fstat (ec->xl_list, locked_on, ec->nodes, replies, -                             output, frame, ec->xl, fd, NULL); -        EC_INTERSECT (sources, sources, output, ec->nodes); -        EC_INTERSECT (healed_sinks, healed_sinks, output, ec->nodes); -        if (EC_COUNT (sources, ec->nodes) < ec->fragments) { -                ret = -ENOTCONN; -                goto out; -        } - -        source_size = ec_adjust_size (ec, size[source], 1); +        if (stbuf) +                *stbuf = replies[source].stat;          for (i = 0; i < ec->nodes; i++) { -                if (sources[i]) { -                        if (replies[i].stat.ia_size != source_size) { -                                sources[i] = 0; -                                healed_sinks[i] = 1; -                        } else if (stbuf) { -                                source = i; -                                *stbuf = replies[i].stat; -                        } -                } -                  if (healed_sinks[i]) {                          if (replies[i].stat.ia_size)                                  trim[i] = 1; @@ -1692,6 +1701,7 @@ out:          if (xattrs)                  dict_unref (xattrs);          cluster_replies_wipe (replies, ec->nodes); +        cluster_replies_wipe (fstat_replies, ec->nodes);          if (ret < 0) {                  gf_msg_debug (ec->xl->name, 0, "%s: heal failed %s",                          uuid_utoa (fd->inode->gfid), strerror (-ret)); @@ -2346,7 +2356,7 @@ ec_heal_do (xlator_t *this, void *data, loc_t *loc, int32_t partial)          frame = create_frame (this, this->ctx->pool);          if (!frame) -                return; +                goto out;          ec_owner_set(frame, frame->root);          /*Do heal as root*/ @@ -2360,15 +2370,6 @@ ec_heal_do (xlator_t *this, void *data, loc_t *loc, int32_t partial)          up_subvols = alloca0(ec->nodes);          ec_mask_to_char_array (ec->xl_up, up_subvols, ec->nodes); -        ec_heal_inspect (frame, ec, loc->inode, up_subvols, -                         &need_heal); -        if (!need_heal) { -                gf_msg (ec->xl->name, GF_LOG_DEBUG, 0, -                        EC_MSG_HEAL_FAIL, "Heal is not required for : %s ", -                        uuid_utoa(loc->gfid)); -                goto out; -        } -          if (loc->name && strlen (loc->name)) {                  ret = ec_heal_name (frame, ec, loc->parent, (char *)loc->name,                                      participants); @@ -2386,6 +2387,17 @@ ec_heal_do (xlator_t *this, void *data, loc_t *loc, int32_t partial)                  }          } +        /* Mount triggers heal only when it detects that it must need heal, shd +         * triggers heals periodically which need not be thorough*/ +        ec_heal_inspect (frame, ec, loc->inode, up_subvols, _gf_false, +                         !ec->shd.iamshd, &need_heal); +        if (!need_heal) { +                gf_msg (ec->xl->name, GF_LOG_DEBUG, 0, +                        EC_MSG_HEAL_FAIL, "Heal is not required for : %s ", +                        uuid_utoa(loc->gfid)); +                goto out; +        } +          msources = alloca0(ec->nodes);          mhealed_sinks = alloca0(ec->nodes);          ret = ec_heal_metadata (frame, ec, loc->inode, msources, mhealed_sinks); @@ -2425,7 +2437,8 @@ out:                                                                   ec->nodes),                                  mgood & good, mbad & bad, NULL);          } -        STACK_DESTROY (frame->root); +        if (frame) +                STACK_DESTROY (frame->root);          return;  } @@ -2681,40 +2694,170 @@ out:          return ret;  } -int32_t -ec_need_heal (ec_t *ec, default_args_cbk_t *replies, -              gf_boolean_t *need_heal, int32_t lock_count) +static int32_t +_need_heal_calculate (ec_t *ec, uint64_t *dirty, unsigned char *sources, +                      gf_boolean_t self_locked, int32_t lock_count, +                      gf_boolean_t *need_heal) +{ +        int i = 0; +        int source_count = 0; + +        source_count = EC_COUNT (sources, ec->nodes); +        if (source_count == ec->nodes) { +                *need_heal = _gf_false; +                if (self_locked || lock_count == 0) { +                        for (i = 0; i < ec->nodes; i++) { +                                if (dirty[i]) { +                                        *need_heal = _gf_true; +                                        goto out; +                                } +                        } +                } else { +                        for (i = 0; i < ec->nodes; i++) { +                                /* Since each lock can only increment the dirty +                                 * count once, if dirty is > 1 it means that +                                 * another operation has left the dirty count +                                 * set and this indicates a problem in the +                                 * inode.*/ +                                if (dirty[i] > 1) { +                                        *need_heal = _gf_true; +                                        goto out; +                                } +                        } +                } +        } else { +                *need_heal = _gf_true; +        } + +out: +        return source_count; +} + +static int32_t +ec_need_metadata_heal (ec_t *ec, inode_t *inode, default_args_cbk_t *replies, +                       int32_t lock_count, gf_boolean_t self_locked, +                       gf_boolean_t thorough, gf_boolean_t *need_heal)  {          uint64_t           *dirty         = NULL;          unsigned char      *sources       = NULL;          unsigned char      *healed_sinks  = NULL; -        uint64_t           *data_versions = NULL;          uint64_t           *meta_versions = NULL; +        int                ret            = 0; +        int                i              = 0; + +        sources = alloca0(ec->nodes); +        healed_sinks = alloca0(ec->nodes); +        dirty = alloca0 (ec->nodes * sizeof (*dirty)); +        meta_versions = alloca0 (ec->nodes * sizeof (*meta_versions)); +        ret = ec_heal_metadata_find_direction (ec, replies, meta_versions, +                                               dirty, sources, healed_sinks); +        if (ret < 0 && ret != -EIO) { +                goto out; +        } + +        ret = _need_heal_calculate (ec, dirty, sources, self_locked, lock_count, +                                    need_heal); +        if (ret == ec->nodes && !(*need_heal)) { +                for (i = 1; i < ec->nodes; i++) { +                        if (meta_versions[i] != meta_versions[0]) { +                                *need_heal = _gf_true; +                                goto out; +                        } +                } +        } +out: +        return ret; +} + +static int32_t +ec_need_data_heal (ec_t *ec, inode_t *inode, default_args_cbk_t *replies, +                   int32_t lock_count, gf_boolean_t self_locked, +                   gf_boolean_t thorough, gf_boolean_t *need_heal) +{ +        uint64_t           *dirty         = NULL; +        unsigned char      *sources       = NULL; +        unsigned char      *healed_sinks  = NULL; +        uint64_t           *data_versions = NULL;          uint64_t           *size          = NULL;          int                ret            = 0; -        int                source_count   = 0;          sources = alloca0(ec->nodes);          healed_sinks = alloca0(ec->nodes);          dirty = alloca0 (ec->nodes * sizeof (*dirty)); -        size = alloca0 (ec->nodes * sizeof (*size));          data_versions = alloca0 (ec->nodes * sizeof (*data_versions)); -        meta_versions = alloca0 (ec->nodes * sizeof (*meta_versions)); +        size = alloca0 (ec->nodes * sizeof (*size)); +        /* When dd is going on and heal info is called there is a very good +         * chance for on disk sizes to mismatch eventhough nothing is wrong +         * we don't need ondisk size check there. But if the file is either +         * self-locked or the caller wants a thorough check then make sure to +         * perform on disk check also. */          ret = ec_heal_data_find_direction (ec, replies, data_versions, -                                           meta_versions, dirty, size, -                                           sources, healed_sinks, +                                           dirty, size, sources, healed_sinks, +                                           self_locked || thorough,                                             EC_COMBINE_XDATA);          if (ret < 0 && ret != -EIO) {                  goto out;          } -        source_count = EC_COUNT (sources, ec->nodes); -        if (source_count == ec->nodes && lock_count > 0) { -                *need_heal = _gf_false; -        } else { -                *need_heal = _gf_true; + +        ret = _need_heal_calculate (ec, dirty, sources, self_locked, lock_count, +                                    need_heal); +out: +        return ret; +} + +static int32_t +ec_need_entry_heal (ec_t *ec, inode_t *inode, default_args_cbk_t *replies, +                    int32_t lock_count, gf_boolean_t self_locked, +                    gf_boolean_t thorough, gf_boolean_t *need_heal) +{ +        uint64_t           *dirty         = NULL; +        unsigned char      *sources       = NULL; +        unsigned char      *healed_sinks  = NULL; +        uint64_t           *data_versions = NULL; +        int                ret            = 0; + +        sources = alloca0(ec->nodes); +        healed_sinks = alloca0(ec->nodes); +        dirty = alloca0 (ec->nodes * sizeof (*dirty)); +        data_versions = alloca0 (ec->nodes * sizeof (*data_versions)); + +        ret = ec_heal_entry_find_direction (ec, replies, data_versions, +                                           dirty, sources, healed_sinks); +        if (ret < 0 && ret != -EIO) { +                goto out; +        } + +        ret = _need_heal_calculate (ec, dirty, sources, self_locked, lock_count, +                                    need_heal); +out: +        return ret; +} + +static int32_t +ec_need_heal (ec_t *ec, inode_t *inode, default_args_cbk_t *replies, +              int32_t lock_count, gf_boolean_t self_locked, +              gf_boolean_t thorough, gf_boolean_t *need_heal) +{ +        int                ret            = 0; + + +        ret = ec_need_metadata_heal (ec, inode, replies, lock_count, +                                     self_locked, thorough, need_heal); +        if (ret < 0) +                goto out; + +        if (*need_heal) +                goto out; + +        if (inode->ia_type == IA_IFREG) { +                ret = ec_need_data_heal (ec, inode, replies, lock_count, +                                         self_locked, thorough, need_heal); +        } else if (inode->ia_type == IA_IFDIR) { +                ret = ec_need_entry_heal (ec, inode, replies, lock_count, +                                          self_locked, thorough, need_heal);          } -        ret = source_count; +  out:          return ret;  } @@ -2722,6 +2865,7 @@ out:  int32_t  ec_heal_inspect (call_frame_t *frame, ec_t *ec,                   inode_t *inode, unsigned char *locked_on, +                 gf_boolean_t self_locked, gf_boolean_t thorough,                   gf_boolean_t *need_heal)  {          loc_t              loc           = {0}; @@ -2742,8 +2886,6 @@ ec_heal_inspect (call_frame_t *frame, ec_t *ec,          xdata = dict_new ();          if (!xdata || -            dict_set_str(xdata, GLUSTERFS_INODELK_DOM_COUNT, -                         ec->xl->name) ||              dict_set_static_bin (xdata, EC_XATTR_VERSION, zero_array,                                   sizeof (zero_array)) ||              dict_set_static_bin (xdata, EC_XATTR_DIRTY, zero_array, @@ -2753,6 +2895,16 @@ ec_heal_inspect (call_frame_t *frame, ec_t *ec,                  ret = -ENOMEM;                  goto out;          } + +        if (!self_locked) { +                ret = dict_set_str(xdata, GLUSTERFS_INODELK_DOM_COUNT, +                             ec->xl->name); +                if (ret) { +                        ret = -ENOMEM; +                        goto out; +                } +        } +          ret = cluster_lookup (ec->xl_list, locked_on, ec->nodes, replies,                                output, frame, ec->xl, &loc, xdata); @@ -2762,6 +2914,9 @@ ec_heal_inspect (call_frame_t *frame, ec_t *ec,                  goto out;          } +        if (self_locked) +                goto need_heal; +          for (i = 0; i < ec->nodes; i++) {                  if (!output[i] || !replies[i].xdata) {                          continue; @@ -2771,7 +2926,9 @@ ec_heal_inspect (call_frame_t *frame, ec_t *ec,                          break;                  }          } -        ret = ec_need_heal (ec, replies, need_heal, lock_count); +need_heal: +        ret = ec_need_heal (ec, inode, replies, lock_count, +                            self_locked, thorough, need_heal);  out:          cluster_replies_wipe (replies, ec->nodes); @@ -2805,8 +2962,8 @@ ec_heal_locked_inspect (call_frame_t *frame, ec_t *ec, inode_t *inode,                  *need_heal = _gf_true;                  goto unlock;          } -        ret = ec_heal_inspect (frame, ec, inode, -                               locked_on, need_heal); +        ret = ec_heal_inspect (frame, ec, inode, locked_on, _gf_true, _gf_true, +                               need_heal);  unlock:          cluster_uninodelk (ec->xl_list, locked_on, ec->nodes,                             replies, output, frame, ec->xl, @@ -2854,9 +3011,9 @@ ec_get_heal_info (xlator_t *this, loc_t *entry_loc, dict_t **dict_rsp)                          goto out;          } -        ret = ec_heal_inspect (frame, ec, loc.inode, up_subvols, -                               &need_heal); -        if (ret == ec->nodes) { +        ret = ec_heal_inspect (frame, ec, loc.inode, up_subvols, _gf_false, +                               _gf_false, &need_heal); +        if (ret == ec->nodes && !need_heal) {                  goto set_heal;          }          need_heal = _gf_false; diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c index 195bb6377fa..751d45a63f6 100644 --- a/xlators/cluster/ec/src/ec-helpers.c +++ b/xlators/cluster/ec/src/ec-helpers.c @@ -211,8 +211,8 @@ int32_t ec_dict_set_array(dict_t *dict, char *key, uint64_t value[],  } -int32_t ec_dict_del_array(dict_t *dict, char *key, uint64_t value[], -                          int32_t size) +int32_t +ec_dict_get_array (dict_t *dict, char *key, uint64_t value[], int32_t size)  {      void    *ptr;      int32_t len; @@ -246,11 +246,21 @@ int32_t ec_dict_del_array(dict_t *dict, char *key, uint64_t value[],              }      } -    dict_del(dict, key); -      return 0;  } +int32_t +ec_dict_del_array (dict_t *dict, char *key, uint64_t value[], int32_t size) +{ +    int ret = 0; + +    ret = ec_dict_get_array (dict, key, value, size); +    if (ret == 0) +            dict_del(dict, key); + +    return ret; +} +  int32_t ec_dict_set_number(dict_t * dict, char * key, uint64_t value)  { diff --git a/xlators/cluster/ec/src/ec-helpers.h b/xlators/cluster/ec/src/ec-helpers.h index 0b355bd440e..4d2145c8317 100644 --- a/xlators/cluster/ec/src/ec-helpers.h +++ b/xlators/cluster/ec/src/ec-helpers.h @@ -30,6 +30,9 @@ int32_t ec_buffer_alloc(xlator_t *xl, size_t size, struct iobref **piobref,                          void **ptr);  int32_t ec_dict_set_array(dict_t *dict, char *key,                            uint64_t *value, int32_t size); +int32_t ec_dict_get_array (dict_t *dict, char *key, uint64_t value[], +                           int32_t size); +  int32_t ec_dict_del_array(dict_t *dict, char *key,                            uint64_t *value, int32_t size);  int32_t ec_dict_set_number(dict_t * dict, char * key, uint64_t value);  | 
