diff options
| -rw-r--r-- | libglusterfs/src/cluster-syncop.h | 35 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-common.h | 2 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-generic.c | 1 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-heal.c | 793 | 
4 files changed, 823 insertions, 8 deletions
diff --git a/libglusterfs/src/cluster-syncop.h b/libglusterfs/src/cluster-syncop.h index 2c94246ff1f..a681951c27d 100644 --- a/libglusterfs/src/cluster-syncop.h +++ b/libglusterfs/src/cluster-syncop.h @@ -121,6 +121,41 @@ cluster_tryentrylk (xlator_t **subvols, unsigned char *on, int numsubvols,                      call_frame_t *frame, xlator_t *this, char *dom,                      inode_t *inode, const char *name); +int32_t +cluster_fxattrop (xlator_t **subvols, unsigned char *on, int numsubvols, +                  default_args_cbk_t *replies, unsigned char *output, +                  call_frame_t *frame, xlator_t *this, fd_t *fd, +                  gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata); + +int32_t +cluster_fstat (xlator_t **subvols, unsigned char *on, int numsubvols, +               default_args_cbk_t *replies, unsigned char *output, +               call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata); + +int32_t +cluster_ftruncate (xlator_t **subvols, unsigned char *on, int numsubvols, +                   default_args_cbk_t *replies, unsigned char *output, +                   call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +                   dict_t *xdata); + +int32_t +cluster_open (xlator_t **subvols, unsigned char *on, int numsubvols, +              default_args_cbk_t *replies, unsigned char *output, +              call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, +              fd_t *fd, dict_t *xdata); + +int +cluster_tryinodelk (xlator_t **subvols, unsigned char *on, int numsubvols, +                    default_args_cbk_t *replies, unsigned char *locked_on, +                    call_frame_t *frame, xlator_t *this, char *dom, +                    inode_t *inode, off_t off, size_t size); + +int32_t +cluster_fsetattr (xlator_t **subvols, unsigned char *on, int numsubvols, +                  default_args_cbk_t *replies, unsigned char *output, +                  call_frame_t *frame, xlator_t *this, fd_t *fd, +                  struct iatt *stbuf, int32_t valid, dict_t *xdata); +  void  cluster_replies_wipe (default_args_cbk_t *replies, int num_subvols);  #endif /* !_CLUSTER_SYNCOP_H */ diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index ba009040b71..04f85a43f16 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -31,6 +31,8 @@ typedef enum {  #define EC_FLAG_WAITING_WINDS     0x0010 +#define EC_SELFHEAL_BIT 62 +  #define EC_MINIMUM_ONE   -1  #define EC_MINIMUM_MIN   -2  #define EC_MINIMUM_ALL   -3 diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c index 4cf5a50ecbd..50169771476 100644 --- a/xlators/cluster/ec/src/ec-generic.c +++ b/xlators/cluster/ec/src/ec-generic.c @@ -18,7 +18,6 @@  #include "ec-fops.h"  #include "byte-order.h" -#define EC_SELFHEAL_BIT 62  /* FOP: flush */  int32_t ec_flush_cbk(call_frame_t * frame, void * cookie, xlator_t * this, diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c index 1e19cf57e1b..b7b910502f8 100644 --- a/xlators/cluster/ec/src/ec-heal.c +++ b/xlators/cluster/ec/src/ec-heal.c @@ -1052,10 +1052,11 @@ void ec_heal_reopen_fd(ec_heal_t * heal)      UNLOCK(&inode->lock);  } -int32_t ec_heal_writev_cbk(call_frame_t * frame, void * cookie, -                           xlator_t * this, int32_t op_ret, int32_t op_errno, -                           struct iatt * prebuf, struct iatt * postbuf, -                           dict_t * xdata) +int32_t +ec_heal_writev_cbk (call_frame_t *frame, void *cookie, +                    xlator_t *this, int32_t op_ret, int32_t op_errno, +                    struct iatt *prebuf, struct iatt *postbuf, +                    dict_t *xdata)  {      ec_trace("WRITE_CBK", cookie, "ret=%d, errno=%d", op_ret, op_errno); @@ -1091,7 +1092,7 @@ int32_t ec_heal_readv_cbk(call_frame_t * frame, void * cookie, xlator_t * this,      return 0;  } -void ec_heal_data(ec_heal_t * heal) +void ec_heal_data_block(ec_heal_t *heal)  {      ec_trace("DATA", heal->fop, "good=%lX, bad=%lX", heal->good, heal->bad); @@ -1393,7 +1394,7 @@ ec_manager_heal (ec_fop_data_t * fop, int32_t state)              return EC_STATE_HEAL_DATA_COPY;          case EC_STATE_HEAL_DATA_COPY: -            ec_heal_data(heal); +            ec_heal_data_block(heal);              return EC_STATE_HEAL_DATA_UNLOCK; @@ -1633,6 +1634,18 @@ ec_mask_to_char_array (uintptr_t mask, unsigned char *array, int numsubvols)                  array[i] = ((mask >> i) & 1);  } +uintptr_t +ec_char_array_to_mask (unsigned char *array, int numsubvols) +{ +        int       i    = 0; +        uintptr_t mask = 0; + +        for (i = 0; i < numsubvols; i++) +                if (array[i]) +                        mask |= (1ULL<<i); +        return mask; +} +  int  ec_heal_find_direction (ec_t *ec, ec_txn_t type, default_args_cbk_t *replies,                          uint64_t *versions, uint64_t *dirty, @@ -1640,8 +1653,8 @@ ec_heal_find_direction (ec_t *ec, ec_txn_t type, default_args_cbk_t *replies,  {          void        *ptr        = NULL;          uint64_t    *value      = NULL; -        uint64_t    max_version = 0;          int         source      = -1; +        uint64_t    max_version = 0;          int32_t     len         = 0;          int         ret         = 0;          int         i           = 0; @@ -2643,3 +2656,769 @@ out:                  STACK_DESTROY (frame->root);          return ret;  } + +/*Data heal*/ +int +ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies, +                             uint64_t *versions, uint64_t *dirty, +                             uint64_t *size, unsigned char *sources, +                             unsigned char *healed_sinks) +{ +        char            version_size[64] = {0}; +        uint64_t        *value           = NULL; +        dict_t          *version_size_db = NULL; +        unsigned char   *same            = NULL; +        void            *ptr             = NULL; +        int             len              = 0; +        int             max_same_count   = 0; +        int             source           = 0; +        int             i                = 0; +        int             ret              = 0; + +        version_size_db = dict_new (); +        if (!version_size_db) { +                ret = -ENOMEM; +                goto out; +        } + +        for (i = 0; i < ec->nodes; i++) { +                if (!replies[i].valid) +                        continue; +                if (replies[i].op_ret < 0) +                        continue; +                ret = dict_get_ptr_and_len (replies[i].xattr, EC_XATTR_VERSION, +                                            &ptr, &len); +                if (ret == 0) { +                        value = ptr; +                        versions[i] = ntoh64(value[EC_DATA_TXN]); +                } + +                ret = dict_get_ptr_and_len (replies[i].xattr, EC_XATTR_DIRTY, +                                            &ptr, &len); +                if (ret == 0) { +                        value = ptr; +                        dirty[i] = ntoh64(value[EC_DATA_TXN]); +                } +                ret = dict_get_ptr_and_len (replies[i].xattr, EC_XATTR_SIZE, +                                            &ptr, &len); +                if (ret == 0) { +                        value = ptr; +                        size[i] = ntoh64(*value); +                } +                /*Build a db of same version, size*/ +                snprintf (version_size, sizeof (version_size), +                          "%"PRIu64"-%"PRIu64, versions[i], size[i]); +                ret = dict_get_bin (version_size_db, version_size, +                                    (void **)&same); +                if (ret < 0) { +                        same = alloca0 (ec->nodes); +                } + +                same[i] = 1; +                if (max_same_count < EC_COUNT (same, ec->nodes)) { +                        max_same_count = EC_COUNT (same, ec->nodes); +                        source = i; +                } + +                if (ret < 0) { +                        ret = dict_set_static_bin (version_size_db, +                                                version_size, same, ec->nodes); +                } + +                if (ret < 0) { +                        ret = -ENOMEM; +                        goto out; +                } +        } +        /* If we don't have ec->fragments number of same version,size it is not +         * recoverable*/ +        if (max_same_count < ec->fragments) { +                ret = -EIO; +                goto out; +        } else { +                snprintf (version_size, sizeof (version_size), +                          "%"PRIu64"-%"PRIu64, versions[source], size[source]); +                ret = dict_get_bin (version_size_db, version_size, +                                    (void **)&same); +                if (ret < 0) +                        goto out; +                memcpy (sources, same, ec->nodes); +                for (i = 0; i < ec->nodes; i++) { +                        if (replies[i].valid && (replies[i].op_ret == 0) && +                            !sources[i]) +                                healed_sinks[i] = 1; +                } +        } +        if (EC_COUNT (healed_sinks, ec->nodes) == 0) { +                ret = -ENOTCONN; +                goto out; +        } +        ret = source; +out: +        if (version_size_db) +                dict_unref (version_size_db); +        return ret; +} + +int +__ec_heal_data_prepare (call_frame_t *frame, ec_t *ec, fd_t *fd, +                        unsigned char *locked_on, uint64_t *versions, +                        uint64_t *dirty, uint64_t *size, unsigned char *sources, +                        unsigned char *healed_sinks, unsigned char *trim, +                        struct iatt *stbuf) +{ +        default_args_cbk_t *replies = NULL; +        unsigned char      *output  = NULL; +        dict_t             *xattrs  = NULL; +        uint64_t           zero_array[2] = {0}; +        int                source   = 0; +        int                ret      = 0; +        uint64_t           zero_value = 0; +        uint64_t           source_size = 0; +        int                i = 0; + +        EC_REPLIES_ALLOC (replies, ec->nodes); +        output = alloca0(ec->nodes); +        xattrs = dict_new (); +        if (!xattrs || +            dict_set_static_bin (xattrs, EC_XATTR_VERSION, zero_array, +                                 sizeof (zero_array)) || +            dict_set_static_bin (xattrs, EC_XATTR_DIRTY, zero_array, +                                 sizeof (zero_array)) || +            dict_set_static_bin (xattrs, EC_XATTR_SIZE, &zero_value, +                                 sizeof (zero_value))) { +                ret = -ENOMEM; +                goto out; +        } + +        ret = cluster_fxattrop (ec->xl_list, locked_on, ec->nodes, +                                replies, output, frame, ec->xl, fd, +                                GF_XATTROP_ADD_ARRAY64, xattrs, NULL); +        if (EC_COUNT (output, ec->nodes) <= ec->fragments) { +                ret = -ENOTCONN; +                goto out; +        } + +        source = ec_heal_data_find_direction (ec, replies, versions, dirty, +                                              size, sources, healed_sinks); +        ret = source; +        if (ret < 0) +                goto out; + +        /* There could be files with versions, size same but on disk ia_size +         * could be different because of disk crashes, mark them as sinks as +         * well*/ +        ret = cluster_fstat (ec->xl_list, locked_on, ec->nodes, replies, +                             output, frame, ec->xl, fd, NULL); +        EC_INTERSECT (sources, sources, output, ec->nodes); +        EC_INTERSECT (healed_sinks, healed_sinks, output, ec->nodes); +        if ((EC_COUNT (sources, ec->nodes) < ec->fragments) || +            (EC_COUNT (healed_sinks, ec->nodes) == 0)) { +                ret = -ENOTCONN; +                goto out; +        } + +        source_size = ec_adjust_size (ec, size[source], 1); + +        for (i = 0; i < ec->nodes; i++) { +                if (sources[i]) { +                        if (replies[i].stat.ia_size != source_size) { +                                sources[i] = 0; +                                healed_sinks[i] = 1; +                        } else if (stbuf) { +                                *stbuf = replies[i].stat; +                        } +                } + +                if (healed_sinks[i]) { +                        if (replies[i].stat.ia_size) +                                trim[i] = 1; +                } +        } + +        if (EC_COUNT(sources, ec->nodes) < ec->fragments) { +                ret = -ENOTCONN; +                goto out; +        } + +        ret = source; +out: +        if (xattrs) +                dict_unref (xattrs); +        cluster_replies_wipe (replies, ec->nodes); +        return ret; +} + +int +__ec_heal_mark_sinks (call_frame_t *frame, ec_t *ec, fd_t *fd, +                      uint64_t *versions, unsigned char *healed_sinks) +{ +        int                i        = 0; +        int                ret      = 0; +        unsigned char      *mark    = NULL; +        dict_t             *xattrs  = NULL; +        default_args_cbk_t *replies = NULL; +        unsigned char      *output  = NULL; +        uint64_t versions_xattr[2]  = {0}; + +        EC_REPLIES_ALLOC (replies, ec->nodes); +        xattrs = dict_new (); +        if (!xattrs) { +                ret = -ENOMEM; +                goto out; +        } + +        mark = alloca0 (ec->nodes); +        for (i = 0; i < ec->nodes; i++) { +                if (!healed_sinks[i]) +                        continue; +                if ((versions[i] >> EC_SELFHEAL_BIT) & 1) +                        continue; +                mark[i] = 1; +        } + +        if (EC_COUNT (mark, ec->nodes) == 0) +                return 0; + +        versions_xattr[EC_DATA_TXN] = hton64(1ULL<<EC_SELFHEAL_BIT); +        if (dict_set_static_bin (xattrs, EC_XATTR_VERSION, versions_xattr, +                                 sizeof (versions_xattr))) { +                ret = -ENOMEM; +                goto out; +        } + +        output = alloca0 (ec->nodes); +        ret = cluster_fxattrop (ec->xl_list, mark, ec->nodes, +                                replies, output, frame, ec->xl, fd, +                                GF_XATTROP_ADD_ARRAY64, xattrs, NULL); +        for (i = 0; i < ec->nodes; i++) { +                if (!output[i]) { +                        if (mark[i]) +                                healed_sinks[i] = 0; +                        continue; +                } +                versions[i] |= (1ULL<<EC_SELFHEAL_BIT); +        } + +        if (EC_COUNT (healed_sinks, ec->nodes) == 0) { +                ret = -ENOTCONN; +                goto out; +        } +        ret = 0; + +out: +        cluster_replies_wipe (replies, ec->nodes); +        if (xattrs) +                dict_unref (xattrs); +        return ret; +} + +int32_t +ec_manager_heal_block (ec_fop_data_t *fop, int32_t state) +{ +    ec_heal_t *heal = fop->data; +    heal->fop = fop; + +    switch (state) { +    case EC_STATE_INIT: +        ec_owner_set(fop->frame, fop->frame->root); + +        ec_heal_inodelk(heal, F_WRLCK, 1, 0, 0); + +        return EC_STATE_HEAL_DATA_COPY; + +    case EC_STATE_HEAL_DATA_COPY: +        ec_heal_data_block (heal); + +        return EC_STATE_HEAL_DATA_UNLOCK; + +    case -EC_STATE_HEAL_DATA_COPY: +    case -EC_STATE_HEAL_DATA_UNLOCK: +    case EC_STATE_HEAL_DATA_UNLOCK: +        ec_heal_inodelk(heal, F_UNLCK, 1, 0, 0); + +        if (state < 0) +                return -EC_STATE_REPORT; +        else +                return EC_STATE_REPORT; + +    case EC_STATE_REPORT: +        if (fop->cbks.heal) { +            fop->cbks.heal (fop->req_frame, fop, fop->xl, 0, +                            0, (heal->good | heal->bad), +                            heal->good, heal->bad, NULL); +        } + +        return EC_STATE_END; +    case -EC_STATE_REPORT: +        if (fop->cbks.heal) { +            fop->cbks.heal (fop->req_frame, fop, fop->xl, -1, +                            EIO, 0, 0, 0, NULL); +        } + +        return -EC_STATE_END; +    default: +        gf_log(fop->xl->name, GF_LOG_ERROR, "Unhandled state %d for %s", +               state, ec_fop_name(fop->id)); + +        return EC_STATE_END; +    } +} + +/*Takes lock */ +void +ec_heal_block (call_frame_t *frame, xlator_t *this, uintptr_t target, +              int32_t minimum, fop_heal_cbk_t func, ec_heal_t *heal) +{ +    ec_cbk_t callback = { .heal = func }; +    ec_fop_data_t *fop = NULL; +    int32_t error = EIO; + +    gf_log("ec", GF_LOG_TRACE, "EC(HEAL) %p", frame); + +    VALIDATE_OR_GOTO(this, out); +    GF_VALIDATE_OR_GOTO(this->name, this->private, out); + +    fop = ec_fop_data_allocate (frame, this, EC_FOP_HEAL, +                                EC_FLAG_UPDATE_LOC_INODE, target, minimum, +                                ec_wind_heal, ec_manager_heal_block, callback, +                                heal); +    if (fop == NULL) +        goto out; + +    error = 0; + +out: +    if (fop != NULL) { +        ec_manager(fop, error); +    } else { +        func(frame, NULL, this, -1, EIO, 0, 0, 0, NULL); +    } +} + +int32_t +ec_heal_block_done (call_frame_t *frame, void *cookie, xlator_t *this, +                    int32_t op_ret, int32_t op_errno, uintptr_t mask, +                    uintptr_t good, uintptr_t bad, dict_t *xdata) +{ +        ec_fop_data_t *fop = cookie; +        ec_heal_t *heal = fop->data; + +        fop->heal = NULL; +        heal->fop = NULL; +        syncbarrier_wake (heal->data); +        return 0; +} + +int +ec_sync_heal_block (call_frame_t *frame, xlator_t *this, ec_heal_t *heal) +{ +        ec_heal_block (frame, this, heal->bad|heal->good, EC_MINIMUM_ONE, +                       ec_heal_block_done, heal); +        syncbarrier_wait (heal->data, 1); +        if (heal->bad == 0) +                return -ENOTCONN; +        return 0; +} + +int +ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size, +                 unsigned char *sources, unsigned char *healed_sinks) +{ +        ec_heal_t        *heal = NULL; +        int              ret = 0; +        syncbarrier_t    barrier; +        struct iobuf_pool *pool = NULL; + +        if (syncbarrier_init (&barrier)) +                return -ENOMEM; + +        heal = alloca0(sizeof (*heal)); +        heal->fd = fd_ref (fd); +        heal->xl = ec->xl; +        heal->data = &barrier; +        syncbarrier_init (heal->data); +        pool = ec->xl->ctx->iobuf_pool; +        heal->size = iobpool_default_pagesize (pool); +        heal->bad       = ec_char_array_to_mask (healed_sinks, ec->nodes); +        heal->good      = ec_char_array_to_mask (sources, ec->nodes); +        heal->iatt.ia_type = IA_IFREG; +        LOCK_INIT(&heal->lock); + +        for (heal->offset = 0; (heal->offset < size) && !heal->done; +                                                   heal->offset += heal->size) { +                ret = ec_sync_heal_block (frame, ec->xl, heal); +                if (ret < 0) +                        break; + +        } +        fd_unref (heal->fd); +        LOCK_DESTROY (&heal->lock); +        syncbarrier_destroy (heal->data); +        return ret; +} + +int +__ec_heal_trim_sinks (call_frame_t *frame, ec_t *ec, fd_t *fd, +                      unsigned char *healed_sinks, unsigned char *trim) +{ +        default_args_cbk_t *replies = NULL; +        unsigned char      *output  = NULL; +        int                ret      = 0; +        int                i        = 0; + +        EC_REPLIES_ALLOC (replies, ec->nodes); +        output       = alloca0 (ec->nodes); + +        if (EC_COUNT (trim, ec->nodes) == 0) { +                ret = 0; +                goto out; +        } + +        ret = cluster_ftruncate (ec->xl_list, trim, ec->nodes, replies, output, +                                 frame, ec->xl, fd, 0, NULL); +        for (i = 0; i < ec->nodes; i++) { +                if (!output[i] && trim[i]) +                        healed_sinks[i] = 0; +        } + +        if (EC_COUNT (healed_sinks, ec->nodes) == 0) { +                ret = -ENOTCONN; +                goto out; +        } + +out: +        cluster_replies_wipe (replies, ec->nodes); +        return ret; +} + +int +ec_data_undo_pending (call_frame_t *frame, ec_t *ec, fd_t *fd, dict_t *xattr, +                      uint64_t *versions, uint64_t *dirty, uint64_t *size, +                      int source, gf_boolean_t erase_dirty, int idx) +{ +        uint64_t versions_xattr[2] = {0}; +        uint64_t dirty_xattr[2]    = {0}; +        uint64_t allzero[2]        = {0}; +        uint64_t size_xattr        = 0; +        int      ret               = 0; + +        versions_xattr[EC_DATA_TXN] = hton64(versions[source] - versions[idx]); +        ret = dict_set_static_bin (xattr, EC_XATTR_VERSION, +                                   versions_xattr, +                                   sizeof (versions_xattr)); +        if (ret < 0) +                goto out; + +        size_xattr = hton64(size[source] - size[idx]); +        ret = dict_set_static_bin (xattr, EC_XATTR_SIZE, +                                   &size_xattr, sizeof (size_xattr)); +        if (ret < 0) +                goto out; + +        if (erase_dirty) { +                dirty_xattr[EC_DATA_TXN] = hton64(-dirty[idx]); +                ret = dict_set_static_bin (xattr, EC_XATTR_DIRTY, +                                           dirty_xattr, +                                           sizeof (dirty_xattr)); +                if (ret < 0) +                        goto out; +        } + +        if ((memcmp (versions_xattr, allzero, sizeof (allzero)) == 0) && +            (memcmp (dirty_xattr, allzero, sizeof (allzero)) == 0) && +             (size == 0)) { +                ret = 0; +                goto out; +        } + +        ret = syncop_fxattrop (ec->xl_list[idx], fd, +                               GF_XATTROP_ADD_ARRAY64, xattr, NULL, NULL); +out: +        return ret; +} + +int +__ec_fd_data_adjust_versions (call_frame_t *frame, ec_t *ec, fd_t *fd, +                            unsigned char *sources, unsigned char *healed_sinks, +                            uint64_t *versions, uint64_t *dirty, uint64_t *size) +{ +        dict_t                     *xattr            = NULL; +        int                        i                 = 0; +        int                        ret               = 0; +        int                        op_ret            = 0; +        int                        source            = -1; +        gf_boolean_t               erase_dirty       = _gf_false; + +        xattr = dict_new (); +        if (!xattr) { +                op_ret = -ENOMEM; +                goto out; +        } + +        /* dirty xattr represents if the file needs heal. Unless all the +         * copies are healed, don't erase it */ +        if (EC_COUNT (sources, ec->nodes) + +            EC_COUNT (healed_sinks, ec->nodes) == ec->nodes) +                erase_dirty = _gf_true; + +        for (i = 0; i < ec->nodes; i++) { +                if (sources[i]) { +                        source = i; +                        break; +                } +        } + +        for (i = 0; i < ec->nodes; i++) { +                if (healed_sinks[i]) { +                        ret = ec_data_undo_pending (frame, ec, fd, xattr, +                                                    versions, dirty, size, +                                                    source, erase_dirty, i); +                        if (ret < 0) +                                goto out; +                } + +        } + +        if (!erase_dirty) +                goto out; + +        for (i = 0; i < ec->nodes; i++) { +                if (sources[i]) { +                        ret = ec_data_undo_pending (frame, ec, fd, xattr, +                                                    versions, dirty, size, +                                                    source, erase_dirty, i); +                        if (ret < 0) +                                continue; +                } + +        } +out: +        if (xattr) +                dict_unref (xattr); +        return op_ret; +} + +int +ec_restore_time_and_adjust_versions (call_frame_t *frame, ec_t *ec, fd_t *fd, +                                     unsigned char *sources, +                                     unsigned char *healed_sinks, +                                     uint64_t *versions, uint64_t *dirty, +                                     uint64_t *size) +{ +        unsigned char      *locked_on           = NULL; +        unsigned char      *participants        = NULL; +        unsigned char      *output              = NULL; +        default_args_cbk_t *replies             = NULL; +        unsigned char      *postsh_sources      = NULL; +        unsigned char      *postsh_healed_sinks = NULL; +        unsigned char      *postsh_trim         = NULL; +        uint64_t           *postsh_versions     = NULL; +        uint64_t           *postsh_dirty        = NULL; +        uint64_t           *postsh_size         = NULL; +        int                ret                  = 0; +        int                i                    = 0; +        struct iatt        source_buf           = {0}; +        loc_t              loc                  = {0}; + +        locked_on           = alloca0(ec->nodes); +        output              = alloca0(ec->nodes); +        participants        = alloca0(ec->nodes); +        postsh_sources      = alloca0(ec->nodes); +        postsh_healed_sinks = alloca0(ec->nodes); +        postsh_trim         = alloca0(ec->nodes); +        postsh_versions     = alloca0(ec->nodes * sizeof (*postsh_versions)); +        postsh_dirty        = alloca0(ec->nodes * sizeof (*postsh_dirty)); +        postsh_size         = alloca0(ec->nodes * sizeof (*postsh_size)); + +        for (i = 0; i < ec->nodes; i++) { +                if (healed_sinks[i] || sources[i]) +                        participants[i] = 1; +        } + +        EC_REPLIES_ALLOC (replies, ec->nodes); +        ret = cluster_inodelk (ec->xl_list, participants, ec->nodes, replies, +                               locked_on, frame, ec->xl, ec->xl->name, +                               fd->inode, 0, 0); +        { +                if (ret <= ec->fragments) { +                        gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal " +                                "as only %d number of subvolumes could " +                                "be locked", uuid_utoa (fd->inode->gfid), ret); +                        ret = -ENOTCONN; +                        goto unlock; +                } + +                ret = __ec_heal_data_prepare (frame, ec, fd, locked_on, +                                              postsh_versions, postsh_dirty, +                                              postsh_size, postsh_sources, +                                              postsh_healed_sinks, postsh_trim, +                                              &source_buf); +                if (ret < 0) +                        goto unlock; + +                loc.inode = inode_ref (fd->inode); +                gf_uuid_copy (loc.gfid, fd->inode->gfid); +                ret = cluster_setattr (ec->xl_list, healed_sinks, ec->nodes, +                                       replies, output, frame, ec->xl, &loc, +                                       &source_buf, +                                       GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME, +                                       NULL); +                EC_INTERSECT (healed_sinks, healed_sinks, output, ec->nodes); +                if (EC_COUNT (healed_sinks, ec->nodes) == 0) { +                        ret = -ENOTCONN; +                        goto unlock; +                } +                ret = __ec_fd_data_adjust_versions (frame, ec, fd, sources, +                                           healed_sinks, versions, dirty, size); +        } +unlock: +        cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output, +                           frame, ec->xl, ec->xl->name, fd->inode, 0, 0); +        cluster_replies_wipe (replies, ec->nodes); +        loc_wipe (&loc); +        return ret; +} + +int +__ec_heal_data (call_frame_t *frame, ec_t *ec, fd_t *fd, unsigned char *heal_on) +{ +        unsigned char      *locked_on    = NULL; +        unsigned char      *output       = NULL; +        uint64_t           *versions     = NULL; +        uint64_t           *dirty        = NULL; +        uint64_t           *size         = NULL; +        unsigned char      *sources      = NULL; +        unsigned char      *healed_sinks = NULL; +        unsigned char      *trim         = NULL; +        default_args_cbk_t *replies      = NULL; +        int                ret           = 0; +        int                source        = 0; + +        locked_on    = alloca0(ec->nodes); +        output       = alloca0(ec->nodes); +        sources      = alloca0 (ec->nodes); +        healed_sinks = alloca0 (ec->nodes); +        trim         = alloca0 (ec->nodes); +        versions     = alloca0 (ec->nodes * sizeof (*versions)); +        dirty        = alloca0 (ec->nodes * sizeof (*dirty)); +        size         = alloca0 (ec->nodes * sizeof (*size)); + +        EC_REPLIES_ALLOC (replies, ec->nodes); +        ret = cluster_inodelk (ec->xl_list, heal_on, ec->nodes, replies, +                               locked_on, frame, ec->xl, ec->xl->name, +                               fd->inode, 0, 0); +        { +                if (ret <= ec->fragments) { +                        gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal " +                                "as only %d number of subvolumes could " +                                "be locked", uuid_utoa (fd->inode->gfid), ret); +                        ret = -ENOTCONN; +                        goto unlock; +                } + +                ret = __ec_heal_data_prepare (frame, ec, fd, locked_on, +                                              versions, dirty, size, sources, +                                              healed_sinks, trim, NULL); +                if (ret < 0) +                        goto unlock; + +                source = ret; +                ret = __ec_heal_mark_sinks (frame, ec, fd, versions, +                                            healed_sinks); +                if (ret < 0) +                        goto unlock; + +                ret = __ec_heal_trim_sinks (frame, ec, fd, healed_sinks, trim); +        } +unlock: +        cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output, +                           frame, ec->xl, ec->xl->name, fd->inode, 0, 0); +        if (ret < 0) +                goto out; + +        ret = ec_rebuild_data (frame, ec, fd, size[source], sources, +                               healed_sinks); +        if (ret < 0) +                goto out; + +        ret = ec_restore_time_and_adjust_versions (frame, ec, fd, sources, +                                                   healed_sinks, versions, +                                                   dirty, size); +out: +        cluster_replies_wipe (replies, ec->nodes); +        return ret; +} + +int +ec_heal_data2 (call_frame_t *req_frame, ec_t *ec, inode_t *inode) +{ +        unsigned char      *locked_on            = NULL; +        unsigned char      *up_subvols           = NULL; +        unsigned char      *output               = NULL; +        default_args_cbk_t *replies              = NULL; +        call_frame_t       *frame                = NULL; +        fd_t               *fd                   = NULL; +        loc_t               loc                  = {0}; +        char               selfheal_domain[1024] = {0}; +        int                ret                   = 0; + +        EC_REPLIES_ALLOC (replies, ec->nodes); + +        locked_on  = alloca0(ec->nodes); +        output     = alloca0(ec->nodes); +        up_subvols = alloca0(ec->nodes); +        loc. inode = inode_ref (inode); +        gf_uuid_copy (loc.gfid, inode->gfid); + +        fd = fd_create (inode, 0); +        if (!fd) { +                ret = -ENOMEM; +                goto out; +        } + +        ec_mask_to_char_array (ec->xl_up, up_subvols, ec->nodes); +        frame = copy_frame (req_frame); +        if (!frame) { +                ret = -ENOMEM; +                goto out; +        } +        /*Do heal as root*/ +        frame->root->uid = 0; +        frame->root->gid = 0; + +        ret = cluster_open (ec->xl_list, up_subvols, ec->nodes, replies, output, +                            frame, ec->xl, &loc, O_RDWR|O_LARGEFILE, fd, NULL); +        if (ret <= ec->fragments) { +                ret = -ENOTCONN; +                goto out; +        } + +        fd_bind (fd); +        sprintf (selfheal_domain, "%s:self-heal", ec->xl->name); +        /*If other processes are already doing the heal, don't block*/ +        ret = cluster_tryinodelk (ec->xl_list, output, ec->nodes, replies, +                               locked_on, frame, ec->xl, selfheal_domain, inode, +                               0, 0); +        { +                if (ret <= ec->fragments) { +                        gf_log (ec->xl->name, GF_LOG_DEBUG, "%s: Skipping heal " +                                "as only %d number of subvolumes could " +                                "be locked", uuid_utoa (inode->gfid), ret); +                        ret = -ENOTCONN; +                        goto unlock; +                } +                ret = __ec_heal_data (frame, ec, fd, locked_on); +        } +unlock: +        cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output, +                           frame, ec->xl, selfheal_domain, inode, 0, 0); +out: +        if (fd) +                fd_unref (fd); +        loc_wipe (&loc); +        cluster_replies_wipe (replies, ec->nodes); +        if (frame) +                STACK_DESTROY (frame->root); +        return ret; +}  | 
