diff options
| author | Xavier Hernandez <xhernandez@datalab.es> | 2014-07-14 17:34:04 +0200 | 
|---|---|---|
| committer | Vijay Bellur <vbellur@redhat.com> | 2014-09-16 10:14:28 -0700 | 
| commit | b224dd14b75fb993eec4f44ecf11edce8a6fc42f (patch) | |
| tree | 2a67a96fa981428adaef85d57c408265db50c8f2 | |
| parent | 7fe574039815ad1339851eb0dc9f2366b02ceddf (diff) | |
ec: Optimize read/write performance
This patch significantly improves performance of read/write
operations on a dispersed volume by reusing previous inodelk/
entrylk operations on the same inode/entry. This reduces the
latency of each individual operation considerably.
Inode version and size are also updated when needed instead
of on each request. This gives an additional boost.
This is a backport of http://review.gluster.org/8369/
Change-Id: I4b98d5508c86b53032e16e295f72a3f83fd8fcac
BUG: 1140844
Signed-off-by: Xavier Hernandez <xhernandez@datalab.es>
Reviewed-on: http://review.gluster.org/8746
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Jeff Darcy <jdarcy@redhat.com>
Reviewed-by: Dan Lambright <dlambrig@redhat.com>
| -rw-r--r-- | xlators/cluster/ec/src/ec-combine.c | 8 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 545 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-common.h | 17 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-data.c | 1 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-data.h | 31 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-dir-write.c | 96 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-generic.c | 96 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-helpers.c | 3 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-inode-read.c | 47 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-inode-write.c | 117 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-mem-types.h | 3 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec.c | 9 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec.h | 1 | 
13 files changed, 706 insertions, 268 deletions
diff --git a/xlators/cluster/ec/src/ec-combine.c b/xlators/cluster/ec/src/ec-combine.c index 3d088d9be4a..02b7e6024fd 100644 --- a/xlators/cluster/ec/src/ec-combine.c +++ b/xlators/cluster/ec/src/ec-combine.c @@ -735,7 +735,7 @@ void ec_combine(ec_cbk_data_t * cbk, ec_combine_f combine)      ec_fop_data_t * fop = cbk->fop;      ec_cbk_data_t * ans = NULL, * tmp = NULL;      struct list_head * item = NULL; -    int32_t needed = 0, report = 0; +    int32_t needed = 0, resume = 0;      char str[32];      LOCK(&fop->lock); @@ -776,7 +776,7 @@ void ec_combine(ec_cbk_data_t * cbk, ec_combine_f combine)          ec_update_bad(fop, cbk->mask); -        report = 1; +        resume = 1;      }      ans = list_entry(fop->cbk_list.next, ec_cbk_data_t, list); @@ -788,8 +788,8 @@ void ec_combine(ec_cbk_data_t * cbk, ec_combine_f combine)      {          ec_dispatch_next(fop, cbk->idx);      } -    else if (report) +    else if (resume)      { -        ec_report(fop, 0); +        ec_resume(fop, 0);      }  } diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index a4423d94aa9..ad04e646d68 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -316,20 +316,10 @@ void ec_resume_parent(ec_fop_data_t * fop, int32_t error)      }  } -void ec_report(ec_fop_data_t * fop, int32_t error) -{ -    if (!list_empty(&fop->lock_list)) -    { -        ec_owner_set(fop->frame, fop->frame->root); -    } - -    ec_resume(fop, error); -} -  void ec_complete(ec_fop_data_t * fop)  {      ec_cbk_data_t * cbk = NULL; -    int32_t ready = 0, report = 0; +    int32_t resume = 0;      LOCK(&fop->lock); @@ -351,21 +341,17 @@ void ec_complete(ec_fop_data_t * fop)                  }              } -            report = 1; +            resume = 1;          }          else if ((fop->flags & EC_FLAG_WAITING_WINDS) != 0)          { -            ready = 1; +            resume = 1;          }      }      UNLOCK(&fop->lock); -    if (report) -    { -        ec_report(fop, 0); -    } -    if (ready) +    if (resume)      {          ec_resume(fop, 0);      } @@ -518,7 +504,7 @@ void ec_dispatch_start(ec_fop_data_t * fop)      INIT_LIST_HEAD(&fop->cbk_list); -    if (!list_empty(&fop->lock_list)) +    if (fop->lock_count > 0)      {          ec_owner_copy(fop->frame, &fop->req_frame->root->lk_owner);      } @@ -602,6 +588,7 @@ void ec_dispatch_min(ec_fop_data_t * fop)  ec_lock_t * ec_lock_allocate(xlator_t * xl, int32_t kind, loc_t * loc)  { +    ec_t * ec = xl->private;      ec_lock_t * lock;      if ((loc->inode == NULL) || @@ -613,15 +600,15 @@ ec_lock_t * ec_lock_allocate(xlator_t * xl, int32_t kind, loc_t * loc)          return NULL;      } -    lock = GF_MALLOC(sizeof(*lock), ec_mt_ec_lock_t); +    lock = mem_get0(ec->lock_pool);      if (lock != NULL)      { -        memset(lock, 0, sizeof(*lock)); -          lock->kind = kind; +        lock->good_mask = -1ULL; +        INIT_LIST_HEAD(&lock->waiting);          if (!ec_loc_from_loc(xl, &lock->loc, loc))          { -            GF_FREE(lock); +            mem_put(lock);              lock = NULL;          }      } @@ -634,34 +621,55 @@ void ec_lock_destroy(ec_lock_t * lock)      GF_FREE(lock->basename);      loc_wipe(&lock->loc); -    GF_FREE(lock); +    mem_put(lock);  } -int32_t ec_locked(call_frame_t * frame, void * cookie, xlator_t * this, -                  int32_t op_ret, int32_t op_errno, dict_t * xdata) +int32_t ec_lock_compare(ec_lock_t * lock1, ec_lock_t * lock2)  { -    ec_fop_data_t * fop = cookie; -    ec_lock_t * lock = NULL; +    int32_t res; -    if (op_ret >= 0) +    res = uuid_compare(lock1->loc.gfid, lock2->loc.gfid); +    if (res != 0)      { -        lock = fop->data; -        lock->mask = fop->good; -        fop->parent->mask &= fop->good; - -        ec_trace("LOCKED", fop->parent, "lock=%p", lock); +        return res;      } -    else +    if (lock1->basename == NULL)      { -        gf_log(this->name, GF_LOG_WARNING, "Failed to complete preop lock"); +        if (lock2->basename == NULL) +        { +            return 0; +        } +        return 1; +    } +    if (lock2->basename == NULL) +    { +        return -1;      } +    return strcmp(lock1->basename, lock2->basename); +} -    return 0; +void ec_lock_insert(ec_fop_data_t * fop, ec_lock_t * lock) +{ +    ec_lock_t * tmp; + +    if ((fop->lock_count > 0) && +        (ec_lock_compare(fop->locks[0].lock, lock) > 0)) +    { +        tmp = fop->locks[0].lock; +        fop->locks[0].lock = lock; +        lock = tmp; +    } +    fop->locks[fop->lock_count].lock = lock; +    fop->locks[fop->lock_count].fop = fop; +    fop->lock_count++; + +    lock->refs++;  } -void ec_lock_entry(ec_fop_data_t * fop, loc_t * loc) +void ec_lock_prepare_entry(ec_fop_data_t * fop, loc_t * loc)  {      ec_lock_t * lock = NULL; +    ec_inode_t * ctx = NULL;      char * name = NULL;      loc_t tmp;      int32_t error; @@ -679,116 +687,106 @@ void ec_lock_entry(ec_fop_data_t * fop, loc_t * loc)          return;      } -    LOCK(&fop->lock); +    LOCK(&tmp.inode->lock); -    list_for_each_entry(lock, &fop->lock_list, list) +    ctx = __ec_inode_get(tmp.inode, fop->xl); +    if (ctx == NULL)      { -        if ((lock->kind == EC_LOCK_ENTRY) && -            (lock->loc.inode == tmp.inode) && -            (strcmp(lock->basename, name) == 0)) -        { -            ec_trace("LOCK_ENTRYLK", fop, "lock=%p, parent=%p, path=%s, " -                                          "name=%s. Lock already acquired", -                     lock, loc->parent, loc->path, name); - -            lock = NULL; +        __ec_fop_set_error(fop, EIO); -            goto unlock; -        } +        goto unlock;      } -    lock = ec_lock_allocate(fop->xl, EC_LOCK_ENTRY, &tmp); -    if (lock != NULL) +    list_for_each_entry(lock, &ctx->entry_locks, list)      { -        lock->type = ENTRYLK_WRLCK; -        lock->basename = name; - -        if (list_empty(&fop->lock_list)) +        if (strcmp(lock->basename, name) == 0)          { -            ec_owner_set(fop->frame, fop->frame->root); +            ec_trace("LOCK_ENTRYLK", fop, "lock=%p, inode=%p, path=%s, " +                                          "name=%s. Lock already acquired", +                     lock, tmp.inode, tmp.path, name); + +            goto insert;          } -        list_add_tail(&lock->list, &fop->lock_list);      } -    else + +    lock = ec_lock_allocate(fop->xl, EC_LOCK_ENTRY, &tmp); +    if (lock == NULL)      {          __ec_fop_set_error(fop, EIO); + +        goto unlock;      } -unlock: -    UNLOCK(&fop->lock); +    ec_trace("LOCK_CREATE", fop, "lock=%p", lock); -    loc_wipe(&tmp); +    lock->type = ENTRYLK_WRLCK; +    lock->basename = name; +    name = NULL; -    if (lock != NULL) -    { -        ec_trace("LOCK_ENTRYLK", fop, "lock=%p, parent=%p, path=%s, " -                                      "basename=%s", lock, lock->loc.inode, -                 lock->loc.path, lock->basename); +    list_add_tail(&lock->list, &ctx->entry_locks); -        ec_entrylk(fop->frame, fop->xl, -1, EC_MINIMUM_ALL, ec_locked, lock, -                   fop->xl->name, &lock->loc, lock->basename, ENTRYLK_LOCK, -                   lock->type, NULL); -    } -    else -    { -        GF_FREE(name); -    } +insert: +    ec_lock_insert(fop, lock); + +unlock: +    UNLOCK(&tmp.inode->lock); + +    loc_wipe(&tmp); +    GF_FREE(name);  } -void ec_lock_inode(ec_fop_data_t * fop, loc_t * loc) +void ec_lock_prepare_inode(ec_fop_data_t * fop, loc_t * loc)  {      ec_lock_t * lock; +    ec_inode_t * ctx;      if ((fop->parent != NULL) || (fop->error != 0) || (loc->inode == NULL))      {          return;      } -    LOCK(&fop->lock); +    LOCK(&loc->inode->lock); -    list_for_each_entry(lock, &fop->lock_list, list) +    ctx = __ec_inode_get(loc->inode, fop->xl); +    if (ctx == NULL)      { -        if ((lock->kind == EC_LOCK_INODE) && (lock->loc.inode == loc->inode)) -        { -            UNLOCK(&fop->lock); - -            ec_trace("LOCK_INODELK", fop, "lock=%p, inode=%p. Lock already " -                                          "acquired", lock, loc->inode); +        __ec_fop_set_error(fop, EIO); -            return; -        } +        goto unlock;      } -    lock = ec_lock_allocate(fop->xl, EC_LOCK_INODE, loc); -    if (lock != NULL) +    if (!list_empty(&ctx->inode_locks))      { -        lock->flock.l_type = F_WRLCK; -        lock->flock.l_whence = SEEK_SET; +        lock = list_entry(ctx->inode_locks.next, ec_lock_t, list); +        ec_trace("LOCK_INODELK", fop, "lock=%p, inode=%p. Lock already " +                                      "acquired", lock, loc->inode); -        if (list_empty(&fop->lock_list)) -        { -            ec_owner_set(fop->frame, fop->frame->root); -        } -        list_add_tail(&lock->list, &fop->lock_list); +        goto insert;      } -    else + +    lock = ec_lock_allocate(fop->xl, EC_LOCK_INODE, loc); +    if (lock == NULL)      {          __ec_fop_set_error(fop, EIO); + +        goto unlock;      } -    UNLOCK(&fop->lock); +    ec_trace("LOCK_CREATE", fop, "lock=%p", lock); -    if (lock != NULL) -    { -        ec_trace("LOCK_INODELK", fop, "lock=%p, inode=%p, owner=%p", lock, -                 lock->loc.inode, fop->frame->root); +    lock->flock.l_type = F_WRLCK; +    lock->flock.l_whence = SEEK_SET; -        ec_inodelk(fop->frame, fop->xl, -1, EC_MINIMUM_ALL, ec_locked, lock, -                   fop->xl->name, &lock->loc, F_SETLKW, &lock->flock, NULL); -    } +    list_add_tail(&lock->list, &ctx->inode_locks); + +insert: +    ec_lock_insert(fop, lock); + +unlock: +    UNLOCK(&loc->inode->lock);  } -void ec_lock_fd(ec_fop_data_t * fop, fd_t * fd) +void ec_lock_prepare_fd(ec_fop_data_t * fop, fd_t * fd)  {      loc_t loc; @@ -799,7 +797,7 @@ void ec_lock_fd(ec_fop_data_t * fop, fd_t * fd)      if (ec_loc_from_fd(fop->xl, &loc, fd))      { -        ec_lock_inode(fop, &loc); +        ec_lock_prepare_inode(fop, &loc);          loc_wipe(&loc);      } @@ -809,6 +807,100 @@ void ec_lock_fd(ec_fop_data_t * fop, fd_t * fd)      }  } +int32_t ec_locked(call_frame_t * frame, void * cookie, xlator_t * this, +                  int32_t op_ret, int32_t op_errno, dict_t * xdata) +{ +    ec_fop_data_t * fop = cookie; +    ec_lock_t * lock = NULL; + +    if (op_ret >= 0) +    { +        lock = fop->data; +        lock->mask = fop->good; +        lock->acquired = 1; + +        fop->parent->mask &= fop->good; +        fop->parent->locked++; + +        ec_trace("LOCKED", fop->parent, "lock=%p", lock); + +        ec_lock(fop->parent); +    } +    else +    { +        gf_log(this->name, GF_LOG_WARNING, "Failed to complete preop lock"); +    } + +    return 0; +} + +void ec_lock(ec_fop_data_t * fop) +{ +    ec_lock_t * lock; + +    while (fop->locked < fop->lock_count) +    { +        lock = fop->locks[fop->locked].lock; + +        LOCK(&lock->loc.inode->lock); + +        if (lock->owner != NULL) +        { +            ec_trace("LOCK_WAIT", fop, "lock=%p", lock); + +            list_add_tail(&fop->locks[fop->locked].wait_list, &lock->waiting); + +            fop->jobs++; +            fop->refs++; + +            UNLOCK(&lock->loc.inode->lock); + +            break; +        } +        lock->owner = fop; + +        UNLOCK(&lock->loc.inode->lock); + +        if (!lock->acquired) +        { +            ec_owner_set(fop->frame, lock); + +            if (lock->kind == EC_LOCK_ENTRY) +            { +                ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p, path=%s, " +                         "name=%s", lock, lock->loc.inode, lock->loc.path, +                         lock->basename); + +                ec_entrylk(fop->frame, fop->xl, -1, EC_MINIMUM_ALL, ec_locked, +                           lock, fop->xl->name, &lock->loc, lock->basename, +                           ENTRYLK_LOCK, lock->type, NULL); +            } +            else +            { +                ec_trace("LOCK_ACQUIRE", fop, "lock=%p, inode=%p", lock, +                         lock->loc.inode); + +                ec_inodelk(fop->frame, fop->xl, -1, EC_MINIMUM_ALL, ec_locked, +                           lock, fop->xl->name, &lock->loc, F_SETLKW, +                           &lock->flock, NULL); +            } + +            break; +        } + +        ec_trace("LOCK_REUSE", fop, "lock=%p", lock); + +        if (lock->have_size) +        { +            fop->pre_size = fop->post_size = lock->size; +            fop->have_size = 1; +        } +        fop->mask &= lock->good_mask; + +        fop->locked++; +    } +} +  int32_t ec_unlocked(call_frame_t * frame, void * cookie, xlator_t * this,                      int32_t op_ret, int32_t op_errno, dict_t * xdata)  { @@ -829,50 +921,68 @@ int32_t ec_unlocked(call_frame_t * frame, void * cookie, xlator_t * this,  void ec_unlock(ec_fop_data_t * fop)  { -    ec_lock_t * lock, * item; - -    ec_trace("UNLOCK", fop, ""); +    ec_lock_t * lock; +    int32_t i, refs; -    list_for_each_entry_safe(lock, item, &fop->lock_list, list) +    for (i = 0; i < fop->lock_count; i++)      { -        list_del(&lock->list); +        lock = fop->locks[i].lock; -        if (lock->mask != 0) -        { -            switch (lock->kind) -            { -                case EC_LOCK_ENTRY: -                    ec_trace("UNLOCK_ENTRYLK", fop, "lock=%p, parent=%p, " -                                                    "path=%s, basename=%s", -                             lock, lock->loc.inode, lock->loc.path, -                             lock->basename); +        LOCK(&lock->loc.inode->lock); -                    ec_entrylk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL, -                               ec_unlocked, lock, fop->xl->name, &lock->loc, -                               lock->basename, ENTRYLK_UNLOCK, lock->type, -                               NULL); +        ec_trace("UNLOCK", fop, "lock=%p", lock); -                    break; - -                case EC_LOCK_INODE: -                    lock->flock.l_type = F_UNLCK; -                    ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", lock, -                             lock->loc.inode); +        refs = --lock->refs; +        if (refs == 0) +        { +            list_del_init(&lock->list); +        } -                    ec_inodelk(fop->frame, fop->xl, lock->mask, EC_MINIMUM_ALL, -                               ec_unlocked, lock, fop->xl->name, &lock->loc, -                               F_SETLK, &lock->flock, NULL); +        UNLOCK(&lock->loc.inode->lock); -                    break; +        if (refs == 0) +        { +            if (lock->mask != 0) +            { +                ec_owner_set(fop->frame, lock); -                default: -                    gf_log(fop->xl->name, GF_LOG_ERROR, "Invalid lock type"); +                switch (lock->kind) +                { +                    case EC_LOCK_ENTRY: +                        ec_trace("UNLOCK_ENTRYLK", fop, "lock=%p, inode=%p, " +                                                        "path=%s, basename=%s", +                                 lock, lock->loc.inode, lock->loc.path, +                                 lock->basename); + +                        ec_entrylk(fop->frame, fop->xl, lock->mask, +                                   EC_MINIMUM_ALL, ec_unlocked, lock, +                                   fop->xl->name, &lock->loc, lock->basename, +                                   ENTRYLK_UNLOCK, lock->type, NULL); + +                        break; + +                    case EC_LOCK_INODE: +                        lock->flock.l_type = F_UNLCK; +                        ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", +                                 lock, lock->loc.inode); + +                        ec_inodelk(fop->frame, fop->xl, lock->mask, +                                   EC_MINIMUM_ALL, ec_unlocked, lock, +                                   fop->xl->name, &lock->loc, F_SETLK, +                                   &lock->flock, NULL); + +                        break; + +                    default: +                        gf_log(fop->xl->name, GF_LOG_ERROR, "Invalid lock " +                                                            "type"); +                }              } -        } -        loc_wipe(&lock->loc); +            ec_trace("LOCK_DESTROY", fop, "lock=%p", lock); -        GF_FREE(lock); +            ec_lock_destroy(lock); +        }      }  } @@ -883,11 +993,36 @@ int32_t ec_get_size_version_set(call_frame_t * frame, void * cookie,                                  struct iatt * postparent)  {      ec_fop_data_t * fop = cookie; +    ec_inode_t * ctx; +    ec_lock_t * lock;      if (op_ret >= 0)      { -        fop->parent->mask &= fop->good; +        LOCK(&inode->lock); + +        ctx = __ec_inode_get(inode, this); +        if ((ctx != NULL) && !list_empty(&ctx->inode_locks)) +        { +            lock = list_entry(ctx->inode_locks.next, ec_lock_t, list); + +            lock->have_size = 1; +            lock->size = buf->ia_size; +            lock->version = fop->answer->version; +        } + +        UNLOCK(&inode->lock); + +        if (lock != NULL) +        { +            // Only update parent mask if the lookup has been made with +            // inode locked. +            fop->parent->mask &= fop->good; +        } +          fop->parent->pre_size = fop->parent->post_size = buf->ia_size; + +        fop->parent->have_size = 1; +      }      else      { @@ -907,11 +1042,18 @@ void ec_get_size_version(ec_fop_data_t * fop)      gid_t gid;      int32_t error = ENOMEM; -    if (fop->parent != NULL) +    if (fop->have_size) +    { +        return; +    } + +    if ((fop->parent != NULL) && fop->parent->have_size)      {          fop->pre_size = fop->parent->pre_size;          fop->post_size = fop->parent->post_size; +        fop->have_size = 1; +          return;      } @@ -998,10 +1140,10 @@ int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie,      return 0;  } -void ec_update_size_version(ec_fop_data_t * fop) +void ec_update_size_version(ec_fop_data_t * fop, uint64_t version, +                            size_t size)  {      dict_t * dict; -    size_t size;      uid_t uid;      gid_t gid; @@ -1012,20 +1154,20 @@ void ec_update_size_version(ec_fop_data_t * fop)          return;      } +    ec_trace("UPDATE", fop, "version=%ld, size=%ld", version, size); +      dict = dict_new();      if (dict == NULL)      {          goto out;      } -    if (ec_dict_set_number(dict, EC_XATTR_VERSION, 1) != 0) +    if (ec_dict_set_number(dict, EC_XATTR_VERSION, version) != 0)      {          goto out;      } -    size = fop->post_size; -    if (fop->pre_size != size) +    if (size != 0)      { -        size -= fop->pre_size;          if (ec_dict_set_number(dict, EC_XATTR_SIZE, size) != 0)          {              goto out; @@ -1069,6 +1211,113 @@ out:      gf_log(fop->xl->name, GF_LOG_ERROR, "Unable to update version and size");  } +void ec_flush_size_version(ec_fop_data_t * fop) +{ +    ec_lock_t * lock; +    uint64_t version; +    size_t delta; + +    GF_ASSERT(fop->lock_count == 1); + +    lock = fop->locks[0].lock; + +    GF_ASSERT(lock->kind == EC_LOCK_INODE); + +    LOCK(&lock->loc.inode->lock); + +    GF_ASSERT(lock->owner == fop); + +    version = lock->version_delta; +    delta = lock->size_delta; +    lock->version_delta = 0; +    lock->size_delta = 0; + +    UNLOCK(&lock->loc.inode->lock); + +    if (version > 0) +    { +        ec_update_size_version(fop, version, delta); +    } +} + +void ec_lock_reuse(ec_fop_data_t * fop, int32_t update) +{ +    ec_fop_data_t * wait_fop; +    ec_lock_t * lock; +    ec_lock_link_t * link; +    size_t delta = 0; +    uint64_t version = 0; +    int32_t refs = 0; +    int32_t i; + +    for (i = 0; i < fop->lock_count; i++) +    { +        wait_fop = NULL; + +        lock = fop->locks[i].lock; + +        LOCK(&lock->loc.inode->lock); + +        ec_trace("LOCK_DONE", fop, "lock=%p", lock); + +        GF_ASSERT(lock->owner == fop); +        lock->owner = NULL; + +        if (lock->kind == EC_LOCK_INODE) +        { +            if (update && (fop->error == 0)) +            { +                lock->version_delta++; +                lock->size_delta += fop->post_size - fop->pre_size; +            } +            version = lock->version_delta; +            delta = lock->size_delta; +            refs = lock->refs; +            if (refs == 1) +            { +                lock->version_delta = 0; +                lock->size_delta = 0; +            } + +            if (fop->have_size) +            { +                lock->size = fop->post_size; +                lock->have_size = 1; +            } +        } +        lock->good_mask &= fop->mask; + +        if (!list_empty(&lock->waiting)) +        { +            link = list_entry(lock->waiting.next, ec_lock_link_t, wait_list); +            list_del_init(&link->wait_list); + +            wait_fop = link->fop; + +            if (lock->kind == EC_LOCK_INODE) +            { +                wait_fop->pre_size = wait_fop->post_size = fop->post_size; +                wait_fop->have_size = fop->have_size; +            } +            wait_fop->mask &= fop->mask; +        } + +        UNLOCK(&lock->loc.inode->lock); + +        if (wait_fop != NULL) +        { +            ec_lock(wait_fop); + +            ec_resume(wait_fop, 0); +        } +    } + +    if ((refs == 1) && (version > 0)) +    { +        ec_update_size_version(fop, version, delta); +    } +} +  void __ec_manager(ec_fop_data_t * fop, int32_t error)  {      do diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index 83f3ba9637e..4fc89fdde33 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -47,10 +47,10 @@  #define EC_STATE_DISPATCH                     4  #define EC_STATE_PREPARE_ANSWER               5  #define EC_STATE_REPORT                       6 -#define EC_STATE_UPDATE_SIZE_AND_VERSION      7 +#define EC_STATE_LOCK_REUSE                   7  #define EC_STATE_UNLOCK                       8 -#define EC_STATE_WRITE_START                100 +#define EC_STATE_DELAYED_START              100  #define EC_STATE_HEAL_ENTRY_LOOKUP          200  #define EC_STATE_HEAL_ENTRY_PREPARE         201 @@ -81,14 +81,15 @@ void ec_update_bad(ec_fop_data_t * fop, uintptr_t good);  void ec_fop_set_error(ec_fop_data_t * fop, int32_t error); -void ec_lock_inode(ec_fop_data_t * fop, loc_t * loc); -void ec_lock_entry(ec_fop_data_t * fop, loc_t * loc); -void ec_lock_fd(ec_fop_data_t * fop, fd_t * fd); - +void ec_lock_prepare_inode(ec_fop_data_t * fop, loc_t * loc); +void ec_lock_prepare_entry(ec_fop_data_t * fop, loc_t * loc); +void ec_lock_prepare_fd(ec_fop_data_t * fop, fd_t * fd); +void ec_lock(ec_fop_data_t * fop); +void ec_lock_reuse(ec_fop_data_t * fop, int32_t update);  void ec_unlock(ec_fop_data_t * fop);  void ec_get_size_version(ec_fop_data_t * fop); -void ec_update_size_version(ec_fop_data_t * fop); +void ec_flush_size_version(ec_fop_data_t * fop);  void ec_dispatch_all(ec_fop_data_t * fop);  void ec_dispatch_inc(ec_fop_data_t * fop); @@ -97,8 +98,8 @@ void ec_dispatch_one(ec_fop_data_t * fop);  void ec_wait_winds(ec_fop_data_t * fop); +void ec_resume(ec_fop_data_t * fop, int32_t error);  void ec_resume_parent(ec_fop_data_t * fop, int32_t error); -void ec_report(ec_fop_data_t * fop, int32_t error);  void ec_manager(ec_fop_data_t * fop, int32_t error); diff --git a/xlators/cluster/ec/src/ec-data.c b/xlators/cluster/ec/src/ec-data.c index 0e72fbbd3b6..174586b4051 100644 --- a/xlators/cluster/ec/src/ec-data.c +++ b/xlators/cluster/ec/src/ec-data.c @@ -158,7 +158,6 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,      fop->minimum = minimum;      fop->mask = target; -    INIT_LIST_HEAD(&fop->lock_list);      INIT_LIST_HEAD(&fop->cbk_list);      INIT_LIST_HEAD(&fop->answer_list); diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h index e83b6ad74eb..cf9ce241adb 100644 --- a/xlators/cluster/ec/src/ec-data.h +++ b/xlators/cluster/ec/src/ec-data.h @@ -37,6 +37,9 @@ typedef union _ec_cbk ec_cbk_t;  struct _ec_lock;  typedef struct _ec_lock ec_lock_t; +struct _ec_lock_link; +typedef struct _ec_lock_link ec_lock_link_t; +  struct _ec_fop_data;  typedef struct _ec_fop_data ec_fop_data_t; @@ -60,8 +63,10 @@ struct _ec_fd  struct _ec_inode  { -    uintptr_t   bad; -    ec_heal_t * heal; +    uintptr_t        bad; +    struct list_head entry_locks; +    struct list_head inode_locks; +    ec_heal_t *      heal;  };  typedef int32_t (* fop_heal_cbk_t)(call_frame_t *, void * cookie, xlator_t *, @@ -124,8 +129,18 @@ union _ec_cbk  struct _ec_lock  {      struct list_head     list; +    struct list_head     waiting;      uintptr_t            mask; +    uintptr_t            good_mask;      int32_t              kind; +    int32_t              refs; +    int32_t              acquired; +    int32_t              have_size; +    size_t               size; +    size_t               size_delta; +    uint64_t             version; +    uint64_t             version_delta; +    ec_fop_data_t *      owner;      loc_t                loc;      union      { @@ -138,6 +153,13 @@ struct _ec_lock      };  }; +struct _ec_lock_link +{ +    ec_lock_t *      lock; +    ec_fop_data_t *  fop; +    struct list_head wait_list; +}; +  struct _ec_fop_data  {      int32_t            id; @@ -152,10 +174,13 @@ struct _ec_fop_data      xlator_t *         xl;      call_frame_t *     req_frame;   // frame of the calling xlator      call_frame_t *     frame;       // frame used by this fop -    struct list_head   lock_list;   // list locks held by this fop      struct list_head   cbk_list;    // sorted list of groups of answers      struct list_head   answer_list; // list of answers      ec_cbk_data_t *    answer;      // accepted answer +    int32_t            lock_count; +    int32_t            locked; +    ec_lock_link_t     locks[2]; +    int32_t            have_size;      size_t             pre_size;      size_t             post_size;      gf_lock_t          lock; diff --git a/xlators/cluster/ec/src/ec-dir-write.c b/xlators/cluster/ec/src/ec-dir-write.c index dc1d94e37e8..a28b7ad1937 100644 --- a/xlators/cluster/ec/src/ec-dir-write.c +++ b/xlators/cluster/ec/src/ec-dir-write.c @@ -181,7 +181,8 @@ int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state)          /* Fall through */          case EC_STATE_LOCK: -            ec_lock_entry(fop, &fop->loc[0]); +            ec_lock_prepare_entry(fop, &fop->loc[0]); +            ec_lock(fop);              return EC_STATE_DISPATCH; @@ -245,11 +246,7 @@ int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state)                                   cbk->xdata);              } -            if (cbk->op_ret >= 0) -            { -                return EC_STATE_UPDATE_SIZE_AND_VERSION; -            } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_DISPATCH: @@ -263,14 +260,14 @@ int32_t ec_manager_create(ec_fop_data_t * fop, int32_t state)                                   NULL, NULL, NULL, NULL, NULL, NULL);              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE; -        case EC_STATE_UPDATE_SIZE_AND_VERSION: -            ec_update_size_version(fop); +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 1);              return EC_STATE_UNLOCK; -        case -EC_STATE_UPDATE_SIZE_AND_VERSION:          case -EC_STATE_UNLOCK:          case EC_STATE_UNLOCK:              ec_unlock(fop); @@ -468,7 +465,8 @@ int32_t ec_manager_link(ec_fop_data_t * fop, int32_t state)              // Parent entry of fop->loc[0] should be locked, but I don't              // receive enough information to do it (fop->loc[0].parent is              // NULL). -            ec_lock_entry(fop, &fop->loc[1]); +            ec_lock_prepare_entry(fop, &fop->loc[1]); +            ec_lock(fop);              return EC_STATE_GET_SIZE_AND_VERSION; @@ -531,7 +529,7 @@ int32_t ec_manager_link(ec_fop_data_t * fop, int32_t state)                                 &cbk->iatt[1], &cbk->iatt[2], cbk->xdata);              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_GET_SIZE_AND_VERSION: @@ -546,6 +544,12 @@ int32_t ec_manager_link(ec_fop_data_t * fop, int32_t state)                                 NULL, NULL, NULL, NULL, NULL);              } +            return EC_STATE_LOCK_REUSE; + +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 0); +              return EC_STATE_UNLOCK;          case -EC_STATE_UNLOCK: @@ -732,7 +736,8 @@ int32_t ec_manager_mkdir(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_entry(fop, &fop->loc[0]); +            ec_lock_prepare_entry(fop, &fop->loc[0]); +            ec_lock(fop);              return EC_STATE_DISPATCH; @@ -785,7 +790,7 @@ int32_t ec_manager_mkdir(ec_fop_data_t * fop, int32_t state)                                  &cbk->iatt[1], &cbk->iatt[2], cbk->xdata);              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_DISPATCH: @@ -799,6 +804,12 @@ int32_t ec_manager_mkdir(ec_fop_data_t * fop, int32_t state)                                  NULL, NULL, NULL, NULL, NULL);              } +            return EC_STATE_LOCK_REUSE; + +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 0); +              return EC_STATE_UNLOCK;          case -EC_STATE_UNLOCK: @@ -982,7 +993,8 @@ int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_entry(fop, &fop->loc[0]); +            ec_lock_prepare_entry(fop, &fop->loc[0]); +            ec_lock(fop);              return EC_STATE_DISPATCH; @@ -1035,7 +1047,7 @@ int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state)                                  &cbk->iatt[1], &cbk->iatt[2], cbk->xdata);              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_DISPATCH: @@ -1049,6 +1061,12 @@ int32_t ec_manager_mknod(ec_fop_data_t * fop, int32_t state)                                  NULL, NULL, NULL, NULL, NULL);              } +            return EC_STATE_LOCK_REUSE; + +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 0); +              return EC_STATE_UNLOCK;          case -EC_STATE_UNLOCK: @@ -1230,8 +1248,9 @@ int32_t ec_manager_rename(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_entry(fop, &fop->loc[0]); -            ec_lock_entry(fop, &fop->loc[1]); +            ec_lock_prepare_entry(fop, &fop->loc[0]); +            ec_lock_prepare_entry(fop, &fop->loc[1]); +            ec_lock(fop);              return EC_STATE_GET_SIZE_AND_VERSION; @@ -1292,7 +1311,7 @@ int32_t ec_manager_rename(ec_fop_data_t * fop, int32_t state)                                   cbk->xdata);              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_GET_SIZE_AND_VERSION: @@ -1307,6 +1326,12 @@ int32_t ec_manager_rename(ec_fop_data_t * fop, int32_t state)                                   NULL, NULL, NULL, NULL, NULL, NULL);              } +            return EC_STATE_LOCK_REUSE; + +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 0); +              return EC_STATE_UNLOCK;          case -EC_STATE_UNLOCK: @@ -1479,7 +1504,8 @@ int32_t ec_manager_rmdir(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_entry(fop, &fop->loc[0]); +            ec_lock_prepare_entry(fop, &fop->loc[0]); +            ec_lock(fop);              return EC_STATE_DISPATCH; @@ -1524,7 +1550,7 @@ int32_t ec_manager_rmdir(ec_fop_data_t * fop, int32_t state)                                  cbk->xdata);              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_DISPATCH: @@ -1538,6 +1564,12 @@ int32_t ec_manager_rmdir(ec_fop_data_t * fop, int32_t state)                                  NULL, NULL, NULL);              } +            return EC_STATE_LOCK_REUSE; + +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 0); +              return EC_STATE_UNLOCK;          case -EC_STATE_UNLOCK: @@ -1719,7 +1751,8 @@ int32_t ec_manager_symlink(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_entry(fop, &fop->loc[0]); +            ec_lock_prepare_entry(fop, &fop->loc[0]); +            ec_lock(fop);              return EC_STATE_DISPATCH; @@ -1772,7 +1805,7 @@ int32_t ec_manager_symlink(ec_fop_data_t * fop, int32_t state)                                    &cbk->iatt[1], &cbk->iatt[2], cbk->xdata);              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_DISPATCH: @@ -1786,6 +1819,12 @@ int32_t ec_manager_symlink(ec_fop_data_t * fop, int32_t state)                                    NULL, NULL, NULL, NULL, NULL);              } +            return EC_STATE_LOCK_REUSE; + +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 0); +              return EC_STATE_UNLOCK;          case -EC_STATE_UNLOCK: @@ -1963,7 +2002,8 @@ int32_t ec_manager_unlink(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_entry(fop, &fop->loc[0]); +            ec_lock_prepare_entry(fop, &fop->loc[0]); +            ec_lock(fop);              return EC_STATE_GET_SIZE_AND_VERSION; @@ -2013,7 +2053,7 @@ int32_t ec_manager_unlink(ec_fop_data_t * fop, int32_t state)                                   cbk->xdata);              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_GET_SIZE_AND_VERSION: @@ -2028,6 +2068,12 @@ int32_t ec_manager_unlink(ec_fop_data_t * fop, int32_t state)                                   NULL, NULL, NULL);              } +            return EC_STATE_LOCK_REUSE; + +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 0); +              return EC_STATE_UNLOCK;          case -EC_STATE_UNLOCK: diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c index 4afec3524c5..4aa02903969 100644 --- a/xlators/cluster/ec/src/ec-generic.c +++ b/xlators/cluster/ec/src/ec-generic.c @@ -91,11 +91,17 @@ int32_t ec_manager_flush(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_fd(fop, fop->fd); +            ec_lock_prepare_fd(fop, fop->fd); +            ec_lock(fop);              return EC_STATE_DISPATCH;          case EC_STATE_DISPATCH: +            ec_flush_size_version(fop); + +            return EC_STATE_DELAYED_START; + +        case EC_STATE_DELAYED_START:              ec_dispatch_all(fop);              return EC_STATE_PREPARE_ANSWER; @@ -135,7 +141,7 @@ int32_t ec_manager_flush(ec_fop_data_t * fop, int32_t state)                                  cbk->op_errno, cbk->xdata);              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_DISPATCH: @@ -149,6 +155,12 @@ int32_t ec_manager_flush(ec_fop_data_t * fop, int32_t state)                                  NULL);              } +            return EC_STATE_LOCK_REUSE; + +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 0); +              return EC_STATE_UNLOCK;          case -EC_STATE_UNLOCK: @@ -313,7 +325,8 @@ int32_t ec_manager_fsync(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_fd(fop, fop->fd); +            ec_lock_prepare_fd(fop, fop->fd); +            ec_lock(fop);              return EC_STATE_GET_SIZE_AND_VERSION; @@ -323,6 +336,11 @@ int32_t ec_manager_fsync(ec_fop_data_t * fop, int32_t state)              return EC_STATE_DISPATCH;          case EC_STATE_DISPATCH: +            ec_flush_size_version(fop); + +            return EC_STATE_DELAYED_START; + +        case EC_STATE_DELAYED_START:              ec_dispatch_all(fop);              return EC_STATE_PREPARE_ANSWER; @@ -371,7 +389,7 @@ int32_t ec_manager_fsync(ec_fop_data_t * fop, int32_t state)                                  cbk->xdata);              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_GET_SIZE_AND_VERSION: @@ -386,6 +404,12 @@ int32_t ec_manager_fsync(ec_fop_data_t * fop, int32_t state)                                  NULL, NULL, NULL);              } +            return EC_STATE_LOCK_REUSE; + +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 0); +              return EC_STATE_UNLOCK;          case -EC_STATE_UNLOCK: @@ -526,11 +550,17 @@ int32_t ec_manager_fsyncdir(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_fd(fop, fop->fd); +            ec_lock_prepare_fd(fop, fop->fd); +            ec_lock(fop);              return EC_STATE_DISPATCH;          case EC_STATE_DISPATCH: +            ec_flush_size_version(fop); + +            return EC_STATE_DELAYED_START; + +        case EC_STATE_DELAYED_START:              ec_dispatch_all(fop);              return EC_STATE_PREPARE_ANSWER; @@ -570,7 +600,7 @@ int32_t ec_manager_fsyncdir(ec_fop_data_t * fop, int32_t state)                                     cbk->op_errno, cbk->xdata);              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_DISPATCH: @@ -584,6 +614,12 @@ int32_t ec_manager_fsyncdir(ec_fop_data_t * fop, int32_t state)                                     fop->error, NULL);              } +            return EC_STATE_LOCK_REUSE; + +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 0); +              return EC_STATE_UNLOCK;          case -EC_STATE_UNLOCK: @@ -665,10 +701,12 @@ out:  void ec_lookup_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)  {      ec_cbk_data_t * ans = NULL; +    ec_inode_t * ctx = NULL; +    ec_lock_t * lock = NULL;      data_t * data = NULL;      uint8_t * buff = NULL;      size_t size = 0; -    int32_t i = 0; +    int32_t i = 0, have_size = 0;      if (cbk->op_ret < 0)      { @@ -679,6 +717,22 @@ void ec_lookup_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)      ec_loc_prepare(fop->xl, &fop->loc[0], cbk->inode, &cbk->iatt[0]); +    LOCK(&cbk->inode->lock); + +    ctx = __ec_inode_get(cbk->inode, fop->xl); +    if ((ctx != NULL) && !list_empty(&ctx->inode_locks)) +    { +        lock = list_entry(ctx->inode_locks.next, ec_lock_t, list); +        cbk->version = lock->version; +        if (lock->have_size) +        { +            size = lock->size; +            have_size = 1; +        } +    } + +    UNLOCK(&cbk->inode->lock); +      if (cbk->iatt[0].ia_type == IA_IFREG)      {          uint8_t * blocks[cbk->count]; @@ -686,6 +740,10 @@ void ec_lookup_rebuild(ec_t * ec, ec_fop_data_t * fop, ec_cbk_data_t * cbk)          cbk->size = cbk->iatt[0].ia_size;          ec_dict_del_number(cbk->xdata, EC_XATTR_SIZE, &cbk->iatt[0].ia_size); +        if (have_size) +        { +            cbk->iatt[0].ia_size = size; +        }          size = SIZE_MAX;          for (i = 0, ans = cbk; (ans != NULL) && (i < ec->fragments); @@ -1314,7 +1372,15 @@ int32_t ec_manager_xattrop(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_inode(fop, &fop->loc[0]); +            if (fop->fd == NULL) +            { +                ec_lock_prepare_inode(fop, &fop->loc[0]); +            } +            else +            { +                ec_lock_prepare_fd(fop, fop->fd); +            } +            ec_lock(fop);              return EC_STATE_DISPATCH; @@ -1373,11 +1439,7 @@ int32_t ec_manager_xattrop(ec_fop_data_t * fop, int32_t state)                  }              } -            if (cbk->op_ret >= 0) -            { -                return EC_STATE_UPDATE_SIZE_AND_VERSION; -            } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_DISPATCH: @@ -1402,14 +1464,14 @@ int32_t ec_manager_xattrop(ec_fop_data_t * fop, int32_t state)                  }              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE; -        case EC_STATE_UPDATE_SIZE_AND_VERSION: -            ec_update_size_version(fop); +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 1);              return EC_STATE_UNLOCK; -        case -EC_STATE_UPDATE_SIZE_AND_VERSION:          case -EC_STATE_UNLOCK:          case EC_STATE_UNLOCK:              ec_unlock(fop); diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c index 6eb836a1c62..22889337cbc 100644 --- a/xlators/cluster/ec/src/ec-helpers.c +++ b/xlators/cluster/ec/src/ec-helpers.c @@ -503,6 +503,9 @@ ec_inode_t * __ec_inode_get(inode_t * inode, xlator_t * xl)                  return NULL;              } + +            INIT_LIST_HEAD(&ctx->entry_locks); +            INIT_LIST_HEAD(&ctx->inode_locks);          }      }      else diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c index a31220ecbc1..0cb5559f62b 100644 --- a/xlators/cluster/ec/src/ec-inode-read.c +++ b/xlators/cluster/ec/src/ec-inode-read.c @@ -252,7 +252,15 @@ int32_t ec_manager_getxattr(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_inode(fop, &fop->loc[0]); +            if (fop->fd == NULL) +            { +                ec_lock_prepare_inode(fop, &fop->loc[0]); +            } +            else +            { +                ec_lock_prepare_fd(fop, fop->fd); +            } +            ec_lock(fop);              return EC_STATE_DISPATCH; @@ -311,7 +319,7 @@ int32_t ec_manager_getxattr(ec_fop_data_t * fop, int32_t state)                                     cbk->op_errno, cbk->dict, cbk->xdata);              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_DISPATCH: @@ -325,6 +333,12 @@ int32_t ec_manager_getxattr(ec_fop_data_t * fop, int32_t state)                                     fop->error, NULL, NULL);              } +            return EC_STATE_LOCK_REUSE; + +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 0); +              return EC_STATE_UNLOCK;          case -EC_STATE_UNLOCK: @@ -1216,7 +1230,8 @@ int32_t ec_manager_readv(ec_fop_data_t * fop, int32_t state)          /* Fall through */          case EC_STATE_LOCK: -            ec_lock_fd(fop, fop->fd); +            ec_lock_prepare_fd(fop, fop->fd); +            ec_lock(fop);              return EC_STATE_GET_SIZE_AND_VERSION; @@ -1276,7 +1291,7 @@ int32_t ec_manager_readv(ec_fop_data_t * fop, int32_t state)                                  &cbk->iatt[0], cbk->buffers, cbk->xdata);              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_GET_SIZE_AND_VERSION: @@ -1291,6 +1306,12 @@ int32_t ec_manager_readv(ec_fop_data_t * fop, int32_t state)                                  NULL, 0, NULL, NULL, NULL);              } +            return EC_STATE_LOCK_REUSE; + +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 0); +              return EC_STATE_UNLOCK;          case -EC_STATE_UNLOCK: @@ -1455,7 +1476,15 @@ int32_t ec_manager_stat(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_inode(fop, &fop->loc[0]); +            if (fop->fd == NULL) +            { +                ec_lock_prepare_inode(fop, &fop->loc[0]); +            } +            else +            { +                ec_lock_prepare_fd(fop, fop->fd); +            } +            ec_lock(fop);              return EC_STATE_GET_SIZE_AND_VERSION; @@ -1522,7 +1551,7 @@ int32_t ec_manager_stat(ec_fop_data_t * fop, int32_t state)                  }              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_GET_SIZE_AND_VERSION: @@ -1548,6 +1577,12 @@ int32_t ec_manager_stat(ec_fop_data_t * fop, int32_t state)                  }              } +            return EC_STATE_LOCK_REUSE; + +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 0); +              return EC_STATE_UNLOCK;          case -EC_STATE_UNLOCK: diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c index 06a2fef8d17..edc7409854d 100644 --- a/xlators/cluster/ec/src/ec-inode-write.c +++ b/xlators/cluster/ec/src/ec-inode-write.c @@ -92,7 +92,15 @@ int32_t ec_manager_removexattr(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_inode(fop, &fop->loc[0]); +            if (fop->fd == NULL) +            { +                ec_lock_prepare_inode(fop, &fop->loc[0]); +            } +            else +            { +                ec_lock_prepare_fd(fop, fop->fd); +            } +            ec_lock(fop);              return EC_STATE_DISPATCH; @@ -149,11 +157,7 @@ int32_t ec_manager_removexattr(ec_fop_data_t * fop, int32_t state)                  }              } -            if (cbk->op_ret >= 0) -            { -                return EC_STATE_UPDATE_SIZE_AND_VERSION; -            } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_DISPATCH: @@ -178,14 +182,14 @@ int32_t ec_manager_removexattr(ec_fop_data_t * fop, int32_t state)                  }              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE; -        case EC_STATE_UPDATE_SIZE_AND_VERSION: -            ec_update_size_version(fop); +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 1);              return EC_STATE_UNLOCK; -        case -EC_STATE_UPDATE_SIZE_AND_VERSION:          case -EC_STATE_UNLOCK:          case EC_STATE_UNLOCK:              ec_unlock(fop); @@ -484,7 +488,15 @@ int32_t ec_manager_setattr(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_inode(fop, &fop->loc[0]); +            if (fop->fd == NULL) +            { +                ec_lock_prepare_inode(fop, &fop->loc[0]); +            } +            else +            { +                ec_lock_prepare_fd(fop, fop->fd); +            } +            ec_lock(fop);              return EC_STATE_GET_SIZE_AND_VERSION; @@ -556,11 +568,7 @@ int32_t ec_manager_setattr(ec_fop_data_t * fop, int32_t state)                  }              } -            if (cbk->op_ret >= 0) -            { -                return EC_STATE_UPDATE_SIZE_AND_VERSION; -            } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_GET_SIZE_AND_VERSION: @@ -586,14 +594,14 @@ int32_t ec_manager_setattr(ec_fop_data_t * fop, int32_t state)                  }              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE; -        case EC_STATE_UPDATE_SIZE_AND_VERSION: -            ec_update_size_version(fop); +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 1);              return EC_STATE_UNLOCK; -        case -EC_STATE_UPDATE_SIZE_AND_VERSION:          case -EC_STATE_UNLOCK:          case EC_STATE_UNLOCK:              ec_unlock(fop); @@ -870,7 +878,15 @@ int32_t ec_manager_setxattr(ec_fop_data_t * fop, int32_t state)      {          case EC_STATE_INIT:          case EC_STATE_LOCK: -            ec_lock_inode(fop, &fop->loc[0]); +            if (fop->fd == NULL) +            { +                ec_lock_prepare_inode(fop, &fop->loc[0]); +            } +            else +            { +                ec_lock_prepare_fd(fop, fop->fd); +            } +            ec_lock(fop);              return EC_STATE_DISPATCH; @@ -926,11 +942,7 @@ int32_t ec_manager_setxattr(ec_fop_data_t * fop, int32_t state)                  }              } -            if (cbk->op_ret >= 0) -            { -                return EC_STATE_UPDATE_SIZE_AND_VERSION; -            } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_DISPATCH: @@ -955,14 +967,14 @@ int32_t ec_manager_setxattr(ec_fop_data_t * fop, int32_t state)                  }              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE; -        case EC_STATE_UPDATE_SIZE_AND_VERSION: -            ec_update_size_version(fop); +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 1);              return EC_STATE_UNLOCK; -        case -EC_STATE_UPDATE_SIZE_AND_VERSION:          case -EC_STATE_UNLOCK:          case EC_STATE_UNLOCK:              ec_unlock(fop); @@ -1366,7 +1378,15 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)          /* Fall through */          case EC_STATE_LOCK: -            ec_lock_inode(fop, &fop->loc[0]); +            if (fop->fd == NULL) +            { +                ec_lock_prepare_inode(fop, &fop->loc[0]); +            } +            else +            { +                ec_lock_prepare_fd(fop, fop->fd); +            } +            ec_lock(fop);              return EC_STATE_GET_SIZE_AND_VERSION; @@ -1447,11 +1467,7 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)                  }              } -            if (cbk->op_ret >= 0) -            { -                return EC_STATE_UPDATE_SIZE_AND_VERSION; -            } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_GET_SIZE_AND_VERSION: @@ -1477,14 +1493,14 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)                  }              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE; -        case EC_STATE_UPDATE_SIZE_AND_VERSION: -            ec_update_size_version(fop); +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 1);              return EC_STATE_UNLOCK; -        case -EC_STATE_UPDATE_SIZE_AND_VERSION:          case -EC_STATE_UNLOCK:          case EC_STATE_UNLOCK:              ec_unlock(fop); @@ -2003,7 +2019,8 @@ int32_t ec_manager_writev(ec_fop_data_t * fop, int32_t state)          /* Fall through */          case EC_STATE_LOCK: -            ec_lock_fd(fop, fop->fd); +            ec_lock_prepare_fd(fop, fop->fd); +            ec_lock(fop);              return EC_STATE_GET_SIZE_AND_VERSION; @@ -2015,9 +2032,9 @@ int32_t ec_manager_writev(ec_fop_data_t * fop, int32_t state)          case EC_STATE_DISPATCH:              ec_writev_start(fop); -            return EC_STATE_WRITE_START; +            return EC_STATE_DELAYED_START; -        case EC_STATE_WRITE_START: +        case EC_STATE_DELAYED_START:              ec_dispatch_all(fop);              return EC_STATE_PREPARE_ANSWER; @@ -2089,11 +2106,7 @@ int32_t ec_manager_writev(ec_fop_data_t * fop, int32_t state)                                   cbk->xdata);              } -            if (cbk->op_ret >= 0) -            { -                return EC_STATE_UPDATE_SIZE_AND_VERSION; -            } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE;          case -EC_STATE_LOCK:          case -EC_STATE_GET_SIZE_AND_VERSION: @@ -2108,14 +2121,14 @@ int32_t ec_manager_writev(ec_fop_data_t * fop, int32_t state)                                   NULL, NULL, NULL);              } -            return EC_STATE_UNLOCK; +            return EC_STATE_LOCK_REUSE; -        case EC_STATE_UPDATE_SIZE_AND_VERSION: -            ec_update_size_version(fop); +        case -EC_STATE_LOCK_REUSE: +        case EC_STATE_LOCK_REUSE: +            ec_lock_reuse(fop, 1);              return EC_STATE_UNLOCK; -        case -EC_STATE_UPDATE_SIZE_AND_VERSION:          case -EC_STATE_UNLOCK:          case EC_STATE_UNLOCK:              ec_unlock(fop); diff --git a/xlators/cluster/ec/src/ec-mem-types.h b/xlators/cluster/ec/src/ec-mem-types.h index f312d1e333f..a7b0c838c7e 100644 --- a/xlators/cluster/ec/src/ec-mem-types.h +++ b/xlators/cluster/ec/src/ec-mem-types.h @@ -27,11 +27,8 @@ enum gf_ec_mem_types_  {      ec_mt_ec_t = gf_common_mt_end + 1,      ec_mt_xlator_t, -    ec_mt_ec_fop_data_t, -    ec_mt_ec_cbk_data_t,      ec_mt_ec_inode_t,      ec_mt_ec_fd_t, -    ec_mt_ec_lock_t,      ec_mt_ec_heal_t,      ec_mt_end  }; diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index 8554f20df0d..93bee1a4d32 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -151,6 +151,11 @@ void __ec_destroy_private(xlator_t * this)              mem_pool_destroy(ec->cbk_pool);          } +        if (ec->lock_pool != NULL) +        { +            mem_pool_destroy(ec->lock_pool); +        } +          LOCK_DESTROY(&ec->lock);          GF_FREE(ec); @@ -350,7 +355,9 @@ int32_t init(xlator_t * this)      ec->fop_pool = mem_pool_new(ec_fop_data_t, 1024);      ec->cbk_pool = mem_pool_new(ec_cbk_data_t, 4096); -    if ((ec->fop_pool == NULL) || (ec->cbk_pool == NULL)) +    ec->lock_pool = mem_pool_new(ec_lock_t, 1024); +    if ((ec->fop_pool == NULL) || (ec->cbk_pool == NULL) || +        (ec->lock_pool == NULL))      {          gf_log(this->name, GF_LOG_ERROR, "Failed to create memory pools."); diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h index 85c430e2045..0cc8fdb4403 100644 --- a/xlators/cluster/ec/src/ec.h +++ b/xlators/cluster/ec/src/ec.h @@ -49,6 +49,7 @@ struct _ec      gf_timer_t *      timer;      struct mem_pool * fop_pool;      struct mem_pool * cbk_pool; +    struct mem_pool * lock_pool;  };  #endif /* __EC_H__ */  | 
