diff options
| author | Xavier Hernandez <xhernandez@datalab.es> | 2015-05-14 20:07:10 +0200 | 
|---|---|---|
| committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2015-05-20 01:00:01 -0700 | 
| commit | 61cfcf65f0d4ad70fc8a47395c583d4b5bf1efbe (patch) | |
| tree | 9d3134df974f4ab0160008dbd877e317d471d2c7 | |
| parent | 8f788528e64c4c13e16f7ad2d9f667a3813e08cc (diff) | |
cluster/ec: Correctly cleanup delayed locks
When a delayed lock is pending, a graph switch doesn't correctly
terminate it. This means that the update of version and size xattrs
is lost, causing EIO errors.
This patch handles GF_EVENT_PARENT_DOWN event to correctly finish
pending udpdates before completing the graph switch.
Change-Id: I394f3b8d41df8d83cdd36636aeb62330f30a66d5
BUG: 1188145
Signed-off-by: Xavier Hernandez <xhernandez@datalab.es>
Reviewed-on: http://review.gluster.org/10787
Tested-by: NetBSD Build System
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
| -rwxr-xr-x | run-tests.sh | 7 | ||||
| -rw-r--r-- | tests/bugs/disperse/bug-1188145.t | 50 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 74 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-common.h | 1 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-data.c | 13 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-data.h | 31 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec-heal.c | 7 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec.c | 97 | ||||
| -rw-r--r-- | xlators/cluster/ec/src/ec.h | 5 | 
9 files changed, 213 insertions, 72 deletions
diff --git a/run-tests.sh b/run-tests.sh index 2a24f940d5c..d8245c81402 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -205,14 +205,7 @@ function is_bad_test ()                ./tests/basic/ec/quota.t  \                ./tests/bugs/distribute/bug-1161156.t     \                ./tests/basic/tier/tier.t \ -              ./tests/basic/ec/ec-4-1.t \ -              ./tests/basic/ec/ec.t \ -              ./tests/basic/ec/self-heal.t \                ./tests/basic/quota-nfs.t \ -              ./tests/basic/ec/ec-6-2.t \ -              ./tests/basic/ec/ec-3-1.t \ -              ./tests/basic/ec/ec-5-1.t \ -              ./tests/basic/ec/ec-12-4.t \                ./tests/bugs/quota/bug-1035576.t  \                ./tests/basic/uss.t       \                ./tests/bugs/glusterfs/bug-867253.t       \ diff --git a/tests/bugs/disperse/bug-1188145.t b/tests/bugs/disperse/bug-1188145.t new file mode 100644 index 00000000000..aa3a59bc62f --- /dev/null +++ b/tests/bugs/disperse/bug-1188145.t @@ -0,0 +1,50 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +function create_dirs() +{ +    local stop=$1 +    local idx +    local res + +    res=0 +    idx=1 +    while [[ -f ${stop} ]]; do +        mkdir $M0/${idx} +        if [[ $? -ne 0 ]]; then +            res=1 +            break; +        fi +        idx=$(( idx + 1 )) +    done + +    return ${res} +} + +cleanup + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 redundancy 2 $H0:$B0/${V0}{0..5} +EXPECT "Created" volinfo_field $V0 'Status' +TEST $CLI volume start $V0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Started" volinfo_field $V0 'Status' +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 + +name=`mktemp -t ${0##*/}.XXXXXX` +create_dirs ${name} & +pid=$! + +sleep 2 +TEST $CLI volume set $V0 uss on +sleep 5 +TEST $CLI volume set $V0 uss off +sleep 5 + +TEST rm -f ${name} +TEST wait ${pid} + +cleanup diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index afd46c095f3..9f312e0c37c 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -1531,31 +1531,46 @@ void ec_unlock_now(ec_fop_data_t *fop, ec_lock_t *lock)      ec_resume(fop, 0);  } -void ec_unlock_timer_cbk(void *data) +void +ec_unlock_timer_del(ec_fop_data_t *fop, ec_lock_t *lock)  { -    ec_lock_link_t *link = data; -    ec_lock_t *lock = link->lock; -    ec_fop_data_t *fop = NULL; +        inode_t *inode; +        gf_boolean_t now = _gf_false; + +        /* A race condition can happen if timer expires, calls this function +         * and the lock is released (lock->loc is wiped) but the fop is not +         * fully completed yet (it's still on the list of pending fops). In +         * this case, this function can also be called if ec_unlock_force() is +         * called. */ +        inode = lock->loc.inode; +        if (inode == NULL) { +                return; +        } -    LOCK(&lock->loc.inode->lock); +        LOCK(&inode->lock); -    if (lock->timer != NULL) { -        fop = link->fop; +        if (lock->timer != NULL) { +                ec_trace("UNLOCK_DELAYED", fop, "lock=%p", lock); -        ec_trace("UNLOCK_DELAYED", fop, "lock=%p", lock); +                gf_timer_call_cancel(fop->xl->ctx, lock->timer); +                lock->timer = NULL; +                *lock->plock = NULL; -        GF_ASSERT(lock->refs == 1); +                now = _gf_true; +        } -        gf_timer_call_cancel(fop->xl->ctx, lock->timer); -        lock->timer = NULL; -        *lock->plock = NULL; -    } +        UNLOCK(&inode->lock); -    UNLOCK(&lock->loc.inode->lock); +        if (now) { +                ec_unlock_now(fop, lock); +        } +} -    if (fop != NULL) { -        ec_unlock_now(fop, lock); -    } +void ec_unlock_timer_cbk(void *data) +{ +        ec_lock_link_t *link = data; + +        ec_unlock_timer_del(link->fop, link->lock);  }  void ec_unlock_timer_add(ec_lock_link_t *link) @@ -1626,6 +1641,18 @@ void ec_unlock(ec_fop_data_t *fop)      }  } +void +ec_unlock_force(ec_fop_data_t *fop) +{ +        int32_t i; + +        for (i = 0; i < fop->lock_count; i++) { +                ec_trace("UNLOCK_FORCED", fop, "lock=%p", &fop->locks[i]); + +                ec_unlock_timer_del(fop, fop->locks[i].lock); +        } +} +  void ec_flush_size_version(ec_fop_data_t * fop)  {      ec_lock_t * lock; @@ -1740,8 +1767,21 @@ void __ec_manager(ec_fop_data_t * fop, int32_t error)          }          if ((fop->state == EC_STATE_END) || (fop->state == -EC_STATE_END)) { +            gf_boolean_t notify; + +            LOCK(&ec->lock); + +            list_del_init(&fop->pending_list); +            notify = list_empty(&ec->pending_fops); + +            UNLOCK(&ec->lock); +              ec_fop_data_release(fop); +            if (notify) { +                ec_pending_fops_completed(ec); +            } +              break;          } diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index 9e0aaa0f079..08993f03c8f 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -87,6 +87,7 @@ void ec_lock_prepare_fd(ec_fop_data_t *fop, fd_t *fd, int32_t update);  void ec_lock(ec_fop_data_t * fop);  void ec_lock_reuse(ec_fop_data_t *fop);  void ec_unlock(ec_fop_data_t * fop); +void ec_unlock_force(ec_fop_data_t *fop);  void ec_get_size_version(ec_fop_data_t * fop);  void ec_prepare_update(ec_fop_data_t *fop); diff --git a/xlators/cluster/ec/src/ec-data.c b/xlators/cluster/ec/src/ec-data.c index fb47aea90a8..b747fc42348 100644 --- a/xlators/cluster/ec/src/ec-data.c +++ b/xlators/cluster/ec/src/ec-data.c @@ -165,17 +165,18 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,          parent = frame->local;          if (parent != NULL)          { -            LOCK(&parent->lock); - -            parent->jobs++; -            parent->refs++; - -            UNLOCK(&parent->lock); +            ec_sleep(parent);          }          fop->parent = parent;      } +    LOCK(&ec->lock); + +    list_add_tail(&fop->pending_list, &ec->pending_fops); + +    UNLOCK(&ec->lock); +      return fop;  } diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h index 9e5c92dd5b8..8a58ffb288b 100644 --- a/xlators/cluster/ec/src/ec-data.h +++ b/xlators/cluster/ec/src/ec-data.h @@ -172,13 +172,14 @@ struct _ec_fop_data      int32_t            winds;      int32_t            jobs;      int32_t            error; -    ec_fop_data_t *    parent; -    xlator_t *         xl; -    call_frame_t *     req_frame;   // frame of the calling xlator -    call_frame_t *     frame;       // frame used by this fop -    struct list_head   cbk_list;    // sorted list of groups of answers -    struct list_head   answer_list; // list of answers -    ec_cbk_data_t *    answer;      // accepted answer +    ec_fop_data_t     *parent; +    xlator_t          *xl; +    call_frame_t      *req_frame;    /* frame of the calling xlator */ +    call_frame_t      *frame;        /* frame used by this fop */ +    struct list_head   cbk_list;     /* sorted list of groups of answers */ +    struct list_head   answer_list;  /* list of answers */ +    struct list_head   pending_list; /* member of ec_t.pending_fops */ +    ec_cbk_data_t     *answer;       /* accepted answer */      int32_t            lock_count;      int32_t            locked;      ec_lock_link_t     locks[2]; @@ -203,7 +204,7 @@ struct _ec_fop_data      ec_handler_f       handler;      ec_resume_f        resume;      ec_cbk_t           cbks; -    void *             data; +    void              *data;      ec_heal_t         *heal;      uint64_t           user_size; @@ -211,8 +212,8 @@ struct _ec_fop_data      int32_t            use_fd; -    dict_t *           xdata; -    dict_t *           dict; +    dict_t            *xdata; +    dict_t            *dict;      int32_t            int32;      uint32_t           uint32;      uint64_t           size; @@ -222,14 +223,14 @@ struct _ec_fop_data      entrylk_type       entrylk_type;      gf_xattrop_flags_t xattrop_flags;      dev_t              dev; -    inode_t *          inode; -    fd_t *             fd; +    inode_t           *inode; +    fd_t              *fd;      struct iatt        iatt; -    char *             str[2]; +    char              *str[2];      loc_t              loc[2];      struct gf_flock    flock; -    struct iovec *     vector; -    struct iobref *    buffers; +    struct iovec      *vector; +    struct iobref     *buffers;  };  struct _ec_cbk_data diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c index ceddfeb6ac7..4fe5754cbb8 100644 --- a/xlators/cluster/ec/src/ec-heal.c +++ b/xlators/cluster/ec/src/ec-heal.c @@ -532,12 +532,7 @@ ec_heal_init (ec_fop_data_t * fop)          gf_log("ec", GF_LOG_INFO, "Healing '%s', gfid %s", heal->loc.path,                 uuid_utoa(heal->loc.gfid));      } else { -        LOCK(&fop->lock); - -        fop->jobs++; -        fop->refs++; - -        UNLOCK(&fop->lock); +        ec_sleep(fop);      }      list_add_tail(&heal->list, &ctx->heal); diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index 3dd04299541..4028aa4d2bb 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -278,6 +278,7 @@ ec_notify_cbk (void *data)  {          ec_t *ec = data;          glusterfs_event_t event = GF_EVENT_MAXVAL; +        gf_boolean_t propagate = _gf_false;          LOCK(&ec->lock);          { @@ -309,10 +310,14 @@ ec_notify_cbk (void *data)                  /* CHILD_DOWN should not come here as no grace period is given                   * for notifying CHILD_DOWN. */ -                default_notify (ec->xl, event, NULL); +                propagate = _gf_true;          }  unlock:          UNLOCK(&ec->lock); + +        if (propagate) { +                default_notify (ec->xl, event, NULL); +        }  }  void @@ -360,6 +365,49 @@ ec_handle_down (xlator_t *this, ec_t *ec, int32_t idx)          }  } +gf_boolean_t +ec_force_unlocks(ec_t *ec) +{ +        struct list_head list; +        ec_fop_data_t *fop; + +        if (list_empty(&ec->pending_fops)) { +                return _gf_true; +        } + +        INIT_LIST_HEAD(&list); + +        /* All pending fops when GF_EVENT_PARENT_DOWN is received should only +         * be fops waiting for a delayed unlock. However the unlock can +         * generate new fops. We don't want to trverse these new fops while +         * forcing unlocks, so we move all fops to a temporal list. To process +         * them without interferences.*/ +        list_splice_init(&ec->pending_fops, &list); + +        while (!list_empty(&list)) { +                fop = list_entry(list.next, ec_fop_data_t, pending_list); +                list_move_tail(&fop->pending_list, &ec->pending_fops); + +                UNLOCK(&ec->lock); + +                ec_unlock_force(fop); + +                LOCK(&ec->lock); +        } + +        ec->shutdown = _gf_true; + +        return list_empty(&ec->pending_fops); +} + +void +ec_pending_fops_completed(ec_t *ec) +{ +        if (ec->shutdown) { +                default_notify(ec->xl, GF_EVENT_PARENT_DOWN, NULL); +        } +} +  int32_t  ec_notify (xlator_t *this, int32_t event, void *data, void *data2)  { @@ -367,14 +415,16 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)          int32_t           idx       = 0;          int32_t           error     = 0;          glusterfs_event_t old_event = GF_EVENT_MAXVAL; -        glusterfs_event_t new_event = GF_EVENT_MAXVAL;          dict_t            *input    = NULL;          dict_t            *output   = NULL; +        gf_boolean_t      propagate = _gf_true; + +        gf_log (this->name, GF_LOG_TRACE, "NOTIFY(%d): %p, %p", +                event, data, data2);          if (event == GF_EVENT_TRANSLATOR_OP) {                  if (!ec->up) {                          error = -1; -                        goto out;                  } else {                          input = data;                          output = data2; @@ -400,13 +450,14 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)                   */                  ec_launch_notify_timer (this, ec);                  goto unlock; +        } else if (event == GF_EVENT_PARENT_DOWN) { +                /* If there aren't pending fops running after we have waken up +                 * them, we immediately propagate the notification. */ +                propagate = ec_force_unlocks(ec); +                goto unlock;          } -        gf_log (this->name, GF_LOG_TRACE, "NOTIFY(%d): %p, %d", -                event, data, idx); -          if (idx < ec->nodes) { /* CHILD_* events */ -                  old_event = ec_get_event_from_state (ec);                  if (event == GF_EVENT_CHILD_UP) { @@ -415,28 +466,30 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2)                          ec_handle_down (this, ec, idx);                  } -                new_event = ec_get_event_from_state (ec); +                event = ec_get_event_from_state (ec); -                if (new_event == GF_EVENT_CHILD_UP && !ec->up) { +                if (event == GF_EVENT_CHILD_UP && !ec->up) {                          ec_up (this, ec); -                } else if (new_event == GF_EVENT_CHILD_DOWN && ec->up) { +                } else if (event == GF_EVENT_CHILD_DOWN && ec->up) {                          ec_down (this, ec);                  } -                if ((new_event == old_event) && (new_event != GF_EVENT_MAXVAL)) -                        new_event = GF_EVENT_CHILD_MODIFIED; - -                event = GF_EVENT_MAXVAL;/* Take care of notifying inside lock */ -                if (new_event != GF_EVENT_MAXVAL) -                        error = default_notify (this, new_event, data); +                if (event != GF_EVENT_MAXVAL) { +                        if (event == old_event) { +                                event = GF_EVENT_CHILD_MODIFIED; +                        } +                } else { +                        propagate = _gf_false; +                }          } -    unlock: -            UNLOCK (&ec->lock); +unlock: +        UNLOCK (&ec->lock); -            if (event != GF_EVENT_MAXVAL) -                    return default_notify (this, event, data); +        if (propagate) { +                error = default_notify (this, event, data); +        }  out: -            return error; +        return error;  }  int32_t @@ -478,6 +531,8 @@ init (xlator_t *this)      ec->xl = this;      LOCK_INIT(&ec->lock); +    INIT_LIST_HEAD(&ec->pending_fops); +      ec->fop_pool = mem_pool_new(ec_fop_data_t, 1024);      ec->cbk_pool = mem_pool_new(ec_cbk_data_t, 4096);      ec->lock_pool = mem_pool_new(ec_lock_t, 1024); diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h index b8f3e288197..fdedb89ec18 100644 --- a/xlators/cluster/ec/src/ec.h +++ b/xlators/cluster/ec/src/ec.h @@ -44,6 +44,8 @@ struct _ec      xlator_t **       xl_list;      gf_lock_t         lock;      gf_timer_t *      timer; +    gf_boolean_t      shutdown; +    struct list_head  pending_fops;      struct mem_pool * fop_pool;      struct mem_pool * cbk_pool;      struct mem_pool * lock_pool; @@ -51,4 +53,7 @@ struct _ec      char              vol_uuid[UUID_SIZE + 1];      dict_t           *leaf_to_subvolid;  }; + +void ec_pending_fops_completed(ec_t *ec); +  #endif /* __EC_H__ */  | 
