diff options
author | Pranith Kumar K <pkarampu@redhat.com> | 2015-01-08 15:39:40 +0530 |
---|---|---|
committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2015-01-28 19:49:53 -0800 |
commit | a48b18d6f661f863371e625084a88a01aaf989f0 (patch) | |
tree | a45a7bad629f4c2b37670db79ab03e80ab1b0c30 /xlators/cluster | |
parent | b17122ffc75c65bda2cf3b3d99832bbf2718e8d3 (diff) |
cluster/ec: Handle CHILD UP/DOWN in all cases
Problem:
When all the bricks are down at the time of mounting the volume, then mount
command hangs.
Fix:
1. Ignore all CHILD_CONNECTING events comming from subvolumes.
2. On timer expiration (without enough up or down childs) send
CHILD_DOWN.
3. Once enough up or down subvolumes are detected, send the appropriate event.
When rest of the subvols go up/down without changing the overall
ec-up/ec-down send CHILD_MODIFIED to parent subvols.
Change-Id: Ie0194dbadef2dce36ab5eb7beece84a6bf3c631c
BUG: 1179180
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: http://review.gluster.org/9396
Reviewed-by: Xavier Hernandez <xhernandez@datalab.es>
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Diffstat (limited to 'xlators/cluster')
-rw-r--r-- | xlators/cluster/ec/src/ec.c | 236 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec.h | 2 |
2 files changed, 134 insertions, 104 deletions
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index 9f620c75457..76911de1461 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -195,151 +195,179 @@ failed: return -1; } -void ec_up(xlator_t * this, ec_t * ec) +glusterfs_event_t +ec_get_event_from_state (ec_t *ec) { - if (ec->timer != NULL) - { - gf_timer_call_cancel(this->ctx, ec->timer); - ec->timer = NULL; - } + int down_count = 0; - if (!ec->up && (ec->xl_up_count >= ec->fragments)) - { - if (ec->xl_up_count < ec->nodes) - { - gf_log("ec", GF_LOG_WARNING, "Starting volume with only %d bricks", - ec->xl_up_count); + if (ec->xl_up_count >= ec->fragments) + return GF_EVENT_CHILD_UP; + + down_count = ec->xl_notify_count - ec->xl_up_count; + if (down_count > ec->redundancy) + return GF_EVENT_CHILD_DOWN; + + return GF_EVENT_MAXVAL; +} + +void +ec_up (xlator_t *this, ec_t *ec) +{ + if (ec->timer != NULL) { + gf_timer_call_cancel (this->ctx, ec->timer); + ec->timer = NULL; } ec->up = 1; gf_log(this->name, GF_LOG_INFO, "Going UP"); - - default_notify(this, GF_EVENT_CHILD_UP, NULL); - } } -void ec_down(xlator_t * this, ec_t * ec) +void +ec_down (xlator_t *this, ec_t *ec) { - if (ec->timer != NULL) - { - gf_timer_call_cancel(this->ctx, ec->timer); - ec->timer = NULL; - } + if (ec->timer != NULL) { + gf_timer_call_cancel(this->ctx, ec->timer); + ec->timer = NULL; + } - if (ec->up) - { ec->up = 0; gf_log(this->name, GF_LOG_INFO, "Going DOWN"); - - default_notify(this, GF_EVENT_CHILD_DOWN, NULL); - } } -void ec_notify_up_cbk(void * data) +void +ec_notify_down (void *data) { - ec_t * ec = data; - - LOCK(&ec->lock); - - if (ec->timer != NULL) - { - ec_up(ec->xl, ec); - } + ec_t *ec = data; - UNLOCK(&ec->lock); + LOCK(&ec->lock); + { + if (!ec->timer) { + /* + * Either child_up/child_down is already sent to parent + * This is a spurious wake up. + */ + goto unlock; + } + + gf_timer_call_cancel (ec->xl->ctx, ec->timer); + ec->timer = NULL; + + if (GF_EVENT_MAXVAL == ec_get_event_from_state (ec)) { + /* Change the state as if the bricks are down */ + ec->xl_notify = (1ULL << ec->nodes) - 1ULL; + ec->xl_notify_count = ec->nodes; + default_notify (ec->xl, GF_EVENT_CHILD_DOWN, NULL); + } + } +unlock: + UNLOCK(&ec->lock); } -int32_t ec_notify_up(xlator_t * this, ec_t * ec, int32_t idx) +void +ec_launch_notify_child_down_timer (xlator_t *this, ec_t *ec) { - struct timespec delay = {0, }; - - if (((ec->xl_up >> idx) & 1) == 0) - { - ec->xl_up |= 1ULL << idx; - ec->xl_up_count++; - - gf_log("ec", GF_LOG_DEBUG, "Child %d is UP (%lX, %u)", idx, ec->xl_up, - ec->xl_up_count); + struct timespec delay = {0, }; - if (ec->xl_up_count == ec->fragments) - { - gf_log("ec", GF_LOG_DEBUG, "Initiating up timer"); - - delay.tv_sec = 5; - delay.tv_nsec = 0; - ec->timer = gf_timer_call_after(this->ctx, delay, ec_notify_up_cbk, - ec); - if (ec->timer == NULL) - { - gf_log(this->name, GF_LOG_ERROR, "Cannot create timer for " - "delayed initialization"); - - return ENOMEM; - } + gf_log (this->name, GF_LOG_DEBUG, "Initiating child-down timer"); + delay.tv_sec = 10; + delay.tv_nsec = 0; + ec->timer = gf_timer_call_after (this->ctx, delay, ec_notify_down, ec); + if (ec->timer == NULL) { + gf_log(this->name, GF_LOG_ERROR, "Cannot create timer " + "for delayed initialization"); } - else if (ec->xl_up_count == ec->nodes) - { - ec_up(this, ec); +} + +void +ec_handle_up (xlator_t *this, ec_t *ec, int32_t idx) +{ + if (((ec->xl_notify >> idx) & 1) == 0) { + ec->xl_notify |= 1ULL << idx; + ec->xl_notify_count++; } - } - return EAGAIN; + if (((ec->xl_up >> idx) & 1) == 0) { /* Duplicate event */ + ec->xl_up |= 1ULL << idx; + ec->xl_up_count++; + } } -int32_t ec_notify_down(xlator_t * this, ec_t * ec, int32_t idx) +void +ec_handle_down (xlator_t *this, ec_t *ec, int32_t idx) { - if (((ec->xl_up >> idx) & 1) != 0) - { - gf_log("ec", GF_LOG_DEBUG, "Child %d is DOWN", idx); - - ec->xl_up ^= 1ULL << idx; - if (ec->xl_up_count-- == ec->fragments) - { - ec_down(this, ec); + if (((ec->xl_notify >> idx) & 1) == 0) { + ec->xl_notify |= 1ULL << idx; + ec->xl_notify_count++; } - } - return EAGAIN; + if (((ec->xl_up >> idx) & 1) != 0) { /* Duplicate event */ + gf_log(this->name, GF_LOG_DEBUG, "Child %d is DOWN", idx); + + ec->xl_up ^= 1ULL << idx; + ec->xl_up_count--; + } } -int32_t notify(xlator_t * this, int32_t event, void * data, ...) +int32_t +notify (xlator_t *this, int32_t event, void *data, ...) { ec_t * ec = this->private; int32_t idx = 0; int32_t error = 0; + glusterfs_event_t old_event = GF_EVENT_MAXVAL; + glusterfs_event_t new_event = GF_EVENT_MAXVAL; + + LOCK (&ec->lock); + + if (event == GF_EVENT_PARENT_UP) { + /* + * Start a timer which sends CHILD_DOWN event to parent + * xlator to prevent the 'mount' syscall from hanging. + */ + ec_launch_notify_child_down_timer (this, ec); + goto unlock; + } - LOCK(&ec->lock); - - for (idx = 0; idx < ec->nodes; idx++) - { - if (ec->xl_list[idx] == data) - { - break; + for (idx = 0; idx < ec->nodes; idx++) { + if (ec->xl_list[idx] == data) + break; } - } - gf_log("ec", GF_LOG_TRACE, "NOTIFY(%d): %p, %d", event, data, idx); + gf_log (this->name, GF_LOG_TRACE, "NOTIFY(%d): %p, %d", + event, data, idx); - if (idx < ec->nodes) - { - if (event == GF_EVENT_CHILD_UP) - { - error = ec_notify_up(this, ec, idx); - } - else if (event == GF_EVENT_CHILD_DOWN) - { - error = ec_notify_down(this, ec, idx); - } - } + if (idx < ec->nodes) { /* CHILD_* events */ - UNLOCK(&ec->lock); + old_event = ec_get_event_from_state (ec); - if (error == 0) - { - return default_notify(this, event, data); - } + if (event == GF_EVENT_CHILD_UP) { + ec_handle_up (this, ec, idx); + } else if (event == GF_EVENT_CHILD_DOWN) { + ec_handle_down (this, ec, idx); + } - return 0; + new_event = ec_get_event_from_state (ec); + + if (new_event == GF_EVENT_CHILD_UP && !ec->up) { + ec_up (this, ec); + } else if (new_event == GF_EVENT_CHILD_DOWN && ec->up) { + ec_down (this, ec); + } + + if ((new_event == old_event) && (new_event != GF_EVENT_MAXVAL)) + new_event = GF_EVENT_CHILD_MODIFIED; + + event = GF_EVENT_MAXVAL;/* Take care of notifying inside lock */ + if (new_event != GF_EVENT_MAXVAL) + error = default_notify (this, new_event, data); + } +unlock: + UNLOCK (&ec->lock); + + if (event != GF_EVENT_MAXVAL) + return default_notify (this, event, data); + + return error; } int32_t diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h index cb3832ae097..e5b54b02966 100644 --- a/xlators/cluster/ec/src/ec.h +++ b/xlators/cluster/ec/src/ec.h @@ -36,6 +36,8 @@ struct _ec uint32_t idx; uint32_t xl_up_count; uintptr_t xl_up; + uint32_t xl_notify_count; + uintptr_t xl_notify; uintptr_t node_mask; xlator_t ** xl_list; gf_lock_t lock; |