diff options
| author | Pranith Kumar K <pkarampu@redhat.com> | 2015-05-13 16:57:49 +0530 |
|---|---|---|
| committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2015-05-28 04:12:06 -0700 |
| commit | 3a57ca8ee29ea8e3d3c5bbf28a56a821bfa99d99 (patch) | |
| tree | 7e919238192422d4bb1f8a86a950013b286b41b3 /xlators/cluster/ec/src/ec-data.h | |
| parent | 2b8fdde926532014f19d850b1321a4c7046dc001 (diff) | |
cluster/ec: Fix all EIO errors in EC
Backport of http://review.gluster.org/10770
Backport of http://review.gluster.org/10806
Backport of http://review.gluster.org/10787
Backport of http://review.gluster.org/10868
Backport of http://review.gluster.com/10852
- When a blocking lock is requested, lock request is succeeded even when
ec->fragment number of locks are acquired successfully in non-blocking locking
phase. This will lead to fop succeeding only on the bricks where the locks are
acquired, leading to the necessity of self-heals. To prevent these un-necessary
self-heals, if the remaining locks fail with EAGAIN in non-blocking lock phase
try blocking locking phase instead.
- Handle lookup failures while op in progress
- cluster/ec: Correctly cleanup delayed locks
When a delayed lock is pending, a graph switch doesn't correctly
terminate it. This means that the update of version and size xattrs
is lost, causing EIO errors. This patch handles GF_EVENT_PARENT_DOWN
event to correctly finish pending udpdates before completing the
graph switch.
- Fix use after free crash
ec_heal creates ec_fop_data but doesn't run ec_manager. ec_fop_data_allocate
adds this fop to ec->pending_fops, because ec_manager is not run on this heal
fop it is never removed from ec->pending_fops. When it is accessed after free
it leads to crash. It is better to not to add HEAL fops to ec->pending_fops
because we don't want graph switch to hang the mount because of a BIG
file/directory heal.
- Forced unlock when lock contention is detected
EC uses an eager lock mechanism to optimize multiple read/write
requests on the same entry or inode. This increases performance
but can have adverse results when other clients try to access the
same entry/inode. To solve this, this patch adds a functionality
to detect when this happens and force an earlier release to not
block other clients.
The method consists on requesting GF_GLUSTERFS_INODELK_COUNT and
GF_GLUSTERFS_ENTRYLK_COUNT for all fops that take a lock. When this
count is greater than one, the lock is marked to be released. All
fops already waiting for this lock will be executed normally before
releasing the lock, but new requests that also require it will be
blocked and restarted after the lock has been released and reacquired
again.
Another problem was that some operations did correctly lock the
parent of an entry when needed, but got the size and version xattrs
from the entry instead of the parent.
This patch solves this problem by binding all queries of size and
version to each lock and replacing all entrylk calls by inodelk ones
to remove concurrent updates on directory metadata. This also allows
rename to correctly update source and destination directories.
BUG: 1225279
Change-Id: I02a6084b138dd38e018a462347cd9ce38610c7ef
Reviewed-on: http://review.gluster.org/10926
Tested-by: NetBSD Build System
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Diffstat (limited to 'xlators/cluster/ec/src/ec-data.h')
| -rw-r--r-- | xlators/cluster/ec/src/ec-data.h | 84 |
1 files changed, 48 insertions, 36 deletions
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h index 9e5c92dd5b8..8204cf087de 100644 --- a/xlators/cluster/ec/src/ec-data.h +++ b/xlators/cluster/ec/src/ec-data.h @@ -67,10 +67,20 @@ struct _ec_fd struct _ec_inode { uintptr_t bad; - ec_lock_t *entry_lock; ec_lock_t *inode_lock; + gf_boolean_t have_info; + gf_boolean_t have_config; + gf_boolean_t have_version; + gf_boolean_t have_size; + gf_boolean_t have_dirty; + ec_config_t config; + uint64_t pre_version[2]; + uint64_t post_version[2]; + uint64_t pre_size; + uint64_t post_size; + uint64_t pre_dirty[2]; + uint64_t post_dirty[2]; struct list_head heal; - }; typedef int32_t (* fop_heal_cbk_t)(call_frame_t *, void * cookie, xlator_t *, @@ -80,7 +90,6 @@ typedef int32_t (* fop_fheal_cbk_t)(call_frame_t *, void * cookie, xlator_t *, int32_t, int32_t, uintptr_t, uintptr_t, uintptr_t, dict_t *); - union _ec_cbk { fop_access_cbk_t access; @@ -132,21 +141,21 @@ union _ec_cbk struct _ec_lock { - ec_lock_t **plock; + ec_inode_t *ctx; gf_timer_t *timer; - struct list_head waiting; + struct list_head waiting; /* Queue of requests being serviced. */ + struct list_head frozen; /* Queue of requests that will be serviced in + the next unlock/lock cycle. */ uintptr_t mask; uintptr_t good_mask; - int32_t kind; int32_t refs; - int32_t acquired; - int32_t have_size; - uint64_t size; - uint64_t size_delta; - uint64_t version[2]; - uint64_t version_delta[2]; - gf_boolean_t is_dirty[2]; + int32_t refs_frozen; + int32_t inserted; + gf_boolean_t acquired; + gf_boolean_t release; + gf_boolean_t query; ec_fop_data_t *owner; + fd_t *fd; loc_t loc; union { @@ -157,9 +166,12 @@ struct _ec_lock struct _ec_lock_link { - ec_lock_t * lock; - ec_fop_data_t * fop; - struct list_head wait_list; + ec_lock_t *lock; + ec_fop_data_t *fop; + struct list_head wait_list; + gf_boolean_t update[2]; + loc_t *base; + uint64_t size; }; struct _ec_fop_data @@ -172,22 +184,19 @@ struct _ec_fop_data int32_t winds; int32_t jobs; int32_t error; - ec_fop_data_t * parent; - xlator_t * xl; - call_frame_t * req_frame; // frame of the calling xlator - call_frame_t * frame; // frame used by this fop - struct list_head cbk_list; // sorted list of groups of answers - struct list_head answer_list; // list of answers - ec_cbk_data_t * answer; // accepted answer + ec_fop_data_t *parent; + xlator_t *xl; + call_frame_t *req_frame; /* frame of the calling xlator */ + call_frame_t *frame; /* frame used by this fop */ + struct list_head cbk_list; /* sorted list of groups of answers */ + struct list_head answer_list; /* list of answers */ + struct list_head pending_list; /* member of ec_t.pending_fops */ + ec_cbk_data_t *answer; /* accepted answer */ int32_t lock_count; int32_t locked; ec_lock_link_t locks[2]; - int32_t locks_update; - int32_t have_size; - uint64_t pre_size; - uint64_t post_size; + int32_t first_lock; gf_lock_t lock; - ec_config_t config; uint32_t flags; uint32_t first; @@ -196,6 +205,7 @@ struct _ec_fop_data if fop->minimum number of subvolumes succeed which are not healing*/ uintptr_t remaining; + uintptr_t received; /* Mask of responses */ uintptr_t good; uintptr_t bad; @@ -203,7 +213,7 @@ struct _ec_fop_data ec_handler_f handler; ec_resume_f resume; ec_cbk_t cbks; - void * data; + void *data; ec_heal_t *heal; uint64_t user_size; @@ -211,8 +221,8 @@ struct _ec_fop_data int32_t use_fd; - dict_t * xdata; - dict_t * dict; + dict_t *xdata; + dict_t *dict; int32_t int32; uint32_t uint32; uint64_t size; @@ -222,14 +232,14 @@ struct _ec_fop_data entrylk_type entrylk_type; gf_xattrop_flags_t xattrop_flags; dev_t dev; - inode_t * inode; - fd_t * fd; + inode_t *inode; + fd_t *fd; struct iatt iatt; - char * str[2]; + char *str[2]; loc_t loc[2]; struct gf_flock flock; - struct iovec * vector; - struct iobref * buffers; + struct iovec *vector; + struct iobref *buffers; }; struct _ec_cbk_data @@ -299,4 +309,6 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this, void ec_fop_data_acquire(ec_fop_data_t * fop); void ec_fop_data_release(ec_fop_data_t * fop); +void ec_fop_cleanup(ec_fop_data_t *fop); + #endif /* __EC_DATA_H__ */ |
