cluster/ec: Fix all EIO errors in EC

Backport of http://review.gluster.org/10770 Backport of http://review.gluster.org/10806 Backport of http://review.gluster.org/10787 Backport of http://review.gluster.org/10868 Backport of http://review.gluster.com/10852 - When a blocking lock is requested, lock request is succeeded even when ec->fragment number of locks are acquired successfully in non-blocking locking phase. This will lead to fop succeeding only on the bricks where the locks are acquired, leading to the necessity of self-heals. To prevent these un-necessary self-heals, if the remaining locks fail with EAGAIN in non-blocking lock phase try blocking locking phase instead. - Handle lookup failures while op in progress - cluster/ec: Correctly cleanup delayed locks When a delayed lock is pending, a graph switch doesn't correctly terminate it. This means that the update of version and size xattrs is lost, causing EIO errors. This patch handles GF_EVENT_PARENT_DOWN event to correctly finish pending udpdates before completing the graph switch. - Fix use after free crash ec_heal creates ec_fop_data but doesn't run ec_manager. ec_fop_data_allocate adds this fop to ec->pending_fops, because ec_manager is not run on this heal fop it is never removed from ec->pending_fops. When it is accessed after free it leads to crash. It is better to not to add HEAL fops to ec->pending_fops because we don't want graph switch to hang the mount because of a BIG file/directory heal. - Forced unlock when lock contention is detected EC uses an eager lock mechanism to optimize multiple read/write requests on the same entry or inode. This increases performance but can have adverse results when other clients try to access the same entry/inode. To solve this, this patch adds a functionality to detect when this happens and force an earlier release to not block other clients. The method consists on requesting GF_GLUSTERFS_INODELK_COUNT and GF_GLUSTERFS_ENTRYLK_COUNT for all fops that take a lock. When this count is greater than one, the lock is marked to be released. All fops already waiting for this lock will be executed normally before releasing the lock, but new requests that also require it will be blocked and restarted after the lock has been released and reacquired again. Another problem was that some operations did correctly lock the parent of an entry when needed, but got the size and version xattrs from the entry instead of the parent. This patch solves this problem by binding all queries of size and version to each lock and replacing all entrylk calls by inodelk ones to remove concurrent updates on directory metadata. This also allows rename to correctly update source and destination directories. BUG: 1225279 Change-Id: I02a6084b138dd38e018a462347cd9ce38610c7ef Reviewed-on: http://review.gluster.org/10926 Tested-by: NetBSD Build System Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
author: Pranith Kumar K <pkarampu@redhat.com> 2015-05-13 16:57:49 +0530
committer: Pranith Kumar Karampuri <pkarampu@redhat.com> 2015-05-28 04:12:06 -0700
commit: 3a57ca8ee29ea8e3d3c5bbf28a56a821bfa99d99 (patch)
tree: 7e919238192422d4bb1f8a86a950013b286b41b3 /xlators/cluster/ec/src/ec-data.h
parent: 2b8fdde926532014f19d850b1321a4c7046dc001 (diff)
1 files changed, 48 insertions, 36 deletions
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h
index 9e5c92dd5b8..8204cf087de 100644
--- a/xlators/cluster/ec/src/ec-data.h
+++ b/xlators/cluster/ec/src/ec-data.h
@@ -67,10 +67,20 @@ struct _ec_fd
 struct _ec_inode
 {
     uintptr_t         bad;
-    ec_lock_t        *entry_lock;
     ec_lock_t        *inode_lock;
+    gf_boolean_t      have_info;
+    gf_boolean_t      have_config;
+    gf_boolean_t      have_version;
+    gf_boolean_t      have_size;
+    gf_boolean_t      have_dirty;
+    ec_config_t       config;
+    uint64_t          pre_version[2];
+    uint64_t          post_version[2];
+    uint64_t          pre_size;
+    uint64_t          post_size;
+    uint64_t          pre_dirty[2];
+    uint64_t          post_dirty[2];
     struct list_head  heal;
-
 };
 
 typedef int32_t (* fop_heal_cbk_t)(call_frame_t *, void * cookie, xlator_t *,
@@ -80,7 +90,6 @@ typedef int32_t (* fop_fheal_cbk_t)(call_frame_t *, void * cookie, xlator_t *,
                                     int32_t, int32_t, uintptr_t, uintptr_t,
                                     uintptr_t, dict_t *);
 
-
 union _ec_cbk
 {
     fop_access_cbk_t       access;
@@ -132,21 +141,21 @@ union _ec_cbk
 
 struct _ec_lock
 {
-    ec_lock_t        **plock;
+    ec_inode_t        *ctx;
     gf_timer_t        *timer;
-    struct list_head   waiting;
+    struct list_head   waiting; /* Queue of requests being serviced. */
+    struct list_head   frozen;  /* Queue of requests that will be serviced in
+                                   the next unlock/lock cycle. */
     uintptr_t          mask;
     uintptr_t          good_mask;
-    int32_t            kind;
     int32_t            refs;
-    int32_t            acquired;
-    int32_t            have_size;
-    uint64_t           size;
-    uint64_t           size_delta;
-    uint64_t           version[2];
-    uint64_t           version_delta[2];
-    gf_boolean_t       is_dirty[2];
+    int32_t            refs_frozen;
+    int32_t            inserted;
+    gf_boolean_t       acquired;
+    gf_boolean_t       release;
+    gf_boolean_t       query;
     ec_fop_data_t     *owner;
+    fd_t              *fd;
     loc_t              loc;
     union
     {
@@ -157,9 +166,12 @@ struct _ec_lock
 
 struct _ec_lock_link
 {
-    ec_lock_t *      lock;
-    ec_fop_data_t *  fop;
-    struct list_head wait_list;
+    ec_lock_t        *lock;
+    ec_fop_data_t    *fop;
+    struct list_head  wait_list;
+    gf_boolean_t      update[2];
+    loc_t            *base;
+    uint64_t          size;
 };
 
 struct _ec_fop_data
@@ -172,22 +184,19 @@ struct _ec_fop_data
     int32_t            winds;
     int32_t            jobs;
     int32_t            error;
-    ec_fop_data_t *    parent;
-    xlator_t *         xl;
-    call_frame_t *     req_frame;   // frame of the calling xlator
-    call_frame_t *     frame;       // frame used by this fop
-    struct list_head   cbk_list;    // sorted list of groups of answers
-    struct list_head   answer_list; // list of answers
-    ec_cbk_data_t *    answer;      // accepted answer
+    ec_fop_data_t     *parent;
+    xlator_t          *xl;
+    call_frame_t      *req_frame;    /* frame of the calling xlator */
+    call_frame_t      *frame;        /* frame used by this fop */
+    struct list_head   cbk_list;     /* sorted list of groups of answers */
+    struct list_head   answer_list;  /* list of answers */
+    struct list_head   pending_list; /* member of ec_t.pending_fops */
+    ec_cbk_data_t     *answer;       /* accepted answer */
     int32_t            lock_count;
     int32_t            locked;
     ec_lock_link_t     locks[2];
-    int32_t            locks_update;
-    int32_t            have_size;
-    uint64_t           pre_size;
-    uint64_t           post_size;
+    int32_t            first_lock;
     gf_lock_t          lock;
-    ec_config_t        config;
 
     uint32_t           flags;
     uint32_t           first;
@@ -196,6 +205,7 @@ struct _ec_fop_data
                                   if fop->minimum number of subvolumes succeed
                                   which are not healing*/
     uintptr_t          remaining;
+    uintptr_t          received; /* Mask of responses */
     uintptr_t          good;
     uintptr_t          bad;
 
@@ -203,7 +213,7 @@ struct _ec_fop_data
     ec_handler_f       handler;
     ec_resume_f        resume;
     ec_cbk_t           cbks;
-    void *             data;
+    void              *data;
     ec_heal_t         *heal;
 
     uint64_t           user_size;
@@ -211,8 +221,8 @@ struct _ec_fop_data
 
     int32_t            use_fd;
 
-    dict_t *           xdata;
-    dict_t *           dict;
+    dict_t            *xdata;
+    dict_t            *dict;
     int32_t            int32;
     uint32_t           uint32;
     uint64_t           size;
@@ -222,14 +232,14 @@ struct _ec_fop_data
     entrylk_type       entrylk_type;
     gf_xattrop_flags_t xattrop_flags;
     dev_t              dev;
-    inode_t *          inode;
-    fd_t *             fd;
+    inode_t           *inode;
+    fd_t              *fd;
     struct iatt        iatt;
-    char *             str[2];
+    char              *str[2];
     loc_t              loc[2];
     struct gf_flock    flock;
-    struct iovec *     vector;
-    struct iobref *    buffers;
+    struct iovec      *vector;
+    struct iobref     *buffers;
 };
 
 struct _ec_cbk_data
@@ -299,4 +309,6 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,
 void ec_fop_data_acquire(ec_fop_data_t * fop);
 void ec_fop_data_release(ec_fop_data_t * fop);
 
+void ec_fop_cleanup(ec_fop_data_t *fop);
+
 #endif /* __EC_DATA_H__ */
author	Pranith Kumar K <pkarampu@redhat.com>	2015-05-13 16:57:49 +0530
committer	Pranith Kumar Karampuri <pkarampu@redhat.com>	2015-05-28 04:12:06 -0700
commit	3a57ca8ee29ea8e3d3c5bbf28a56a821bfa99d99 (patch)
tree	7e919238192422d4bb1f8a86a950013b286b41b3 /xlators/cluster/ec/src/ec-data.h
parent	2b8fdde926532014f19d850b1321a4c7046dc001 (diff)