summaryrefslogtreecommitdiffstats
path: root/xlators
diff options
context:
space:
mode:
authorRaghavendra G <rgowdapp@redhat.com>2018-08-21 09:44:15 +0530
committerRaghavendra G <rgowdapp@redhat.com>2018-08-23 15:40:57 +0000
commit370f05546eeedab394ca0d333a6ca6637757f1e3 (patch)
treee77ffcd621df79251a67413272d1649d841815e2 /xlators
parent59e560248771d3b95517a3e12c174e9acbf39585 (diff)
performance/write-behind: fix fulfill and readdirp race
Current invalidation of stats in wb_readdirp_cbk is prone to races. As the deleted comment explains, <snip> We cannot guarantee integrity of entry->d_stat as there are cached writes. The stat is most likely stale as it doesn't account the cached writes. However, checking for non-empty liability list here is not a fool-proof solution as there can be races like, 1. readdirp is successful on posix 2. sync of cached write is successful on posix 3. write-behind received sync response and removed the request from liability queue 4. readdirp response is processed at write-behind. In the above scenario, stat for the file is sent back in readdirp response but it is stale. </snip> The fix is to mark readdirp sessions (tracked in this patch by non-zero value of "readdirps" on parent inode) and if fulfill completes when one or more readdirp sessions are in progress, mark the inode so that wb_readdirp_cbk doesn't send iatts for that in inode in readdirp response. Note that wb_readdirp_cbk already checks for presence of a non-empty liability queue and invalidates iatt. Since the only way a liability queue can shrink is by fulfilling requests in liability queue, wb_fulfill_cbk indicates wb_readdirp_cbk that a potential race could've happened b/w readdirp and fulfill. Change-Id: I12d167bf450648baa64be1cbe1ca0fddf5379521 Signed-off-by: Raghavendra G <rgowdapp@redhat.com> updates: bz#1512691
Diffstat (limited to 'xlators')
-rw-r--r--xlators/performance/write-behind/src/write-behind.c169
1 files changed, 136 insertions, 33 deletions
diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c
index b2a05881fcc..9fc8391632b 100644
--- a/xlators/performance/write-behind/src/write-behind.c
+++ b/xlators/performance/write-behind/src/write-behind.c
@@ -84,6 +84,13 @@ typedef struct wb_inode {
writes are in progress at the same time. Modules
like eager-lock in AFR depend on this behavior.
*/
+ list_head_t invalidate_list; /* list of wb_inodes that were marked for
+ * iatt invalidation due to requests in
+ * liability queue fulfilled while there
+ * was a readdirp session on parent
+ * directory. For a directory inode, this
+ * list points to list of children.
+ */
uint64_t gen; /* Liability generation number. Represents
the current 'state' of liability. Every
new addition to the liability list bumps
@@ -107,6 +114,7 @@ typedef struct wb_inode {
size_t size; /* Size of the file to catch write after EOF. */
gf_lock_t lock;
xlator_t *this;
+ inode_t *inode;
int dontsync; /* If positive, don't pick lies for
* winding. This is needed to break infinite
* recursion during invocation of
@@ -114,6 +122,8 @@ typedef struct wb_inode {
* wb_fulfill_cbk in case of an
* error during fulfill.
*/
+ gf_atomic_int32_t readdirps;
+ gf_atomic_int8_t invalidate;
} wb_inode_t;
@@ -186,9 +196,6 @@ typedef struct wb_conf {
} wb_conf_t;
-void
-wb_process_queue (wb_inode_t *wb_inode);
-
wb_inode_t *
__wb_inode_ctx_get (xlator_t *this, inode_t *inode)
@@ -225,6 +232,44 @@ out:
}
+static void
+wb_set_invalidate (wb_inode_t *wb_inode, int set)
+{
+ int readdirps = 0;
+ inode_t *parent_inode = NULL;
+ wb_inode_t *wb_parent_inode = NULL;
+
+ parent_inode = inode_parent (wb_inode->inode, NULL, NULL);
+ if (parent_inode)
+ wb_parent_inode = wb_inode_ctx_get (wb_inode->this,
+ parent_inode);
+
+ if (wb_parent_inode) {
+ LOCK (&wb_parent_inode->lock);
+ {
+ readdirps = GF_ATOMIC_GET (wb_parent_inode->readdirps);
+ if (readdirps && set) {
+ GF_ATOMIC_SWAP (wb_inode->invalidate, 1);
+ list_del_init (&wb_inode->invalidate_list);
+ list_add (&wb_inode->invalidate_list,
+ &wb_parent_inode->invalidate_list);
+ } else if (readdirps == 0) {
+ GF_ATOMIC_SWAP (wb_inode->invalidate, 0);
+ list_del_init (&wb_inode->invalidate_list);
+ }
+ }
+ UNLOCK (&wb_parent_inode->lock);
+ } else {
+ GF_ATOMIC_SWAP (wb_inode->invalidate, 0);
+ }
+
+ return;
+}
+
+void
+wb_process_queue (wb_inode_t *wb_inode);
+
+
/*
Below is a succinct explanation of the code deciding whether two regions
overlap, from Pavan <tcp@gluster.com>.
@@ -646,12 +691,16 @@ __wb_inode_create (xlator_t *this, inode_t *inode)
INIT_LIST_HEAD (&wb_inode->liability);
INIT_LIST_HEAD (&wb_inode->temptation);
INIT_LIST_HEAD (&wb_inode->wip);
+ INIT_LIST_HEAD (&wb_inode->invalidate_list);
wb_inode->this = this;
wb_inode->window_conf = conf->window_size;
+ wb_inode->inode = inode;
LOCK_INIT (&wb_inode->lock);
+ GF_ATOMIC_INIT (wb_inode->invalidate, 0);
+ GF_ATOMIC_INIT (wb_inode->readdirps, 0);
ret = __inode_ctx_put (inode, this, (uint64_t)(unsigned long)wb_inode);
if (ret) {
@@ -1047,6 +1096,30 @@ wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
wb_inode = head->wb_inode;
+ /* There could be a readdirp session in progress. Since wb_fulfill_cbk
+ * can potentially remove a request from liability queue,
+ * wb_readdirp_cbk will miss writes on this inode (as it invalidates
+ * stats only if liability queue is not empty) and hence mark inode
+ * for invalidation of stats in readdirp response. Specifically this
+ * code fixes the following race mentioned in wb_readdirp_cbk:
+ */
+
+ /* <removed comment from wb_readdirp_cbk>
+ * We cannot guarantee integrity of entry->d_stat as there are cached
+ * writes. The stat is most likely stale as it doesn't account the
+ * cached writes. However, checking for non-empty liability list here is
+ * not a fool-proof solution as there can be races like,
+ * 1. readdirp is successful on posix
+ * 2. sync of cached write is successful on posix
+ * 3. write-behind received sync response and removed the request from
+ * liability queue
+ * 4. readdirp response is processed at write-behind
+ *
+ * In the above scenario, stat for the file is sent back in readdirp
+ * response but it is stale.
+ * </comment> */
+ wb_set_invalidate (wb_inode, 1);
+
if (op_ret == -1) {
wb_fulfill_err (head, op_errno);
} else if (op_ret < head->total_size) {
@@ -1070,14 +1143,13 @@ wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
head->total_size += req->write_size; \
} while (0)
-
int
wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head)
{
struct iovec vector[MAX_VECTOR_COUNT];
- int count = 0;
- wb_request_t *req = NULL;
- call_frame_t *frame = NULL;
+ int count = 0;
+ wb_request_t *req = NULL;
+ call_frame_t *frame = NULL;
/* make sure head->total_size is updated before we run into any
* errors
@@ -1148,7 +1220,7 @@ wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities)
conf = wb_inode->this->private;
- list_for_each_entry_safe (req, tmp, liabilities, winds) {
+ list_for_each_entry_safe (req, tmp, liabilities, winds) {
list_del_init (&req->winds);
if (!head) {
@@ -2442,6 +2514,48 @@ noqueue:
return 0;
}
+static void
+wb_mark_readdirp_start (xlator_t *this, inode_t *directory)
+{
+ wb_inode_t *wb_directory_inode = NULL;
+
+ wb_directory_inode = wb_inode_create (this, directory);
+
+ LOCK (&wb_directory_inode->lock);
+ {
+ GF_ATOMIC_INC (wb_directory_inode->readdirps);
+ }
+ UNLOCK (&wb_directory_inode->lock);
+
+ return;
+}
+
+static void
+wb_mark_readdirp_end (xlator_t *this, inode_t *directory)
+{
+ wb_inode_t *wb_directory_inode = NULL, *wb_inode = NULL, *tmp = NULL;
+ int readdirps = 0;
+
+ wb_directory_inode = wb_inode_ctx_get (this, directory);
+
+ LOCK (&wb_directory_inode->lock);
+ {
+ readdirps = GF_ATOMIC_DEC (wb_directory_inode->readdirps);
+ if (readdirps)
+ goto unlock;
+
+ list_for_each_entry_safe (wb_inode, tmp,
+ &wb_directory_inode->invalidate_list,
+ invalidate_list) {
+ list_del_init (&wb_inode->invalidate_list);
+ GF_ATOMIC_SWAP (wb_inode->invalidate, 0);
+ }
+ }
+unlock:
+ UNLOCK (&wb_directory_inode->lock);
+
+ return;
+}
int32_t
wb_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -2451,6 +2565,10 @@ wb_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
wb_inode_t *wb_inode = NULL;
gf_dirent_t *entry = NULL;
inode_t *inode = NULL;
+ fd_t *fd = NULL;
+
+ fd = frame->local;
+ frame->local = NULL;
if (op_ret <= 0)
goto unwind;
@@ -2465,29 +2583,8 @@ wb_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
LOCK (&wb_inode->lock);
{
- if (!list_empty (&wb_inode->liability)) {
- /* We cannot guarantee integrity of
- entry->d_stat as there are cached writes.
- The stat is most likely stale as it doesn't
- account the cached writes. However, checking
- for non-empty liability list here is not a
- fool-proof solution as there can be races
- like,
- 1. readdirp is successful on posix
- 2. sync of cached write is successful on
- posix
- 3. write-behind received sync response and
- removed the request from liability queue
- 4. readdirp response is processed at
- write-behind
-
- In the above scenario, stat for the file is
- sent back in readdirp response but it is
- stale.
-
- For lack of better solutions I am sticking
- with current solution.
- */
+ if (!list_empty (&wb_inode->liability) ||
+ GF_ATOMIC_GET (wb_inode->invalidate)) {
inode = entry->inode;
entry->inode = NULL;
@@ -2500,9 +2597,11 @@ wb_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
UNLOCK (&wb_inode->lock);
}
+ wb_mark_readdirp_end (this, fd->inode);
+
unwind:
- STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno,
- entries, xdata);
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata);
return 0;
}
@@ -2511,6 +2610,10 @@ int32_t
wb_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t off, dict_t *xdata)
{
+ wb_mark_readdirp_start (this, fd->inode);
+
+ frame->local = fd;
+
STACK_WIND (frame, wb_readdirp_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->readdirp,
fd, size, off, xdata);