diff options
author | Raghavendra G <rgowdapp@redhat.com> | 2018-07-01 06:43:51 +0530 |
---|---|---|
committer | Raghavendra G <rgowdapp@redhat.com> | 2018-08-03 14:47:03 +0000 |
commit | 7959147cfa75b3dd0320090e2173bb90930788e7 (patch) | |
tree | 65bc98ab6cc75e4f6e3a18276e3b0dc4a0f93478 /xlators/performance/open-behind | |
parent | 93d7f3f2da52b6fb513ad2ff91c5bd0589d1482b (diff) |
performance/ob: stringent synchronization between rename/unlink and open
Issue 1:
========
open all pending fds before resuming rename and unlink
currently ob uses fd_lookup to find out the opened-behind. But,
fd_lookup gives the recent fd opened on the inode, but the oldest
fd(s) (there can be multiple fds opened-behind when the very first
opens on an inode are issued in parallel) are the candidates for fds
with pending opens on backend. So, this patch explictily tracks the
opened-behind fds on an inode and opens them before resuming rename or
unlink.
similar code changes are also done for setattr and setxattr to make
sure pending opens are complete before permission change.
This patch also adds a check for an open-in-progress to
ob_get_wind_fd. If there is already an open-in-progress,
ob_get_wind_fd won't return an anonymous fd as a result. This is done
to make sure that rename/unlink/setattr/setxattr don't race with an
operation like readv/fstat on an anonymous fd already in progress.
Issue 2:
========
once renamed/unlinked, don't open-behind any future opens on the same
inode.
Issue 3:
========
Don't use anonymous fds by default. Note that rename/unlink can race
with a read/fd on anonymous fds and these operations can fail with
ESTALE. So, for better consistency in default mode, don't use
anonymous fds. If performance is needed with tradeoff of consistency,
one can switch on the option "use-anonymous-fd"
Change-Id: Iaf130db71ce61ac37269f422e348a45f6ae6e82c
Signed-off-by: Raghavendra G <rgowdapp@redhat.com>
Updates: bz#1512691
Diffstat (limited to 'xlators/performance/open-behind')
-rw-r--r-- | xlators/performance/open-behind/src/open-behind-mem-types.h | 1 | ||||
-rw-r--r-- | xlators/performance/open-behind/src/open-behind.c | 396 |
2 files changed, 330 insertions, 67 deletions
diff --git a/xlators/performance/open-behind/src/open-behind-mem-types.h b/xlators/performance/open-behind/src/open-behind-mem-types.h index 1e94296f424..8ddd061a835 100644 --- a/xlators/performance/open-behind/src/open-behind-mem-types.h +++ b/xlators/performance/open-behind/src/open-behind-mem-types.h @@ -16,6 +16,7 @@ enum gf_ob_mem_types_ { gf_ob_mt_fd_t = gf_common_mt_end + 1, gf_ob_mt_conf_t, + gf_ob_mt_inode_t, gf_ob_mt_end }; #endif diff --git a/xlators/performance/open-behind/src/open-behind.c b/xlators/performance/open-behind/src/open-behind.c index 396bf29f40e..ec2be508bda 100644 --- a/xlators/performance/open-behind/src/open-behind.c +++ b/xlators/performance/open-behind/src/open-behind.c @@ -32,6 +32,16 @@ typedef struct ob_conf { */ } ob_conf_t; +typedef struct ob_inode { + inode_t *inode; + struct list_head resume_fops; + struct list_head ob_fds; + int count; + int op_ret; + int op_errno; + gf_boolean_t open_in_progress; + int unlinked; +} ob_inode_t; typedef struct ob_fd { call_frame_t *open_frame; @@ -39,9 +49,79 @@ typedef struct ob_fd { dict_t *xdata; int flags; int op_errno; + ob_inode_t *ob_inode; + fd_t *fd; + gf_boolean_t opened; + gf_boolean_t ob_inode_fops_waiting; struct list_head list; + struct list_head ob_fds_on_inode; } ob_fd_t; +ob_inode_t * +ob_inode_alloc (inode_t *inode) +{ + ob_inode_t *ob_inode = NULL; + + ob_inode = GF_CALLOC (1, sizeof (*ob_inode), gf_ob_mt_inode_t); + if (ob_inode == NULL) + goto out; + + ob_inode->inode = inode; + INIT_LIST_HEAD (&ob_inode->resume_fops); + INIT_LIST_HEAD (&ob_inode->ob_fds); +out: + return ob_inode; +} + +void +ob_inode_free (ob_inode_t *ob_inode) +{ + if (ob_inode == NULL) + goto out; + + list_del_init (&ob_inode->resume_fops); + list_del_init (&ob_inode->ob_fds); + + GF_FREE (ob_inode); +out: + return; +} + +ob_inode_t * +ob_inode_get (xlator_t *this, inode_t *inode) +{ + ob_inode_t *ob_inode = NULL; + uint64_t value = 0; + int ret = 0; + + if (!inode) + goto out; + + LOCK (&inode->lock); + { + __inode_ctx_get (inode, this, &value); + if (value == 0) { + ob_inode = ob_inode_alloc (inode); + if (ob_inode == NULL) + goto unlock; + + value = (uint64_t)((void *)ob_inode); + ret = __inode_ctx_set (inode, this, &value); + if (ret < 0) { + ob_inode_free (ob_inode); + ob_inode = NULL; + } + } else { + ob_inode = (ob_inode_t *) value; + } + } +unlock: + UNLOCK (&inode->lock); + +out: + return ob_inode; +} + ob_fd_t * __ob_fd_ctx_get (xlator_t *this, fd_t *fd) @@ -112,6 +192,7 @@ ob_fd_new (void) ob_fd = GF_CALLOC (1, sizeof (*ob_fd), gf_ob_mt_fd_t); INIT_LIST_HEAD (&ob_fd->list); + INIT_LIST_HEAD (&ob_fd->ob_fds_on_inode); return ob_fd; } @@ -120,6 +201,12 @@ ob_fd_new (void) void ob_fd_free (ob_fd_t *ob_fd) { + LOCK (&ob_fd->fd->inode->lock); + { + list_del_init (&ob_fd->ob_fds_on_inode); + } + UNLOCK (&ob_fd->fd->inode->lock); + loc_wipe (&ob_fd->loc); if (ob_fd->xdata) @@ -136,21 +223,32 @@ int ob_wake_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, fd_t *fd_ret, dict_t *xdata) { - fd_t *fd = NULL; - struct list_head list; - ob_fd_t *ob_fd = NULL; - call_stub_t *stub = NULL, *tmp = NULL; + fd_t *fd = NULL; + int count = 0; + int ob_inode_op_ret = 0; + int ob_inode_op_errno = 0; + ob_fd_t *ob_fd = NULL; + call_stub_t *stub = NULL, *tmp = NULL; + ob_inode_t *ob_inode = NULL; + gf_boolean_t ob_inode_fops_waiting = _gf_false; + struct list_head fops_waiting_on_fd, fops_waiting_on_inode; fd = frame->local; frame->local = NULL; - INIT_LIST_HEAD (&list); + INIT_LIST_HEAD (&fops_waiting_on_fd); + INIT_LIST_HEAD (&fops_waiting_on_inode); + + ob_inode = ob_inode_get (this, fd->inode); LOCK (&fd->lock); { ob_fd = __ob_fd_ctx_get (this, fd); + ob_fd->opened = _gf_true; - list_splice_init (&ob_fd->list, &list); + ob_inode_fops_waiting = ob_fd->ob_inode_fops_waiting; + + list_splice_init (&ob_fd->list, &fops_waiting_on_fd); if (op_ret < 0) { /* mark fd BAD for ever */ @@ -162,10 +260,31 @@ ob_wake_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } UNLOCK (&fd->lock); + if (ob_inode_fops_waiting) { + LOCK (&fd->inode->lock); + { + count = --ob_inode->count; + if (op_ret < 0) { + /* TODO: when to reset the error? */ + ob_inode->op_ret = -1; + ob_inode->op_errno = op_errno; + } + + if (count == 0) { + ob_inode->open_in_progress = _gf_false; + ob_inode_op_ret = ob_inode->op_ret; + ob_inode_op_errno = ob_inode->op_errno; + list_splice_init (&ob_inode->resume_fops, + &fops_waiting_on_inode); + } + } + UNLOCK (&fd->inode->lock); + } + if (ob_fd) ob_fd_free (ob_fd); - list_for_each_entry_safe (stub, tmp, &list, list) { + list_for_each_entry_safe (stub, tmp, &fops_waiting_on_fd, list) { list_del_init (&stub->list); if (op_ret < 0) @@ -174,6 +293,15 @@ ob_wake_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_resume (stub); } + list_for_each_entry_safe (stub, tmp, &fops_waiting_on_inode, list) { + list_del_init (&stub->list); + + if (ob_inode_op_ret < 0) + call_unwind_error (stub, -1, ob_inode_op_errno); + else + call_resume (stub); + } + fd_unref (fd); STACK_DESTROY (frame->root); @@ -183,22 +311,30 @@ ob_wake_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int -ob_fd_wake (xlator_t *this, fd_t *fd) +ob_fd_wake (xlator_t *this, fd_t *fd, ob_fd_t *ob_fd) { call_frame_t *frame = NULL; - ob_fd_t *ob_fd = NULL; - LOCK (&fd->lock); - { - ob_fd = __ob_fd_ctx_get (this, fd); - if (!ob_fd) - goto unlock; - - frame = ob_fd->open_frame; - ob_fd->open_frame = NULL; - } -unlock: - UNLOCK (&fd->lock); + if (ob_fd == NULL) { + LOCK (&fd->lock); + { + ob_fd = __ob_fd_ctx_get (this, fd); + if (!ob_fd) + goto unlock; + + frame = ob_fd->open_frame; + ob_fd->open_frame = NULL; + } + unlock: + UNLOCK (&fd->lock); + } else { + LOCK (&fd->lock); + { + frame = ob_fd->open_frame; + ob_fd->open_frame = NULL; + } + UNLOCK (&fd->lock); + } if (frame) { frame->local = fd_ref (fd); @@ -211,6 +347,108 @@ unlock: return 0; } +void +ob_inode_wake (xlator_t *this, struct list_head *ob_fds) +{ + ob_fd_t *ob_fd = NULL, *tmp = NULL; + fd_t *fd = NULL; + + list_for_each_entry_safe (ob_fd, tmp, ob_fds, ob_fds_on_inode) { + ob_fd_wake (this, ob_fd->fd, ob_fd); + fd = ob_fd->fd; + ob_fd_free (ob_fd); + fd_unref (fd); + } +} + +/* called holding inode->lock and fd->lock */ +void +ob_fd_copy (ob_fd_t *src, ob_fd_t *dst) +{ + if (!src || !dst) + goto out; + + dst->fd = __fd_ref (src->fd); + dst->loc.inode = inode_ref (src->loc.inode); + gf_uuid_copy (dst->loc.gfid, src->loc.gfid); + dst->flags = src->flags; + dst->xdata = dict_ref (src->xdata); + dst->ob_inode = src->ob_inode; +out: + return; +} + +int +open_all_pending_fds_and_resume (xlator_t *this, inode_t *inode, + call_stub_t *stub) +{ + ob_inode_t *ob_inode = NULL; + ob_fd_t *ob_fd = NULL, *tmp = NULL; + gf_boolean_t was_open_in_progress = _gf_false; + gf_boolean_t wait_for_open = _gf_false; + struct list_head ob_fds = {0, }; + + ob_inode = ob_inode_get (this, inode); + if (ob_inode == NULL) + goto out; + + INIT_LIST_HEAD (&ob_fds); + + LOCK (&inode->lock); + { + was_open_in_progress = ob_inode->open_in_progress; + ob_inode->unlinked = 1; + + if (was_open_in_progress) { + list_add_tail (&stub->list, &ob_inode->resume_fops); + goto inode_unlock; + } + + list_for_each_entry (ob_fd, &ob_inode->ob_fds, + ob_fds_on_inode) { + LOCK (&ob_fd->fd->lock); + { + if (ob_fd->opened) + goto fd_unlock; + + ob_inode->count++; + ob_fd->ob_inode_fops_waiting = _gf_true; + + if (ob_fd->open_frame == NULL) { + /* open in progress no need of wake */ + } else { + tmp = ob_fd_new (); + tmp->open_frame = ob_fd->open_frame; + ob_fd->open_frame = NULL; + + ob_fd_copy (ob_fd, tmp); + list_add_tail (&tmp->ob_fds_on_inode, + &ob_fds); + } + } + fd_unlock: + UNLOCK (&ob_fd->fd->lock); + } + + if (ob_inode->count) { + wait_for_open = ob_inode->open_in_progress = _gf_true; + list_add_tail (&stub->list, &ob_inode->resume_fops); + } + } +inode_unlock: + UNLOCK (&inode->lock); + +out: + if (!was_open_in_progress) { + if (!wait_for_open) { + call_resume (stub); + } else { + ob_inode_wake (this, &ob_fds); + } + } + + return 0; +} int open_and_resume (xlator_t *this, fd_t *fd, call_stub_t *stub) @@ -241,7 +479,7 @@ nofd: if (op_errno) call_unwind_error (stub, -1, op_errno); else if (ob_fd) - ob_fd_wake (this, fd); + ob_fd_wake (this, fd, NULL); else call_resume (stub); @@ -253,10 +491,12 @@ int ob_open_behind (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd, dict_t *xdata) { - ob_fd_t *ob_fd = NULL; - int ret = -1; - ob_conf_t *conf = NULL; - + ob_fd_t *ob_fd = NULL; + int ret = -1; + ob_conf_t *conf = NULL; + ob_inode_t *ob_inode = NULL; + gf_boolean_t open_in_progress = _gf_false; + int unlinked = 0; conf = this->private; @@ -267,10 +507,17 @@ ob_open_behind (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, return 0; } + ob_inode = ob_inode_get (this, fd->inode); + ob_fd = ob_fd_new (); if (!ob_fd) goto enomem; + ob_fd->ob_inode = ob_inode; + + /* don't do fd_ref, it'll cause leaks */ + ob_fd->fd = fd; + ob_fd->open_frame = copy_frame (frame); if (!ob_fd->open_frame) goto enomem; @@ -282,27 +529,49 @@ ob_open_behind (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, if (xdata) ob_fd->xdata = dict_ref (xdata); - ret = ob_fd_ctx_set (this, fd, ob_fd); - if (ret) - goto enomem; + LOCK (&fd->inode->lock); + { + open_in_progress = ob_inode->open_in_progress; + unlinked = ob_inode->unlinked; + if (!open_in_progress && !unlinked) { + ret = ob_fd_ctx_set (this, fd, ob_fd); + if (ret) { + UNLOCK (&fd->inode->lock); + goto enomem; + } + + list_add (&ob_fd->ob_fds_on_inode, &ob_inode->ob_fds); + } + } + UNLOCK (&fd->inode->lock); - fd_ref (fd); - STACK_UNWIND_STRICT (open, frame, 0, 0, fd, xdata); + if (!open_in_progress && !unlinked) { + fd_ref (fd); - if (!conf->lazy_open) - ob_fd_wake (this, fd); + STACK_UNWIND_STRICT (open, frame, 0, 0, fd, xdata); - fd_unref (fd); + if (!conf->lazy_open) + ob_fd_wake (this, fd, NULL); + + fd_unref (fd); + } else { + ob_fd_free (ob_fd); + STACK_WIND (frame, default_open_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->open, + loc, flags, fd, xdata); + } return 0; enomem: if (ob_fd) { if (ob_fd->open_frame) STACK_DESTROY (ob_fd->open_frame->root); + loc_wipe (&ob_fd->loc); if (ob_fd->xdata) dict_unref (ob_fd->xdata); + GF_FREE (ob_fd); } @@ -314,10 +583,10 @@ int ob_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd, dict_t *xdata) { - fd_t *old_fd = NULL; - int ret = -1; - int op_errno = 0; - call_stub_t *stub = NULL; + fd_t *old_fd = NULL; + int ret = -1; + int op_errno = 0; + call_stub_t *stub = NULL; old_fd = fd_lookup (fd->inode, 0); if (old_fd) { @@ -365,7 +634,7 @@ ob_get_wind_fd (xlator_t *this, fd_t *fd, uint32_t *flag) ob_fd = ob_fd_ctx_get (this, fd); - if (ob_fd && conf->use_anonymous_fd) { + if (ob_fd && ob_fd->open_frame && conf->use_anonymous_fd) { wind_fd = fd_anonymous (fd->inode); if ((ob_fd->flags & O_DIRECT) && (flag)) *flag = *flag | O_DIRECT; @@ -760,12 +1029,10 @@ err: return 0; } - int ob_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, dict_t *xdata) { - fd_t *fd = NULL; call_stub_t *stub = NULL; stub = fop_unlink_stub (frame, default_unlink_resume, loc, @@ -773,11 +1040,7 @@ ob_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, if (!stub) goto err; - fd = fd_lookup (loc->inode, 0); - - open_and_resume (this, fd, stub); - if (fd) - fd_unref (fd); + open_all_pending_fds_and_resume (this, loc->inode, stub); return 0; err: @@ -786,24 +1049,17 @@ err: return 0; } - int ob_rename (call_frame_t *frame, xlator_t *this, loc_t *src, loc_t *dst, dict_t *xdata) { - fd_t *fd = NULL; call_stub_t *stub = NULL; stub = fop_rename_stub (frame, default_rename_resume, src, dst, xdata); if (!stub) goto err; - if (dst->inode) - fd = fd_lookup (dst->inode, 0); - - open_and_resume (this, fd, stub); - if (fd) - fd_unref (fd); + open_all_pending_fds_and_resume (this, dst->inode, stub); return 0; err: @@ -816,7 +1072,6 @@ int32_t ob_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, int32_t valid, dict_t *xdata) { - fd_t *fd = NULL; call_stub_t *stub = NULL; stub = fop_setattr_stub (frame, default_setattr_resume, loc, stbuf, @@ -824,11 +1079,7 @@ ob_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, if (!stub) goto err; - fd = fd_lookup (loc->inode, 0); - - open_and_resume (this, fd, stub); - if (fd) - fd_unref (fd); + open_all_pending_fds_and_resume (this, loc->inode, stub); return 0; err: @@ -841,7 +1092,6 @@ int32_t ob_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) { - fd_t *fd = NULL; call_stub_t *stub = NULL; gf_boolean_t access_xattr = _gf_false; @@ -858,11 +1108,7 @@ ob_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, if (!stub) goto err; - fd = fd_lookup (loc->inode, 0); - - open_and_resume (this, fd, stub); - if (fd) - fd_unref (fd); + open_all_pending_fds_and_resume (this, loc->inode, stub); return 0; err: @@ -873,7 +1119,7 @@ err: int ob_release (xlator_t *this, fd_t *fd) { - ob_fd_t *ob_fd = NULL; + ob_fd_t *ob_fd = NULL; ob_fd = ob_fd_ctx_get (this, fd); @@ -882,6 +1128,21 @@ ob_release (xlator_t *this, fd_t *fd) return 0; } +int +ob_forget (xlator_t *this, inode_t *inode) +{ + ob_inode_t *ob_inode = NULL; + uint64_t value = 0; + + inode_ctx_del (inode, this, &value); + + if (value) { + ob_inode = (ob_inode_t *)((void *) value); + ob_inode_free (ob_inode); + } + + return 0; +} int ob_priv_dump (xlator_t *this) @@ -1068,6 +1329,7 @@ struct xlator_fops fops = { struct xlator_cbks cbks = { .release = ob_release, + .forget = ob_forget, }; struct xlator_dumpops dumpops = { @@ -1079,7 +1341,7 @@ struct xlator_dumpops dumpops = { struct volume_options options[] = { { .key = {"use-anonymous-fd"}, .type = GF_OPTION_TYPE_BOOL, - .default_value = "yes", + .default_value = "no", .description = "For read operations, use anonymous FD when " "original FD is open-behind and not yet opened in the backend.", }, |