diff options
Diffstat (limited to 'xlators/features/bit-rot/src/stub/bit-rot-stub.c')
-rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-stub.c | 1070 |
1 files changed, 804 insertions, 266 deletions
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.c b/xlators/features/bit-rot/src/stub/bit-rot-stub.c index f9c3886948a..93db072f671 100644 --- a/xlators/features/bit-rot/src/stub/bit-rot-stub.c +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.c @@ -198,14 +198,15 @@ br_stub_init_inode_versions (xlator_t *this, fd_t *fd, inode_t *inode, if (!ctx) goto error_return; + INIT_LIST_HEAD (&ctx->fd_list); (markdirty) ? __br_stub_mark_inode_dirty (ctx) : __br_stub_mark_inode_synced (ctx); __br_stub_set_ongoing_version (ctx, version); - __br_stub_reset_release_counters (ctx); if (fd) { - br_stub_require_release_call (this, fd); - __br_stub_track_openfd (fd, ctx); + ret = br_stub_add_fd_to_inode (this, fd, ctx); + if (ret) + goto free_ctx; } ret = br_stub_set_inode_ctx (this, inode, ctx); if (ret) @@ -238,7 +239,6 @@ br_stub_mod_inode_versions (xlator_t *this, __br_stub_mark_inode_synced (ctx); } - __br_stub_track_openfd (fd, ctx); ret = 0; } unblock: @@ -250,19 +250,16 @@ br_stub_mod_inode_versions (xlator_t *this, static inline void br_stub_fill_local (br_stub_local_t *local, call_stub_t *stub, fd_t *fd, inode_t *inode, uuid_t gfid, - int versioningtype, unsigned long memversion, int dirty) + int versioningtype, unsigned long memversion) { local->fopstub = stub; local->versioningtype = versioningtype; local->u.context.version = memversion; - if (fd) + if (fd && !local->u.context.fd) local->u.context.fd = fd_ref (fd); if (inode) local->u.context.inode = inode_ref (inode); gf_uuid_copy (local->u.context.gfid, gfid); - - /* mark inode dirty/fresh according to durability */ - local->u.context.markdirty = (dirty) ? _gf_true : _gf_false; } static inline void @@ -279,57 +276,13 @@ br_stub_cleanup_local (br_stub_local_t *local) inode_unref (local->u.context.inode); local->u.context.inode = NULL; } - local->u.context.markdirty = _gf_true; memset (local->u.context.gfid, '\0', sizeof (uuid_t)); } /** - * callback for inode/fd full versioning + * callback for inode/fd versioning */ int -br_stub_inode_fullversioning_cbk (call_frame_t *frame, - void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xdata) -{ - fd_t *fd = NULL; - inode_t *inode = NULL; - unsigned long version = 0; - gf_boolean_t dirty = _gf_true; - br_stub_local_t *local = NULL; - - local = (br_stub_local_t *)frame->local; - - /* be graceful to EEXIST */ - if ((op_ret < 0) && (op_errno == EEXIST)) { - op_ret = 0; - goto done; - } - - if (op_ret < 0) - goto done; - - fd = local->u.context.fd; - inode = local->u.context.inode; - version = local->u.context.version; - dirty = local->u.context.markdirty; - - op_ret = br_stub_init_inode_versions (this, fd, inode, version, dirty); - if (op_ret < 0) - op_errno = EINVAL; - - done: - frame->local = NULL; - if (op_ret < 0) - call_unwind_error (local->fopstub, op_ret, op_errno); - else - call_resume (local->fopstub); - br_stub_cleanup_local (local); - br_stub_dealloc_local (local); - - return 0; -} - -int br_stub_fd_incversioning_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, dict_t *xdata) @@ -351,14 +304,14 @@ br_stub_fd_incversioning_cbk (call_frame_t *frame, op_errno = EINVAL; done: - frame->local = NULL; - if (op_ret < 0) + if (op_ret < 0) { + frame->local = NULL; call_unwind_error (local->fopstub, -1, op_errno); - else + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + } else { call_resume (local->fopstub); - br_stub_cleanup_local (local); - br_stub_dealloc_local (local); - + } return 0; } @@ -366,28 +319,27 @@ br_stub_fd_incversioning_cbk (call_frame_t *frame, * Initial object versioning * * Version persists two (2) extended attributes as explained below: - * 1. Current (ongoing) version: This is incremented on an open() - * or creat() and is the running version for an object. + * 1. Current (ongoing) version: This is incremented on an writev () + * or truncate () and is the running version for an object. * 2. Signing version: This is the version against which an object * was signed (checksummed). * * During initial versioning, both ongoing and signing versions are - * set of one and zero respectively. An open() call increments the + * set of one and zero respectively. A write() call increments the * ongoing version as an indication of modification to the object. * Additionally this needs to be persisted on disk and needs to be * durable: fsync().. :-/ - * As an optimization only the first open() synchronizes the ongoing - * version to disk, subsequent open()s before the *last* release() + * As an optimization only the first write() synchronizes the ongoing + * version to disk, subsequent write()s before the *last* release() * are no-op's. * * create(), just like lookup() initializes the object versions to - * the default, but persists the version to disk. As an optimization - * this is not a durable operation: in case of a crash, hard reboot - * etc.. absence of versioning xattrs is ignored in scrubber along - * with the one time crawler explicitly triggering signing for such - * objects. + * the default. As an optimization this is not a durable operation: + * in case of a crash, hard reboot etc.. absence of versioning xattrs + * is ignored in scrubber along with the one time crawler explicitly + * triggering signing for such objects. * - * c.f. br_stub_open_cbk() / br_stub_create_cbk() + * c.f. br_stub_writev() / br_stub_truncate() */ /** @@ -400,7 +352,7 @@ int br_stub_fd_versioning (xlator_t *this, call_frame_t *frame, call_stub_t *stub, dict_t *dict, fd_t *fd, br_stub_version_cbk *callback, unsigned long memversion, - int versioningtype, int durable, int dirty) + int versioningtype, int durable) { int32_t ret = -1; int flags = 0; @@ -421,18 +373,11 @@ br_stub_fd_versioning (xlator_t *this, call_frame_t *frame, goto dealloc_xdata; } - local = br_stub_alloc_local (this); - if (!local) { - ret = -1; - goto dealloc_xdata; - } - - if (versioningtype == BR_STUB_FULL_VERSIONING) - flags |= XATTR_CREATE; + local = frame->local; br_stub_fill_local (local, stub, fd, fd->inode, fd->inode->gfid, - versioningtype, memversion, dirty); + versioningtype, memversion); frame->local = local; STACK_WIND (frame, callback, @@ -448,82 +393,21 @@ br_stub_fd_versioning (xlator_t *this, call_frame_t *frame, } static inline int -br_stub_perform_fullversioning (xlator_t *this, call_frame_t *frame, - call_stub_t *stub, fd_t *fd) -{ - int32_t ret = -1; - dict_t *dict = NULL; - br_version_t *obuf = NULL; - int op_errno = 0; - - op_errno = ENOMEM; - dict = dict_new (); - if (!dict) - goto done; - ret = br_stub_alloc_versions (&obuf, NULL, 0); - if (ret) - goto dealloc_dict; - - op_errno = EINVAL; - ret = br_stub_prepare_version_request (this, dict, obuf, - BITROT_DEFAULT_CURRENT_VERSION); - if (ret) - goto dealloc_versions; - - /** - * Version extended attributes need not be durable at this point of - * time. If the objects (inode) data gets persisted on disk but the - * version extended attributes are lost due to a crash/power failure, - * a subsequent lookup marks the objects signature as stale. This way, - * dentry operation times do not shoot up. - */ - ret = br_stub_fd_versioning (this, frame, stub, dict, fd, - br_stub_inode_fullversioning_cbk, - BITROT_DEFAULT_CURRENT_VERSION, - BR_STUB_FULL_VERSIONING, !WRITEBACK_DURABLE, 0); - - dealloc_versions: - br_stub_dealloc_versions (obuf); - dealloc_dict: - dict_unref (dict); - done: - if (ret) - call_unwind_error (stub, -1, op_errno); - return ret; -} - -static inline int br_stub_perform_incversioning (xlator_t *this, call_frame_t *frame, call_stub_t *stub, fd_t *fd, br_stub_inode_ctx_t *ctx) { - int32_t ret = -1; - dict_t *dict = NULL; - inode_t *inode = NULL; - br_version_t *obuf = NULL; - unsigned long writeback_version = 0; - int op_errno = 0; - - inode = fd->inode; + int32_t ret = -1; + dict_t *dict = NULL; + br_version_t *obuf = NULL; + unsigned long writeback_version = 0; + int op_errno = 0; + br_stub_local_t *local = NULL; op_errno = EINVAL; - ret = br_stub_require_release_call (this, fd); - if (ret) - goto done; - - LOCK (&inode->lock); - { - if (__br_stub_is_inode_dirty (ctx)) - writeback_version = __br_stub_writeback_version (ctx); - else - __br_stub_track_openfd (fd, ctx); - } - UNLOCK (&inode->lock); + local = frame->local; - if (!writeback_version) { - ret = 0; - goto done; - } + writeback_version = __br_stub_writeback_version (ctx); /* inode requires writeback to disk */ op_errno = ENOMEM; @@ -541,17 +425,23 @@ br_stub_perform_incversioning (xlator_t *this, ret = br_stub_fd_versioning (this, frame, stub, dict, fd, br_stub_fd_incversioning_cbk, writeback_version, - BR_STUB_INCREMENTAL_VERSIONING, WRITEBACK_DURABLE, 0); + BR_STUB_INCREMENTAL_VERSIONING, !WRITEBACK_DURABLE); dealloc_versions: br_stub_dealloc_versions (obuf); dealloc_dict: dict_unref (dict); done: - if (!ret && !writeback_version) - call_resume (stub); - if (ret) + if (ret) { + if (local) + frame->local = NULL; call_unwind_error (stub, -1, op_errno); + if (local) { + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + } + } + return ret; } @@ -560,6 +450,44 @@ br_stub_perform_incversioning (xlator_t *this, /* fsetxattr() */ static inline int +br_stub_compare_sign_version (xlator_t *this, inode_t *inode, + br_signature_t *sbuf, dict_t *dict) +{ + int32_t ret = -1; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t tmp_ctx = 0; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO (this->name, sbuf, out); + GF_VALIDATE_OR_GOTO (this->name, dict, out); + + ret = br_stub_get_inode_ctx (this, inode, &tmp_ctx); + if (ret) { + dict_del (dict, BITROT_SIGNING_VERSION_KEY); + goto out; + } + + ret = -1; + ctx = (br_stub_inode_ctx_t *)(long)tmp_ctx; + + LOCK (&inode->lock); + { + if (ctx->currentversion == sbuf->signedversion) + ret = 0; + else + gf_log (this->name, GF_LOG_WARNING, "current version " + "%lu and version of the signature %lu are not " + "same", ctx->currentversion, + sbuf->signedversion); + } + UNLOCK (&inode->lock); + +out: + return ret; +} + +static inline int br_stub_prepare_signature (xlator_t *this, dict_t *dict, inode_t *inode, br_isignature_t *sign) { @@ -577,6 +505,11 @@ br_stub_prepare_signature (xlator_t *this, dict_t *dict, ret = br_stub_prepare_signing_request (dict, sbuf, sign, signaturelen); if (ret) goto dealloc_versions; + + ret = br_stub_compare_sign_version (this, inode, sbuf, dict); + if (ret) + goto dealloc_versions; + return 0; dealloc_versions: @@ -620,6 +553,8 @@ br_stub_fsetxattr (call_frame_t *frame, xlator_t *this, if (ret) goto unwind; + gf_log (this->name, GF_LOG_DEBUG, "SIGNED VERSION: %lu", + sign->signedversion); wind: STACK_WIND (frame, default_setxattr_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr, fd, @@ -865,77 +800,598 @@ br_stub_fgetxattr (call_frame_t *frame, xlator_t *this, return 0; } -/** }}} */ +/** + * The first write response on the first fd in the list of fds will set + * the flag to indicate that the inode is modified. The subsequent write + * respnses coming on either the first fd or some other fd will not change + * the fd. The inode-modified flag is unset only upon release of all the + * fds. + */ +int32_t +br_stub_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + br_stub_local_t *local = NULL; + + if (frame->local) { + local = frame->local; + frame->local = NULL; + } + if (op_ret < 0) + goto unwind; -/** {{{ */ + ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode, + &ctx_addr); + if (ret < 0) + goto unwind; -/* open() */ + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; -int -br_stub_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd, dict_t *xdata) + /* Mark the flag to indicate the inode has been modified */ + LOCK (&local->u.context.fd->inode->lock); + { + if (!__br_stub_is_inode_modified (ctx)) + __br_stub_set_inode_modified (ctx); + } + UNLOCK (&local->u.context.fd->inode->lock); + + +unwind: + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + return 0; +} + +/** + * Ongoing version is increased only for the first modify operation. + * First modify version means the first write or truncate call coming on the + * first fd in the list of inodes. + * For anonymous fds open would not have come, so check if its the first write + * by doing both inode dirty check and ensuring list of fds is empty + */ +static inline gf_boolean_t +br_stub_inc_version (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx) { - int32_t ret = 0; - uint64_t ctx_addr = 0; - br_stub_inode_ctx_t *ctx = NULL; - call_stub_t *stub = NULL; + gf_boolean_t inc_version = _gf_false; + + GF_VALIDATE_OR_GOTO (this->name, fd, out); + GF_VALIDATE_OR_GOTO (this->name, ctx, out); + + LOCK (&fd->inode->lock); + { + if (__br_stub_is_inode_dirty (ctx)) + inc_version = _gf_true; + } + UNLOCK (&fd->inode->lock); + +out: + return inc_version; +} + +/** + * Since NFS does not do open, writes from NFS are sent over an anonymous + * fd. It means each write fop might come on a different anonymous fd and + * will lead to very large number of notifications being sent. It might + * affect the perfromance as, there will too many sign requests. + * To avoid that whenever the last fd released from an inode (logical release) + * is an anonymous fd the release notification is sent with a flag being set + * __br_stub_anon_release (ctx); + * BitD checks for the flag and if set, it will send a dummy write request + * (again on an anonymous fd) instead of triggering sign. + * Bit-rot-stub should identify such dummy writes and should send success to + * them instead of winding them downwards. + */ +gf_boolean_t +br_stub_dummy_write (call_frame_t *frame) +{ + return (frame->root->pid == GF_CLIENT_PID_BITD) + ? _gf_true : _gf_false; +} + +int32_t +br_stub_anon_fd_ctx (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx) +{ + int32_t ret = -1; + br_stub_fd_t *br_stub_fd = NULL; + + br_stub_fd = br_stub_fd_ctx_get (this, fd); + if (!br_stub_fd) { + ret = br_stub_add_fd_to_inode (this, fd, ctx); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to " + "add fd to the inode (gfid: %s)", + uuid_utoa (fd->inode->gfid)); + goto out; + } + } + + ret = 0; + +out: + return ret; +} + +int32_t +br_stub_writev_resume (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) +{ + if (frame->root->pid == GF_CLIENT_PID_BITD) + br_stub_writev_cbk (frame, NULL, this, vector->iov_len, 0, + NULL, NULL, NULL); + else + STACK_WIND (frame, br_stub_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, + offset, flags, iobref, xdata); + return 0; +} + +/** + TODO: If possible add pictorial represention of below comment. + + Before sending writev on the ANONYMOUS FD, increase the ongoing + version first. This brings anonymous fd write closer to the regular + fd write by having the ongoing version increased before doing the + write (In regular fd, after open the ongoing version is incremented). + Do following steps to handle writes on anonymous fds: + 1) Increase the on-disk ongoing version + 2) Once versioning is successfully done send write operation. If versioning + fails, then fail the write fop. + 3) In writev_cbk do below things: + a) Increase in-memory version + b) set the fd context (so that br_stub_release is invoked) + c) add the fd to the list of fds maintained in the inode context of + bitrot-stub. + d) Mark inode as non dirty + e) Mard inode as modified (in the inode context) +**/ +int32_t +br_stub_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) +{ + br_stub_local_t *local = NULL; + call_stub_t *stub = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + gf_boolean_t inc_version = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t ctx_addr = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO (this->name, frame, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd, unwind); + + local = br_stub_alloc_local (this); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, "local allocation failed " + "(gfid: %s)", uuid_utoa (fd->inode->gfid)); + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + local->u.context.fd = fd_ref (fd); + frame->local = local; + + ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to get the inode " + "context for the inode %s", + uuid_utoa (fd->inode->gfid)); + goto unwind; + } + + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + if (fd_is_anonymous (fd)) { + ret = br_stub_anon_fd_ctx (this, fd, ctx); + if (ret) + goto unwind; + } + + /* TODO: Better to do a dummy fsetxattr instead of write. Keep write + simple */ + if (br_stub_dummy_write (frame)) { + LOCK (&fd->inode->lock); + { + (void) __br_stub_inode_sign_state + (ctx, GF_FOP_WRITE, fd); + } + UNLOCK (&fd->inode->lock); + + if (xdata && dict_get (xdata, "br-fd-reopen")) { + op_ret = vector->iov_len; + op_errno = 0; + goto unwind; + } + } + + /** + * Check whether this is the first write on this inode since the last + * sign notification has been sent. If so, do versioning. Otherwise + * go ahead with the fop. + */ + inc_version = br_stub_inc_version (this, fd, ctx); + if (!inc_version) + goto wind; + + /* Create the stub for the write fop */ + stub = fop_writev_stub (frame, br_stub_writev_resume, fd, vector, count, + offset, flags, iobref, xdata); + + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for " + "write fop (gfid: %s), unwinding", + uuid_utoa (fd->inode->gfid)); + goto unwind; + } + + /* Perform Versioning */ + return br_stub_perform_incversioning (this, frame, stub, fd, ctx); + +wind: + STACK_WIND (frame, br_stub_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); + return 0; + +unwind: + frame->local = NULL; + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, NULL, NULL, + NULL); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + return 0; +} + +int32_t +br_stub_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + br_stub_local_t *local = NULL; + + if (frame->local) { + local = frame->local; + frame->local = NULL; + } if (op_ret < 0) goto unwind; - if (cookie != (void *) BR_STUB_REQUEST_COOKIE) + + ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode, + &ctx_addr); + if (ret < 0) goto unwind; + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + + /* Mark the flag to indicate the inode has been modified */ + LOCK (&local->u.context.fd->inode->lock); + { + if (!__br_stub_is_inode_modified (ctx)) + __br_stub_set_inode_modified (ctx); + } + UNLOCK (&local->u.context.fd->inode->lock); + + +unwind: + STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + return 0; +} + +int32_t +br_stub_ftruncate_resume (call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, dict_t *xdata) +{ + STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; +} + +int32_t +br_stub_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, dict_t *xdata) +{ + br_stub_local_t *local = NULL; + call_stub_t *stub = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + gf_boolean_t inc_version = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t ctx_addr = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO (this->name, frame, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd, unwind); + + local = br_stub_alloc_local (this); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, "local allocation failed " + "(gfid: %s)", uuid_utoa (fd->inode->gfid)); + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + local->u.context.fd = fd_ref (fd); + frame->local = local; + ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); - if (ret < 0) + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to get the inode " + "context for the inode %s", + uuid_utoa (fd->inode->gfid)); goto unwind; + } + + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + if (fd_is_anonymous (fd)) { + ret = br_stub_anon_fd_ctx (this, fd, ctx); + if (ret) + goto unwind; + } - stub = fop_open_cbk_stub (frame, NULL, op_ret, op_errno, fd, xdata); + /** + * c.f. br_stub_writev() + */ + inc_version = br_stub_inc_version (this, fd, ctx); + if (!inc_version) + goto wind; + + /* Create the stub for the ftruncate fop */ + stub = fop_ftruncate_stub (frame, br_stub_ftruncate_resume, fd, offset, + xdata); if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for " + "ftruncate fop (gfid: %s), unwinding", + uuid_utoa (fd->inode->gfid)); + goto unwind; + } + + /* Perform Versioning */ + return br_stub_perform_incversioning (this, frame, stub, fd, ctx); + +wind: + STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; + +unwind: + frame->local = NULL; + STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, NULL, NULL, + NULL); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + return 0; +} + +int32_t +br_stub_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + br_stub_local_t *local = NULL; + + if (frame->local) { + local = frame->local; + frame->local = NULL; + } + + if (op_ret < 0) + goto unwind; + + ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode, + &ctx_addr); + if (ret < 0) + goto unwind; + + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + + /* Mark the flag to indicate the inode has been modified */ + LOCK (&local->u.context.fd->inode->lock); + { + if (!__br_stub_is_inode_modified (ctx)) + __br_stub_set_inode_modified (ctx); + } + UNLOCK (&local->u.context.fd->inode->lock); + + +unwind: + STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + return 0; +} + +int32_t +br_stub_truncate_resume (call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata) +{ + STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; +} + +/** + * Bit-rot-stub depends heavily on the fd based operations to for doing + * versioning and sending notification. It starts tracking the operation + * upon getting first fd based modify operation by doing versioning and + * sends notification when last fd using which the inode was modified is + * released. + * But for truncate there is no fd and hence it becomes difficult to do + * the versioning and send notification. It is handled by doing versioning + * on an anonymous fd. The fd will be valid till the completion of the + * truncate call. It guarantees that release on this anonymous fd will happen + * after the truncate call and notification is sent after the truncate call. + */ +int32_t +br_stub_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata) +{ + br_stub_local_t *local = NULL; + call_stub_t *stub = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + gf_boolean_t inc_version = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t ctx_addr = 0; + int32_t ret = -1; + fd_t *fd = NULL; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO (this->name, frame, unwind); + GF_VALIDATE_OR_GOTO (this->name, loc, unwind); + GF_VALIDATE_OR_GOTO (this->name, loc->inode, unwind); + + fd = fd_anonymous (loc->inode); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, "failed to create anonymous " + "fd for the inode %s", uuid_utoa (loc->inode->gfid)); + goto unwind; + } + + local = br_stub_alloc_local (this); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, "local allocation failed " + "(gfid: %s)", uuid_utoa (loc->inode->gfid)); op_ret = -1; - op_errno = EINVAL; + op_errno = ENOMEM; + goto unwind; + } + + local->u.context.fd = fd; + frame->local = local; + + ret = br_stub_get_inode_ctx (this, loc->inode, &ctx_addr); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to get the inode " + "context for the inode %s", + uuid_utoa (fd->inode->gfid)); goto unwind; } + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + ret = br_stub_anon_fd_ctx (this, local->u.context.fd, ctx); + if (ret) + goto unwind; + /** - * Ongoing version needs to be incremented. If the inode is not dirty, - * things are simple: increment the ongoing version safely and be done. - * If inode is dirty, a writeback to disk is required. This is tricky in - * case of multiple open()'s as ongoing version needs to be incremented - * on a successful writeback. It's probably safe to remember the ongoing - * version before writeback and *assigning* it in the callback, but that - * may lead to a trustable checksum to be treated as stale by scrubber - * (the case where the in-memory ongoing version is lesser than the - * on-disk version). Therefore, *all* open() calls (which might have - * come in parallel) try to synchronize the next ongoing version to - * disk. In the callback path, the winner marks the inode as synced - * therby loosing open() calls become no-op's. + * c.f. br_stub_writev() */ - ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; - return br_stub_perform_incversioning (this, frame, stub, fd, ctx); + inc_version = br_stub_inc_version (this, fd, ctx); + if (!inc_version) + goto wind; - unwind: - STACK_UNWIND_STRICT (open, frame, - op_ret, op_errno, fd, xdata); + /* Create the stub for the truncate fop */ + stub = fop_truncate_stub (frame, br_stub_truncate_resume, loc, offset, + xdata); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for " + "truncate fop (gfid: %s), unwinding", + uuid_utoa (fd->inode->gfid)); + goto unwind; + } + + /* Perform Versioning */ + return br_stub_perform_incversioning (this, frame, stub, + local->u.context.fd, ctx); + +wind: + STACK_WIND (frame, br_stub_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; + +unwind: + frame->local = NULL; + STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, NULL, NULL, + NULL); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); return 0; } +/** }}} */ + + +/** {{{ */ + +/* open() */ + +/** + * It's probably worth mentioning a bit about why some of the housekeeping + * work is done in open() call path, rather than the callback path. + * Two (or more) open()'s in parallel can race and lead to a situation + * where a release() gets triggered (possibly after a series of write() + * calls) when *other* open()'s have still not reached callback path + * thereby having an active fd on an inode that is in process of getting + * signed with the current version. + * + * Maintaining fd list in the call path ensures that a release() would + * not be triggered if an open() call races ahead (followed by a close()) + * threby finding non-empty fd list. + */ + int br_stub_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata) { - void *cookie = NULL; + int32_t ret = -1; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t ctx_addr = 0; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO (this->name, loc, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd->inode, unwind); - if (!flags) - goto wind; if (frame->root->pid == GF_CLIENT_PID_SCRUB) goto wind; - cookie = (void *) BR_STUB_REQUEST_COOKIE; - wind: - STACK_WIND_COOKIE (frame, br_stub_open_cbk, cookie, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->open, - loc, flags, fd, xdata); + ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to get the inode " + "context for the file %s (gfid: %s)", loc->path, + uuid_utoa (fd->inode->gfid)); + goto unwind; + } + + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + if (flags == O_RDONLY) + goto wind; + + ret = br_stub_add_fd_to_inode (this, fd, ctx); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed add fd to the list " + "(gfid: %s)", uuid_utoa (fd->inode->gfid)); + goto unwind; + } + +wind: + STACK_WIND (frame, default_open_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->open, loc, flags, fd, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT (open, frame, -1, EINVAL, NULL, NULL); return 0; } @@ -946,39 +1402,60 @@ br_stub_open (call_frame_t *frame, xlator_t *this, /* creat() */ +/** + * This routine registers a release callback for the given fd and adds the + * fd to the inode context fd tracking list. + */ +int32_t +br_stub_add_fd_to_inode (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx) +{ + int32_t ret = -1; + br_stub_fd_t *br_stub_fd = NULL; + + ret = br_stub_require_release_call (this, fd, &br_stub_fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to set the fd " + "context for the file (gfid: %s)", + uuid_utoa (fd->inode->gfid)); + goto out; + } + + LOCK (&fd->inode->lock); + { + list_add_tail (&ctx->fd_list, &br_stub_fd->list); + } + UNLOCK (&fd->inode->lock); + + ret = 0; + +out: + return ret; +} + int br_stub_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - int32_t ret = 0; - uint64_t ctx_addr = 0; - call_stub_t *stub = NULL; - br_stub_inode_ctx_t *ctx = NULL; + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + unsigned long version = BITROT_DEFAULT_CURRENT_VERSION; if (op_ret < 0) goto unwind; - stub = fop_create_cbk_stub (frame, NULL, op_ret, op_errno, fd, inode, - stbuf, preparent, postparent, xdata); - if (!stub) { - op_ret = -1; - op_errno = EINVAL; - goto unwind; - } - ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); - if (ret < 0) - ctx_addr = 0; - ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; - - /* see comment in br_stub_open_cbk().. */ - return (ctx) - ? br_stub_perform_incversioning (this, frame, stub, fd, ctx) - : br_stub_perform_fullversioning (this, frame, stub, fd); + if (ret < 0) { + ret = br_stub_init_inode_versions (this, fd, inode, version, + _gf_true); + } else { + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + ret = br_stub_add_fd_to_inode (this, fd, ctx); + } - unwind: +unwind: STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, stbuf, preparent, postparent, xdata); return 0; @@ -989,10 +1466,20 @@ br_stub_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO (this->name, loc, unwind); + GF_VALIDATE_OR_GOTO (this->name, loc->inode, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd->inode, unwind); + STACK_WIND (frame, br_stub_create_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->create, loc, flags, mode, umask, fd, xdata); return 0; +unwind: + STACK_UNWIND_STRICT (create, frame, -1, EINVAL, NULL, NULL, NULL, NULL, + NULL, NULL); + return 0; } /** }}} */ @@ -1011,21 +1498,11 @@ br_stub_lookup_version (xlator_t *this, * out the correct version to use in the inode context (start with * the default version if unavailable). As of now versions are not * persisted on-disk. The inode is marked dirty, so that the first - * operation (such as open(), etc..) would trigger synchronization - * to disk. + * operation (such as write(), etc..) triggers synchronization to + * disk. */ status = br_version_xattr_state (xattr, &obuf, &sbuf); - /** - * stub does not know how to handle presence of signature but not - * the object version, therefore, in such cases, bail out.. - */ - if (status == BR_VXATTR_STATUS_INVALID) { - gf_log (this->name, GF_LOG_ERROR, "Invalid versioning xattrs. " - "Bailing out [GFID: %s]", uuid_utoa (gfid)); - return -1; - } - version = ((status == BR_VXATTR_STATUS_FULL) || (status == BR_VXATTR_STATUS_UNSIGNED)) ? obuf->ongoingversion : BITROT_DEFAULT_CURRENT_VERSION; @@ -1259,8 +1736,8 @@ br_stub_noop (call_frame_t *frame, void *cookie, xlator_t *this, } static inline void -br_stub_send_ipc_fop (xlator_t *this, - fd_t *fd, unsigned long releaseversion, int32_t flags) +br_stub_send_ipc_fop (xlator_t *this, fd_t *fd, unsigned long releaseversion, + int sign_info) { int32_t op = 0; int32_t ret = 0; @@ -1269,8 +1746,8 @@ br_stub_send_ipc_fop (xlator_t *this, changelog_event_t ev = {0,}; ev.ev_type = CHANGELOG_OP_TYPE_BR_RELEASE; - ev.u.releasebr.flags = flags; ev.u.releasebr.version = releaseversion; + ev.u.releasebr.sign_info = sign_info; gf_uuid_copy (ev.u.releasebr.gfid, fd->inode->gfid); xdata = dict_new (); @@ -1305,14 +1782,67 @@ br_stub_send_ipc_fop (xlator_t *this, return; } +/** + * This is how the state machine of sign info works: + * 3 states: + * 1) BR_SIGN_NORMAL => The default State of the inode + * 2) BR_SIGN_REOPEN_WAIT => A release has been sent and is waiting for reopen + * 3) BR_SIGN_QUICK => reopen has happened and this release should trigger sign + * 2 events: + * 1) GF_FOP_RELEASE + * 2) GF_FOP_WRITE (actually a dummy write fro BitD) + * + * This is how states are changed based on events: + * EVENT: GF_FOP_RELEASE: + * if (state == BR_SIGN_NORMAL) ; then + * set state = BR_SIGN_REOPEN_WAIT; + * if (state == BR_SIGN_QUICK); then + * set state = BR_SIGN_NORMAL; + * EVENT: GF_FOP_WRITE: + * if (state == BR_SIGN_REOPEN_WAIT); then + * set state = BR_SIGN_QUICK; + */ +br_sign_state_t +__br_stub_inode_sign_state (br_stub_inode_ctx_t *ctx, + glusterfs_fop_t fop, fd_t *fd) +{ + br_sign_state_t sign_info = BR_SIGN_INVALID; + + switch (fop) { + + case GF_FOP_WRITE: + sign_info = ctx->info_sign = BR_SIGN_QUICK; + break; + + case GF_FOP_RELEASE: + GF_ASSERT (ctx->info_sign != BR_SIGN_REOPEN_WAIT); + + if (ctx->info_sign == BR_SIGN_NORMAL) { + sign_info = ctx->info_sign = BR_SIGN_REOPEN_WAIT; + } else { + sign_info = ctx->info_sign; + ctx->info_sign = BR_SIGN_NORMAL; + } + + break; + default: + break; + } + + return sign_info; +} + int32_t br_stub_release (xlator_t *this, fd_t *fd) { - int32_t ret = 0; - int32_t flags = 0; - inode_t *inode = NULL; - unsigned long releaseversion = 0; - br_stub_inode_ctx_t *ctx = NULL; + int32_t ret = 0; + int32_t flags = 0; + inode_t *inode = NULL; + unsigned long releaseversion = 0; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t tmp = 0; + br_stub_fd_t *br_stub_fd = NULL; + int32_t signinfo = 0; inode = fd->inode; @@ -1321,12 +1851,23 @@ br_stub_release (xlator_t *this, fd_t *fd) ctx = __br_stub_get_ongoing_version_ctx (this, inode, NULL); if (ctx == NULL) goto unblock; - __br_stub_track_release (ctx); + br_stub_fd = br_stub_fd_ctx_get (this, fd); + if (br_stub_fd) { + list_del_init (&br_stub_fd->list); + } + ret = __br_stub_can_trigger_release - (inode, ctx, &releaseversion, &flags); - if (ret) { - GF_ASSERT (__br_stub_is_inode_dirty (ctx) == 0); + (inode, ctx, &releaseversion); + if (!ret) + goto unblock; + + signinfo = __br_stub_inode_sign_state (ctx, GF_FOP_RELEASE, fd); + signinfo = htonl (signinfo); + + /* inode back to initital state: mark dirty */ + if (ctx->info_sign == BR_SIGN_NORMAL) { __br_stub_mark_inode_dirty (ctx); + __br_stub_unset_inode_modified (ctx); } } unblock: @@ -1334,10 +1875,17 @@ br_stub_release (xlator_t *this, fd_t *fd) if (ret) { gf_log (this->name, GF_LOG_DEBUG, - "releaseversion: %lu|flags: %d", releaseversion, flags); - br_stub_send_ipc_fop (this, fd, releaseversion, flags); + "releaseversion: %lu | flags: %d | signinfo: %d", + (unsigned long) ntohl (releaseversion), + flags, ntohl(signinfo)); + br_stub_send_ipc_fop (this, fd, releaseversion, signinfo); } + ret = fd_ctx_del (fd, this, &tmp); + br_stub_fd = (br_stub_fd_t *)(long)tmp; + + GF_FREE (br_stub_fd); + return 0; } @@ -1351,11 +1899,12 @@ void br_stub_ictxmerge (xlator_t *this, fd_t *fd, inode_t *inode, inode_t *linked_inode) { - int32_t ret = 0; - uint64_t ctxaddr = 0; - uint64_t lctxaddr = 0; - br_stub_inode_ctx_t *ctx = NULL; - br_stub_inode_ctx_t *lctx = NULL; + int32_t ret = 0; + uint64_t ctxaddr = 0; + uint64_t lctxaddr = 0; + br_stub_inode_ctx_t *ctx = NULL; + br_stub_inode_ctx_t *lctx = NULL; + br_stub_fd_t *br_stub_fd = NULL; ret = br_stub_get_inode_ctx (this, inode, &ctxaddr); if (ret < 0) @@ -1369,29 +1918,15 @@ br_stub_ictxmerge (xlator_t *this, fd_t *fd, goto unblock; lctx = (br_stub_inode_ctx_t *) lctxaddr; - if (__br_stub_is_inode_dirty (lctx)) { - /** - * RACY code: An inode can end up in this situation - * after a lookup() or after a create() followed by - * a release(). Even if we distinguish b/w the two, - * there needs to be more infrastructure built up - * in stub to handle these races. Note, that it's - * probably OK to ignore the race iff the version - * was initialized on the very first lookup(), i.e., - * [ongoingversion: default]. - * - * FIXME: fixup races [create(1..n)/lookup(1..n)]. - */ - GF_ASSERT (lctx->currentversion - == BITROT_DEFAULT_CURRENT_VERSION); - __br_stub_track_openfd (fd, lctx); - __br_stub_mark_inode_synced (lctx); - } else { - GF_ASSERT (ctx->currentversion <= lctx->currentversion); - __br_stub_track_openfd (fd, lctx); + GF_ASSERT (list_is_singular (&ctx->fd_list)); + br_stub_fd = list_first_entry (&ctx->fd_list, br_stub_fd_t, + list); + if (br_stub_fd) { + GF_ASSERT (br_stub_fd->fd == fd); + list_move_tail (&br_stub_fd->list, &lctx->fd_list); } } - unblock: +unblock: UNLOCK (&linked_inode->lock); done: @@ -1409,6 +1944,9 @@ struct xlator_fops fops = { .getxattr = br_stub_getxattr, .fgetxattr = br_stub_fgetxattr, .fsetxattr = br_stub_fsetxattr, + .writev = br_stub_writev, + .truncate = br_stub_truncate, + .ftruncate = br_stub_ftruncate, }; struct xlator_cbks cbks = { |