diff options
Diffstat (limited to 'xlators/features/bit-rot/src/stub/bit-rot-stub.c')
| -rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-stub.c | 1070 | 
1 files changed, 804 insertions, 266 deletions
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.c b/xlators/features/bit-rot/src/stub/bit-rot-stub.c index f9c3886948a..93db072f671 100644 --- a/xlators/features/bit-rot/src/stub/bit-rot-stub.c +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.c @@ -198,14 +198,15 @@ br_stub_init_inode_versions (xlator_t *this, fd_t *fd, inode_t *inode,          if (!ctx)                  goto error_return; +        INIT_LIST_HEAD (&ctx->fd_list);          (markdirty) ? __br_stub_mark_inode_dirty (ctx)                  : __br_stub_mark_inode_synced (ctx);          __br_stub_set_ongoing_version (ctx, version); -        __br_stub_reset_release_counters (ctx);          if (fd) { -                br_stub_require_release_call (this, fd); -                __br_stub_track_openfd (fd, ctx); +                ret = br_stub_add_fd_to_inode (this, fd, ctx); +                if (ret) +                        goto free_ctx;          }          ret = br_stub_set_inode_ctx (this, inode, ctx);          if (ret) @@ -238,7 +239,6 @@ br_stub_mod_inode_versions (xlator_t *this,                          __br_stub_mark_inode_synced (ctx);                  } -                __br_stub_track_openfd (fd, ctx);                  ret = 0;          }   unblock: @@ -250,19 +250,16 @@ br_stub_mod_inode_versions (xlator_t *this,  static inline void  br_stub_fill_local (br_stub_local_t *local,                      call_stub_t *stub, fd_t *fd, inode_t *inode, uuid_t gfid, -                    int versioningtype, unsigned long memversion, int dirty) +                    int versioningtype, unsigned long memversion)  {          local->fopstub = stub;          local->versioningtype = versioningtype;          local->u.context.version = memversion; -        if (fd) +        if (fd && !local->u.context.fd)                  local->u.context.fd = fd_ref (fd);          if (inode)                  local->u.context.inode = inode_ref (inode);          gf_uuid_copy (local->u.context.gfid, gfid); - -        /* mark inode dirty/fresh according to durability */ -        local->u.context.markdirty = (dirty) ? _gf_true : _gf_false;  }  static inline void @@ -279,57 +276,13 @@ br_stub_cleanup_local (br_stub_local_t *local)                  inode_unref (local->u.context.inode);                  local->u.context.inode = NULL;          } -        local->u.context.markdirty = _gf_true;          memset (local->u.context.gfid, '\0', sizeof (uuid_t));  }  /** - * callback for inode/fd full versioning + * callback for inode/fd versioning   */  int -br_stub_inode_fullversioning_cbk (call_frame_t *frame, -                                  void *cookie, xlator_t *this, -                                  int op_ret, int op_errno, dict_t *xdata) -{ -        fd_t            *fd      = NULL; -        inode_t         *inode   = NULL; -        unsigned long    version = 0; -        gf_boolean_t     dirty   = _gf_true; -        br_stub_local_t *local   = NULL; - -        local = (br_stub_local_t *)frame->local; - -        /* be graceful to EEXIST */ -        if ((op_ret < 0) && (op_errno == EEXIST)) { -                op_ret = 0; -                goto done; -        } - -        if (op_ret < 0) -                goto done; - -        fd      = local->u.context.fd; -        inode   = local->u.context.inode; -        version = local->u.context.version; -        dirty   = local->u.context.markdirty; - -        op_ret = br_stub_init_inode_versions (this, fd, inode, version, dirty); -        if (op_ret < 0) -                op_errno = EINVAL; - - done: -        frame->local = NULL; -        if (op_ret < 0) -                call_unwind_error (local->fopstub, op_ret, op_errno); -        else -                call_resume (local->fopstub); -        br_stub_cleanup_local (local); -        br_stub_dealloc_local (local); - -        return 0; -} - -int  br_stub_fd_incversioning_cbk (call_frame_t *frame,                                void *cookie, xlator_t *this,                                int op_ret, int op_errno, dict_t *xdata) @@ -351,14 +304,14 @@ br_stub_fd_incversioning_cbk (call_frame_t *frame,                  op_errno = EINVAL;   done: -        frame->local = NULL; -        if (op_ret < 0) +        if (op_ret < 0) { +                frame->local = NULL;                  call_unwind_error (local->fopstub, -1, op_errno); -        else +                br_stub_cleanup_local (local); +                br_stub_dealloc_local (local); +        } else {                  call_resume (local->fopstub); -        br_stub_cleanup_local (local); -        br_stub_dealloc_local (local); - +        }          return 0;  } @@ -366,28 +319,27 @@ br_stub_fd_incversioning_cbk (call_frame_t *frame,   * Initial object versioning   *   * Version persists two (2) extended attributes as explained below: - *   1. Current (ongoing) version: This is incremented on an open() - *      or creat() and is the running version for an object. + *   1. Current (ongoing) version: This is incremented on an writev () + *      or truncate () and is the running version for an object.   *   2. Signing version: This is the version against which an object   *      was signed (checksummed).   *   * During initial versioning, both ongoing and signing versions are - * set of one and zero respectively. An open() call increments the + * set of one and zero respectively. A write() call increments the   * ongoing version as an indication of modification to the object.   * Additionally this needs to be persisted on disk and needs to be   * durable: fsync().. :-/ - * As an optimization only the first open() synchronizes the ongoing - * version to disk, subsequent open()s before the *last* release() + * As an optimization only the first write() synchronizes the ongoing + * version to disk, subsequent write()s before the *last* release()   * are no-op's.   *   * create(), just like lookup() initializes the object versions to - * the default, but persists the version to disk. As an optimization - * this is not a durable operation: in case of a crash, hard reboot - * etc.. absence of versioning xattrs is ignored in scrubber along - * with the one time crawler explicitly triggering signing for such - * objects. + * the default. As an optimization this is not a durable operation: + * in case of a crash, hard reboot etc.. absence of versioning xattrs + * is ignored in scrubber along with the one time crawler explicitly + * triggering signing for such objects.   * - * c.f. br_stub_open_cbk() / br_stub_create_cbk() + * c.f. br_stub_writev() / br_stub_truncate()   */  /** @@ -400,7 +352,7 @@ int  br_stub_fd_versioning (xlator_t *this, call_frame_t *frame,                         call_stub_t *stub, dict_t *dict, fd_t *fd,                         br_stub_version_cbk *callback, unsigned long memversion, -                       int versioningtype, int durable, int dirty) +                       int versioningtype, int durable)  {          int32_t          ret   = -1;          int              flags = 0; @@ -421,18 +373,11 @@ br_stub_fd_versioning (xlator_t *this, call_frame_t *frame,                          goto dealloc_xdata;          } -        local = br_stub_alloc_local (this); -        if (!local) { -                ret = -1; -                goto dealloc_xdata; -        } - -        if (versioningtype == BR_STUB_FULL_VERSIONING) -                flags |= XATTR_CREATE; +        local = frame->local;          br_stub_fill_local (local, stub, fd,                              fd->inode, fd->inode->gfid, -                            versioningtype, memversion, dirty); +                            versioningtype, memversion);          frame->local = local;          STACK_WIND (frame, callback, @@ -448,82 +393,21 @@ br_stub_fd_versioning (xlator_t *this, call_frame_t *frame,  }  static inline int -br_stub_perform_fullversioning (xlator_t *this, call_frame_t *frame, -                                call_stub_t *stub, fd_t *fd) -{ -        int32_t         ret      = -1; -        dict_t         *dict     = NULL; -        br_version_t   *obuf     = NULL; -        int             op_errno = 0; - -        op_errno = ENOMEM; -        dict = dict_new (); -        if (!dict) -                goto done; -        ret = br_stub_alloc_versions (&obuf, NULL, 0); -        if (ret) -                goto dealloc_dict; - -        op_errno = EINVAL; -        ret = br_stub_prepare_version_request (this, dict, obuf, -                                               BITROT_DEFAULT_CURRENT_VERSION); -        if (ret) -                goto dealloc_versions; - -        /** -         * Version extended attributes need not be durable at this point of -         * time. If the objects (inode) data gets persisted on disk but the -         * version extended attributes are lost due to a crash/power failure, -         * a subsequent lookup marks the objects signature as stale. This way, -         * dentry operation times do not shoot up. -         */ -        ret = br_stub_fd_versioning (this, frame, stub, dict, fd, -                                     br_stub_inode_fullversioning_cbk, -                                     BITROT_DEFAULT_CURRENT_VERSION, -                                     BR_STUB_FULL_VERSIONING, !WRITEBACK_DURABLE, 0); - - dealloc_versions: -        br_stub_dealloc_versions (obuf); - dealloc_dict: -        dict_unref (dict); - done: -        if (ret) -                call_unwind_error (stub, -1, op_errno); -        return ret; -} - -static inline int  br_stub_perform_incversioning (xlator_t *this,                                 call_frame_t *frame, call_stub_t *stub,                                 fd_t *fd, br_stub_inode_ctx_t *ctx)  { -        int32_t        ret               = -1; -        dict_t        *dict              = NULL; -        inode_t       *inode             = NULL; -        br_version_t  *obuf              = NULL; -        unsigned long  writeback_version = 0; -        int            op_errno          = 0; - -        inode = fd->inode; +        int32_t          ret               = -1; +        dict_t          *dict              = NULL; +        br_version_t    *obuf              = NULL; +        unsigned long    writeback_version = 0; +        int              op_errno          = 0; +        br_stub_local_t *local             = NULL;          op_errno = EINVAL; -        ret = br_stub_require_release_call (this, fd); -        if (ret) -                goto done; - -        LOCK (&inode->lock); -        { -                if (__br_stub_is_inode_dirty (ctx)) -                        writeback_version = __br_stub_writeback_version (ctx); -                else -                        __br_stub_track_openfd (fd, ctx); -        } -        UNLOCK (&inode->lock); +        local = frame->local; -        if (!writeback_version) { -                ret = 0; -                goto done; -        } +        writeback_version = __br_stub_writeback_version (ctx);          /* inode requires writeback to disk */          op_errno = ENOMEM; @@ -541,17 +425,23 @@ br_stub_perform_incversioning (xlator_t *this,          ret = br_stub_fd_versioning                  (this, frame, stub, dict,                   fd, br_stub_fd_incversioning_cbk, writeback_version, -                 BR_STUB_INCREMENTAL_VERSIONING, WRITEBACK_DURABLE, 0); +                 BR_STUB_INCREMENTAL_VERSIONING, !WRITEBACK_DURABLE);   dealloc_versions:          br_stub_dealloc_versions (obuf);   dealloc_dict:          dict_unref (dict);   done: -        if (!ret && !writeback_version) -                call_resume (stub); -        if (ret) +        if (ret) { +                if (local) +                        frame->local = NULL;                  call_unwind_error (stub, -1, op_errno); +                if (local) { +                        br_stub_cleanup_local (local); +                        br_stub_dealloc_local (local); +                } +        } +          return ret;  } @@ -560,6 +450,44 @@ br_stub_perform_incversioning (xlator_t *this,  /* fsetxattr() */  static inline int +br_stub_compare_sign_version (xlator_t *this, inode_t *inode, +                              br_signature_t *sbuf, dict_t *dict) +{ +        int32_t ret = -1; +        br_stub_inode_ctx_t *ctx = NULL; +        uint64_t tmp_ctx = 0; + +        GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out); +        GF_VALIDATE_OR_GOTO (this->name, inode, out); +        GF_VALIDATE_OR_GOTO (this->name, sbuf, out); +        GF_VALIDATE_OR_GOTO (this->name, dict, out); + +        ret = br_stub_get_inode_ctx (this, inode, &tmp_ctx); +        if (ret) { +                dict_del (dict, BITROT_SIGNING_VERSION_KEY); +                goto out; +        } + +        ret = -1; +        ctx = (br_stub_inode_ctx_t *)(long)tmp_ctx; + +        LOCK (&inode->lock); +        { +                if (ctx->currentversion == sbuf->signedversion) +                        ret = 0; +                else +                        gf_log (this->name, GF_LOG_WARNING, "current version " +                                "%lu and version of the signature %lu are not " +                                "same", ctx->currentversion, +                                sbuf->signedversion); +        } +        UNLOCK (&inode->lock); + +out: +        return ret; +} + +static inline int  br_stub_prepare_signature (xlator_t *this, dict_t *dict,                             inode_t *inode, br_isignature_t *sign)  { @@ -577,6 +505,11 @@ br_stub_prepare_signature (xlator_t *this, dict_t *dict,          ret = br_stub_prepare_signing_request (dict, sbuf, sign, signaturelen);          if (ret)                  goto dealloc_versions; + +        ret = br_stub_compare_sign_version (this, inode, sbuf, dict); +        if (ret) +                goto dealloc_versions; +          return 0;   dealloc_versions: @@ -620,6 +553,8 @@ br_stub_fsetxattr (call_frame_t *frame, xlator_t *this,          if (ret)                  goto unwind; +        gf_log (this->name, GF_LOG_DEBUG, "SIGNED VERSION: %lu", +                sign->signedversion);   wind:          STACK_WIND (frame, default_setxattr_cbk,                      FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr, fd, @@ -865,77 +800,598 @@ br_stub_fgetxattr (call_frame_t *frame, xlator_t *this,          return 0;  } -/** }}} */ +/** + * The first write response on the first fd in the list of fds will set + * the flag to indicate that the inode is modified. The subsequent write + * respnses coming on either the first fd or some other fd will not change + * the fd. The inode-modified flag is unset only upon release of all the + * fds. + */ +int32_t +br_stub_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                    int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +                    struct iatt *postbuf, dict_t *xdata) +{ +        int32_t              ret         = 0; +        uint64_t             ctx_addr    = 0; +        br_stub_inode_ctx_t *ctx         = NULL; +        br_stub_local_t     *local       = NULL; + +        if (frame->local) { +                local = frame->local; +                frame->local = NULL; +        } +        if (op_ret < 0) +                goto unwind; -/** {{{ */ +        ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode, +                                     &ctx_addr); +        if (ret < 0) +                goto unwind; -/* open() */ +        ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; -int -br_stub_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, -                  int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +        /* Mark the flag to indicate the inode has been modified */ +        LOCK (&local->u.context.fd->inode->lock); +        { +                if (!__br_stub_is_inode_modified (ctx)) +                        __br_stub_set_inode_modified (ctx); +        } +        UNLOCK (&local->u.context.fd->inode->lock); + + +unwind: +        STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, +                             xdata); +        br_stub_cleanup_local (local); +        br_stub_dealloc_local (local); +        return 0; +} + +/** + * Ongoing version is increased only for the first modify operation. + * First modify version means the first write or truncate call coming on the + * first fd in the list of inodes. + * For anonymous fds open would not have come, so check if its the first write + * by doing both inode dirty check and ensuring list of fds is empty + */ +static inline gf_boolean_t +br_stub_inc_version (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx)  { -        int32_t              ret      = 0; -        uint64_t             ctx_addr = 0; -        br_stub_inode_ctx_t *ctx      = NULL; -        call_stub_t         *stub     = NULL; +        gf_boolean_t inc_version = _gf_false; + +        GF_VALIDATE_OR_GOTO (this->name, fd, out); +        GF_VALIDATE_OR_GOTO (this->name, ctx, out); + +        LOCK (&fd->inode->lock); +        { +                if (__br_stub_is_inode_dirty (ctx)) +                        inc_version = _gf_true; +        } +        UNLOCK (&fd->inode->lock); + +out: +        return inc_version; +} + +/** + * Since NFS does not do open, writes from NFS are sent over an anonymous + * fd. It means each write fop might come on a different anonymous fd and + * will lead to very large number of notifications being sent. It might + * affect the perfromance as, there will too many sign requests. + * To avoid that whenever the last fd released from an inode (logical release) + * is an anonymous fd the release notification is sent with a flag being set + * __br_stub_anon_release (ctx); + * BitD checks for the flag and if set, it will send a dummy write request + * (again on an anonymous fd) instead of triggering sign. + * Bit-rot-stub should identify such dummy writes and should send success to + * them instead of winding them downwards. + */ +gf_boolean_t +br_stub_dummy_write (call_frame_t *frame) +{ +        return (frame->root->pid == GF_CLIENT_PID_BITD) +                        ? _gf_true : _gf_false; +} + +int32_t +br_stub_anon_fd_ctx (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx) +{ +        int32_t  ret = -1; +        br_stub_fd_t *br_stub_fd = NULL; + +        br_stub_fd = br_stub_fd_ctx_get (this, fd); +        if (!br_stub_fd) { +                ret = br_stub_add_fd_to_inode (this, fd, ctx); +                if (ret) { +                        gf_log (this->name, GF_LOG_ERROR, "failed to " +                                "add fd to the inode (gfid: %s)", +                                uuid_utoa (fd->inode->gfid)); +                        goto out; +                } +        } + +        ret = 0; + +out: +        return ret; +} + +int32_t +br_stub_writev_resume (call_frame_t *frame, xlator_t *this, fd_t *fd, +                       struct iovec *vector, int32_t count, off_t offset, +                       uint32_t flags, struct iobref *iobref, dict_t *xdata) +{ +        if (frame->root->pid == GF_CLIENT_PID_BITD) +                br_stub_writev_cbk (frame, NULL, this, vector->iov_len, 0, +                                    NULL, NULL, NULL); +        else +                STACK_WIND (frame, br_stub_writev_cbk, FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->writev, fd, vector, count, +                            offset, flags, iobref, xdata); +        return 0; +} + +/** +   TODO: If possible add pictorial represention of below comment. + +   Before sending writev on the ANONYMOUS FD, increase the ongoing +   version first. This brings anonymous fd write closer to the regular +   fd write by having the ongoing version increased before doing the +   write (In regular fd, after open the ongoing version is incremented). +   Do following steps to handle writes on anonymous fds: +   1) Increase the on-disk ongoing version +   2) Once versioning is successfully done send write operation. If versioning +      fails, then fail the write fop. +   3) In writev_cbk do below things: +      a) Increase in-memory version +      b) set the fd context (so that br_stub_release is invoked) +      c) add the fd to the list of fds maintained in the inode context of +         bitrot-stub. +      d) Mark inode as non dirty +      e) Mard inode as modified (in the inode context) +**/ +int32_t +br_stub_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, +                struct iovec *vector, int32_t count, off_t offset, +                uint32_t flags, struct iobref *iobref, dict_t *xdata) +{ +        br_stub_local_t     *local       = NULL; +        call_stub_t         *stub        = NULL; +        int32_t              op_ret      = -1; +        int32_t              op_errno    = EINVAL; +        gf_boolean_t         inc_version = _gf_false; +        br_stub_inode_ctx_t *ctx         = NULL; +        uint64_t             ctx_addr    = 0; +        int32_t              ret         = -1; + +        GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); +        GF_VALIDATE_OR_GOTO (this->name, frame, unwind); +        GF_VALIDATE_OR_GOTO (this->name, fd, unwind); + +        local = br_stub_alloc_local (this); +        if (!local) { +                gf_log (this->name, GF_LOG_ERROR, "local allocation failed " +                        "(gfid: %s)", uuid_utoa (fd->inode->gfid)); +                op_ret = -1; +                op_errno = ENOMEM; +                goto unwind; +        } + +        local->u.context.fd = fd_ref (fd); +        frame->local = local; + +        ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, "failed to get the inode " +                        "context for the inode %s", +                        uuid_utoa (fd->inode->gfid)); +                goto unwind; +        } + +        ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; +        if (fd_is_anonymous (fd)) { +                ret = br_stub_anon_fd_ctx (this, fd, ctx); +                if (ret) +                        goto unwind; +        } + +        /* TODO: Better to do a dummy fsetxattr instead of write. Keep write +           simple */ +        if (br_stub_dummy_write (frame)) { +                LOCK (&fd->inode->lock); +                { +                        (void) __br_stub_inode_sign_state +                                             (ctx, GF_FOP_WRITE, fd); +                } +                UNLOCK (&fd->inode->lock); + +                if (xdata && dict_get (xdata, "br-fd-reopen")) { +                        op_ret = vector->iov_len; +                        op_errno = 0; +                        goto unwind; +                } +        } + +        /** +         * Check whether this is the first write on this inode since the last +         * sign notification has been sent. If so, do versioning. Otherwise +         * go ahead with the fop. +         */ +        inc_version = br_stub_inc_version (this, fd, ctx); +        if (!inc_version) +                goto wind; + +        /* Create the stub for the write fop */ +        stub = fop_writev_stub (frame, br_stub_writev_resume, fd, vector, count, +                                offset, flags, iobref, xdata); + +        if (!stub) { +                gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for " +                        "write fop (gfid: %s), unwinding", +                        uuid_utoa (fd->inode->gfid)); +                goto unwind; +        } + +        /* Perform Versioning */ +        return br_stub_perform_incversioning (this, frame, stub, fd, ctx); + +wind: +        STACK_WIND (frame, br_stub_writev_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, +                    flags, iobref, xdata); +        return 0; + +unwind: +        frame->local = NULL; +        STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, NULL, NULL, +                             NULL); +        br_stub_cleanup_local (local); +        br_stub_dealloc_local (local); +        return 0; +} + +int32_t +br_stub_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +                       struct iatt *postbuf, dict_t *xdata) +{ +        int32_t              ret         = 0; +        uint64_t             ctx_addr    = 0; +        br_stub_inode_ctx_t *ctx         = NULL; +        br_stub_local_t     *local       = NULL; + +        if (frame->local) { +                local = frame->local; +                frame->local = NULL; +        }          if (op_ret < 0)                  goto unwind; -        if (cookie != (void *) BR_STUB_REQUEST_COOKIE) + +        ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode, +                                     &ctx_addr); +        if (ret < 0)                  goto unwind; +        ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + +        /* Mark the flag to indicate the inode has been modified */ +        LOCK (&local->u.context.fd->inode->lock); +        { +                if (!__br_stub_is_inode_modified (ctx)) +                        __br_stub_set_inode_modified (ctx); +        } +        UNLOCK (&local->u.context.fd->inode->lock); + + +unwind: +        STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, postbuf, +                             xdata); +        br_stub_cleanup_local (local); +        br_stub_dealloc_local (local); +        return 0; +} + +int32_t +br_stub_ftruncate_resume (call_frame_t *frame, xlator_t *this, fd_t *fd, +                          off_t offset, dict_t *xdata) +{ +        STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); +        return 0; +} + +int32_t +br_stub_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, +                   off_t offset, dict_t *xdata) +{ +        br_stub_local_t     *local       = NULL; +        call_stub_t         *stub        = NULL; +        int32_t              op_ret      = -1; +        int32_t              op_errno    = EINVAL; +        gf_boolean_t         inc_version = _gf_false; +        br_stub_inode_ctx_t *ctx         = NULL; +        uint64_t             ctx_addr    = 0; +        int32_t              ret         = -1; + +        GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); +        GF_VALIDATE_OR_GOTO (this->name, frame, unwind); +        GF_VALIDATE_OR_GOTO (this->name, fd, unwind); + +        local = br_stub_alloc_local (this); +        if (!local) { +                gf_log (this->name, GF_LOG_ERROR, "local allocation failed " +                        "(gfid: %s)", uuid_utoa (fd->inode->gfid)); +                op_ret = -1; +                op_errno = ENOMEM; +                goto unwind; +        } + +        local->u.context.fd = fd_ref (fd); +        frame->local = local; +          ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); -        if (ret < 0) +        if (ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, "failed to get the inode " +                        "context for the inode %s", +                        uuid_utoa (fd->inode->gfid));                  goto unwind; +        } + +        ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; +        if (fd_is_anonymous (fd)) { +                ret = br_stub_anon_fd_ctx (this, fd, ctx); +                if (ret) +                        goto unwind; +        } -        stub = fop_open_cbk_stub (frame, NULL, op_ret, op_errno, fd, xdata); +        /** +         * c.f. br_stub_writev() +         */ +        inc_version = br_stub_inc_version (this, fd, ctx); +        if (!inc_version) +                goto wind; + +        /* Create the stub for the ftruncate fop */ +        stub = fop_ftruncate_stub (frame, br_stub_ftruncate_resume, fd, offset, +                                   xdata);          if (!stub) { +                gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for " +                        "ftruncate fop (gfid: %s), unwinding", +                        uuid_utoa (fd->inode->gfid)); +                goto unwind; +        } + +        /* Perform Versioning */ +        return br_stub_perform_incversioning (this, frame, stub, fd, ctx); + +wind: +        STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); +        return 0; + +unwind: +        frame->local = NULL; +        STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, NULL, NULL, +                             NULL); +        br_stub_cleanup_local (local); +        br_stub_dealloc_local (local); +        return 0; +} + +int32_t +br_stub_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                      int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +                      struct iatt *postbuf, dict_t *xdata) +{ +        int32_t              ret         = 0; +        uint64_t             ctx_addr    = 0; +        br_stub_inode_ctx_t *ctx         = NULL; +        br_stub_local_t     *local       = NULL; + +        if (frame->local) { +                local = frame->local; +                frame->local = NULL; +        } + +        if (op_ret < 0) +                goto unwind; + +        ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode, +                                     &ctx_addr); +        if (ret < 0) +                goto unwind; + +        ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + +        /* Mark the flag to indicate the inode has been modified */ +        LOCK (&local->u.context.fd->inode->lock); +        { +                if (!__br_stub_is_inode_modified (ctx)) +                        __br_stub_set_inode_modified (ctx); +        } +        UNLOCK (&local->u.context.fd->inode->lock); + + +unwind: +        STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, postbuf, +                             xdata); +        br_stub_cleanup_local (local); +        br_stub_dealloc_local (local); +        return 0; +} + +int32_t +br_stub_truncate_resume (call_frame_t *frame, xlator_t *this, loc_t *loc, +                          off_t offset, dict_t *xdata) +{ +        STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); +        return 0; +} + +/** + * Bit-rot-stub depends heavily on the fd based operations to for doing + * versioning and sending notification. It starts tracking the operation + * upon getting first fd based modify operation by doing versioning and + * sends notification when last fd using which the inode was modified is + * released. + * But for truncate there is no fd and hence it becomes difficult to do + * the versioning and send notification. It is handled by doing versioning + * on an anonymous fd. The fd will be valid till the completion of the + * truncate call. It guarantees that release on this anonymous fd will happen + * after the truncate call and notification is sent after the truncate call. + */ +int32_t +br_stub_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, +                  off_t offset, dict_t *xdata) +{ +        br_stub_local_t     *local       = NULL; +        call_stub_t         *stub        = NULL; +        int32_t              op_ret      = -1; +        int32_t              op_errno    = EINVAL; +        gf_boolean_t         inc_version = _gf_false; +        br_stub_inode_ctx_t *ctx         = NULL; +        uint64_t             ctx_addr    = 0; +        int32_t              ret         = -1; +        fd_t                *fd          = NULL; + +        GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); +        GF_VALIDATE_OR_GOTO (this->name, frame, unwind); +        GF_VALIDATE_OR_GOTO (this->name, loc, unwind); +        GF_VALIDATE_OR_GOTO (this->name, loc->inode, unwind); + +        fd = fd_anonymous (loc->inode); +        if (!fd) { +                gf_log (this->name, GF_LOG_ERROR, "failed to create anonymous " +                        "fd for the inode %s", uuid_utoa (loc->inode->gfid)); +                goto unwind; +        } + +        local = br_stub_alloc_local (this); +        if (!local) { +                gf_log (this->name, GF_LOG_ERROR, "local allocation failed " +                        "(gfid: %s)", uuid_utoa (loc->inode->gfid));                  op_ret = -1; -                op_errno = EINVAL; +                op_errno = ENOMEM; +                goto unwind; +        } + +        local->u.context.fd = fd; +        frame->local = local; + +        ret = br_stub_get_inode_ctx (this, loc->inode, &ctx_addr); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, "failed to get the inode " +                        "context for the inode %s", +                        uuid_utoa (fd->inode->gfid));                  goto unwind;          } +        ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; +        ret = br_stub_anon_fd_ctx (this, local->u.context.fd, ctx); +        if (ret) +                goto unwind; +          /** -         * Ongoing version needs to be incremented. If the inode is not dirty, -         * things are simple: increment the ongoing version safely and be done. -         * If inode is dirty, a writeback to disk is required. This is tricky in -         * case of multiple open()'s as ongoing version needs to be incremented -         * on a successful writeback. It's probably safe to remember the ongoing -         * version before writeback and *assigning* it in the callback, but that -         * may lead to a trustable checksum to be treated as stale by scrubber -         * (the case where the in-memory ongoing version is lesser than the -         * on-disk version). Therefore, *all* open() calls (which might have -         * come in parallel) try to synchronize the next ongoing version to -         * disk. In the callback path, the winner marks the inode as synced -         * therby loosing open() calls become no-op's. +         * c.f. br_stub_writev()           */ -        ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; -        return br_stub_perform_incversioning (this, frame, stub, fd, ctx); +        inc_version = br_stub_inc_version (this, fd, ctx); +        if (!inc_version) +                goto wind; - unwind: -        STACK_UNWIND_STRICT (open, frame, -                             op_ret, op_errno, fd, xdata); +        /* Create the stub for the truncate fop */ +        stub = fop_truncate_stub (frame, br_stub_truncate_resume, loc, offset, +                                  xdata); +        if (!stub) { +                gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for " +                        "truncate fop (gfid: %s), unwinding", +                        uuid_utoa (fd->inode->gfid)); +                goto unwind; +        } + +        /* Perform Versioning */ +        return br_stub_perform_incversioning (this, frame, stub, +                                              local->u.context.fd, ctx); + +wind: +        STACK_WIND (frame, br_stub_truncate_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); +        return 0; + +unwind: +        frame->local = NULL; +        STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, NULL, NULL, +                             NULL); +        br_stub_cleanup_local (local); +        br_stub_dealloc_local (local);          return 0;  } +/** }}} */ + + +/** {{{ */ + +/* open() */ + +/** + * It's probably worth mentioning a bit about why some of the housekeeping + * work is done in open() call path, rather than the callback path. + * Two (or more) open()'s in parallel can race and lead to a situation + * where a release() gets triggered (possibly after a series of write() + * calls) when *other* open()'s have still not reached callback path + * thereby having an active fd on an inode that is in process of getting + * signed with the current version. + * + * Maintaining fd list in the call path ensures that a release() would + * not be triggered if an open() call races ahead (followed by a close()) + * threby finding non-empty fd list. + */ +  int  br_stub_open (call_frame_t *frame, xlator_t *this,                loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata)  { -        void *cookie = NULL; +        int32_t              ret      = -1; +        br_stub_inode_ctx_t *ctx      = NULL; +        uint64_t             ctx_addr = 0; + +        GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); +        GF_VALIDATE_OR_GOTO (this->name, loc, unwind); +        GF_VALIDATE_OR_GOTO (this->name, fd, unwind); +        GF_VALIDATE_OR_GOTO (this->name, fd->inode, unwind); -        if (!flags) -                goto wind;          if (frame->root->pid == GF_CLIENT_PID_SCRUB)                  goto wind; -        cookie = (void *) BR_STUB_REQUEST_COOKIE; - wind: -        STACK_WIND_COOKIE (frame, br_stub_open_cbk, cookie, -                           FIRST_CHILD (this), FIRST_CHILD (this)->fops->open, -                           loc, flags, fd, xdata); +        ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "failed to get the inode " +                        "context for the file %s (gfid: %s)", loc->path, +                        uuid_utoa (fd->inode->gfid)); +                goto unwind; +        } + +        ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; +        if (flags == O_RDONLY) +                goto wind; + +        ret = br_stub_add_fd_to_inode (this, fd, ctx); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "failed add fd to the list " +                        "(gfid: %s)", uuid_utoa (fd->inode->gfid)); +                goto unwind; +        } + +wind: +        STACK_WIND (frame, default_open_cbk, FIRST_CHILD (this), +                    FIRST_CHILD (this)->fops->open, loc, flags, fd, xdata); +        return 0; +unwind: +        STACK_UNWIND_STRICT (open, frame, -1, EINVAL, NULL, NULL);          return 0;  } @@ -946,39 +1402,60 @@ br_stub_open (call_frame_t *frame, xlator_t *this,  /* creat() */ +/** + * This routine registers a release callback for the given fd and adds the + * fd to the inode context fd tracking list. + */ +int32_t +br_stub_add_fd_to_inode (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx) +{ +        int32_t       ret        = -1; +        br_stub_fd_t *br_stub_fd = NULL; + +        ret = br_stub_require_release_call (this, fd, &br_stub_fd); +        if (ret) { +                gf_log (this->name, GF_LOG_ERROR, "failed to set the fd " +                        "context for the file (gfid: %s)", +                        uuid_utoa (fd->inode->gfid)); +                goto out; +        } + +        LOCK (&fd->inode->lock); +        { +                list_add_tail (&ctx->fd_list, &br_stub_fd->list); +        } +        UNLOCK (&fd->inode->lock); + +        ret = 0; + +out: +        return ret; +} +  int  br_stub_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                      int op_ret, int op_errno, fd_t *fd, inode_t *inode,                      struct iatt *stbuf, struct iatt *preparent,                      struct iatt *postparent, dict_t *xdata)  { -        int32_t ret = 0; -        uint64_t ctx_addr = 0; -        call_stub_t *stub = NULL; -        br_stub_inode_ctx_t *ctx = NULL; +        int32_t              ret      = 0; +        uint64_t             ctx_addr = 0; +        br_stub_inode_ctx_t *ctx      = NULL; +        unsigned long        version  = BITROT_DEFAULT_CURRENT_VERSION;          if (op_ret < 0)                  goto unwind; -        stub = fop_create_cbk_stub (frame, NULL, op_ret, op_errno, fd, inode, -                                    stbuf, preparent, postparent, xdata); -        if (!stub) { -                op_ret = -1; -                op_errno = EINVAL; -                goto unwind; -        } -          ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); -        if (ret < 0) -                ctx_addr = 0; -        ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; - -        /* see comment in br_stub_open_cbk().. */ -        return (ctx) -                ? br_stub_perform_incversioning (this, frame, stub, fd, ctx) -                : br_stub_perform_fullversioning (this, frame, stub, fd); +        if (ret < 0) { +                ret = br_stub_init_inode_versions (this, fd, inode, version, +                                                   _gf_true); +        } else { +                ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; +                ret = br_stub_add_fd_to_inode (this, fd, ctx); +        } - unwind: +unwind:          STACK_UNWIND_STRICT (create, frame, op_ret, op_errno,                               fd, inode, stbuf, preparent, postparent, xdata);          return 0; @@ -989,10 +1466,20 @@ br_stub_create (call_frame_t *frame,                  xlator_t *this, loc_t *loc, int32_t flags,                  mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)  { +        GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); +        GF_VALIDATE_OR_GOTO (this->name, loc, unwind); +        GF_VALIDATE_OR_GOTO (this->name, loc->inode, unwind); +        GF_VALIDATE_OR_GOTO (this->name, fd, unwind); +        GF_VALIDATE_OR_GOTO (this->name, fd->inode, unwind); +          STACK_WIND (frame, br_stub_create_cbk, FIRST_CHILD (this),                      FIRST_CHILD (this)->fops->create,                      loc, flags, mode, umask, fd, xdata);          return 0; +unwind: +        STACK_UNWIND_STRICT (create, frame, -1, EINVAL, NULL, NULL, NULL, NULL, +                             NULL, NULL); +        return 0;  }  /** }}} */ @@ -1011,21 +1498,11 @@ br_stub_lookup_version (xlator_t *this,           * out the correct version to use in the inode context (start with           * the default version if unavailable). As of now versions are not           * persisted on-disk. The inode is marked dirty, so that the first -         * operation (such as open(), etc..) would trigger synchronization -         * to disk. +         * operation (such as write(), etc..) triggers synchronization to +         * disk.           */          status = br_version_xattr_state (xattr, &obuf, &sbuf); -        /** -         * stub does not know how to handle presence of signature but not -         * the object version, therefore, in such cases, bail out.. -         */ -        if (status == BR_VXATTR_STATUS_INVALID) { -                gf_log (this->name, GF_LOG_ERROR, "Invalid versioning xattrs. " -                        "Bailing out [GFID: %s]", uuid_utoa (gfid)); -                return -1; -        } -          version = ((status == BR_VXATTR_STATUS_FULL)                     || (status == BR_VXATTR_STATUS_UNSIGNED))                          ? obuf->ongoingversion : BITROT_DEFAULT_CURRENT_VERSION; @@ -1259,8 +1736,8 @@ br_stub_noop (call_frame_t *frame, void *cookie, xlator_t *this,  }  static inline void -br_stub_send_ipc_fop (xlator_t *this, -                      fd_t *fd, unsigned long releaseversion, int32_t flags) +br_stub_send_ipc_fop (xlator_t *this, fd_t *fd, unsigned long releaseversion, +                      int sign_info)  {          int32_t op = 0;          int32_t ret = 0; @@ -1269,8 +1746,8 @@ br_stub_send_ipc_fop (xlator_t *this,          changelog_event_t ev = {0,};          ev.ev_type = CHANGELOG_OP_TYPE_BR_RELEASE; -        ev.u.releasebr.flags = flags;          ev.u.releasebr.version = releaseversion; +        ev.u.releasebr.sign_info = sign_info;          gf_uuid_copy (ev.u.releasebr.gfid, fd->inode->gfid);          xdata = dict_new (); @@ -1305,14 +1782,67 @@ br_stub_send_ipc_fop (xlator_t *this,          return;  } +/** + * This is how the state machine of sign info works: + * 3 states: + * 1) BR_SIGN_NORMAL => The default State of the inode + * 2) BR_SIGN_REOPEN_WAIT => A release has been sent and is waiting for reopen + * 3) BR_SIGN_QUICK => reopen has happened and this release should trigger sign + * 2 events: + * 1) GF_FOP_RELEASE + * 2) GF_FOP_WRITE (actually a dummy write fro BitD) + * + * This is how states are changed based on events: + * EVENT: GF_FOP_RELEASE: + * if (state == BR_SIGN_NORMAL) ; then + *     set state = BR_SIGN_REOPEN_WAIT; + * if (state == BR_SIGN_QUICK); then + *     set state = BR_SIGN_NORMAL; + * EVENT: GF_FOP_WRITE: + *  if (state == BR_SIGN_REOPEN_WAIT); then + *     set state = BR_SIGN_QUICK; + */ +br_sign_state_t +__br_stub_inode_sign_state (br_stub_inode_ctx_t *ctx, +                            glusterfs_fop_t fop, fd_t *fd) +{ +        br_sign_state_t sign_info = BR_SIGN_INVALID; + +        switch (fop) { + +        case GF_FOP_WRITE: +                sign_info = ctx->info_sign = BR_SIGN_QUICK; +                break; + +        case GF_FOP_RELEASE: +                GF_ASSERT (ctx->info_sign != BR_SIGN_REOPEN_WAIT); + +                if (ctx->info_sign == BR_SIGN_NORMAL) { +                        sign_info = ctx->info_sign = BR_SIGN_REOPEN_WAIT; +                } else { +                        sign_info = ctx->info_sign; +                        ctx->info_sign = BR_SIGN_NORMAL; +                } + +                break; +        default: +                break; +        } + +        return sign_info; +} +  int32_t  br_stub_release (xlator_t *this, fd_t *fd)  { -        int32_t ret = 0; -        int32_t flags = 0; -        inode_t *inode = NULL; -        unsigned long releaseversion = 0; -        br_stub_inode_ctx_t *ctx = NULL; +        int32_t              ret            = 0; +        int32_t              flags          = 0; +        inode_t             *inode          = NULL; +        unsigned long        releaseversion = 0; +        br_stub_inode_ctx_t *ctx            = NULL; +        uint64_t             tmp            = 0; +        br_stub_fd_t        *br_stub_fd     = NULL; +        int32_t              signinfo       = 0;          inode = fd->inode; @@ -1321,12 +1851,23 @@ br_stub_release (xlator_t *this, fd_t *fd)                  ctx = __br_stub_get_ongoing_version_ctx (this, inode, NULL);                  if (ctx == NULL)                          goto unblock; -                __br_stub_track_release (ctx); +                br_stub_fd = br_stub_fd_ctx_get (this, fd); +                if (br_stub_fd) { +                        list_del_init (&br_stub_fd->list); +                } +                  ret = __br_stub_can_trigger_release -                                 (inode, ctx, &releaseversion, &flags); -                if (ret) { -                        GF_ASSERT (__br_stub_is_inode_dirty (ctx) == 0); +                                    (inode, ctx, &releaseversion); +                if (!ret) +                        goto unblock; + +                signinfo = __br_stub_inode_sign_state (ctx, GF_FOP_RELEASE, fd); +                signinfo = htonl (signinfo); + +                /* inode back to initital state: mark dirty */ +                if (ctx->info_sign == BR_SIGN_NORMAL) {                          __br_stub_mark_inode_dirty (ctx); +                        __br_stub_unset_inode_modified (ctx);                  }          }   unblock: @@ -1334,10 +1875,17 @@ br_stub_release (xlator_t *this, fd_t *fd)          if (ret) {                  gf_log (this->name, GF_LOG_DEBUG, -                        "releaseversion: %lu|flags: %d", releaseversion, flags); -                br_stub_send_ipc_fop (this, fd, releaseversion, flags); +                        "releaseversion: %lu | flags: %d | signinfo: %d", +                        (unsigned long) ntohl (releaseversion), +                        flags, ntohl(signinfo)); +                br_stub_send_ipc_fop (this, fd, releaseversion, signinfo);          } +        ret = fd_ctx_del (fd, this, &tmp); +        br_stub_fd = (br_stub_fd_t *)(long)tmp; + +        GF_FREE (br_stub_fd); +          return 0;  } @@ -1351,11 +1899,12 @@ void  br_stub_ictxmerge (xlator_t *this, fd_t *fd,                     inode_t *inode, inode_t *linked_inode)  { -        int32_t ret = 0; -        uint64_t ctxaddr = 0; -        uint64_t lctxaddr = 0; -        br_stub_inode_ctx_t *ctx = NULL; -        br_stub_inode_ctx_t *lctx = NULL; +        int32_t              ret        = 0; +        uint64_t             ctxaddr    = 0; +        uint64_t             lctxaddr   = 0; +        br_stub_inode_ctx_t *ctx        = NULL; +        br_stub_inode_ctx_t *lctx       = NULL; +        br_stub_fd_t        *br_stub_fd = NULL;          ret = br_stub_get_inode_ctx (this, inode, &ctxaddr);          if (ret < 0) @@ -1369,29 +1918,15 @@ br_stub_ictxmerge (xlator_t *this, fd_t *fd,                          goto unblock;                  lctx = (br_stub_inode_ctx_t *) lctxaddr; -                if (__br_stub_is_inode_dirty (lctx)) { -                        /** -                         * RACY code: An inode can end up in this situation -                         * after a lookup() or after a create() followed by -                         * a release(). Even if we distinguish b/w the two, -                         * there needs to be more infrastructure built up -                         * in stub to handle these races. Note, that it's -                         * probably OK to ignore the race iff the version -                         * was initialized on the very first lookup(), i.e., -                         * [ongoingversion: default]. -                         * -                         * FIXME: fixup races [create(1..n)/lookup(1..n)]. -                         */ -                        GF_ASSERT (lctx->currentversion -                                      == BITROT_DEFAULT_CURRENT_VERSION); -                        __br_stub_track_openfd (fd, lctx); -                        __br_stub_mark_inode_synced (lctx); -                } else { -                        GF_ASSERT (ctx->currentversion <= lctx->currentversion); -                        __br_stub_track_openfd (fd, lctx); +                GF_ASSERT (list_is_singular (&ctx->fd_list)); +                br_stub_fd = list_first_entry (&ctx->fd_list, br_stub_fd_t, +                                               list); +                if (br_stub_fd) { +                        GF_ASSERT (br_stub_fd->fd == fd); +                        list_move_tail (&br_stub_fd->list, &lctx->fd_list);                  }          } - unblock: +unblock:          UNLOCK (&linked_inode->lock);   done: @@ -1409,6 +1944,9 @@ struct xlator_fops fops = {          .getxattr  = br_stub_getxattr,          .fgetxattr = br_stub_fgetxattr,          .fsetxattr = br_stub_fsetxattr, +        .writev    = br_stub_writev, +        .truncate  = br_stub_truncate, +        .ftruncate = br_stub_ftruncate,  };  struct xlator_cbks cbks = {  | 
