diff options
-rw-r--r-- | libglusterfs/src/changelog.h | 2 | ||||
-rw-r--r-- | tests/bitrot/br-stub.c | 47 | ||||
-rw-r--r-- | tests/bitrot/br-stub.t | 21 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/bitd/bit-rot.c | 138 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/bitd/bit-rot.h | 6 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-common.h | 7 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h | 3 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-stub.c | 1070 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-stub.h | 216 |
9 files changed, 1156 insertions, 354 deletions
diff --git a/libglusterfs/src/changelog.h b/libglusterfs/src/changelog.h index 08307810704..6f86e5a54cd 100644 --- a/libglusterfs/src/changelog.h +++ b/libglusterfs/src/changelog.h @@ -42,9 +42,9 @@ struct ev_release { }; struct ev_release_br { - int32_t flags; unsigned long version; unsigned char gfid[16]; + int32_t sign_info; }; struct ev_changelog { diff --git a/tests/bitrot/br-stub.c b/tests/bitrot/br-stub.c index 6cb12ed0a8e..e164170bb83 100644 --- a/tests/bitrot/br-stub.c +++ b/tests/bitrot/br-stub.c @@ -31,8 +31,10 @@ brstub_validate_version (char *bpath, unsigned long version) if (ret < 0) goto err; - if (xv->ongoingversion != version) + if (xv->ongoingversion != version) { match = -1; + fprintf (stderr, "ongoingversion: %lu\n", xv->ongoingversion); + } free (xv); return match; @@ -42,11 +44,12 @@ brstub_validate_version (char *bpath, unsigned long version) } int -brstub_open_validation (char *filp, char *bpath, unsigned long startversion) +brstub_write_validation (char *filp, char *bpath, unsigned long startversion) { int fd1 = 0; int fd2 = 0; int ret = 0; + char *string = "string\n"; /* read only check */ fd1 = open (filp, O_RDONLY); @@ -55,18 +58,37 @@ brstub_open_validation (char *filp, char *bpath, unsigned long startversion) close (fd1); ret = brstub_validate_version (bpath, startversion); - if (ret < 0) + if (ret == 0) goto err; - /* single open (write/) check */ fd1 = open (filp, O_RDWR); if (fd1 < 0) goto err; - close (fd1); + ret = write (fd1, string, strlen (string)); + if (ret <= 0) + goto err; + /** + * Fsync is done so that the write call has properly reached the + * disk. For fuse mounts write-behind xlator would have held the + * writes with itself and for nfs, client would have held the + * write in its cache. So write fop would not have triggered the + * versioning as it would have not reached the bit-rot-stub. + */ + fsync (fd1); startversion++; ret = brstub_validate_version (bpath, startversion); + if (ret < 0) + goto err; + ret = write (fd1, string, strlen (string)); + if (ret <= 0) + goto err; + ret = brstub_validate_version (bpath, startversion); + if (ret < 0) + goto err; + + close (fd1); /* multi open (write/) check */ fd1 = open (filp, O_RDWR); if (fd1 < 0) @@ -74,13 +96,20 @@ brstub_open_validation (char *filp, char *bpath, unsigned long startversion) fd2 = open (filp, O_WRONLY); if (fd1 < 0) goto err; + + ret = write (fd1, string, strlen (string)); + if (ret <= 0) + goto err; + + ret = write (fd1, string, strlen (string)); + if (ret <= 0) + goto err; close (fd1); close (fd2); /** - * incremented once per open()/open().../close()/close() sequence + * incremented once per write()/write().../close()/close() sequence */ - startversion++; ret = brstub_validate_version (bpath, startversion); if (ret < 0) goto err; @@ -106,11 +135,11 @@ brstub_new_object_validate (char *filp, char *brick) printf ("Validating initial version..\n"); ret = brstub_validate_version (bpath, 1); - if (ret < 0) + if (ret == 0) goto err; printf ("Validating version on modifications..\n"); - ret = brstub_open_validation (filp, bpath, 1); + ret = brstub_write_validation (filp, bpath, 1); if (ret < 0) goto err; diff --git a/tests/bitrot/br-stub.t b/tests/bitrot/br-stub.t index 11d02418785..bab4c7cdbd1 100644 --- a/tests/bitrot/br-stub.t +++ b/tests/bitrot/br-stub.t @@ -2,6 +2,7 @@ . $(dirname $0)/../include.rc . $(dirname $0)/../volume.rc +. $(dirname $0)/../nfs.rc STUB_SOURCE=$(dirname $0)/br-stub.c STUB_EXEC=$(dirname $0)/br-stub @@ -17,9 +18,9 @@ EXPECT "$V0" volinfo_field $V0 'Volume Name'; EXPECT 'Created' volinfo_field $V0 'Status'; EXPECT '2' brick_count $V0 -## Turn off open-behind (stub does not work with anonfd yet..) -TEST $CLI volume set $V0 performance.open-behind off -EXPECT 'off' volinfo_field $V0 'performance.open-behind' +## Turn off write-behind (write-behind clubs writes together) +TEST $CLI volume set $V0 performance.write-behind off +#EXPECT 'off' volinfo_field $V0 'performance.open-behind' ## Start the volume TEST $CLI volume start $V0; @@ -27,6 +28,7 @@ EXPECT 'Started' volinfo_field $V0 'Status'; ## Mount the volume TEST $GFS --volfile-server=$H0 --volfile-id=$V0 $M0; +TEST mount_nfs $H0:/$V0 $N0 nolock; ## Build stub C source build_tester $STUB_SOURCE -o $STUB_EXEC -I$(dirname $0)/../../xlators/features/bit-rot/src/stub @@ -34,11 +36,20 @@ TEST [ -e $STUB_EXEC ] ## create & check version fname="$M0/filezero" -touch $fname +touch $fname; backpath=$(get_backend_paths $fname) + +TEST $STUB_EXEC $fname $(dirname $backpath) + +rm -f $fname; + +## test nfs +fname="$N0/filezero" +touch $fname; # backpath remains same.. + TEST $STUB_EXEC $fname $(dirname $backpath) -## cleanups.. +##cleanups.. rm -f $STUB_EXEC cleanup; diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c index 5638b0f348b..61d461f897b 100644 --- a/xlators/features/bit-rot/src/bitd/bit-rot.c +++ b/xlators/features/bit-rot/src/bitd/bit-rot.c @@ -171,11 +171,11 @@ bitd_is_bad_file (xlator_t *this, br_child_t *child, loc_t *loc, fd_t *fd) if (fd) ret = syncop_fgetxattr (child->xl, fd, &xattr, - BITROT_OBJECT_BAD_KEY, NULL, + "trusted.glusterfs.bad-file", NULL, NULL); else if (loc) ret = syncop_getxattr (child->xl, loc, &xattr, - BITROT_OBJECT_BAD_KEY, NULL, + "trusted.glusterfs.bad-file", NULL, NULL); if (!ret) { @@ -484,6 +484,98 @@ br_log_object_path (xlator_t *this, char *op, op, path, strerror (op_errno)); } +static void +br_send_dummy_write (xlator_t *this, fd_t *fd, br_child_t *child, + dict_t *xdata) +{ + struct iovec iov = {0, }; + struct iobref *iobref = NULL; + struct iobuf *iobuf = NULL; + char *msg = NULL; + size_t size = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("bit-rot", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + GF_VALIDATE_OR_GOTO (this->name, child, out); + + msg = gf_strdup ("GLUSTERFS"); + if (!msg) + goto out; + + size = strlen (msg); + + iov.iov_base = msg; + iov.iov_len = size; + + iobref = iobref_new (); + if (!iobref) + goto free_msg; + + iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); + if (!iobuf) + goto free_iobref; + + iobref_add (iobref, iobuf); + + iov_unload (iobuf_ptr (iobuf), &iov, 1); /* FIXME!!! */ + + iov.iov_base = iobuf_ptr (iobuf); + iov.iov_len = size; + + ret = syncop_writev (child->xl, fd, &iov, 1, 0, iobref, 0, xdata, NULL); + if (ret <= 0) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "dummy write failed (%s)", strerror (errno)); + goto free_iobuf; + } + + /* iobref_unbref() takes care of iobuf unref */ + ret = 0; + + free_iobuf: + iobuf_unref (iobuf); + free_iobref: + iobref_unref (iobref); + free_msg: + GF_FREE (msg); + out: + return; +} + +static void +br_object_handle_reopen (xlator_t *this, + br_object_t *object, inode_t *linked_inode) +{ + int32_t ret = -1; + dict_t *dict = NULL; + loc_t loc = {0, }; + + /** + * Here dict is purposefully not checked for NULL, because at any cost + * sending a re-open should not be missed. This re-open is an indication + * for the stub to properly mark inode's status. + */ + dict = dict_new (); + if (dict) { + /* TODO: Make it a #define */ + ret = dict_set_int32 (dict, "br-fd-reopen", 1); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "Object reopen would trigger versioning."); + } + + loc.inode = inode_ref (linked_inode); + gf_uuid_copy (loc.gfid, linked_inode->gfid); + + br_trigger_sign (this, object->child, linked_inode, &loc, dict); + + if (dict) + dict_unref (dict); + loc_wipe (&loc); +} + /** * Sign a given object. This routine runs full throttle. There needs to be * some form of priority scheduling and/or read burstness to avoid starving @@ -497,6 +589,7 @@ static inline int32_t br_sign_object (br_object_t *object) fd_t *fd = NULL; struct iatt iatt = {0, }; pid_t pid = GF_CLIENT_PID_BITD; + br_sign_state_t sign_info = BR_SIGN_NORMAL; GF_VALIDATE_OR_GOTO ("bit-rot", object, out); @@ -515,6 +608,20 @@ static inline int32_t br_sign_object (br_object_t *object) goto out; } + /* sanity check */ + sign_info = ntohl (object->sign_info); + GF_ASSERT (sign_info != BR_SIGN_NORMAL); + + /** + * For fd's that have notified for reopening, we send an explicit + * open() followed by a dummy write() call. This triggers the + * actual signing of the object. + */ + if (sign_info == BR_SIGN_REOPEN_WAIT) { + br_object_handle_reopen (this, object, linked_inode); + goto unref_inode; + } + ret = br_object_open (this, object, linked_inode, &fd); if (!fd) { br_log_object (this, "open", object->gfid, -ret); @@ -648,6 +755,7 @@ br_initialize_object (xlator_t *this, br_child_t *child, changelog_event_t *ev) /* NOTE: it's BE, but no worry */ object->signedversion = ev->u.releasebr.version; + object->sign_info = ev->u.releasebr.sign_info; out: return object; @@ -693,7 +801,6 @@ br_brick_callback (void *xl, char *brick, xlator_t *this = NULL; br_object_t *object = NULL; br_child_t *child = NULL; - int32_t flags = 0; struct gf_tw_timer_list *timer = NULL; this = xl; @@ -710,14 +817,6 @@ br_brick_callback (void *xl, char *brick, gf_log (this->name, GF_LOG_DEBUG, "RELEASE EVENT [GFID %s]", uuid_utoa (gfid)); - flags = (int32_t)ntohl (ev->u.releasebr.flags); - if (flags == O_RDONLY) { - gf_log (this->name, GF_LOG_DEBUG, - "Read only fd [GFID: %s], ignoring signing..", - uuid_utoa (gfid)); - goto out; - } - child = br_get_child_from_brick_path (this, brick); if (!child) { gf_log (this->name, GF_LOG_ERROR, "failed to get the subvolume " @@ -804,12 +903,15 @@ out: return need_sign; } -static inline void +void br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode, - loc_t *loc) + loc_t *loc, dict_t *xdata) { fd_t *fd = NULL; int32_t ret = -1; + pid_t pid = GF_CLIENT_PID_BITD; + + syncopctx_setfspid (&pid); fd = fd_create (linked_inode, 0); if (!fd) { @@ -828,8 +930,10 @@ br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode, fd_bind (fd); } - if (fd) + if (fd) { + br_send_dummy_write (this, fd, child, xdata); syncop_close (fd); + } out: return; @@ -972,7 +1076,7 @@ bitd_oneshot_crawl (xlator_t *subvol, gf_log (this->name, GF_LOG_INFO, "Triggering signing for %s [GFID: %s | Brick: %s]", loc.path, uuid_utoa (linked_inode->gfid), child->brick_path); - br_trigger_sign (this, child, linked_inode, &loc); + br_trigger_sign (this, child, linked_inode, &loc, NULL); ret = 0; @@ -1600,7 +1704,9 @@ struct xlator_cbks cbks; struct volume_options options[] = { { .key = {"expiry-time"}, .type = GF_OPTION_TYPE_INT, - .default_value = "120", + /* Let the default timer be half the value of the wait time for + * sining (which is 120 as of now) */ + .default_value = "60", .description = "default time duration for which an object waits " "before it is signed", }, diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.h b/xlators/features/bit-rot/src/bitd/bit-rot.h index 66515e3213c..bbaf86fa65f 100644 --- a/xlators/features/bit-rot/src/bitd/bit-rot.h +++ b/xlators/features/bit-rot/src/bitd/bit-rot.h @@ -152,6 +152,8 @@ struct br_object { be signed */ br_child_t *child; /* object's subvolume */ + int sign_info; + struct list_head list; /* hook to add to the queue once the object is expired from timer wheel */ void *data; @@ -175,4 +177,8 @@ br_prepare_loc (xlator_t *, br_child_t *, loc_t *, gf_dirent_t *, loc_t *); gf_boolean_t bitd_is_bad_file (xlator_t *, br_child_t *, loc_t *, fd_t *); +void +br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode, + loc_t *loc, dict_t *xdata); + #endif /* __BIT_ROT_H__ */ diff --git a/xlators/features/bit-rot/src/stub/bit-rot-common.h b/xlators/features/bit-rot/src/stub/bit-rot-common.h index 699323170d3..7fd584e5970 100644 --- a/xlators/features/bit-rot/src/stub/bit-rot-common.h +++ b/xlators/features/bit-rot/src/stub/bit-rot-common.h @@ -33,6 +33,13 @@ typedef enum br_vxattr_state { BR_VXATTR_STATUS_INVALID = 3, } br_vxattr_status_t; +typedef enum br_sign_state { + BR_SIGN_INVALID = -1, + BR_SIGN_NORMAL = 0, + BR_SIGN_REOPEN_WAIT = 1, + BR_SIGN_QUICK = 2, +} br_sign_state_t; + static inline br_vxattr_status_t br_version_xattr_state (dict_t *xattr, br_version_t **obuf, br_signature_t **sbuf) diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h index 46271407219..9f6da89032f 100644 --- a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h @@ -28,7 +28,8 @@ enum br_mem_types { gf_br_mt_br_tbf_opspec_t, gf_br_mt_br_scrubber_t, gf_br_mt_br_fsscan_entry_t, - gf_br_stub_mt_end + gf_br_stub_mt_br_stub_fd_t, + gf_br_stub_mt_end, }; #endif diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.c b/xlators/features/bit-rot/src/stub/bit-rot-stub.c index f9c3886948a..93db072f671 100644 --- a/xlators/features/bit-rot/src/stub/bit-rot-stub.c +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.c @@ -198,14 +198,15 @@ br_stub_init_inode_versions (xlator_t *this, fd_t *fd, inode_t *inode, if (!ctx) goto error_return; + INIT_LIST_HEAD (&ctx->fd_list); (markdirty) ? __br_stub_mark_inode_dirty (ctx) : __br_stub_mark_inode_synced (ctx); __br_stub_set_ongoing_version (ctx, version); - __br_stub_reset_release_counters (ctx); if (fd) { - br_stub_require_release_call (this, fd); - __br_stub_track_openfd (fd, ctx); + ret = br_stub_add_fd_to_inode (this, fd, ctx); + if (ret) + goto free_ctx; } ret = br_stub_set_inode_ctx (this, inode, ctx); if (ret) @@ -238,7 +239,6 @@ br_stub_mod_inode_versions (xlator_t *this, __br_stub_mark_inode_synced (ctx); } - __br_stub_track_openfd (fd, ctx); ret = 0; } unblock: @@ -250,19 +250,16 @@ br_stub_mod_inode_versions (xlator_t *this, static inline void br_stub_fill_local (br_stub_local_t *local, call_stub_t *stub, fd_t *fd, inode_t *inode, uuid_t gfid, - int versioningtype, unsigned long memversion, int dirty) + int versioningtype, unsigned long memversion) { local->fopstub = stub; local->versioningtype = versioningtype; local->u.context.version = memversion; - if (fd) + if (fd && !local->u.context.fd) local->u.context.fd = fd_ref (fd); if (inode) local->u.context.inode = inode_ref (inode); gf_uuid_copy (local->u.context.gfid, gfid); - - /* mark inode dirty/fresh according to durability */ - local->u.context.markdirty = (dirty) ? _gf_true : _gf_false; } static inline void @@ -279,57 +276,13 @@ br_stub_cleanup_local (br_stub_local_t *local) inode_unref (local->u.context.inode); local->u.context.inode = NULL; } - local->u.context.markdirty = _gf_true; memset (local->u.context.gfid, '\0', sizeof (uuid_t)); } /** - * callback for inode/fd full versioning + * callback for inode/fd versioning */ int -br_stub_inode_fullversioning_cbk (call_frame_t *frame, - void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xdata) -{ - fd_t *fd = NULL; - inode_t *inode = NULL; - unsigned long version = 0; - gf_boolean_t dirty = _gf_true; - br_stub_local_t *local = NULL; - - local = (br_stub_local_t *)frame->local; - - /* be graceful to EEXIST */ - if ((op_ret < 0) && (op_errno == EEXIST)) { - op_ret = 0; - goto done; - } - - if (op_ret < 0) - goto done; - - fd = local->u.context.fd; - inode = local->u.context.inode; - version = local->u.context.version; - dirty = local->u.context.markdirty; - - op_ret = br_stub_init_inode_versions (this, fd, inode, version, dirty); - if (op_ret < 0) - op_errno = EINVAL; - - done: - frame->local = NULL; - if (op_ret < 0) - call_unwind_error (local->fopstub, op_ret, op_errno); - else - call_resume (local->fopstub); - br_stub_cleanup_local (local); - br_stub_dealloc_local (local); - - return 0; -} - -int br_stub_fd_incversioning_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, dict_t *xdata) @@ -351,14 +304,14 @@ br_stub_fd_incversioning_cbk (call_frame_t *frame, op_errno = EINVAL; done: - frame->local = NULL; - if (op_ret < 0) + if (op_ret < 0) { + frame->local = NULL; call_unwind_error (local->fopstub, -1, op_errno); - else + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + } else { call_resume (local->fopstub); - br_stub_cleanup_local (local); - br_stub_dealloc_local (local); - + } return 0; } @@ -366,28 +319,27 @@ br_stub_fd_incversioning_cbk (call_frame_t *frame, * Initial object versioning * * Version persists two (2) extended attributes as explained below: - * 1. Current (ongoing) version: This is incremented on an open() - * or creat() and is the running version for an object. + * 1. Current (ongoing) version: This is incremented on an writev () + * or truncate () and is the running version for an object. * 2. Signing version: This is the version against which an object * was signed (checksummed). * * During initial versioning, both ongoing and signing versions are - * set of one and zero respectively. An open() call increments the + * set of one and zero respectively. A write() call increments the * ongoing version as an indication of modification to the object. * Additionally this needs to be persisted on disk and needs to be * durable: fsync().. :-/ - * As an optimization only the first open() synchronizes the ongoing - * version to disk, subsequent open()s before the *last* release() + * As an optimization only the first write() synchronizes the ongoing + * version to disk, subsequent write()s before the *last* release() * are no-op's. * * create(), just like lookup() initializes the object versions to - * the default, but persists the version to disk. As an optimization - * this is not a durable operation: in case of a crash, hard reboot - * etc.. absence of versioning xattrs is ignored in scrubber along - * with the one time crawler explicitly triggering signing for such - * objects. + * the default. As an optimization this is not a durable operation: + * in case of a crash, hard reboot etc.. absence of versioning xattrs + * is ignored in scrubber along with the one time crawler explicitly + * triggering signing for such objects. * - * c.f. br_stub_open_cbk() / br_stub_create_cbk() + * c.f. br_stub_writev() / br_stub_truncate() */ /** @@ -400,7 +352,7 @@ int br_stub_fd_versioning (xlator_t *this, call_frame_t *frame, call_stub_t *stub, dict_t *dict, fd_t *fd, br_stub_version_cbk *callback, unsigned long memversion, - int versioningtype, int durable, int dirty) + int versioningtype, int durable) { int32_t ret = -1; int flags = 0; @@ -421,18 +373,11 @@ br_stub_fd_versioning (xlator_t *this, call_frame_t *frame, goto dealloc_xdata; } - local = br_stub_alloc_local (this); - if (!local) { - ret = -1; - goto dealloc_xdata; - } - - if (versioningtype == BR_STUB_FULL_VERSIONING) - flags |= XATTR_CREATE; + local = frame->local; br_stub_fill_local (local, stub, fd, fd->inode, fd->inode->gfid, - versioningtype, memversion, dirty); + versioningtype, memversion); frame->local = local; STACK_WIND (frame, callback, @@ -448,82 +393,21 @@ br_stub_fd_versioning (xlator_t *this, call_frame_t *frame, } static inline int -br_stub_perform_fullversioning (xlator_t *this, call_frame_t *frame, - call_stub_t *stub, fd_t *fd) -{ - int32_t ret = -1; - dict_t *dict = NULL; - br_version_t *obuf = NULL; - int op_errno = 0; - - op_errno = ENOMEM; - dict = dict_new (); - if (!dict) - goto done; - ret = br_stub_alloc_versions (&obuf, NULL, 0); - if (ret) - goto dealloc_dict; - - op_errno = EINVAL; - ret = br_stub_prepare_version_request (this, dict, obuf, - BITROT_DEFAULT_CURRENT_VERSION); - if (ret) - goto dealloc_versions; - - /** - * Version extended attributes need not be durable at this point of - * time. If the objects (inode) data gets persisted on disk but the - * version extended attributes are lost due to a crash/power failure, - * a subsequent lookup marks the objects signature as stale. This way, - * dentry operation times do not shoot up. - */ - ret = br_stub_fd_versioning (this, frame, stub, dict, fd, - br_stub_inode_fullversioning_cbk, - BITROT_DEFAULT_CURRENT_VERSION, - BR_STUB_FULL_VERSIONING, !WRITEBACK_DURABLE, 0); - - dealloc_versions: - br_stub_dealloc_versions (obuf); - dealloc_dict: - dict_unref (dict); - done: - if (ret) - call_unwind_error (stub, -1, op_errno); - return ret; -} - -static inline int br_stub_perform_incversioning (xlator_t *this, call_frame_t *frame, call_stub_t *stub, fd_t *fd, br_stub_inode_ctx_t *ctx) { - int32_t ret = -1; - dict_t *dict = NULL; - inode_t *inode = NULL; - br_version_t *obuf = NULL; - unsigned long writeback_version = 0; - int op_errno = 0; - - inode = fd->inode; + int32_t ret = -1; + dict_t *dict = NULL; + br_version_t *obuf = NULL; + unsigned long writeback_version = 0; + int op_errno = 0; + br_stub_local_t *local = NULL; op_errno = EINVAL; - ret = br_stub_require_release_call (this, fd); - if (ret) - goto done; - - LOCK (&inode->lock); - { - if (__br_stub_is_inode_dirty (ctx)) - writeback_version = __br_stub_writeback_version (ctx); - else - __br_stub_track_openfd (fd, ctx); - } - UNLOCK (&inode->lock); + local = frame->local; - if (!writeback_version) { - ret = 0; - goto done; - } + writeback_version = __br_stub_writeback_version (ctx); /* inode requires writeback to disk */ op_errno = ENOMEM; @@ -541,17 +425,23 @@ br_stub_perform_incversioning (xlator_t *this, ret = br_stub_fd_versioning (this, frame, stub, dict, fd, br_stub_fd_incversioning_cbk, writeback_version, - BR_STUB_INCREMENTAL_VERSIONING, WRITEBACK_DURABLE, 0); + BR_STUB_INCREMENTAL_VERSIONING, !WRITEBACK_DURABLE); dealloc_versions: br_stub_dealloc_versions (obuf); dealloc_dict: dict_unref (dict); done: - if (!ret && !writeback_version) - call_resume (stub); - if (ret) + if (ret) { + if (local) + frame->local = NULL; call_unwind_error (stub, -1, op_errno); + if (local) { + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + } + } + return ret; } @@ -560,6 +450,44 @@ br_stub_perform_incversioning (xlator_t *this, /* fsetxattr() */ static inline int +br_stub_compare_sign_version (xlator_t *this, inode_t *inode, + br_signature_t *sbuf, dict_t *dict) +{ + int32_t ret = -1; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t tmp_ctx = 0; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO (this->name, sbuf, out); + GF_VALIDATE_OR_GOTO (this->name, dict, out); + + ret = br_stub_get_inode_ctx (this, inode, &tmp_ctx); + if (ret) { + dict_del (dict, BITROT_SIGNING_VERSION_KEY); + goto out; + } + + ret = -1; + ctx = (br_stub_inode_ctx_t *)(long)tmp_ctx; + + LOCK (&inode->lock); + { + if (ctx->currentversion == sbuf->signedversion) + ret = 0; + else + gf_log (this->name, GF_LOG_WARNING, "current version " + "%lu and version of the signature %lu are not " + "same", ctx->currentversion, + sbuf->signedversion); + } + UNLOCK (&inode->lock); + +out: + return ret; +} + +static inline int br_stub_prepare_signature (xlator_t *this, dict_t *dict, inode_t *inode, br_isignature_t *sign) { @@ -577,6 +505,11 @@ br_stub_prepare_signature (xlator_t *this, dict_t *dict, ret = br_stub_prepare_signing_request (dict, sbuf, sign, signaturelen); if (ret) goto dealloc_versions; + + ret = br_stub_compare_sign_version (this, inode, sbuf, dict); + if (ret) + goto dealloc_versions; + return 0; dealloc_versions: @@ -620,6 +553,8 @@ br_stub_fsetxattr (call_frame_t *frame, xlator_t *this, if (ret) goto unwind; + gf_log (this->name, GF_LOG_DEBUG, "SIGNED VERSION: %lu", + sign->signedversion); wind: STACK_WIND (frame, default_setxattr_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr, fd, @@ -865,77 +800,598 @@ br_stub_fgetxattr (call_frame_t *frame, xlator_t *this, return 0; } -/** }}} */ +/** + * The first write response on the first fd in the list of fds will set + * the flag to indicate that the inode is modified. The subsequent write + * respnses coming on either the first fd or some other fd will not change + * the fd. The inode-modified flag is unset only upon release of all the + * fds. + */ +int32_t +br_stub_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + br_stub_local_t *local = NULL; + + if (frame->local) { + local = frame->local; + frame->local = NULL; + } + if (op_ret < 0) + goto unwind; -/** {{{ */ + ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode, + &ctx_addr); + if (ret < 0) + goto unwind; -/* open() */ + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; -int -br_stub_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd, dict_t *xdata) + /* Mark the flag to indicate the inode has been modified */ + LOCK (&local->u.context.fd->inode->lock); + { + if (!__br_stub_is_inode_modified (ctx)) + __br_stub_set_inode_modified (ctx); + } + UNLOCK (&local->u.context.fd->inode->lock); + + +unwind: + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + return 0; +} + +/** + * Ongoing version is increased only for the first modify operation. + * First modify version means the first write or truncate call coming on the + * first fd in the list of inodes. + * For anonymous fds open would not have come, so check if its the first write + * by doing both inode dirty check and ensuring list of fds is empty + */ +static inline gf_boolean_t +br_stub_inc_version (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx) { - int32_t ret = 0; - uint64_t ctx_addr = 0; - br_stub_inode_ctx_t *ctx = NULL; - call_stub_t *stub = NULL; + gf_boolean_t inc_version = _gf_false; + + GF_VALIDATE_OR_GOTO (this->name, fd, out); + GF_VALIDATE_OR_GOTO (this->name, ctx, out); + + LOCK (&fd->inode->lock); + { + if (__br_stub_is_inode_dirty (ctx)) + inc_version = _gf_true; + } + UNLOCK (&fd->inode->lock); + +out: + return inc_version; +} + +/** + * Since NFS does not do open, writes from NFS are sent over an anonymous + * fd. It means each write fop might come on a different anonymous fd and + * will lead to very large number of notifications being sent. It might + * affect the perfromance as, there will too many sign requests. + * To avoid that whenever the last fd released from an inode (logical release) + * is an anonymous fd the release notification is sent with a flag being set + * __br_stub_anon_release (ctx); + * BitD checks for the flag and if set, it will send a dummy write request + * (again on an anonymous fd) instead of triggering sign. + * Bit-rot-stub should identify such dummy writes and should send success to + * them instead of winding them downwards. + */ +gf_boolean_t +br_stub_dummy_write (call_frame_t *frame) +{ + return (frame->root->pid == GF_CLIENT_PID_BITD) + ? _gf_true : _gf_false; +} + +int32_t +br_stub_anon_fd_ctx (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx) +{ + int32_t ret = -1; + br_stub_fd_t *br_stub_fd = NULL; + + br_stub_fd = br_stub_fd_ctx_get (this, fd); + if (!br_stub_fd) { + ret = br_stub_add_fd_to_inode (this, fd, ctx); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to " + "add fd to the inode (gfid: %s)", + uuid_utoa (fd->inode->gfid)); + goto out; + } + } + + ret = 0; + +out: + return ret; +} + +int32_t +br_stub_writev_resume (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) +{ + if (frame->root->pid == GF_CLIENT_PID_BITD) + br_stub_writev_cbk (frame, NULL, this, vector->iov_len, 0, + NULL, NULL, NULL); + else + STACK_WIND (frame, br_stub_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, + offset, flags, iobref, xdata); + return 0; +} + +/** + TODO: If possible add pictorial represention of below comment. + + Before sending writev on the ANONYMOUS FD, increase the ongoing + version first. This brings anonymous fd write closer to the regular + fd write by having the ongoing version increased before doing the + write (In regular fd, after open the ongoing version is incremented). + Do following steps to handle writes on anonymous fds: + 1) Increase the on-disk ongoing version + 2) Once versioning is successfully done send write operation. If versioning + fails, then fail the write fop. + 3) In writev_cbk do below things: + a) Increase in-memory version + b) set the fd context (so that br_stub_release is invoked) + c) add the fd to the list of fds maintained in the inode context of + bitrot-stub. + d) Mark inode as non dirty + e) Mard inode as modified (in the inode context) +**/ +int32_t +br_stub_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) +{ + br_stub_local_t *local = NULL; + call_stub_t *stub = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + gf_boolean_t inc_version = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t ctx_addr = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO (this->name, frame, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd, unwind); + + local = br_stub_alloc_local (this); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, "local allocation failed " + "(gfid: %s)", uuid_utoa (fd->inode->gfid)); + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + local->u.context.fd = fd_ref (fd); + frame->local = local; + + ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to get the inode " + "context for the inode %s", + uuid_utoa (fd->inode->gfid)); + goto unwind; + } + + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + if (fd_is_anonymous (fd)) { + ret = br_stub_anon_fd_ctx (this, fd, ctx); + if (ret) + goto unwind; + } + + /* TODO: Better to do a dummy fsetxattr instead of write. Keep write + simple */ + if (br_stub_dummy_write (frame)) { + LOCK (&fd->inode->lock); + { + (void) __br_stub_inode_sign_state + (ctx, GF_FOP_WRITE, fd); + } + UNLOCK (&fd->inode->lock); + + if (xdata && dict_get (xdata, "br-fd-reopen")) { + op_ret = vector->iov_len; + op_errno = 0; + goto unwind; + } + } + + /** + * Check whether this is the first write on this inode since the last + * sign notification has been sent. If so, do versioning. Otherwise + * go ahead with the fop. + */ + inc_version = br_stub_inc_version (this, fd, ctx); + if (!inc_version) + goto wind; + + /* Create the stub for the write fop */ + stub = fop_writev_stub (frame, br_stub_writev_resume, fd, vector, count, + offset, flags, iobref, xdata); + + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for " + "write fop (gfid: %s), unwinding", + uuid_utoa (fd->inode->gfid)); + goto unwind; + } + + /* Perform Versioning */ + return br_stub_perform_incversioning (this, frame, stub, fd, ctx); + +wind: + STACK_WIND (frame, br_stub_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); + return 0; + +unwind: + frame->local = NULL; + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, NULL, NULL, + NULL); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + return 0; +} + +int32_t +br_stub_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + br_stub_local_t *local = NULL; + + if (frame->local) { + local = frame->local; + frame->local = NULL; + } if (op_ret < 0) goto unwind; - if (cookie != (void *) BR_STUB_REQUEST_COOKIE) + + ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode, + &ctx_addr); + if (ret < 0) goto unwind; + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + + /* Mark the flag to indicate the inode has been modified */ + LOCK (&local->u.context.fd->inode->lock); + { + if (!__br_stub_is_inode_modified (ctx)) + __br_stub_set_inode_modified (ctx); + } + UNLOCK (&local->u.context.fd->inode->lock); + + +unwind: + STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + return 0; +} + +int32_t +br_stub_ftruncate_resume (call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, dict_t *xdata) +{ + STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; +} + +int32_t +br_stub_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, dict_t *xdata) +{ + br_stub_local_t *local = NULL; + call_stub_t *stub = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + gf_boolean_t inc_version = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t ctx_addr = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO (this->name, frame, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd, unwind); + + local = br_stub_alloc_local (this); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, "local allocation failed " + "(gfid: %s)", uuid_utoa (fd->inode->gfid)); + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + local->u.context.fd = fd_ref (fd); + frame->local = local; + ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); - if (ret < 0) + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to get the inode " + "context for the inode %s", + uuid_utoa (fd->inode->gfid)); goto unwind; + } + + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + if (fd_is_anonymous (fd)) { + ret = br_stub_anon_fd_ctx (this, fd, ctx); + if (ret) + goto unwind; + } - stub = fop_open_cbk_stub (frame, NULL, op_ret, op_errno, fd, xdata); + /** + * c.f. br_stub_writev() + */ + inc_version = br_stub_inc_version (this, fd, ctx); + if (!inc_version) + goto wind; + + /* Create the stub for the ftruncate fop */ + stub = fop_ftruncate_stub (frame, br_stub_ftruncate_resume, fd, offset, + xdata); if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for " + "ftruncate fop (gfid: %s), unwinding", + uuid_utoa (fd->inode->gfid)); + goto unwind; + } + + /* Perform Versioning */ + return br_stub_perform_incversioning (this, frame, stub, fd, ctx); + +wind: + STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; + +unwind: + frame->local = NULL; + STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, NULL, NULL, + NULL); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + return 0; +} + +int32_t +br_stub_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + br_stub_local_t *local = NULL; + + if (frame->local) { + local = frame->local; + frame->local = NULL; + } + + if (op_ret < 0) + goto unwind; + + ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode, + &ctx_addr); + if (ret < 0) + goto unwind; + + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + + /* Mark the flag to indicate the inode has been modified */ + LOCK (&local->u.context.fd->inode->lock); + { + if (!__br_stub_is_inode_modified (ctx)) + __br_stub_set_inode_modified (ctx); + } + UNLOCK (&local->u.context.fd->inode->lock); + + +unwind: + STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + return 0; +} + +int32_t +br_stub_truncate_resume (call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata) +{ + STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; +} + +/** + * Bit-rot-stub depends heavily on the fd based operations to for doing + * versioning and sending notification. It starts tracking the operation + * upon getting first fd based modify operation by doing versioning and + * sends notification when last fd using which the inode was modified is + * released. + * But for truncate there is no fd and hence it becomes difficult to do + * the versioning and send notification. It is handled by doing versioning + * on an anonymous fd. The fd will be valid till the completion of the + * truncate call. It guarantees that release on this anonymous fd will happen + * after the truncate call and notification is sent after the truncate call. + */ +int32_t +br_stub_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata) +{ + br_stub_local_t *local = NULL; + call_stub_t *stub = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + gf_boolean_t inc_version = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t ctx_addr = 0; + int32_t ret = -1; + fd_t *fd = NULL; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO (this->name, frame, unwind); + GF_VALIDATE_OR_GOTO (this->name, loc, unwind); + GF_VALIDATE_OR_GOTO (this->name, loc->inode, unwind); + + fd = fd_anonymous (loc->inode); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, "failed to create anonymous " + "fd for the inode %s", uuid_utoa (loc->inode->gfid)); + goto unwind; + } + + local = br_stub_alloc_local (this); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, "local allocation failed " + "(gfid: %s)", uuid_utoa (loc->inode->gfid)); op_ret = -1; - op_errno = EINVAL; + op_errno = ENOMEM; + goto unwind; + } + + local->u.context.fd = fd; + frame->local = local; + + ret = br_stub_get_inode_ctx (this, loc->inode, &ctx_addr); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to get the inode " + "context for the inode %s", + uuid_utoa (fd->inode->gfid)); goto unwind; } + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + ret = br_stub_anon_fd_ctx (this, local->u.context.fd, ctx); + if (ret) + goto unwind; + /** - * Ongoing version needs to be incremented. If the inode is not dirty, - * things are simple: increment the ongoing version safely and be done. - * If inode is dirty, a writeback to disk is required. This is tricky in - * case of multiple open()'s as ongoing version needs to be incremented - * on a successful writeback. It's probably safe to remember the ongoing - * version before writeback and *assigning* it in the callback, but that - * may lead to a trustable checksum to be treated as stale by scrubber - * (the case where the in-memory ongoing version is lesser than the - * on-disk version). Therefore, *all* open() calls (which might have - * come in parallel) try to synchronize the next ongoing version to - * disk. In the callback path, the winner marks the inode as synced - * therby loosing open() calls become no-op's. + * c.f. br_stub_writev() */ - ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; - return br_stub_perform_incversioning (this, frame, stub, fd, ctx); + inc_version = br_stub_inc_version (this, fd, ctx); + if (!inc_version) + goto wind; - unwind: - STACK_UNWIND_STRICT (open, frame, - op_ret, op_errno, fd, xdata); + /* Create the stub for the truncate fop */ + stub = fop_truncate_stub (frame, br_stub_truncate_resume, loc, offset, + xdata); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for " + "truncate fop (gfid: %s), unwinding", + uuid_utoa (fd->inode->gfid)); + goto unwind; + } + + /* Perform Versioning */ + return br_stub_perform_incversioning (this, frame, stub, + local->u.context.fd, ctx); + +wind: + STACK_WIND (frame, br_stub_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; + +unwind: + frame->local = NULL; + STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, NULL, NULL, + NULL); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); return 0; } +/** }}} */ + + +/** {{{ */ + +/* open() */ + +/** + * It's probably worth mentioning a bit about why some of the housekeeping + * work is done in open() call path, rather than the callback path. + * Two (or more) open()'s in parallel can race and lead to a situation + * where a release() gets triggered (possibly after a series of write() + * calls) when *other* open()'s have still not reached callback path + * thereby having an active fd on an inode that is in process of getting + * signed with the current version. + * + * Maintaining fd list in the call path ensures that a release() would + * not be triggered if an open() call races ahead (followed by a close()) + * threby finding non-empty fd list. + */ + int br_stub_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata) { - void *cookie = NULL; + int32_t ret = -1; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t ctx_addr = 0; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO (this->name, loc, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd->inode, unwind); - if (!flags) - goto wind; if (frame->root->pid == GF_CLIENT_PID_SCRUB) goto wind; - cookie = (void *) BR_STUB_REQUEST_COOKIE; - wind: - STACK_WIND_COOKIE (frame, br_stub_open_cbk, cookie, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->open, - loc, flags, fd, xdata); + ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to get the inode " + "context for the file %s (gfid: %s)", loc->path, + uuid_utoa (fd->inode->gfid)); + goto unwind; + } + + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + if (flags == O_RDONLY) + goto wind; + + ret = br_stub_add_fd_to_inode (this, fd, ctx); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed add fd to the list " + "(gfid: %s)", uuid_utoa (fd->inode->gfid)); + goto unwind; + } + +wind: + STACK_WIND (frame, default_open_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->open, loc, flags, fd, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT (open, frame, -1, EINVAL, NULL, NULL); return 0; } @@ -946,39 +1402,60 @@ br_stub_open (call_frame_t *frame, xlator_t *this, /* creat() */ +/** + * This routine registers a release callback for the given fd and adds the + * fd to the inode context fd tracking list. + */ +int32_t +br_stub_add_fd_to_inode (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx) +{ + int32_t ret = -1; + br_stub_fd_t *br_stub_fd = NULL; + + ret = br_stub_require_release_call (this, fd, &br_stub_fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to set the fd " + "context for the file (gfid: %s)", + uuid_utoa (fd->inode->gfid)); + goto out; + } + + LOCK (&fd->inode->lock); + { + list_add_tail (&ctx->fd_list, &br_stub_fd->list); + } + UNLOCK (&fd->inode->lock); + + ret = 0; + +out: + return ret; +} + int br_stub_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - int32_t ret = 0; - uint64_t ctx_addr = 0; - call_stub_t *stub = NULL; - br_stub_inode_ctx_t *ctx = NULL; + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + unsigned long version = BITROT_DEFAULT_CURRENT_VERSION; if (op_ret < 0) goto unwind; - stub = fop_create_cbk_stub (frame, NULL, op_ret, op_errno, fd, inode, - stbuf, preparent, postparent, xdata); - if (!stub) { - op_ret = -1; - op_errno = EINVAL; - goto unwind; - } - ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); - if (ret < 0) - ctx_addr = 0; - ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; - - /* see comment in br_stub_open_cbk().. */ - return (ctx) - ? br_stub_perform_incversioning (this, frame, stub, fd, ctx) - : br_stub_perform_fullversioning (this, frame, stub, fd); + if (ret < 0) { + ret = br_stub_init_inode_versions (this, fd, inode, version, + _gf_true); + } else { + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + ret = br_stub_add_fd_to_inode (this, fd, ctx); + } - unwind: +unwind: STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, stbuf, preparent, postparent, xdata); return 0; @@ -989,10 +1466,20 @@ br_stub_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO (this->name, loc, unwind); + GF_VALIDATE_OR_GOTO (this->name, loc->inode, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd->inode, unwind); + STACK_WIND (frame, br_stub_create_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->create, loc, flags, mode, umask, fd, xdata); return 0; +unwind: + STACK_UNWIND_STRICT (create, frame, -1, EINVAL, NULL, NULL, NULL, NULL, + NULL, NULL); + return 0; } /** }}} */ @@ -1011,21 +1498,11 @@ br_stub_lookup_version (xlator_t *this, * out the correct version to use in the inode context (start with * the default version if unavailable). As of now versions are not * persisted on-disk. The inode is marked dirty, so that the first - * operation (such as open(), etc..) would trigger synchronization - * to disk. + * operation (such as write(), etc..) triggers synchronization to + * disk. */ status = br_version_xattr_state (xattr, &obuf, &sbuf); - /** - * stub does not know how to handle presence of signature but not - * the object version, therefore, in such cases, bail out.. - */ - if (status == BR_VXATTR_STATUS_INVALID) { - gf_log (this->name, GF_LOG_ERROR, "Invalid versioning xattrs. " - "Bailing out [GFID: %s]", uuid_utoa (gfid)); - return -1; - } - version = ((status == BR_VXATTR_STATUS_FULL) || (status == BR_VXATTR_STATUS_UNSIGNED)) ? obuf->ongoingversion : BITROT_DEFAULT_CURRENT_VERSION; @@ -1259,8 +1736,8 @@ br_stub_noop (call_frame_t *frame, void *cookie, xlator_t *this, } static inline void -br_stub_send_ipc_fop (xlator_t *this, - fd_t *fd, unsigned long releaseversion, int32_t flags) +br_stub_send_ipc_fop (xlator_t *this, fd_t *fd, unsigned long releaseversion, + int sign_info) { int32_t op = 0; int32_t ret = 0; @@ -1269,8 +1746,8 @@ br_stub_send_ipc_fop (xlator_t *this, changelog_event_t ev = {0,}; ev.ev_type = CHANGELOG_OP_TYPE_BR_RELEASE; - ev.u.releasebr.flags = flags; ev.u.releasebr.version = releaseversion; + ev.u.releasebr.sign_info = sign_info; gf_uuid_copy (ev.u.releasebr.gfid, fd->inode->gfid); xdata = dict_new (); @@ -1305,14 +1782,67 @@ br_stub_send_ipc_fop (xlator_t *this, return; } +/** + * This is how the state machine of sign info works: + * 3 states: + * 1) BR_SIGN_NORMAL => The default State of the inode + * 2) BR_SIGN_REOPEN_WAIT => A release has been sent and is waiting for reopen + * 3) BR_SIGN_QUICK => reopen has happened and this release should trigger sign + * 2 events: + * 1) GF_FOP_RELEASE + * 2) GF_FOP_WRITE (actually a dummy write fro BitD) + * + * This is how states are changed based on events: + * EVENT: GF_FOP_RELEASE: + * if (state == BR_SIGN_NORMAL) ; then + * set state = BR_SIGN_REOPEN_WAIT; + * if (state == BR_SIGN_QUICK); then + * set state = BR_SIGN_NORMAL; + * EVENT: GF_FOP_WRITE: + * if (state == BR_SIGN_REOPEN_WAIT); then + * set state = BR_SIGN_QUICK; + */ +br_sign_state_t +__br_stub_inode_sign_state (br_stub_inode_ctx_t *ctx, + glusterfs_fop_t fop, fd_t *fd) +{ + br_sign_state_t sign_info = BR_SIGN_INVALID; + + switch (fop) { + + case GF_FOP_WRITE: + sign_info = ctx->info_sign = BR_SIGN_QUICK; + break; + + case GF_FOP_RELEASE: + GF_ASSERT (ctx->info_sign != BR_SIGN_REOPEN_WAIT); + + if (ctx->info_sign == BR_SIGN_NORMAL) { + sign_info = ctx->info_sign = BR_SIGN_REOPEN_WAIT; + } else { + sign_info = ctx->info_sign; + ctx->info_sign = BR_SIGN_NORMAL; + } + + break; + default: + break; + } + + return sign_info; +} + int32_t br_stub_release (xlator_t *this, fd_t *fd) { - int32_t ret = 0; - int32_t flags = 0; - inode_t *inode = NULL; - unsigned long releaseversion = 0; - br_stub_inode_ctx_t *ctx = NULL; + int32_t ret = 0; + int32_t flags = 0; + inode_t *inode = NULL; + unsigned long releaseversion = 0; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t tmp = 0; + br_stub_fd_t *br_stub_fd = NULL; + int32_t signinfo = 0; inode = fd->inode; @@ -1321,12 +1851,23 @@ br_stub_release (xlator_t *this, fd_t *fd) ctx = __br_stub_get_ongoing_version_ctx (this, inode, NULL); if (ctx == NULL) goto unblock; - __br_stub_track_release (ctx); + br_stub_fd = br_stub_fd_ctx_get (this, fd); + if (br_stub_fd) { + list_del_init (&br_stub_fd->list); + } + ret = __br_stub_can_trigger_release - (inode, ctx, &releaseversion, &flags); - if (ret) { - GF_ASSERT (__br_stub_is_inode_dirty (ctx) == 0); + (inode, ctx, &releaseversion); + if (!ret) + goto unblock; + + signinfo = __br_stub_inode_sign_state (ctx, GF_FOP_RELEASE, fd); + signinfo = htonl (signinfo); + + /* inode back to initital state: mark dirty */ + if (ctx->info_sign == BR_SIGN_NORMAL) { __br_stub_mark_inode_dirty (ctx); + __br_stub_unset_inode_modified (ctx); } } unblock: @@ -1334,10 +1875,17 @@ br_stub_release (xlator_t *this, fd_t *fd) if (ret) { gf_log (this->name, GF_LOG_DEBUG, - "releaseversion: %lu|flags: %d", releaseversion, flags); - br_stub_send_ipc_fop (this, fd, releaseversion, flags); + "releaseversion: %lu | flags: %d | signinfo: %d", + (unsigned long) ntohl (releaseversion), + flags, ntohl(signinfo)); + br_stub_send_ipc_fop (this, fd, releaseversion, signinfo); } + ret = fd_ctx_del (fd, this, &tmp); + br_stub_fd = (br_stub_fd_t *)(long)tmp; + + GF_FREE (br_stub_fd); + return 0; } @@ -1351,11 +1899,12 @@ void br_stub_ictxmerge (xlator_t *this, fd_t *fd, inode_t *inode, inode_t *linked_inode) { - int32_t ret = 0; - uint64_t ctxaddr = 0; - uint64_t lctxaddr = 0; - br_stub_inode_ctx_t *ctx = NULL; - br_stub_inode_ctx_t *lctx = NULL; + int32_t ret = 0; + uint64_t ctxaddr = 0; + uint64_t lctxaddr = 0; + br_stub_inode_ctx_t *ctx = NULL; + br_stub_inode_ctx_t *lctx = NULL; + br_stub_fd_t *br_stub_fd = NULL; ret = br_stub_get_inode_ctx (this, inode, &ctxaddr); if (ret < 0) @@ -1369,29 +1918,15 @@ br_stub_ictxmerge (xlator_t *this, fd_t *fd, goto unblock; lctx = (br_stub_inode_ctx_t *) lctxaddr; - if (__br_stub_is_inode_dirty (lctx)) { - /** - * RACY code: An inode can end up in this situation - * after a lookup() or after a create() followed by - * a release(). Even if we distinguish b/w the two, - * there needs to be more infrastructure built up - * in stub to handle these races. Note, that it's - * probably OK to ignore the race iff the version - * was initialized on the very first lookup(), i.e., - * [ongoingversion: default]. - * - * FIXME: fixup races [create(1..n)/lookup(1..n)]. - */ - GF_ASSERT (lctx->currentversion - == BITROT_DEFAULT_CURRENT_VERSION); - __br_stub_track_openfd (fd, lctx); - __br_stub_mark_inode_synced (lctx); - } else { - GF_ASSERT (ctx->currentversion <= lctx->currentversion); - __br_stub_track_openfd (fd, lctx); + GF_ASSERT (list_is_singular (&ctx->fd_list)); + br_stub_fd = list_first_entry (&ctx->fd_list, br_stub_fd_t, + list); + if (br_stub_fd) { + GF_ASSERT (br_stub_fd->fd == fd); + list_move_tail (&br_stub_fd->list, &lctx->fd_list); } } - unblock: +unblock: UNLOCK (&linked_inode->lock); done: @@ -1409,6 +1944,9 @@ struct xlator_fops fops = { .getxattr = br_stub_getxattr, .fgetxattr = br_stub_fgetxattr, .fsetxattr = br_stub_fsetxattr, + .writev = br_stub_writev, + .truncate = br_stub_truncate, + .ftruncate = br_stub_ftruncate, }; struct xlator_cbks cbks = { diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.h b/xlators/features/bit-rot/src/stub/bit-rot-stub.h index 86090bfb877..69e212bb81f 100644 --- a/xlators/features/bit-rot/src/stub/bit-rot-stub.h +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.h @@ -21,6 +21,7 @@ #include "xlator.h" #include "defaults.h" #include "call-stub.h" +#include "bit-rot-stub-mem-types.h" #include "bit-rot-common.h" @@ -32,17 +33,18 @@ typedef struct br_stub_inode_ctx { a writeback to disk? */ unsigned long currentversion; /* ongoing version */ - struct release { - int32_t ordflags; - unsigned long opencount; /* number of open()s before - final release() */ - unsigned long releasecount; /* number of release()s */ - } releasectx; -#define BR_STUB_REQUIRE_RELEASE_CBK 0x0E0EA0E + int info_sign; + struct list_head fd_list; /* list of open fds or fds participating in + write operations */ } br_stub_inode_ctx_t; +typedef struct br_stub_fd { + fd_t *fd; + struct list_head list; +} br_stub_fd_t; #define I_DIRTY (1<<0) /* inode needs writeback */ +#define I_MODIFIED (1<<1) #define WRITEBACK_DURABLE 1 /* writeback is durable */ /** @@ -60,12 +62,10 @@ typedef struct br_stub_local { uuid_t gfid; inode_t *inode; unsigned long version; - gf_boolean_t markdirty; } context; } u; } br_stub_local_t; -#define BR_STUB_FULL_VERSIONING (1<<0) #define BR_STUB_INCREMENTAL_VERSIONING (1<<1) typedef struct br_stub_private { @@ -96,16 +96,131 @@ __br_stub_is_inode_dirty (br_stub_inode_ctx_t *ctx) return (ctx->need_writeback & I_DIRTY); } +/* inode mofification markers */ +static inline void +__br_stub_set_inode_modified (br_stub_inode_ctx_t *ctx) +{ + ctx->need_writeback |= I_MODIFIED; +} + +static inline void +__br_stub_unset_inode_modified (br_stub_inode_ctx_t *ctx) +{ + ctx->need_writeback &= ~I_MODIFIED; +} + +static inline int +__br_stub_is_inode_modified (br_stub_inode_ctx_t *ctx) +{ + return (ctx->need_writeback & I_MODIFIED); +} + +br_stub_fd_t * +br_stub_fd_new (void) +{ + br_stub_fd_t *br_stub_fd = NULL; + + br_stub_fd = GF_CALLOC (1, sizeof (*br_stub_fd), + gf_br_stub_mt_br_stub_fd_t); + + return br_stub_fd; +} + +int +__br_stub_fd_ctx_set (xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd) +{ + uint64_t value = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + GF_VALIDATE_OR_GOTO (this->name, br_stub_fd, out); + + value = (uint64_t)(long) br_stub_fd; + + ret = __fd_ctx_set (fd, this, value); + +out: + return ret; +} + +br_stub_fd_t * +__br_stub_fd_ctx_get (xlator_t *this, fd_t *fd) +{ + br_stub_fd_t *br_stub_fd = NULL; + uint64_t value = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + + ret = __fd_ctx_get (fd, this, &value); + if (ret) + return NULL; + + br_stub_fd = (br_stub_fd_t *) ((long) value); + +out: + return br_stub_fd; +} + +br_stub_fd_t * +br_stub_fd_ctx_get (xlator_t *this, fd_t *fd) +{ + br_stub_fd_t *br_stub_fd = NULL; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + + LOCK (&fd->lock); + { + br_stub_fd = __br_stub_fd_ctx_get (this, fd); + } + UNLOCK (&fd->lock); + +out: + return br_stub_fd; +} + +int32_t +br_stub_fd_ctx_set (xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + GF_VALIDATE_OR_GOTO (this->name, br_stub_fd, out); + + LOCK (&fd->lock); + { + ret = __br_stub_fd_ctx_set (this, fd, br_stub_fd); + } + UNLOCK (&fd->lock); + +out: + return ret; +} + static inline int -br_stub_require_release_call (xlator_t *this, fd_t *fd) +br_stub_require_release_call (xlator_t *this, fd_t *fd, br_stub_fd_t **fd_ctx) { int32_t ret = 0; + br_stub_fd_t *br_stub_fd = NULL; + + br_stub_fd = br_stub_fd_new (); + if (!br_stub_fd) + return -1; + + br_stub_fd->fd = fd; + INIT_LIST_HEAD (&br_stub_fd->list); - ret = fd_ctx_set (fd, this, - (uint64_t)(long)BR_STUB_REQUIRE_RELEASE_CBK); + ret = br_stub_fd_ctx_set (this, fd, br_stub_fd); if (ret) gf_log (this->name, GF_LOG_WARNING, "could not set fd context (for release callback"); + else + *fd_ctx = br_stub_fd; + return ret; } @@ -122,7 +237,15 @@ static inline int br_stub_get_inode_ctx (xlator_t *this, inode_t *inode, uint64_t *ctx) { - return inode_ctx_get (inode, this, ctx); + int ret = -1; + + LOCK (&inode->lock); + { + ret = __br_stub_get_inode_ctx (this, inode, ctx); + } + UNLOCK (&inode->lock); + + return ret; } static inline int @@ -144,55 +267,31 @@ __br_stub_writeback_version (br_stub_inode_ctx_t *ctx) static inline void __br_stub_set_ongoing_version (br_stub_inode_ctx_t *ctx, unsigned long version) { - ctx->currentversion = version; -} - -static inline void -__br_stub_reset_release_counters (br_stub_inode_ctx_t *ctx) -{ - ctx->releasectx.ordflags = 0; - ctx->releasectx.opencount = 0; - ctx->releasectx.releasecount = 0; -} - -static inline void -__br_stub_track_release (br_stub_inode_ctx_t *ctx) -{ - ++ctx->releasectx.releasecount; -} - -static inline void -___br_stub_track_open (br_stub_inode_ctx_t *ctx) -{ - ++ctx->releasectx.opencount; -} - -static inline void -___br_stub_track_open_flags (fd_t *fd, br_stub_inode_ctx_t *ctx) -{ - ctx->releasectx.ordflags |= fd->flags; -} - -static inline void -__br_stub_track_openfd (fd_t *fd, br_stub_inode_ctx_t *ctx) -{ - ___br_stub_track_open (ctx); - ___br_stub_track_open_flags (fd, ctx); + if (ctx->currentversion < version) + ctx->currentversion = version; + else + gf_log ("bit-rot-stub", GF_LOG_WARNING, "current version: %lu" + "new version: %lu", ctx->currentversion, version); } static inline int __br_stub_can_trigger_release (inode_t *inode, - br_stub_inode_ctx_t *ctx, - unsigned long *version, int32_t *flags) + br_stub_inode_ctx_t *ctx, unsigned long *version) { - if (list_empty (&inode->fd_list) - && (ctx->releasectx.releasecount == ctx->releasectx.opencount)) { - if (flags) - *flags = htonl (ctx->releasectx.ordflags); + /** + * If the inode is modified, then it has to be dirty. An inode is + * marked dirty once version is increased. Its marked as modified + * when the modification call (write/truncate) which triggered + * the versioning is successful. + */ + if (__br_stub_is_inode_modified (ctx) + && list_empty (&ctx->fd_list) + && (ctx->info_sign != BR_SIGN_REOPEN_WAIT)) { + + GF_ASSERT (__br_stub_is_inode_dirty (ctx) == 0); + if (version) *version = htonl (ctx->currentversion); - - __br_stub_reset_release_counters (ctx); return 1; } @@ -261,11 +360,16 @@ static inline void br_stub_remove_vxattrs (dict_t *xattr) { if (xattr) { - dict_del (xattr, BITROT_OBJECT_BAD_KEY); dict_del (xattr, BITROT_CURRENT_VERSION_KEY); dict_del (xattr, BITROT_SIGNING_VERSION_KEY); dict_del (xattr, BITROT_SIGNING_XATTR_SIZE_KEY); } } +int32_t +br_stub_add_fd_to_inode (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx); + +br_sign_state_t +__br_stub_inode_sign_state (br_stub_inode_ctx_t *ctx, glusterfs_fop_t fop, + fd_t *fd); #endif /* __BIT_ROT_STUB_H__ */ |