diff options
author | Raghavendra Bhat <raghavendra@redhat.com> | 2015-04-09 15:38:47 +0530 |
---|---|---|
committer | Vijay Bellur <vbellur@redhat.com> | 2015-05-08 11:27:42 -0700 |
commit | c93c433a44770de931f837be179c5ccdba958cad (patch) | |
tree | f6c5f513e8b2ccadc3b53d73db609f731d54ba8b | |
parent | 680b3bf629f0fef038470baab62c6d6d8f5988ce (diff) |
features/bit-rot-stub: versioning of objects in write/truncate fop instead of open
* This patch brings in the changes where object versioning is done in write and
truncate fops instead of tracking them in open and create fops. This model
works for both regular and anonymous fds. It also removes the race associated
with open calls, create and lookups.
This patch follows the below method for object versioning and notifications:
Before sending writev on the fd, increase the ongoing
version first. This makes anonymous fd write similar to the regular
fd write by having the ongoing version increased before doing the
write.
Do following steps to do versioning:
1) For anonymous fds set the fd context (so that release is invoked) and add
the fd context to the list maintained in the inode context.
For regular fds the above think would have been done in open itself.
2) Increase the on-disk ongoing version
3) Increase the in memory ongoing version and mark inode as non-dirty
3) Once versioning is successfully done send write operation. If
versioning fails, then fail the write fop.
5) In writev_cbk mark inode as modified.
Change-Id: I7104391bbe076d8fc49b68745d2ec29a6e92476c
BUG: 1207979
Signed-off-by: Raghavendra Bhat <raghavendra@redhat.com>
Reviewed-on: http://review.gluster.org/10233
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
-rw-r--r-- | libglusterfs/src/changelog.h | 2 | ||||
-rw-r--r-- | tests/bitrot/br-stub.c | 47 | ||||
-rw-r--r-- | tests/bitrot/br-stub.t | 21 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/bitd/bit-rot.c | 138 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/bitd/bit-rot.h | 6 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-common.h | 7 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h | 3 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-stub.c | 1070 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/stub/bit-rot-stub.h | 216 |
9 files changed, 1156 insertions, 354 deletions
diff --git a/libglusterfs/src/changelog.h b/libglusterfs/src/changelog.h index 08307810704..6f86e5a54cd 100644 --- a/libglusterfs/src/changelog.h +++ b/libglusterfs/src/changelog.h @@ -42,9 +42,9 @@ struct ev_release { }; struct ev_release_br { - int32_t flags; unsigned long version; unsigned char gfid[16]; + int32_t sign_info; }; struct ev_changelog { diff --git a/tests/bitrot/br-stub.c b/tests/bitrot/br-stub.c index 6cb12ed0a8e..e164170bb83 100644 --- a/tests/bitrot/br-stub.c +++ b/tests/bitrot/br-stub.c @@ -31,8 +31,10 @@ brstub_validate_version (char *bpath, unsigned long version) if (ret < 0) goto err; - if (xv->ongoingversion != version) + if (xv->ongoingversion != version) { match = -1; + fprintf (stderr, "ongoingversion: %lu\n", xv->ongoingversion); + } free (xv); return match; @@ -42,11 +44,12 @@ brstub_validate_version (char *bpath, unsigned long version) } int -brstub_open_validation (char *filp, char *bpath, unsigned long startversion) +brstub_write_validation (char *filp, char *bpath, unsigned long startversion) { int fd1 = 0; int fd2 = 0; int ret = 0; + char *string = "string\n"; /* read only check */ fd1 = open (filp, O_RDONLY); @@ -55,18 +58,37 @@ brstub_open_validation (char *filp, char *bpath, unsigned long startversion) close (fd1); ret = brstub_validate_version (bpath, startversion); - if (ret < 0) + if (ret == 0) goto err; - /* single open (write/) check */ fd1 = open (filp, O_RDWR); if (fd1 < 0) goto err; - close (fd1); + ret = write (fd1, string, strlen (string)); + if (ret <= 0) + goto err; + /** + * Fsync is done so that the write call has properly reached the + * disk. For fuse mounts write-behind xlator would have held the + * writes with itself and for nfs, client would have held the + * write in its cache. So write fop would not have triggered the + * versioning as it would have not reached the bit-rot-stub. + */ + fsync (fd1); startversion++; ret = brstub_validate_version (bpath, startversion); + if (ret < 0) + goto err; + ret = write (fd1, string, strlen (string)); + if (ret <= 0) + goto err; + ret = brstub_validate_version (bpath, startversion); + if (ret < 0) + goto err; + + close (fd1); /* multi open (write/) check */ fd1 = open (filp, O_RDWR); if (fd1 < 0) @@ -74,13 +96,20 @@ brstub_open_validation (char *filp, char *bpath, unsigned long startversion) fd2 = open (filp, O_WRONLY); if (fd1 < 0) goto err; + + ret = write (fd1, string, strlen (string)); + if (ret <= 0) + goto err; + + ret = write (fd1, string, strlen (string)); + if (ret <= 0) + goto err; close (fd1); close (fd2); /** - * incremented once per open()/open().../close()/close() sequence + * incremented once per write()/write().../close()/close() sequence */ - startversion++; ret = brstub_validate_version (bpath, startversion); if (ret < 0) goto err; @@ -106,11 +135,11 @@ brstub_new_object_validate (char *filp, char *brick) printf ("Validating initial version..\n"); ret = brstub_validate_version (bpath, 1); - if (ret < 0) + if (ret == 0) goto err; printf ("Validating version on modifications..\n"); - ret = brstub_open_validation (filp, bpath, 1); + ret = brstub_write_validation (filp, bpath, 1); if (ret < 0) goto err; diff --git a/tests/bitrot/br-stub.t b/tests/bitrot/br-stub.t index 11d02418785..bab4c7cdbd1 100644 --- a/tests/bitrot/br-stub.t +++ b/tests/bitrot/br-stub.t @@ -2,6 +2,7 @@ . $(dirname $0)/../include.rc . $(dirname $0)/../volume.rc +. $(dirname $0)/../nfs.rc STUB_SOURCE=$(dirname $0)/br-stub.c STUB_EXEC=$(dirname $0)/br-stub @@ -17,9 +18,9 @@ EXPECT "$V0" volinfo_field $V0 'Volume Name'; EXPECT 'Created' volinfo_field $V0 'Status'; EXPECT '2' brick_count $V0 -## Turn off open-behind (stub does not work with anonfd yet..) -TEST $CLI volume set $V0 performance.open-behind off -EXPECT 'off' volinfo_field $V0 'performance.open-behind' +## Turn off write-behind (write-behind clubs writes together) +TEST $CLI volume set $V0 performance.write-behind off +#EXPECT 'off' volinfo_field $V0 'performance.open-behind' ## Start the volume TEST $CLI volume start $V0; @@ -27,6 +28,7 @@ EXPECT 'Started' volinfo_field $V0 'Status'; ## Mount the volume TEST $GFS --volfile-server=$H0 --volfile-id=$V0 $M0; +TEST mount_nfs $H0:/$V0 $N0 nolock; ## Build stub C source build_tester $STUB_SOURCE -o $STUB_EXEC -I$(dirname $0)/../../xlators/features/bit-rot/src/stub @@ -34,11 +36,20 @@ TEST [ -e $STUB_EXEC ] ## create & check version fname="$M0/filezero" -touch $fname +touch $fname; backpath=$(get_backend_paths $fname) + +TEST $STUB_EXEC $fname $(dirname $backpath) + +rm -f $fname; + +## test nfs +fname="$N0/filezero" +touch $fname; # backpath remains same.. + TEST $STUB_EXEC $fname $(dirname $backpath) -## cleanups.. +##cleanups.. rm -f $STUB_EXEC cleanup; diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c index 5638b0f348b..61d461f897b 100644 --- a/xlators/features/bit-rot/src/bitd/bit-rot.c +++ b/xlators/features/bit-rot/src/bitd/bit-rot.c @@ -171,11 +171,11 @@ bitd_is_bad_file (xlator_t *this, br_child_t *child, loc_t *loc, fd_t *fd) if (fd) ret = syncop_fgetxattr (child->xl, fd, &xattr, - BITROT_OBJECT_BAD_KEY, NULL, + "trusted.glusterfs.bad-file", NULL, NULL); else if (loc) ret = syncop_getxattr (child->xl, loc, &xattr, - BITROT_OBJECT_BAD_KEY, NULL, + "trusted.glusterfs.bad-file", NULL, NULL); if (!ret) { @@ -484,6 +484,98 @@ br_log_object_path (xlator_t *this, char *op, op, path, strerror (op_errno)); } +static void +br_send_dummy_write (xlator_t *this, fd_t *fd, br_child_t *child, + dict_t *xdata) +{ + struct iovec iov = {0, }; + struct iobref *iobref = NULL; + struct iobuf *iobuf = NULL; + char *msg = NULL; + size_t size = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("bit-rot", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + GF_VALIDATE_OR_GOTO (this->name, child, out); + + msg = gf_strdup ("GLUSTERFS"); + if (!msg) + goto out; + + size = strlen (msg); + + iov.iov_base = msg; + iov.iov_len = size; + + iobref = iobref_new (); + if (!iobref) + goto free_msg; + + iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); + if (!iobuf) + goto free_iobref; + + iobref_add (iobref, iobuf); + + iov_unload (iobuf_ptr (iobuf), &iov, 1); /* FIXME!!! */ + + iov.iov_base = iobuf_ptr (iobuf); + iov.iov_len = size; + + ret = syncop_writev (child->xl, fd, &iov, 1, 0, iobref, 0, xdata, NULL); + if (ret <= 0) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "dummy write failed (%s)", strerror (errno)); + goto free_iobuf; + } + + /* iobref_unbref() takes care of iobuf unref */ + ret = 0; + + free_iobuf: + iobuf_unref (iobuf); + free_iobref: + iobref_unref (iobref); + free_msg: + GF_FREE (msg); + out: + return; +} + +static void +br_object_handle_reopen (xlator_t *this, + br_object_t *object, inode_t *linked_inode) +{ + int32_t ret = -1; + dict_t *dict = NULL; + loc_t loc = {0, }; + + /** + * Here dict is purposefully not checked for NULL, because at any cost + * sending a re-open should not be missed. This re-open is an indication + * for the stub to properly mark inode's status. + */ + dict = dict_new (); + if (dict) { + /* TODO: Make it a #define */ + ret = dict_set_int32 (dict, "br-fd-reopen", 1); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "Object reopen would trigger versioning."); + } + + loc.inode = inode_ref (linked_inode); + gf_uuid_copy (loc.gfid, linked_inode->gfid); + + br_trigger_sign (this, object->child, linked_inode, &loc, dict); + + if (dict) + dict_unref (dict); + loc_wipe (&loc); +} + /** * Sign a given object. This routine runs full throttle. There needs to be * some form of priority scheduling and/or read burstness to avoid starving @@ -497,6 +589,7 @@ static inline int32_t br_sign_object (br_object_t *object) fd_t *fd = NULL; struct iatt iatt = {0, }; pid_t pid = GF_CLIENT_PID_BITD; + br_sign_state_t sign_info = BR_SIGN_NORMAL; GF_VALIDATE_OR_GOTO ("bit-rot", object, out); @@ -515,6 +608,20 @@ static inline int32_t br_sign_object (br_object_t *object) goto out; } + /* sanity check */ + sign_info = ntohl (object->sign_info); + GF_ASSERT (sign_info != BR_SIGN_NORMAL); + + /** + * For fd's that have notified for reopening, we send an explicit + * open() followed by a dummy write() call. This triggers the + * actual signing of the object. + */ + if (sign_info == BR_SIGN_REOPEN_WAIT) { + br_object_handle_reopen (this, object, linked_inode); + goto unref_inode; + } + ret = br_object_open (this, object, linked_inode, &fd); if (!fd) { br_log_object (this, "open", object->gfid, -ret); @@ -648,6 +755,7 @@ br_initialize_object (xlator_t *this, br_child_t *child, changelog_event_t *ev) /* NOTE: it's BE, but no worry */ object->signedversion = ev->u.releasebr.version; + object->sign_info = ev->u.releasebr.sign_info; out: return object; @@ -693,7 +801,6 @@ br_brick_callback (void *xl, char *brick, xlator_t *this = NULL; br_object_t *object = NULL; br_child_t *child = NULL; - int32_t flags = 0; struct gf_tw_timer_list *timer = NULL; this = xl; @@ -710,14 +817,6 @@ br_brick_callback (void *xl, char *brick, gf_log (this->name, GF_LOG_DEBUG, "RELEASE EVENT [GFID %s]", uuid_utoa (gfid)); - flags = (int32_t)ntohl (ev->u.releasebr.flags); - if (flags == O_RDONLY) { - gf_log (this->name, GF_LOG_DEBUG, - "Read only fd [GFID: %s], ignoring signing..", - uuid_utoa (gfid)); - goto out; - } - child = br_get_child_from_brick_path (this, brick); if (!child) { gf_log (this->name, GF_LOG_ERROR, "failed to get the subvolume " @@ -804,12 +903,15 @@ out: return need_sign; } -static inline void +void br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode, - loc_t *loc) + loc_t *loc, dict_t *xdata) { fd_t *fd = NULL; int32_t ret = -1; + pid_t pid = GF_CLIENT_PID_BITD; + + syncopctx_setfspid (&pid); fd = fd_create (linked_inode, 0); if (!fd) { @@ -828,8 +930,10 @@ br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode, fd_bind (fd); } - if (fd) + if (fd) { + br_send_dummy_write (this, fd, child, xdata); syncop_close (fd); + } out: return; @@ -972,7 +1076,7 @@ bitd_oneshot_crawl (xlator_t *subvol, gf_log (this->name, GF_LOG_INFO, "Triggering signing for %s [GFID: %s | Brick: %s]", loc.path, uuid_utoa (linked_inode->gfid), child->brick_path); - br_trigger_sign (this, child, linked_inode, &loc); + br_trigger_sign (this, child, linked_inode, &loc, NULL); ret = 0; @@ -1600,7 +1704,9 @@ struct xlator_cbks cbks; struct volume_options options[] = { { .key = {"expiry-time"}, .type = GF_OPTION_TYPE_INT, - .default_value = "120", + /* Let the default timer be half the value of the wait time for + * sining (which is 120 as of now) */ + .default_value = "60", .description = "default time duration for which an object waits " "before it is signed", }, diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.h b/xlators/features/bit-rot/src/bitd/bit-rot.h index 66515e3213c..bbaf86fa65f 100644 --- a/xlators/features/bit-rot/src/bitd/bit-rot.h +++ b/xlators/features/bit-rot/src/bitd/bit-rot.h @@ -152,6 +152,8 @@ struct br_object { be signed */ br_child_t *child; /* object's subvolume */ + int sign_info; + struct list_head list; /* hook to add to the queue once the object is expired from timer wheel */ void *data; @@ -175,4 +177,8 @@ br_prepare_loc (xlator_t *, br_child_t *, loc_t *, gf_dirent_t *, loc_t *); gf_boolean_t bitd_is_bad_file (xlator_t *, br_child_t *, loc_t *, fd_t *); +void +br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode, + loc_t *loc, dict_t *xdata); + #endif /* __BIT_ROT_H__ */ diff --git a/xlators/features/bit-rot/src/stub/bit-rot-common.h b/xlators/features/bit-rot/src/stub/bit-rot-common.h index 699323170d3..7fd584e5970 100644 --- a/xlators/features/bit-rot/src/stub/bit-rot-common.h +++ b/xlators/features/bit-rot/src/stub/bit-rot-common.h @@ -33,6 +33,13 @@ typedef enum br_vxattr_state { BR_VXATTR_STATUS_INVALID = 3, } br_vxattr_status_t; +typedef enum br_sign_state { + BR_SIGN_INVALID = -1, + BR_SIGN_NORMAL = 0, + BR_SIGN_REOPEN_WAIT = 1, + BR_SIGN_QUICK = 2, +} br_sign_state_t; + static inline br_vxattr_status_t br_version_xattr_state (dict_t *xattr, br_version_t **obuf, br_signature_t **sbuf) diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h index 46271407219..9f6da89032f 100644 --- a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h @@ -28,7 +28,8 @@ enum br_mem_types { gf_br_mt_br_tbf_opspec_t, gf_br_mt_br_scrubber_t, gf_br_mt_br_fsscan_entry_t, - gf_br_stub_mt_end + gf_br_stub_mt_br_stub_fd_t, + gf_br_stub_mt_end, }; #endif diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.c b/xlators/features/bit-rot/src/stub/bit-rot-stub.c index f9c3886948a..93db072f671 100644 --- a/xlators/features/bit-rot/src/stub/bit-rot-stub.c +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.c @@ -198,14 +198,15 @@ br_stub_init_inode_versions (xlator_t *this, fd_t *fd, inode_t *inode, if (!ctx) goto error_return; + INIT_LIST_HEAD (&ctx->fd_list); (markdirty) ? __br_stub_mark_inode_dirty (ctx) : __br_stub_mark_inode_synced (ctx); __br_stub_set_ongoing_version (ctx, version); - __br_stub_reset_release_counters (ctx); if (fd) { - br_stub_require_release_call (this, fd); - __br_stub_track_openfd (fd, ctx); + ret = br_stub_add_fd_to_inode (this, fd, ctx); + if (ret) + goto free_ctx; } ret = br_stub_set_inode_ctx (this, inode, ctx); if (ret) @@ -238,7 +239,6 @@ br_stub_mod_inode_versions (xlator_t *this, __br_stub_mark_inode_synced (ctx); } - __br_stub_track_openfd (fd, ctx); ret = 0; } unblock: @@ -250,19 +250,16 @@ br_stub_mod_inode_versions (xlator_t *this, static inline void br_stub_fill_local (br_stub_local_t *local, call_stub_t *stub, fd_t *fd, inode_t *inode, uuid_t gfid, - int versioningtype, unsigned long memversion, int dirty) + int versioningtype, unsigned long memversion) { local->fopstub = stub; local->versioningtype = versioningtype; local->u.context.version = memversion; - if (fd) + if (fd && !local->u.context.fd) local->u.context.fd = fd_ref (fd); if (inode) local->u.context.inode = inode_ref (inode); gf_uuid_copy (local->u.context.gfid, gfid); - - /* mark inode dirty/fresh according to durability */ - local->u.context.markdirty = (dirty) ? _gf_true : _gf_false; } static inline void @@ -279,57 +276,13 @@ br_stub_cleanup_local (br_stub_local_t *local) inode_unref (local->u.context.inode); local->u.context.inode = NULL; } - local->u.context.markdirty = _gf_true; memset (local->u.context.gfid, '\0', sizeof (uuid_t)); } /** - * callback for inode/fd full versioning + * callback for inode/fd versioning */ int -br_stub_inode_fullversioning_cbk (call_frame_t *frame, - void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xdata) -{ - fd_t *fd = NULL; - inode_t *inode = NULL; - unsigned long version = 0; - gf_boolean_t dirty = _gf_true; - br_stub_local_t *local = NULL; - - local = (br_stub_local_t *)frame->local; - - /* be graceful to EEXIST */ - if ((op_ret < 0) && (op_errno == EEXIST)) { - op_ret = 0; - goto done; - } - - if (op_ret < 0) - goto done; - - fd = local->u.context.fd; - inode = local->u.context.inode; - version = local->u.context.version; - dirty = local->u.context.markdirty; - - op_ret = br_stub_init_inode_versions (this, fd, inode, version, dirty); - if (op_ret < 0) - op_errno = EINVAL; - - done: - frame->local = NULL; - if (op_ret < 0) - call_unwind_error (local->fopstub, op_ret, op_errno); - else - call_resume (local->fopstub); - br_stub_cleanup_local (local); - br_stub_dealloc_local (local); - - return 0; -} - -int br_stub_fd_incversioning_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, dict_t *xdata) @@ -351,14 +304,14 @@ br_stub_fd_incversioning_cbk (call_frame_t *frame, op_errno = EINVAL; done: - frame->local = NULL; - if (op_ret < 0) + if (op_ret < 0) { + frame->local = NULL; call_unwind_error (local->fopstub, -1, op_errno); - else + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + } else { call_resume (local->fopstub); - br_stub_cleanup_local (local); - br_stub_dealloc_local (local); - + } return 0; } @@ -366,28 +319,27 @@ br_stub_fd_incversioning_cbk (call_frame_t *frame, * Initial object versioning * * Version persists two (2) extended attributes as explained below: - * 1. Current (ongoing) version: This is incremented on an open() - * or creat() and is the running version for an object. + * 1. Current (ongoing) version: This is incremented on an writev () + * or truncate () and is the running version for an object. * 2. Signing version: This is the version against which an object * was signed (checksummed). * * During initial versioning, both ongoing and signing versions are - * set of one and zero respectively. An open() call increments the + * set of one and zero respectively. A write() call increments the * ongoing version as an indication of modification to the object. * Additionally this needs to be persisted on disk and needs to be * durable: fsync().. :-/ - * As an optimization only the first open() synchronizes the ongoing - * version to disk, subsequent open()s before the *last* release() + * As an optimization only the first write() synchronizes the ongoing + * version to disk, subsequent write()s before the *last* release() * are no-op's. * * create(), just like lookup() initializes the object versions to - * the default, but persists the version to disk. As an optimization - * this is not a durable operation: in case of a crash, hard reboot - * etc.. absence of versioning xattrs is ignored in scrubber along - * with the one time crawler explicitly triggering signing for such - * objects. + * the default. As an optimization this is not a durable operation: + * in case of a crash, hard reboot etc.. absence of versioning xattrs + * is ignored in scrubber along with the one time crawler explicitly + * triggering signing for such objects. * - * c.f. br_stub_open_cbk() / br_stub_create_cbk() + * c.f. br_stub_writev() / br_stub_truncate() */ /** @@ -400,7 +352,7 @@ int br_stub_fd_versioning (xlator_t *this, call_frame_t *frame, call_stub_t *stub, dict_t *dict, fd_t *fd, br_stub_version_cbk *callback, unsigned long memversion, - int versioningtype, int durable, int dirty) + int versioningtype, int durable) { int32_t ret = -1; int flags = 0; @@ -421,18 +373,11 @@ br_stub_fd_versioning (xlator_t *this, call_frame_t *frame, goto dealloc_xdata; } - local = br_stub_alloc_local (this); - if (!local) { - ret = -1; - goto dealloc_xdata; - } - - if (versioningtype == BR_STUB_FULL_VERSIONING) - flags |= XATTR_CREATE; + local = frame->local; br_stub_fill_local (local, stub, fd, fd->inode, fd->inode->gfid, - versioningtype, memversion, dirty); + versioningtype, memversion); frame->local = local; STACK_WIND (frame, callback, @@ -448,82 +393,21 @@ br_stub_fd_versioning (xlator_t *this, call_frame_t *frame, } static inline int -br_stub_perform_fullversioning (xlator_t *this, call_frame_t *frame, - call_stub_t *stub, fd_t *fd) -{ - int32_t ret = -1; - dict_t *dict = NULL; - br_version_t *obuf = NULL; - int op_errno = 0; - - op_errno = ENOMEM; - dict = dict_new (); - if (!dict) - goto done; - ret = br_stub_alloc_versions (&obuf, NULL, 0); - if (ret) - goto dealloc_dict; - - op_errno = EINVAL; - ret = br_stub_prepare_version_request (this, dict, obuf, - BITROT_DEFAULT_CURRENT_VERSION); - if (ret) - goto dealloc_versions; - - /** - * Version extended attributes need not be durable at this point of - * time. If the objects (inode) data gets persisted on disk but the - * version extended attributes are lost due to a crash/power failure, - * a subsequent lookup marks the objects signature as stale. This way, - * dentry operation times do not shoot up. - */ - ret = br_stub_fd_versioning (this, frame, stub, dict, fd, - br_stub_inode_fullversioning_cbk, - BITROT_DEFAULT_CURRENT_VERSION, - BR_STUB_FULL_VERSIONING, !WRITEBACK_DURABLE, 0); - - dealloc_versions: - br_stub_dealloc_versions (obuf); - dealloc_dict: - dict_unref (dict); - done: - if (ret) - call_unwind_error (stub, -1, op_errno); - return ret; -} - -static inline int br_stub_perform_incversioning (xlator_t *this, call_frame_t *frame, call_stub_t *stub, fd_t *fd, br_stub_inode_ctx_t *ctx) { - int32_t ret = -1; - dict_t *dict = NULL; - inode_t *inode = NULL; - br_version_t *obuf = NULL; - unsigned long writeback_version = 0; - int op_errno = 0; - - inode = fd->inode; + int32_t ret = -1; + dict_t *dict = NULL; + br_version_t *obuf = NULL; + unsigned long writeback_version = 0; + int op_errno = 0; + br_stub_local_t *local = NULL; op_errno = EINVAL; - ret = br_stub_require_release_call (this, fd); - if (ret) - goto done; - - LOCK (&inode->lock); - { - if (__br_stub_is_inode_dirty (ctx)) - writeback_version = __br_stub_writeback_version (ctx); - else - __br_stub_track_openfd (fd, ctx); - } - UNLOCK (&inode->lock); + local = frame->local; - if (!writeback_version) { - ret = 0; - goto done; - } + writeback_version = __br_stub_writeback_version (ctx); /* inode requires writeback to disk */ op_errno = ENOMEM; @@ -541,17 +425,23 @@ br_stub_perform_incversioning (xlator_t *this, ret = br_stub_fd_versioning (this, frame, stub, dict, fd, br_stub_fd_incversioning_cbk, writeback_version, - BR_STUB_INCREMENTAL_VERSIONING, WRITEBACK_DURABLE, 0); + BR_STUB_INCREMENTAL_VERSIONING, !WRITEBACK_DURABLE); dealloc_versions: br_stub_dealloc_versions (obuf); dealloc_dict: dict_unref (dict); done: - if (!ret && !writeback_version) - call_resume (stub); - if (ret) + if (ret) { + if (local) + frame->local = NULL; call_unwind_error (stub, -1, op_errno); + if (local) { + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + } + } + return ret; } @@ -560,6 +450,44 @@ br_stub_perform_incversioning (xlator_t *this, /* fsetxattr() */ static inline int +br_stub_compare_sign_version (xlator_t *this, inode_t *inode, + br_signature_t *sbuf, dict_t *dict) +{ + int32_t ret = -1; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t tmp_ctx = 0; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO (this->name, sbuf, out); + GF_VALIDATE_OR_GOTO (this->name, dict, out); + + ret = br_stub_get_inode_ctx (this, inode, &tmp_ctx); + if (ret) { + dict_del (dict, BITROT_SIGNING_VERSION_KEY); + goto out; + } + + ret = -1; + ctx = (br_stub_inode_ctx_t *)(long)tmp_ctx; + + LOCK (&inode->lock); + { + if (ctx->currentversion == sbuf->signedversion) + ret = 0; + else + gf_log (this->name, GF_LOG_WARNING, "current version " + "%lu and version of the signature %lu are not " + "same", ctx->currentversion, + sbuf->signedversion); + } + UNLOCK (&inode->lock); + +out: + return ret; +} + +static inline int br_stub_prepare_signature (xlator_t *this, dict_t *dict, inode_t *inode, br_isignature_t *sign) { @@ -577,6 +505,11 @@ br_stub_prepare_signature (xlator_t *this, dict_t *dict, ret = br_stub_prepare_signing_request (dict, sbuf, sign, signaturelen); if (ret) goto dealloc_versions; + + ret = br_stub_compare_sign_version (this, inode, sbuf, dict); + if (ret) + goto dealloc_versions; + return 0; dealloc_versions: @@ -620,6 +553,8 @@ br_stub_fsetxattr (call_frame_t *frame, xlator_t *this, if (ret) goto unwind; + gf_log (this->name, GF_LOG_DEBUG, "SIGNED VERSION: %lu", + sign->signedversion); wind: STACK_WIND (frame, default_setxattr_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr, fd, @@ -865,77 +800,598 @@ br_stub_fgetxattr (call_frame_t *frame, xlator_t *this, return 0; } -/** }}} */ +/** + * The first write response on the first fd in the list of fds will set + * the flag to indicate that the inode is modified. The subsequent write + * respnses coming on either the first fd or some other fd will not change + * the fd. The inode-modified flag is unset only upon release of all the + * fds. + */ +int32_t +br_stub_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + br_stub_local_t *local = NULL; + + if (frame->local) { + local = frame->local; + frame->local = NULL; + } + if (op_ret < 0) + goto unwind; -/** {{{ */ + ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode, + &ctx_addr); + if (ret < 0) + goto unwind; -/* open() */ + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; -int -br_stub_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, fd_t *fd, dict_t *xdata) + /* Mark the flag to indicate the inode has been modified */ + LOCK (&local->u.context.fd->inode->lock); + { + if (!__br_stub_is_inode_modified (ctx)) + __br_stub_set_inode_modified (ctx); + } + UNLOCK (&local->u.context.fd->inode->lock); + + +unwind: + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + return 0; +} + +/** + * Ongoing version is increased only for the first modify operation. + * First modify version means the first write or truncate call coming on the + * first fd in the list of inodes. + * For anonymous fds open would not have come, so check if its the first write + * by doing both inode dirty check and ensuring list of fds is empty + */ +static inline gf_boolean_t +br_stub_inc_version (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx) { - int32_t ret = 0; - uint64_t ctx_addr = 0; - br_stub_inode_ctx_t *ctx = NULL; - call_stub_t *stub = NULL; + gf_boolean_t inc_version = _gf_false; + + GF_VALIDATE_OR_GOTO (this->name, fd, out); + GF_VALIDATE_OR_GOTO (this->name, ctx, out); + + LOCK (&fd->inode->lock); + { + if (__br_stub_is_inode_dirty (ctx)) + inc_version = _gf_true; + } + UNLOCK (&fd->inode->lock); + +out: + return inc_version; +} + +/** + * Since NFS does not do open, writes from NFS are sent over an anonymous + * fd. It means each write fop might come on a different anonymous fd and + * will lead to very large number of notifications being sent. It might + * affect the perfromance as, there will too many sign requests. + * To avoid that whenever the last fd released from an inode (logical release) + * is an anonymous fd the release notification is sent with a flag being set + * __br_stub_anon_release (ctx); + * BitD checks for the flag and if set, it will send a dummy write request + * (again on an anonymous fd) instead of triggering sign. + * Bit-rot-stub should identify such dummy writes and should send success to + * them instead of winding them downwards. + */ +gf_boolean_t +br_stub_dummy_write (call_frame_t *frame) +{ + return (frame->root->pid == GF_CLIENT_PID_BITD) + ? _gf_true : _gf_false; +} + +int32_t +br_stub_anon_fd_ctx (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx) +{ + int32_t ret = -1; + br_stub_fd_t *br_stub_fd = NULL; + + br_stub_fd = br_stub_fd_ctx_get (this, fd); + if (!br_stub_fd) { + ret = br_stub_add_fd_to_inode (this, fd, ctx); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to " + "add fd to the inode (gfid: %s)", + uuid_utoa (fd->inode->gfid)); + goto out; + } + } + + ret = 0; + +out: + return ret; +} + +int32_t +br_stub_writev_resume (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) +{ + if (frame->root->pid == GF_CLIENT_PID_BITD) + br_stub_writev_cbk (frame, NULL, this, vector->iov_len, 0, + NULL, NULL, NULL); + else + STACK_WIND (frame, br_stub_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, + offset, flags, iobref, xdata); + return 0; +} + +/** + TODO: If possible add pictorial represention of below comment. + + Before sending writev on the ANONYMOUS FD, increase the ongoing + version first. This brings anonymous fd write closer to the regular + fd write by having the ongoing version increased before doing the + write (In regular fd, after open the ongoing version is incremented). + Do following steps to handle writes on anonymous fds: + 1) Increase the on-disk ongoing version + 2) Once versioning is successfully done send write operation. If versioning + fails, then fail the write fop. + 3) In writev_cbk do below things: + a) Increase in-memory version + b) set the fd context (so that br_stub_release is invoked) + c) add the fd to the list of fds maintained in the inode context of + bitrot-stub. + d) Mark inode as non dirty + e) Mard inode as modified (in the inode context) +**/ +int32_t +br_stub_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset, + uint32_t flags, struct iobref *iobref, dict_t *xdata) +{ + br_stub_local_t *local = NULL; + call_stub_t *stub = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + gf_boolean_t inc_version = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t ctx_addr = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO (this->name, frame, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd, unwind); + + local = br_stub_alloc_local (this); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, "local allocation failed " + "(gfid: %s)", uuid_utoa (fd->inode->gfid)); + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + local->u.context.fd = fd_ref (fd); + frame->local = local; + + ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to get the inode " + "context for the inode %s", + uuid_utoa (fd->inode->gfid)); + goto unwind; + } + + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + if (fd_is_anonymous (fd)) { + ret = br_stub_anon_fd_ctx (this, fd, ctx); + if (ret) + goto unwind; + } + + /* TODO: Better to do a dummy fsetxattr instead of write. Keep write + simple */ + if (br_stub_dummy_write (frame)) { + LOCK (&fd->inode->lock); + { + (void) __br_stub_inode_sign_state + (ctx, GF_FOP_WRITE, fd); + } + UNLOCK (&fd->inode->lock); + + if (xdata && dict_get (xdata, "br-fd-reopen")) { + op_ret = vector->iov_len; + op_errno = 0; + goto unwind; + } + } + + /** + * Check whether this is the first write on this inode since the last + * sign notification has been sent. If so, do versioning. Otherwise + * go ahead with the fop. + */ + inc_version = br_stub_inc_version (this, fd, ctx); + if (!inc_version) + goto wind; + + /* Create the stub for the write fop */ + stub = fop_writev_stub (frame, br_stub_writev_resume, fd, vector, count, + offset, flags, iobref, xdata); + + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for " + "write fop (gfid: %s), unwinding", + uuid_utoa (fd->inode->gfid)); + goto unwind; + } + + /* Perform Versioning */ + return br_stub_perform_incversioning (this, frame, stub, fd, ctx); + +wind: + STACK_WIND (frame, br_stub_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, offset, + flags, iobref, xdata); + return 0; + +unwind: + frame->local = NULL; + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, NULL, NULL, + NULL); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + return 0; +} + +int32_t +br_stub_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + br_stub_local_t *local = NULL; + + if (frame->local) { + local = frame->local; + frame->local = NULL; + } if (op_ret < 0) goto unwind; - if (cookie != (void *) BR_STUB_REQUEST_COOKIE) + + ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode, + &ctx_addr); + if (ret < 0) goto unwind; + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + + /* Mark the flag to indicate the inode has been modified */ + LOCK (&local->u.context.fd->inode->lock); + { + if (!__br_stub_is_inode_modified (ctx)) + __br_stub_set_inode_modified (ctx); + } + UNLOCK (&local->u.context.fd->inode->lock); + + +unwind: + STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + return 0; +} + +int32_t +br_stub_ftruncate_resume (call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, dict_t *xdata) +{ + STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; +} + +int32_t +br_stub_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, dict_t *xdata) +{ + br_stub_local_t *local = NULL; + call_stub_t *stub = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + gf_boolean_t inc_version = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t ctx_addr = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO (this->name, frame, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd, unwind); + + local = br_stub_alloc_local (this); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, "local allocation failed " + "(gfid: %s)", uuid_utoa (fd->inode->gfid)); + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + local->u.context.fd = fd_ref (fd); + frame->local = local; + ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); - if (ret < 0) + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to get the inode " + "context for the inode %s", + uuid_utoa (fd->inode->gfid)); goto unwind; + } + + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + if (fd_is_anonymous (fd)) { + ret = br_stub_anon_fd_ctx (this, fd, ctx); + if (ret) + goto unwind; + } - stub = fop_open_cbk_stub (frame, NULL, op_ret, op_errno, fd, xdata); + /** + * c.f. br_stub_writev() + */ + inc_version = br_stub_inc_version (this, fd, ctx); + if (!inc_version) + goto wind; + + /* Create the stub for the ftruncate fop */ + stub = fop_ftruncate_stub (frame, br_stub_ftruncate_resume, fd, offset, + xdata); if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for " + "ftruncate fop (gfid: %s), unwinding", + uuid_utoa (fd->inode->gfid)); + goto unwind; + } + + /* Perform Versioning */ + return br_stub_perform_incversioning (this, frame, stub, fd, ctx); + +wind: + STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); + return 0; + +unwind: + frame->local = NULL; + STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, NULL, NULL, + NULL); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + return 0; +} + +int32_t +br_stub_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + br_stub_local_t *local = NULL; + + if (frame->local) { + local = frame->local; + frame->local = NULL; + } + + if (op_ret < 0) + goto unwind; + + ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode, + &ctx_addr); + if (ret < 0) + goto unwind; + + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + + /* Mark the flag to indicate the inode has been modified */ + LOCK (&local->u.context.fd->inode->lock); + { + if (!__br_stub_is_inode_modified (ctx)) + __br_stub_set_inode_modified (ctx); + } + UNLOCK (&local->u.context.fd->inode->lock); + + +unwind: + STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); + return 0; +} + +int32_t +br_stub_truncate_resume (call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata) +{ + STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; +} + +/** + * Bit-rot-stub depends heavily on the fd based operations to for doing + * versioning and sending notification. It starts tracking the operation + * upon getting first fd based modify operation by doing versioning and + * sends notification when last fd using which the inode was modified is + * released. + * But for truncate there is no fd and hence it becomes difficult to do + * the versioning and send notification. It is handled by doing versioning + * on an anonymous fd. The fd will be valid till the completion of the + * truncate call. It guarantees that release on this anonymous fd will happen + * after the truncate call and notification is sent after the truncate call. + */ +int32_t +br_stub_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, + off_t offset, dict_t *xdata) +{ + br_stub_local_t *local = NULL; + call_stub_t *stub = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + gf_boolean_t inc_version = _gf_false; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t ctx_addr = 0; + int32_t ret = -1; + fd_t *fd = NULL; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO (this->name, frame, unwind); + GF_VALIDATE_OR_GOTO (this->name, loc, unwind); + GF_VALIDATE_OR_GOTO (this->name, loc->inode, unwind); + + fd = fd_anonymous (loc->inode); + if (!fd) { + gf_log (this->name, GF_LOG_ERROR, "failed to create anonymous " + "fd for the inode %s", uuid_utoa (loc->inode->gfid)); + goto unwind; + } + + local = br_stub_alloc_local (this); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, "local allocation failed " + "(gfid: %s)", uuid_utoa (loc->inode->gfid)); op_ret = -1; - op_errno = EINVAL; + op_errno = ENOMEM; + goto unwind; + } + + local->u.context.fd = fd; + frame->local = local; + + ret = br_stub_get_inode_ctx (this, loc->inode, &ctx_addr); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "failed to get the inode " + "context for the inode %s", + uuid_utoa (fd->inode->gfid)); goto unwind; } + ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; + ret = br_stub_anon_fd_ctx (this, local->u.context.fd, ctx); + if (ret) + goto unwind; + /** - * Ongoing version needs to be incremented. If the inode is not dirty, - * things are simple: increment the ongoing version safely and be done. - * If inode is dirty, a writeback to disk is required. This is tricky in - * case of multiple open()'s as ongoing version needs to be incremented - * on a successful writeback. It's probably safe to remember the ongoing - * version before writeback and *assigning* it in the callback, but that - * may lead to a trustable checksum to be treated as stale by scrubber - * (the case where the in-memory ongoing version is lesser than the - * on-disk version). Therefore, *all* open() calls (which might have - * come in parallel) try to synchronize the next ongoing version to - * disk. In the callback path, the winner marks the inode as synced - * therby loosing open() calls become no-op's. + * c.f. br_stub_writev() */ - ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; - return br_stub_perform_incversioning (this, frame, stub, fd, ctx); + inc_version = br_stub_inc_version (this, fd, ctx); + if (!inc_version) + goto wind; - unwind: - STACK_UNWIND_STRICT (open, frame, - op_ret, op_errno, fd, xdata); + /* Create the stub for the truncate fop */ + stub = fop_truncate_stub (frame, br_stub_truncate_resume, loc, offset, + xdata); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for " + "truncate fop (gfid: %s), unwinding", + uuid_utoa (fd->inode->gfid)); + goto unwind; + } + + /* Perform Versioning */ + return br_stub_perform_incversioning (this, frame, stub, + local->u.context.fd, ctx); + +wind: + STACK_WIND (frame, br_stub_truncate_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); + return 0; + +unwind: + frame->local = NULL; + STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, NULL, NULL, + NULL); + br_stub_cleanup_local (local); + br_stub_dealloc_local (local); return 0; } +/** }}} */ + + +/** {{{ */ + +/* open() */ + +/** + * It's probably worth mentioning a bit about why some of the housekeeping + * work is done in open() call path, rather than the callback path. + * Two (or more) open()'s in parallel can race and lead to a situation + * where a release() gets triggered (possibly after a series of write() + * calls) when *other* open()'s have still not reached callback path + * thereby having an active fd on an inode that is in process of getting + * signed with the current version. + * + * Maintaining fd list in the call path ensures that a release() would + * not be triggered if an open() call races ahead (followed by a close()) + * threby finding non-empty fd list. + */ + int br_stub_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata) { - void *cookie = NULL; + int32_t ret = -1; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t ctx_addr = 0; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO (this->name, loc, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd->inode, unwind); - if (!flags) - goto wind; if (frame->root->pid == GF_CLIENT_PID_SCRUB) goto wind; - cookie = (void *) BR_STUB_REQUEST_COOKIE; - wind: - STACK_WIND_COOKIE (frame, br_stub_open_cbk, cookie, - FIRST_CHILD (this), FIRST_CHILD (this)->fops->open, - loc, flags, fd, xdata); + ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to get the inode " + "context for the file %s (gfid: %s)", loc->path, + uuid_utoa (fd->inode->gfid)); + goto unwind; + } + + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + if (flags == O_RDONLY) + goto wind; + + ret = br_stub_add_fd_to_inode (this, fd, ctx); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed add fd to the list " + "(gfid: %s)", uuid_utoa (fd->inode->gfid)); + goto unwind; + } + +wind: + STACK_WIND (frame, default_open_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->open, loc, flags, fd, xdata); + return 0; +unwind: + STACK_UNWIND_STRICT (open, frame, -1, EINVAL, NULL, NULL); return 0; } @@ -946,39 +1402,60 @@ br_stub_open (call_frame_t *frame, xlator_t *this, /* creat() */ +/** + * This routine registers a release callback for the given fd and adds the + * fd to the inode context fd tracking list. + */ +int32_t +br_stub_add_fd_to_inode (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx) +{ + int32_t ret = -1; + br_stub_fd_t *br_stub_fd = NULL; + + ret = br_stub_require_release_call (this, fd, &br_stub_fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "failed to set the fd " + "context for the file (gfid: %s)", + uuid_utoa (fd->inode->gfid)); + goto out; + } + + LOCK (&fd->inode->lock); + { + list_add_tail (&ctx->fd_list, &br_stub_fd->list); + } + UNLOCK (&fd->inode->lock); + + ret = 0; + +out: + return ret; +} + int br_stub_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - int32_t ret = 0; - uint64_t ctx_addr = 0; - call_stub_t *stub = NULL; - br_stub_inode_ctx_t *ctx = NULL; + int32_t ret = 0; + uint64_t ctx_addr = 0; + br_stub_inode_ctx_t *ctx = NULL; + unsigned long version = BITROT_DEFAULT_CURRENT_VERSION; if (op_ret < 0) goto unwind; - stub = fop_create_cbk_stub (frame, NULL, op_ret, op_errno, fd, inode, - stbuf, preparent, postparent, xdata); - if (!stub) { - op_ret = -1; - op_errno = EINVAL; - goto unwind; - } - ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr); - if (ret < 0) - ctx_addr = 0; - ctx = (br_stub_inode_ctx_t *) (long) ctx_addr; - - /* see comment in br_stub_open_cbk().. */ - return (ctx) - ? br_stub_perform_incversioning (this, frame, stub, fd, ctx) - : br_stub_perform_fullversioning (this, frame, stub, fd); + if (ret < 0) { + ret = br_stub_init_inode_versions (this, fd, inode, version, + _gf_true); + } else { + ctx = (br_stub_inode_ctx_t *)(long)ctx_addr; + ret = br_stub_add_fd_to_inode (this, fd, ctx); + } - unwind: +unwind: STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, stbuf, preparent, postparent, xdata); return 0; @@ -989,10 +1466,20 @@ br_stub_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind); + GF_VALIDATE_OR_GOTO (this->name, loc, unwind); + GF_VALIDATE_OR_GOTO (this->name, loc->inode, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd, unwind); + GF_VALIDATE_OR_GOTO (this->name, fd->inode, unwind); + STACK_WIND (frame, br_stub_create_cbk, FIRST_CHILD (this), FIRST_CHILD (this)->fops->create, loc, flags, mode, umask, fd, xdata); return 0; +unwind: + STACK_UNWIND_STRICT (create, frame, -1, EINVAL, NULL, NULL, NULL, NULL, + NULL, NULL); + return 0; } /** }}} */ @@ -1011,21 +1498,11 @@ br_stub_lookup_version (xlator_t *this, * out the correct version to use in the inode context (start with * the default version if unavailable). As of now versions are not * persisted on-disk. The inode is marked dirty, so that the first - * operation (such as open(), etc..) would trigger synchronization - * to disk. + * operation (such as write(), etc..) triggers synchronization to + * disk. */ status = br_version_xattr_state (xattr, &obuf, &sbuf); - /** - * stub does not know how to handle presence of signature but not - * the object version, therefore, in such cases, bail out.. - */ - if (status == BR_VXATTR_STATUS_INVALID) { - gf_log (this->name, GF_LOG_ERROR, "Invalid versioning xattrs. " - "Bailing out [GFID: %s]", uuid_utoa (gfid)); - return -1; - } - version = ((status == BR_VXATTR_STATUS_FULL) || (status == BR_VXATTR_STATUS_UNSIGNED)) ? obuf->ongoingversion : BITROT_DEFAULT_CURRENT_VERSION; @@ -1259,8 +1736,8 @@ br_stub_noop (call_frame_t *frame, void *cookie, xlator_t *this, } static inline void -br_stub_send_ipc_fop (xlator_t *this, - fd_t *fd, unsigned long releaseversion, int32_t flags) +br_stub_send_ipc_fop (xlator_t *this, fd_t *fd, unsigned long releaseversion, + int sign_info) { int32_t op = 0; int32_t ret = 0; @@ -1269,8 +1746,8 @@ br_stub_send_ipc_fop (xlator_t *this, changelog_event_t ev = {0,}; ev.ev_type = CHANGELOG_OP_TYPE_BR_RELEASE; - ev.u.releasebr.flags = flags; ev.u.releasebr.version = releaseversion; + ev.u.releasebr.sign_info = sign_info; gf_uuid_copy (ev.u.releasebr.gfid, fd->inode->gfid); xdata = dict_new (); @@ -1305,14 +1782,67 @@ br_stub_send_ipc_fop (xlator_t *this, return; } +/** + * This is how the state machine of sign info works: + * 3 states: + * 1) BR_SIGN_NORMAL => The default State of the inode + * 2) BR_SIGN_REOPEN_WAIT => A release has been sent and is waiting for reopen + * 3) BR_SIGN_QUICK => reopen has happened and this release should trigger sign + * 2 events: + * 1) GF_FOP_RELEASE + * 2) GF_FOP_WRITE (actually a dummy write fro BitD) + * + * This is how states are changed based on events: + * EVENT: GF_FOP_RELEASE: + * if (state == BR_SIGN_NORMAL) ; then + * set state = BR_SIGN_REOPEN_WAIT; + * if (state == BR_SIGN_QUICK); then + * set state = BR_SIGN_NORMAL; + * EVENT: GF_FOP_WRITE: + * if (state == BR_SIGN_REOPEN_WAIT); then + * set state = BR_SIGN_QUICK; + */ +br_sign_state_t +__br_stub_inode_sign_state (br_stub_inode_ctx_t *ctx, + glusterfs_fop_t fop, fd_t *fd) +{ + br_sign_state_t sign_info = BR_SIGN_INVALID; + + switch (fop) { + + case GF_FOP_WRITE: + sign_info = ctx->info_sign = BR_SIGN_QUICK; + break; + + case GF_FOP_RELEASE: + GF_ASSERT (ctx->info_sign != BR_SIGN_REOPEN_WAIT); + + if (ctx->info_sign == BR_SIGN_NORMAL) { + sign_info = ctx->info_sign = BR_SIGN_REOPEN_WAIT; + } else { + sign_info = ctx->info_sign; + ctx->info_sign = BR_SIGN_NORMAL; + } + + break; + default: + break; + } + + return sign_info; +} + int32_t br_stub_release (xlator_t *this, fd_t *fd) { - int32_t ret = 0; - int32_t flags = 0; - inode_t *inode = NULL; - unsigned long releaseversion = 0; - br_stub_inode_ctx_t *ctx = NULL; + int32_t ret = 0; + int32_t flags = 0; + inode_t *inode = NULL; + unsigned long releaseversion = 0; + br_stub_inode_ctx_t *ctx = NULL; + uint64_t tmp = 0; + br_stub_fd_t *br_stub_fd = NULL; + int32_t signinfo = 0; inode = fd->inode; @@ -1321,12 +1851,23 @@ br_stub_release (xlator_t *this, fd_t *fd) ctx = __br_stub_get_ongoing_version_ctx (this, inode, NULL); if (ctx == NULL) goto unblock; - __br_stub_track_release (ctx); + br_stub_fd = br_stub_fd_ctx_get (this, fd); + if (br_stub_fd) { + list_del_init (&br_stub_fd->list); + } + ret = __br_stub_can_trigger_release - (inode, ctx, &releaseversion, &flags); - if (ret) { - GF_ASSERT (__br_stub_is_inode_dirty (ctx) == 0); + (inode, ctx, &releaseversion); + if (!ret) + goto unblock; + + signinfo = __br_stub_inode_sign_state (ctx, GF_FOP_RELEASE, fd); + signinfo = htonl (signinfo); + + /* inode back to initital state: mark dirty */ + if (ctx->info_sign == BR_SIGN_NORMAL) { __br_stub_mark_inode_dirty (ctx); + __br_stub_unset_inode_modified (ctx); } } unblock: @@ -1334,10 +1875,17 @@ br_stub_release (xlator_t *this, fd_t *fd) if (ret) { gf_log (this->name, GF_LOG_DEBUG, - "releaseversion: %lu|flags: %d", releaseversion, flags); - br_stub_send_ipc_fop (this, fd, releaseversion, flags); + "releaseversion: %lu | flags: %d | signinfo: %d", + (unsigned long) ntohl (releaseversion), + flags, ntohl(signinfo)); + br_stub_send_ipc_fop (this, fd, releaseversion, signinfo); } + ret = fd_ctx_del (fd, this, &tmp); + br_stub_fd = (br_stub_fd_t *)(long)tmp; + + GF_FREE (br_stub_fd); + return 0; } @@ -1351,11 +1899,12 @@ void br_stub_ictxmerge (xlator_t *this, fd_t *fd, inode_t *inode, inode_t *linked_inode) { - int32_t ret = 0; - uint64_t ctxaddr = 0; - uint64_t lctxaddr = 0; - br_stub_inode_ctx_t *ctx = NULL; - br_stub_inode_ctx_t *lctx = NULL; + int32_t ret = 0; + uint64_t ctxaddr = 0; + uint64_t lctxaddr = 0; + br_stub_inode_ctx_t *ctx = NULL; + br_stub_inode_ctx_t *lctx = NULL; + br_stub_fd_t *br_stub_fd = NULL; ret = br_stub_get_inode_ctx (this, inode, &ctxaddr); if (ret < 0) @@ -1369,29 +1918,15 @@ br_stub_ictxmerge (xlator_t *this, fd_t *fd, goto unblock; lctx = (br_stub_inode_ctx_t *) lctxaddr; - if (__br_stub_is_inode_dirty (lctx)) { - /** - * RACY code: An inode can end up in this situation - * after a lookup() or after a create() followed by - * a release(). Even if we distinguish b/w the two, - * there needs to be more infrastructure built up - * in stub to handle these races. Note, that it's - * probably OK to ignore the race iff the version - * was initialized on the very first lookup(), i.e., - * [ongoingversion: default]. - * - * FIXME: fixup races [create(1..n)/lookup(1..n)]. - */ - GF_ASSERT (lctx->currentversion - == BITROT_DEFAULT_CURRENT_VERSION); - __br_stub_track_openfd (fd, lctx); - __br_stub_mark_inode_synced (lctx); - } else { - GF_ASSERT (ctx->currentversion <= lctx->currentversion); - __br_stub_track_openfd (fd, lctx); + GF_ASSERT (list_is_singular (&ctx->fd_list)); + br_stub_fd = list_first_entry (&ctx->fd_list, br_stub_fd_t, + list); + if (br_stub_fd) { + GF_ASSERT (br_stub_fd->fd == fd); + list_move_tail (&br_stub_fd->list, &lctx->fd_list); } } - unblock: +unblock: UNLOCK (&linked_inode->lock); done: @@ -1409,6 +1944,9 @@ struct xlator_fops fops = { .getxattr = br_stub_getxattr, .fgetxattr = br_stub_fgetxattr, .fsetxattr = br_stub_fsetxattr, + .writev = br_stub_writev, + .truncate = br_stub_truncate, + .ftruncate = br_stub_ftruncate, }; struct xlator_cbks cbks = { diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.h b/xlators/features/bit-rot/src/stub/bit-rot-stub.h index 86090bfb877..69e212bb81f 100644 --- a/xlators/features/bit-rot/src/stub/bit-rot-stub.h +++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.h @@ -21,6 +21,7 @@ #include "xlator.h" #include "defaults.h" #include "call-stub.h" +#include "bit-rot-stub-mem-types.h" #include "bit-rot-common.h" @@ -32,17 +33,18 @@ typedef struct br_stub_inode_ctx { a writeback to disk? */ unsigned long currentversion; /* ongoing version */ - struct release { - int32_t ordflags; - unsigned long opencount; /* number of open()s before - final release() */ - unsigned long releasecount; /* number of release()s */ - } releasectx; -#define BR_STUB_REQUIRE_RELEASE_CBK 0x0E0EA0E + int info_sign; + struct list_head fd_list; /* list of open fds or fds participating in + write operations */ } br_stub_inode_ctx_t; +typedef struct br_stub_fd { + fd_t *fd; + struct list_head list; +} br_stub_fd_t; #define I_DIRTY (1<<0) /* inode needs writeback */ +#define I_MODIFIED (1<<1) #define WRITEBACK_DURABLE 1 /* writeback is durable */ /** @@ -60,12 +62,10 @@ typedef struct br_stub_local { uuid_t gfid; inode_t *inode; unsigned long version; - gf_boolean_t markdirty; } context; } u; } br_stub_local_t; -#define BR_STUB_FULL_VERSIONING (1<<0) #define BR_STUB_INCREMENTAL_VERSIONING (1<<1) typedef struct br_stub_private { @@ -96,16 +96,131 @@ __br_stub_is_inode_dirty (br_stub_inode_ctx_t *ctx) return (ctx->need_writeback & I_DIRTY); } +/* inode mofification markers */ +static inline void +__br_stub_set_inode_modified (br_stub_inode_ctx_t *ctx) +{ + ctx->need_writeback |= I_MODIFIED; +} + +static inline void +__br_stub_unset_inode_modified (br_stub_inode_ctx_t *ctx) +{ + ctx->need_writeback &= ~I_MODIFIED; +} + +static inline int +__br_stub_is_inode_modified (br_stub_inode_ctx_t *ctx) +{ + return (ctx->need_writeback & I_MODIFIED); +} + +br_stub_fd_t * +br_stub_fd_new (void) +{ + br_stub_fd_t *br_stub_fd = NULL; + + br_stub_fd = GF_CALLOC (1, sizeof (*br_stub_fd), + gf_br_stub_mt_br_stub_fd_t); + + return br_stub_fd; +} + +int +__br_stub_fd_ctx_set (xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd) +{ + uint64_t value = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + GF_VALIDATE_OR_GOTO (this->name, br_stub_fd, out); + + value = (uint64_t)(long) br_stub_fd; + + ret = __fd_ctx_set (fd, this, value); + +out: + return ret; +} + +br_stub_fd_t * +__br_stub_fd_ctx_get (xlator_t *this, fd_t *fd) +{ + br_stub_fd_t *br_stub_fd = NULL; + uint64_t value = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + + ret = __fd_ctx_get (fd, this, &value); + if (ret) + return NULL; + + br_stub_fd = (br_stub_fd_t *) ((long) value); + +out: + return br_stub_fd; +} + +br_stub_fd_t * +br_stub_fd_ctx_get (xlator_t *this, fd_t *fd) +{ + br_stub_fd_t *br_stub_fd = NULL; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + + LOCK (&fd->lock); + { + br_stub_fd = __br_stub_fd_ctx_get (this, fd); + } + UNLOCK (&fd->lock); + +out: + return br_stub_fd; +} + +int32_t +br_stub_fd_ctx_set (xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + GF_VALIDATE_OR_GOTO (this->name, br_stub_fd, out); + + LOCK (&fd->lock); + { + ret = __br_stub_fd_ctx_set (this, fd, br_stub_fd); + } + UNLOCK (&fd->lock); + +out: + return ret; +} + static inline int -br_stub_require_release_call (xlator_t *this, fd_t *fd) +br_stub_require_release_call (xlator_t *this, fd_t *fd, br_stub_fd_t **fd_ctx) { int32_t ret = 0; + br_stub_fd_t *br_stub_fd = NULL; + + br_stub_fd = br_stub_fd_new (); + if (!br_stub_fd) + return -1; + + br_stub_fd->fd = fd; + INIT_LIST_HEAD (&br_stub_fd->list); - ret = fd_ctx_set (fd, this, - (uint64_t)(long)BR_STUB_REQUIRE_RELEASE_CBK); + ret = br_stub_fd_ctx_set (this, fd, br_stub_fd); if (ret) gf_log (this->name, GF_LOG_WARNING, "could not set fd context (for release callback"); + else + *fd_ctx = br_stub_fd; + return ret; } @@ -122,7 +237,15 @@ static inline int br_stub_get_inode_ctx (xlator_t *this, inode_t *inode, uint64_t *ctx) { - return inode_ctx_get (inode, this, ctx); + int ret = -1; + + LOCK (&inode->lock); + { + ret = __br_stub_get_inode_ctx (this, inode, ctx); + } + UNLOCK (&inode->lock); + + return ret; } static inline int @@ -144,55 +267,31 @@ __br_stub_writeback_version (br_stub_inode_ctx_t *ctx) static inline void __br_stub_set_ongoing_version (br_stub_inode_ctx_t *ctx, unsigned long version) { - ctx->currentversion = version; -} - -static inline void -__br_stub_reset_release_counters (br_stub_inode_ctx_t *ctx) -{ - ctx->releasectx.ordflags = 0; - ctx->releasectx.opencount = 0; - ctx->releasectx.releasecount = 0; -} - -static inline void -__br_stub_track_release (br_stub_inode_ctx_t *ctx) -{ - ++ctx->releasectx.releasecount; -} - -static inline void -___br_stub_track_open (br_stub_inode_ctx_t *ctx) -{ - ++ctx->releasectx.opencount; -} - -static inline void -___br_stub_track_open_flags (fd_t *fd, br_stub_inode_ctx_t *ctx) -{ - ctx->releasectx.ordflags |= fd->flags; -} - -static inline void -__br_stub_track_openfd (fd_t *fd, br_stub_inode_ctx_t *ctx) -{ - ___br_stub_track_open (ctx); - ___br_stub_track_open_flags (fd, ctx); + if (ctx->currentversion < version) + ctx->currentversion = version; + else + gf_log ("bit-rot-stub", GF_LOG_WARNING, "current version: %lu" + "new version: %lu", ctx->currentversion, version); } static inline int __br_stub_can_trigger_release (inode_t *inode, - br_stub_inode_ctx_t *ctx, - unsigned long *version, int32_t *flags) + br_stub_inode_ctx_t *ctx, unsigned long *version) { - if (list_empty (&inode->fd_list) - && (ctx->releasectx.releasecount == ctx->releasectx.opencount)) { - if (flags) - *flags = htonl (ctx->releasectx.ordflags); + /** + * If the inode is modified, then it has to be dirty. An inode is + * marked dirty once version is increased. Its marked as modified + * when the modification call (write/truncate) which triggered + * the versioning is successful. + */ + if (__br_stub_is_inode_modified (ctx) + && list_empty (&ctx->fd_list) + && (ctx->info_sign != BR_SIGN_REOPEN_WAIT)) { + + GF_ASSERT (__br_stub_is_inode_dirty (ctx) == 0); + if (version) *version = htonl (ctx->currentversion); - - __br_stub_reset_release_counters (ctx); return 1; } @@ -261,11 +360,16 @@ static inline void br_stub_remove_vxattrs (dict_t *xattr) { if (xattr) { - dict_del (xattr, BITROT_OBJECT_BAD_KEY); dict_del (xattr, BITROT_CURRENT_VERSION_KEY); dict_del (xattr, BITROT_SIGNING_VERSION_KEY); dict_del (xattr, BITROT_SIGNING_XATTR_SIZE_KEY); } } +int32_t +br_stub_add_fd_to_inode (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx); + +br_sign_state_t +__br_stub_inode_sign_state (br_stub_inode_ctx_t *ctx, glusterfs_fop_t fop, + fd_t *fd); #endif /* __BIT_ROT_STUB_H__ */ |