diff options
author | Raghavendra Bhat <raghavendra@redhat.com> | 2015-04-09 15:38:47 +0530 |
---|---|---|
committer | Vijay Bellur <vbellur@redhat.com> | 2015-05-08 11:27:42 -0700 |
commit | c93c433a44770de931f837be179c5ccdba958cad (patch) | |
tree | f6c5f513e8b2ccadc3b53d73db609f731d54ba8b /xlators/features/bit-rot/src/bitd | |
parent | 680b3bf629f0fef038470baab62c6d6d8f5988ce (diff) |
features/bit-rot-stub: versioning of objects in write/truncate fop instead of open
* This patch brings in the changes where object versioning is done in write and
truncate fops instead of tracking them in open and create fops. This model
works for both regular and anonymous fds. It also removes the race associated
with open calls, create and lookups.
This patch follows the below method for object versioning and notifications:
Before sending writev on the fd, increase the ongoing
version first. This makes anonymous fd write similar to the regular
fd write by having the ongoing version increased before doing the
write.
Do following steps to do versioning:
1) For anonymous fds set the fd context (so that release is invoked) and add
the fd context to the list maintained in the inode context.
For regular fds the above think would have been done in open itself.
2) Increase the on-disk ongoing version
3) Increase the in memory ongoing version and mark inode as non-dirty
3) Once versioning is successfully done send write operation. If
versioning fails, then fail the write fop.
5) In writev_cbk mark inode as modified.
Change-Id: I7104391bbe076d8fc49b68745d2ec29a6e92476c
BUG: 1207979
Signed-off-by: Raghavendra Bhat <raghavendra@redhat.com>
Reviewed-on: http://review.gluster.org/10233
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'xlators/features/bit-rot/src/bitd')
-rw-r--r-- | xlators/features/bit-rot/src/bitd/bit-rot.c | 138 | ||||
-rw-r--r-- | xlators/features/bit-rot/src/bitd/bit-rot.h | 6 |
2 files changed, 128 insertions, 16 deletions
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c index 5638b0f348b..61d461f897b 100644 --- a/xlators/features/bit-rot/src/bitd/bit-rot.c +++ b/xlators/features/bit-rot/src/bitd/bit-rot.c @@ -171,11 +171,11 @@ bitd_is_bad_file (xlator_t *this, br_child_t *child, loc_t *loc, fd_t *fd) if (fd) ret = syncop_fgetxattr (child->xl, fd, &xattr, - BITROT_OBJECT_BAD_KEY, NULL, + "trusted.glusterfs.bad-file", NULL, NULL); else if (loc) ret = syncop_getxattr (child->xl, loc, &xattr, - BITROT_OBJECT_BAD_KEY, NULL, + "trusted.glusterfs.bad-file", NULL, NULL); if (!ret) { @@ -484,6 +484,98 @@ br_log_object_path (xlator_t *this, char *op, op, path, strerror (op_errno)); } +static void +br_send_dummy_write (xlator_t *this, fd_t *fd, br_child_t *child, + dict_t *xdata) +{ + struct iovec iov = {0, }; + struct iobref *iobref = NULL; + struct iobuf *iobuf = NULL; + char *msg = NULL; + size_t size = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("bit-rot", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + GF_VALIDATE_OR_GOTO (this->name, child, out); + + msg = gf_strdup ("GLUSTERFS"); + if (!msg) + goto out; + + size = strlen (msg); + + iov.iov_base = msg; + iov.iov_len = size; + + iobref = iobref_new (); + if (!iobref) + goto free_msg; + + iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); + if (!iobuf) + goto free_iobref; + + iobref_add (iobref, iobuf); + + iov_unload (iobuf_ptr (iobuf), &iov, 1); /* FIXME!!! */ + + iov.iov_base = iobuf_ptr (iobuf); + iov.iov_len = size; + + ret = syncop_writev (child->xl, fd, &iov, 1, 0, iobref, 0, xdata, NULL); + if (ret <= 0) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "dummy write failed (%s)", strerror (errno)); + goto free_iobuf; + } + + /* iobref_unbref() takes care of iobuf unref */ + ret = 0; + + free_iobuf: + iobuf_unref (iobuf); + free_iobref: + iobref_unref (iobref); + free_msg: + GF_FREE (msg); + out: + return; +} + +static void +br_object_handle_reopen (xlator_t *this, + br_object_t *object, inode_t *linked_inode) +{ + int32_t ret = -1; + dict_t *dict = NULL; + loc_t loc = {0, }; + + /** + * Here dict is purposefully not checked for NULL, because at any cost + * sending a re-open should not be missed. This re-open is an indication + * for the stub to properly mark inode's status. + */ + dict = dict_new (); + if (dict) { + /* TODO: Make it a #define */ + ret = dict_set_int32 (dict, "br-fd-reopen", 1); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "Object reopen would trigger versioning."); + } + + loc.inode = inode_ref (linked_inode); + gf_uuid_copy (loc.gfid, linked_inode->gfid); + + br_trigger_sign (this, object->child, linked_inode, &loc, dict); + + if (dict) + dict_unref (dict); + loc_wipe (&loc); +} + /** * Sign a given object. This routine runs full throttle. There needs to be * some form of priority scheduling and/or read burstness to avoid starving @@ -497,6 +589,7 @@ static inline int32_t br_sign_object (br_object_t *object) fd_t *fd = NULL; struct iatt iatt = {0, }; pid_t pid = GF_CLIENT_PID_BITD; + br_sign_state_t sign_info = BR_SIGN_NORMAL; GF_VALIDATE_OR_GOTO ("bit-rot", object, out); @@ -515,6 +608,20 @@ static inline int32_t br_sign_object (br_object_t *object) goto out; } + /* sanity check */ + sign_info = ntohl (object->sign_info); + GF_ASSERT (sign_info != BR_SIGN_NORMAL); + + /** + * For fd's that have notified for reopening, we send an explicit + * open() followed by a dummy write() call. This triggers the + * actual signing of the object. + */ + if (sign_info == BR_SIGN_REOPEN_WAIT) { + br_object_handle_reopen (this, object, linked_inode); + goto unref_inode; + } + ret = br_object_open (this, object, linked_inode, &fd); if (!fd) { br_log_object (this, "open", object->gfid, -ret); @@ -648,6 +755,7 @@ br_initialize_object (xlator_t *this, br_child_t *child, changelog_event_t *ev) /* NOTE: it's BE, but no worry */ object->signedversion = ev->u.releasebr.version; + object->sign_info = ev->u.releasebr.sign_info; out: return object; @@ -693,7 +801,6 @@ br_brick_callback (void *xl, char *brick, xlator_t *this = NULL; br_object_t *object = NULL; br_child_t *child = NULL; - int32_t flags = 0; struct gf_tw_timer_list *timer = NULL; this = xl; @@ -710,14 +817,6 @@ br_brick_callback (void *xl, char *brick, gf_log (this->name, GF_LOG_DEBUG, "RELEASE EVENT [GFID %s]", uuid_utoa (gfid)); - flags = (int32_t)ntohl (ev->u.releasebr.flags); - if (flags == O_RDONLY) { - gf_log (this->name, GF_LOG_DEBUG, - "Read only fd [GFID: %s], ignoring signing..", - uuid_utoa (gfid)); - goto out; - } - child = br_get_child_from_brick_path (this, brick); if (!child) { gf_log (this->name, GF_LOG_ERROR, "failed to get the subvolume " @@ -804,12 +903,15 @@ out: return need_sign; } -static inline void +void br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode, - loc_t *loc) + loc_t *loc, dict_t *xdata) { fd_t *fd = NULL; int32_t ret = -1; + pid_t pid = GF_CLIENT_PID_BITD; + + syncopctx_setfspid (&pid); fd = fd_create (linked_inode, 0); if (!fd) { @@ -828,8 +930,10 @@ br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode, fd_bind (fd); } - if (fd) + if (fd) { + br_send_dummy_write (this, fd, child, xdata); syncop_close (fd); + } out: return; @@ -972,7 +1076,7 @@ bitd_oneshot_crawl (xlator_t *subvol, gf_log (this->name, GF_LOG_INFO, "Triggering signing for %s [GFID: %s | Brick: %s]", loc.path, uuid_utoa (linked_inode->gfid), child->brick_path); - br_trigger_sign (this, child, linked_inode, &loc); + br_trigger_sign (this, child, linked_inode, &loc, NULL); ret = 0; @@ -1600,7 +1704,9 @@ struct xlator_cbks cbks; struct volume_options options[] = { { .key = {"expiry-time"}, .type = GF_OPTION_TYPE_INT, - .default_value = "120", + /* Let the default timer be half the value of the wait time for + * sining (which is 120 as of now) */ + .default_value = "60", .description = "default time duration for which an object waits " "before it is signed", }, diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.h b/xlators/features/bit-rot/src/bitd/bit-rot.h index 66515e3213c..bbaf86fa65f 100644 --- a/xlators/features/bit-rot/src/bitd/bit-rot.h +++ b/xlators/features/bit-rot/src/bitd/bit-rot.h @@ -152,6 +152,8 @@ struct br_object { be signed */ br_child_t *child; /* object's subvolume */ + int sign_info; + struct list_head list; /* hook to add to the queue once the object is expired from timer wheel */ void *data; @@ -175,4 +177,8 @@ br_prepare_loc (xlator_t *, br_child_t *, loc_t *, gf_dirent_t *, loc_t *); gf_boolean_t bitd_is_bad_file (xlator_t *, br_child_t *, loc_t *, fd_t *); +void +br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode, + loc_t *loc, dict_t *xdata); + #endif /* __BIT_ROT_H__ */ |