summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVenky Shankar <vshankar@redhat.com>2015-05-26 21:51:31 +0530
committerVijay Bellur <vbellur@redhat.com>2015-05-28 02:39:05 -0700
commite9290dc0db7bee31cda1cbed1d9eb68d9c404746 (patch)
treea52ecd5ced83ebc6f0d4e62cb6fc72e20c86efba
parentbc858473db1e1091b15d3f3d69ac6ba5d20b58e7 (diff)
features/bitrot: stub improvements and fixes
This patch refactors the signing trigger mechanism used by bitrot daemon as a "catch up" meachanism to sign files which _missed_ signing on the last run either due to bitrot being disabled and enabled again or if bitrot is enabled for a volume with existing data. Existing implementation relies on overloading writev() to trigger signing which just by the looks sounded dangerous and I hated it to the core. This change moves all that business to the setxattr interface thereby keeping the writev path strictly for client IO. Why not use IPC fop to trigger signing? There's a need to access the object's inode to perform various maintainance operations. inode is not _directly_ accessible in the IPC fop (although, it can be found via inode_grep() for the object's GFID - the inode just needs to be pinned in memory, which is the case if there's an active fd on the inode). This patch relies on good old technique of overloading fsetxattr() to do the job instead of using IPC fop. There are some pretty nice cleanups along the lines of memory deallocations, unncessary allocations and redundant ref()ing of structures (such as fd's) provided by this patch. All in all - much improved code navigation. Change-Id: Id93fe90b1618802d1a95a5072517dac342b96cb8 BUG: 1224600 Signed-off-by: Venky Shankar <vshankar@redhat.com> Reviewed-on: http://review.gluster.org/10942 Tested-by: NetBSD Build System Reviewed-by: Vijay Bellur <vbellur@redhat.com>
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.c150
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.h3
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-common.h5
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub.c700
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub.h3
5 files changed, 435 insertions, 426 deletions
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c
index b9adbd6647c..2652f02b4ea 100644
--- a/xlators/features/bit-rot/src/bitd/bit-rot.c
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.c
@@ -485,94 +485,72 @@ br_log_object_path (xlator_t *this, char *op,
}
static void
-br_send_dummy_write (xlator_t *this, fd_t *fd, br_child_t *child,
- dict_t *xdata)
+br_trigger_sign (xlator_t *this, br_child_t *child,
+ inode_t *linked_inode, loc_t *loc, gf_boolean_t need_reopen)
{
- struct iovec iov = {0, };
- struct iobref *iobref = NULL;
- struct iobuf *iobuf = NULL;
- char *msg = NULL;
- size_t size = 0;
- int32_t ret = -1;
-
- GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
- GF_VALIDATE_OR_GOTO (this->name, child, out);
-
- msg = gf_strdup ("GLUSTERFS");
- if (!msg)
- goto out;
+ fd_t *fd = NULL;
+ int32_t ret = -1;
+ uint32_t val = 0;
+ dict_t *dict = NULL;
+ pid_t pid = GF_CLIENT_PID_BITD;
- size = strlen (msg);
+ syncopctx_setfspid (&pid);
- iov.iov_base = msg;
- iov.iov_len = size;
+ val = (need_reopen == _gf_true) ? BR_OBJECT_REOPEN : BR_OBJECT_RESIGN;
- iobref = iobref_new ();
- if (!iobref)
- goto free_msg;
+ dict = dict_new ();
+ if (!dict)
+ goto out;
- iobuf = iobuf_get2 (this->ctx->iobuf_pool, size);
- if (!iobuf)
- goto free_iobref;
+ ret = dict_set_uint32 (dict, BR_REOPEN_SIGN_HINT_KEY, val);
+ if (ret)
+ goto cleanup_dict;
- iobref_add (iobref, iobuf);
+ ret = -1;
+ fd = fd_create (linked_inode, 0);
+ if (!fd) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to create fd "
+ "[GFID %s]", uuid_utoa (linked_inode->gfid));
+ goto cleanup_dict;
+ }
- iov_unload (iobuf_ptr (iobuf), &iov, 1); /* FIXME!!! */
+ ret = syncop_open (child->xl, loc, O_RDWR, fd, NULL, NULL);
+ if (ret) {
+ br_log_object (this, "open", linked_inode->gfid, -ret);
+ goto unref_fd;
+ }
- iov.iov_base = iobuf_ptr (iobuf);
- iov.iov_len = size;
+ fd_bind (fd);
- ret = syncop_writev (child->xl, fd, &iov, 1, 0, iobref, 0, xdata, NULL);
- if (ret <= 0) {
- ret = -1;
- gf_log (this->name, GF_LOG_ERROR,
- "dummy write failed (%s)", strerror (errno));
- goto free_iobuf;
- }
+ ret = syncop_fsetxattr (child->xl, fd, dict, 0, NULL, NULL);
+ if (ret)
+ br_log_object (this, "fsetxattr", linked_inode->gfid, -ret);
- /* iobref_unbref() takes care of iobuf unref */
- ret = 0;
+ /* passthough: fd_unref() */
- free_iobuf:
- iobuf_unref (iobuf);
- free_iobref:
- iobref_unref (iobref);
- free_msg:
- GF_FREE (msg);
+ unref_fd:
+ fd_unref (fd);
+ cleanup_dict:
+ dict_unref (dict);
out:
- return;
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Could not trigger signingd for %s (reopen hint: %d)",
+ uuid_utoa (linked_inode->gfid), val);
+ }
}
static void
-br_object_handle_reopen (xlator_t *this,
- br_object_t *object, inode_t *linked_inode)
+br_object_resign (xlator_t *this,
+ br_object_t *object, inode_t *linked_inode)
{
- int32_t ret = -1;
- dict_t *dict = NULL;
- loc_t loc = {0, };
-
- /**
- * Here dict is purposefully not checked for NULL, because at any cost
- * sending a re-open should not be missed. This re-open is an indication
- * for the stub to properly mark inode's status.
- */
- dict = dict_new ();
- if (dict) {
- /* TODO: Make it a #define */
- ret = dict_set_int32 (dict, "br-fd-reopen", 1);
- if (ret)
- gf_log (this->name, GF_LOG_WARNING,
- "Object reopen would trigger versioning.");
- }
+ loc_t loc = {0, };
loc.inode = inode_ref (linked_inode);
gf_uuid_copy (loc.gfid, linked_inode->gfid);
- br_trigger_sign (this, object->child, linked_inode, &loc, dict);
+ br_trigger_sign (this, object->child, linked_inode, &loc, _gf_false);
- if (dict)
- dict_unref (dict);
loc_wipe (&loc);
}
@@ -618,7 +596,7 @@ static inline int32_t br_sign_object (br_object_t *object)
* actual signing of the object.
*/
if (sign_info == BR_SIGN_REOPEN_WAIT) {
- br_object_handle_reopen (this, object, linked_inode);
+ br_object_resign (this, object, linked_inode);
goto unref_inode;
}
@@ -903,41 +881,7 @@ out:
return need_sign;
}
-void
-br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode,
- loc_t *loc, dict_t *xdata)
-{
- fd_t *fd = NULL;
- int32_t ret = -1;
- pid_t pid = GF_CLIENT_PID_BITD;
-
- syncopctx_setfspid (&pid);
-
- fd = fd_create (linked_inode, 0);
- if (!fd) {
- gf_log (this->name, GF_LOG_ERROR,
- "Failed to create fd [GFID %s]",
- uuid_utoa (linked_inode->gfid));
- goto out;
- }
-
- ret = syncop_open (child->xl, loc, O_RDWR, fd, NULL, NULL);
- if (ret) {
- br_log_object (this, "open", linked_inode->gfid, -ret);
- fd_unref (fd);
- fd = NULL;
- } else {
- fd_bind (fd);
- }
-
- if (fd) {
- br_send_dummy_write (this, fd, child, xdata);
- syncop_close (fd);
- }
-out:
- return;
-}
int32_t
br_prepare_loc (xlator_t *this, br_child_t *child, loc_t *parent,
@@ -1076,7 +1020,7 @@ bitd_oneshot_crawl (xlator_t *subvol,
gf_log (this->name, GF_LOG_INFO,
"Triggering signing for %s [GFID: %s | Brick: %s]",
loc.path, uuid_utoa (linked_inode->gfid), child->brick_path);
- br_trigger_sign (this, child, linked_inode, &loc, NULL);
+ br_trigger_sign (this, child, linked_inode, &loc, _gf_true);
ret = 0;
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.h b/xlators/features/bit-rot/src/bitd/bit-rot.h
index 1705f715f0c..6543be763d6 100644
--- a/xlators/features/bit-rot/src/bitd/bit-rot.h
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.h
@@ -182,8 +182,5 @@ br_prepare_loc (xlator_t *, br_child_t *, loc_t *, gf_dirent_t *, loc_t *);
gf_boolean_t
bitd_is_bad_file (xlator_t *, br_child_t *, loc_t *, fd_t *);
-void
-br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode,
- loc_t *loc, dict_t *xdata);
#endif /* __BIT_ROT_H__ */
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-common.h b/xlators/features/bit-rot/src/stub/bit-rot-common.h
index 7fd584e5970..a8285d2b560 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-common.h
+++ b/xlators/features/bit-rot/src/stub/bit-rot-common.h
@@ -119,6 +119,11 @@ typedef enum {
/* BitRot stub start time (virtual xattr) */
#define GLUSTERFS_GET_BR_STUB_INIT_TIME "trusted.glusterfs.bit-rot.stub-init"
+/* signing/reopen hint */
+#define BR_OBJECT_RESIGN 0
+#define BR_OBJECT_REOPEN 1
+#define BR_REOPEN_SIGN_HINT_KEY "trusted.glusterfs.bit-rot.reopen-hint"
+
static inline int
br_is_signature_type_valid (int8_t signaturetype)
{
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.c b/xlators/features/bit-rot/src/stub/bit-rot-stub.c
index 93db072f671..524c235b549 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub.c
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.c
@@ -255,7 +255,7 @@ br_stub_fill_local (br_stub_local_t *local,
local->fopstub = stub;
local->versioningtype = versioningtype;
local->u.context.version = memversion;
- if (fd && !local->u.context.fd)
+ if (fd)
local->u.context.fd = fd_ref (fd);
if (inode)
local->u.context.inode = inode_ref (inode);
@@ -279,6 +279,126 @@ br_stub_cleanup_local (br_stub_local_t *local)
memset (local->u.context.gfid, '\0', sizeof (uuid_t));
}
+static int
+br_stub_need_versioning (xlator_t *this,
+ fd_t *fd, gf_boolean_t *versioning,
+ gf_boolean_t *modified, br_stub_inode_ctx_t **ctx)
+{
+ int32_t ret = -1;
+ uint64_t ctx_addr = 0;
+ br_stub_inode_ctx_t *c = NULL;
+
+ *versioning = _gf_false;
+ *modified = _gf_false;
+
+ ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the inode "
+ "context for the inode %s",
+ uuid_utoa (fd->inode->gfid));
+ goto error_return;
+ }
+
+ c = (br_stub_inode_ctx_t *) (long) ctx_addr;
+
+ LOCK (&fd->inode->lock);
+ {
+ if (__br_stub_is_inode_dirty (c))
+ *versioning = _gf_true;
+ if (__br_stub_is_inode_modified (c))
+ *modified = _gf_true;
+ }
+ UNLOCK (&fd->inode->lock);
+
+ if (ctx)
+ *ctx = c;
+ return 0;
+
+ error_return:
+ return -1;
+}
+
+static int32_t
+br_stub_anon_fd_ctx (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx)
+{
+ int32_t ret = -1;
+ br_stub_fd_t *br_stub_fd = NULL;
+
+ br_stub_fd = br_stub_fd_ctx_get (this, fd);
+ if (!br_stub_fd) {
+ ret = br_stub_add_fd_to_inode (this, fd, ctx);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
+ "add fd to the inode (gfid: %s)",
+ uuid_utoa (fd->inode->gfid));
+ goto out;
+ }
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+static int
+br_stub_versioning_prep (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx)
+{
+ int32_t ret = -1;
+ br_stub_local_t *local = NULL;
+
+ local = br_stub_alloc_local (this);
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR, "local allocation failed "
+ "(gfid: %s)", uuid_utoa (fd->inode->gfid));
+ goto error_return;
+ }
+
+ if (fd_is_anonymous (fd)) {
+ ret = br_stub_anon_fd_ctx (this, fd, ctx);
+ if (ret)
+ goto free_local;
+ }
+
+ frame->local = local;
+
+ return 0;
+
+ free_local:
+ br_stub_dealloc_local (local);
+ error_return:
+ return -1;
+}
+
+static int
+br_stub_mark_inode_modified (xlator_t *this, br_stub_local_t *local)
+{
+ fd_t *fd = NULL;
+ int32_t ret = 0;
+ uint64_t ctx_addr = 0;
+ br_stub_inode_ctx_t *ctx = NULL;
+
+ fd = local->u.context.fd;
+
+ ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr);
+ if (ret < 0)
+ goto error_return;
+
+ ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
+
+ LOCK (&fd->inode->lock);
+ {
+ __br_stub_set_inode_modified (ctx);
+ }
+ UNLOCK (&fd->inode->lock);
+
+ return 0;
+
+ error_return:
+ return -1;
+}
+
/**
* callback for inode/fd versioning
*/
@@ -379,7 +499,6 @@ br_stub_fd_versioning (xlator_t *this, call_frame_t *frame,
fd->inode, fd->inode->gfid,
versioningtype, memversion);
- frame->local = local;
STACK_WIND (frame, callback,
FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr,
fd, dict, flags, xdata);
@@ -409,7 +528,6 @@ br_stub_perform_incversioning (xlator_t *this,
writeback_version = __br_stub_writeback_version (ctx);
- /* inode requires writeback to disk */
op_errno = ENOMEM;
dict = dict_new ();
if (!dict)
@@ -518,54 +636,160 @@ br_stub_prepare_signature (xlator_t *this, dict_t *dict,
return -1;
}
-int
-br_stub_fsetxattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, dict_t *dict, int flags, dict_t *xdata)
+static void
+br_stub_handle_object_signature (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, dict_t *dict,
+ br_isignature_t *sign, dict_t *xdata)
{
- int32_t ret = 0;
- br_isignature_t *sign = NULL;
- gf_boolean_t xref = _gf_false;
+ int32_t ret = -1;
+ gf_boolean_t xref = _gf_false;
- if (!IA_ISREG (fd->inode->ia_type))
- goto wind;
- ret = dict_get_bin (dict, GLUSTERFS_SET_OBJECT_SIGNATURE,
- (void **) &sign);
- if (ret < 0)
- goto wind;
if (frame->root->pid != GF_CLIENT_PID_BITD)
- goto unwind;
+ goto dofop;
ret = br_stub_prepare_signature (this, dict, fd->inode, sign);
if (ret)
- goto unwind;
+ goto dofop;
dict_del (dict, GLUSTERFS_SET_OBJECT_SIGNATURE);
+ ret = -1;
if (!xdata) {
xdata = dict_new ();
if (!xdata)
- goto unwind;
+ goto dofop;
} else {
dict_ref (xdata);
}
xref = _gf_true;
ret = dict_set_int32 (xdata, GLUSTERFS_DURABLE_OP, 0);
+
+ dofop:
+ if (ret)
+ STACK_UNWIND_STRICT (fsetxattr, frame, -1, EINVAL, NULL);
+ else {
+ gf_log (this->name, GF_LOG_DEBUG, "SIGNED VERSION: %lu",
+ sign->signedversion);
+
+ STACK_WIND (frame, default_setxattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fsetxattr, fd, dict, 0,
+ xdata);
+ }
+
+ if (xref)
+ dict_unref (xdata);
+}
+
+int32_t
+br_stub_fsetxattr_resume (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ int32_t ret = -1;
+ br_stub_local_t *local = NULL;
+
+ local = frame->local;
+ frame->local = NULL;
+
+ ret = br_stub_mark_inode_modified (this, local);
+ if (ret) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ }
+
+ STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, xdata);
+
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+
+ return 0;
+}
+
+static void
+br_stub_handle_object_reopen (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, uint32_t val)
+{
+ int32_t ret = -1;
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ call_stub_t *stub = NULL;
+ gf_boolean_t inc_version = _gf_false;
+ gf_boolean_t modified = _gf_false;
+ br_stub_inode_ctx_t *ctx = NULL;
+ br_stub_local_t *local = NULL;
+
+ ret = br_stub_need_versioning (this, fd, &inc_version, &modified, &ctx);
if (ret)
goto unwind;
- gf_log (this->name, GF_LOG_DEBUG, "SIGNED VERSION: %lu",
- sign->signedversion);
+ LOCK (&fd->inode->lock);
+ {
+ (void) __br_stub_inode_sign_state (ctx, GF_FOP_FSETXATTR, fd);
+ }
+ UNLOCK (&fd->inode->lock);
+
+ if ((val == BR_OBJECT_RESIGN) || !inc_version) {
+ op_ret = op_errno = 0;
+ goto unwind;
+ }
+
+ ret = br_stub_versioning_prep (frame, this, fd, ctx);
+ if (ret)
+ goto unwind;
+ local = frame->local;
+
+ stub = fop_fsetxattr_cbk_stub (frame, br_stub_fsetxattr_resume,
+ 0, 0, NULL);
+ if (!stub) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for "
+ "fsetxattr fop (gfid: %s), unwinding",
+ uuid_utoa (fd->inode->gfid));
+ goto cleanup_local;
+ }
+
+ (void) br_stub_perform_incversioning (this, frame, stub, fd, ctx);
+ return;
+
+ cleanup_local:
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+
+ unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, NULL);
+}
+
+int
+br_stub_fsetxattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, dict_t *dict, int flags, dict_t *xdata)
+{
+ int32_t ret = 0;
+ uint32_t val = 0;
+ br_isignature_t *sign = NULL;
+
+ if (!IA_ISREG (fd->inode->ia_type))
+ goto wind;
+
+ /* object signature request */
+ ret = dict_get_bin (dict, GLUSTERFS_SET_OBJECT_SIGNATURE,
+ (void **) &sign);
+ if (!ret) {
+ br_stub_handle_object_signature (frame, this,
+ fd, dict, sign, xdata);
+ goto done;
+ }
+
+ /* object reopen request */
+ ret = dict_get_uint32 (dict, BR_REOPEN_SIGN_HINT_KEY, &val);
+ if (!ret) {
+ br_stub_handle_object_reopen (frame, this, fd, val);
+ goto done;
+ }
+
wind:
STACK_WIND (frame, default_setxattr_cbk,
FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr, fd,
dict, flags, xdata);
- goto done;
-
- unwind:
- STACK_UNWIND_STRICT (setxattr, frame, -1, EINVAL, NULL);
done:
- if (xref)
- dict_unref (xdata);
return 0;
}
@@ -812,110 +1036,29 @@ br_stub_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- int32_t ret = 0;
- uint64_t ctx_addr = 0;
- br_stub_inode_ctx_t *ctx = NULL;
- br_stub_local_t *local = NULL;
+ int32_t ret = 0;
+ br_stub_local_t *local = NULL;
- if (frame->local) {
- local = frame->local;
- frame->local = NULL;
- }
+ local = frame->local;
+ frame->local = NULL;
if (op_ret < 0)
goto unwind;
- ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode,
- &ctx_addr);
- if (ret < 0)
- goto unwind;
-
- ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
-
- /* Mark the flag to indicate the inode has been modified */
- LOCK (&local->u.context.fd->inode->lock);
- {
- if (!__br_stub_is_inode_modified (ctx))
- __br_stub_set_inode_modified (ctx);
+ ret = br_stub_mark_inode_modified (this, local);
+ if (ret) {
+ op_ret = -1;
+ op_errno = EINVAL;
}
- UNLOCK (&local->u.context.fd->inode->lock);
-
unwind:
- STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf,
- xdata);
+ STACK_UNWIND_STRICT (writev, frame,
+ op_ret, op_errno, prebuf, postbuf, xdata);
+
br_stub_cleanup_local (local);
br_stub_dealloc_local (local);
- return 0;
-}
-
-/**
- * Ongoing version is increased only for the first modify operation.
- * First modify version means the first write or truncate call coming on the
- * first fd in the list of inodes.
- * For anonymous fds open would not have come, so check if its the first write
- * by doing both inode dirty check and ensuring list of fds is empty
- */
-static inline gf_boolean_t
-br_stub_inc_version (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx)
-{
- gf_boolean_t inc_version = _gf_false;
-
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
- GF_VALIDATE_OR_GOTO (this->name, ctx, out);
-
- LOCK (&fd->inode->lock);
- {
- if (__br_stub_is_inode_dirty (ctx))
- inc_version = _gf_true;
- }
- UNLOCK (&fd->inode->lock);
-
-out:
- return inc_version;
-}
-
-/**
- * Since NFS does not do open, writes from NFS are sent over an anonymous
- * fd. It means each write fop might come on a different anonymous fd and
- * will lead to very large number of notifications being sent. It might
- * affect the perfromance as, there will too many sign requests.
- * To avoid that whenever the last fd released from an inode (logical release)
- * is an anonymous fd the release notification is sent with a flag being set
- * __br_stub_anon_release (ctx);
- * BitD checks for the flag and if set, it will send a dummy write request
- * (again on an anonymous fd) instead of triggering sign.
- * Bit-rot-stub should identify such dummy writes and should send success to
- * them instead of winding them downwards.
- */
-gf_boolean_t
-br_stub_dummy_write (call_frame_t *frame)
-{
- return (frame->root->pid == GF_CLIENT_PID_BITD)
- ? _gf_true : _gf_false;
-}
-
-int32_t
-br_stub_anon_fd_ctx (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx)
-{
- int32_t ret = -1;
- br_stub_fd_t *br_stub_fd = NULL;
- br_stub_fd = br_stub_fd_ctx_get (this, fd);
- if (!br_stub_fd) {
- ret = br_stub_add_fd_to_inode (this, fd, ctx);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "failed to "
- "add fd to the inode (gfid: %s)",
- uuid_utoa (fd->inode->gfid));
- goto out;
- }
- }
-
- ret = 0;
-
-out:
- return ret;
+ return 0;
}
int32_t
@@ -923,107 +1066,71 @@ br_stub_writev_resume (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iovec *vector, int32_t count, off_t offset,
uint32_t flags, struct iobref *iobref, dict_t *xdata)
{
- if (frame->root->pid == GF_CLIENT_PID_BITD)
- br_stub_writev_cbk (frame, NULL, this, vector->iov_len, 0,
- NULL, NULL, NULL);
- else
- STACK_WIND (frame, br_stub_writev_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->writev, fd, vector, count,
- offset, flags, iobref, xdata);
+ STACK_WIND (frame, br_stub_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count,
+ offset, flags, iobref, xdata);
return 0;
}
/**
- TODO: If possible add pictorial represention of below comment.
-
- Before sending writev on the ANONYMOUS FD, increase the ongoing
- version first. This brings anonymous fd write closer to the regular
- fd write by having the ongoing version increased before doing the
- write (In regular fd, after open the ongoing version is incremented).
- Do following steps to handle writes on anonymous fds:
- 1) Increase the on-disk ongoing version
- 2) Once versioning is successfully done send write operation. If versioning
- fails, then fail the write fop.
- 3) In writev_cbk do below things:
- a) Increase in-memory version
- b) set the fd context (so that br_stub_release is invoked)
- c) add the fd to the list of fds maintained in the inode context of
- bitrot-stub.
- d) Mark inode as non dirty
- e) Mard inode as modified (in the inode context)
-**/
+ * This is probably the most crucial part about the whole versioning thing.
+ * There's absolutely no differentiation as such between an anonymous fd
+ * and a regular fd except the fd context initialization. Object versioning
+ * is perfomed when the inode is dirty. Parallel write operations are no
+ * special with each write performing object versioning followed by marking
+ * the inode as non-dirty (synced). This is followed by the actual operation
+ * (writev() in this case) which on a success marks the inode as modified.
+ * This prevents signing of objects that have not been modified.
+ */
int32_t
br_stub_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iovec *vector, int32_t count, off_t offset,
uint32_t flags, struct iobref *iobref, dict_t *xdata)
{
- br_stub_local_t *local = NULL;
call_stub_t *stub = NULL;
int32_t op_ret = -1;
int32_t op_errno = EINVAL;
gf_boolean_t inc_version = _gf_false;
+ gf_boolean_t modified = _gf_false;
br_stub_inode_ctx_t *ctx = NULL;
- uint64_t ctx_addr = 0;
int32_t ret = -1;
+ fop_writev_cbk_t cbk = default_writev_cbk;
+ br_stub_local_t *local = NULL;
GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
GF_VALIDATE_OR_GOTO (this->name, frame, unwind);
GF_VALIDATE_OR_GOTO (this->name, fd, unwind);
- local = br_stub_alloc_local (this);
- if (!local) {
- gf_log (this->name, GF_LOG_ERROR, "local allocation failed "
- "(gfid: %s)", uuid_utoa (fd->inode->gfid));
- op_ret = -1;
- op_errno = ENOMEM;
- goto unwind;
- }
-
- local->u.context.fd = fd_ref (fd);
- frame->local = local;
-
- ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "failed to get the inode "
- "context for the inode %s",
- uuid_utoa (fd->inode->gfid));
+ ret = br_stub_need_versioning (this, fd, &inc_version, &modified, &ctx);
+ if (ret)
goto unwind;
- }
- ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
- if (fd_is_anonymous (fd)) {
- ret = br_stub_anon_fd_ctx (this, fd, ctx);
- if (ret)
- goto unwind;
- }
-
- /* TODO: Better to do a dummy fsetxattr instead of write. Keep write
- simple */
- if (br_stub_dummy_write (frame)) {
- LOCK (&fd->inode->lock);
- {
- (void) __br_stub_inode_sign_state
- (ctx, GF_FOP_WRITE, fd);
- }
- UNLOCK (&fd->inode->lock);
-
- if (xdata && dict_get (xdata, "br-fd-reopen")) {
- op_ret = vector->iov_len;
- op_errno = 0;
- goto unwind;
- }
- }
+ /**
+ * The inode is not dirty and also witnessed atleast one successful
+ * modification operation. Therefore, subsequent operations need not
+ * perform any special tracking.
+ */
+ if (!inc_version && modified)
+ goto wind;
/**
- * Check whether this is the first write on this inode since the last
- * sign notification has been sent. If so, do versioning. Otherwise
- * go ahead with the fop.
+ * okay.. so, either the inode needs versioning or the modification
+ * needs to be tracked. ->cbk is set to the appropriate callback
+ * routine for this.
+ * NOTE: ->local needs to be deallocated on failures from here on.
*/
- inc_version = br_stub_inc_version (this, fd, ctx);
- if (!inc_version)
+ ret = br_stub_versioning_prep (frame, this, fd, ctx);
+ if (ret)
+ goto unwind;
+
+ local = frame->local;
+ if (!inc_version) {
+ br_stub_fill_local (local, NULL, fd, fd->inode,
+ fd->inode->gfid, BR_STUB_NO_VERSIONING, 0);
+ cbk = br_stub_writev_cbk;
goto wind;
+ }
- /* Create the stub for the write fop */
stub = fop_writev_stub (frame, br_stub_writev_resume, fd, vector, count,
offset, flags, iobref, xdata);
@@ -1031,24 +1138,27 @@ br_stub_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for "
"write fop (gfid: %s), unwinding",
uuid_utoa (fd->inode->gfid));
- goto unwind;
+ goto cleanup_local;
}
/* Perform Versioning */
return br_stub_perform_incversioning (this, frame, stub, fd, ctx);
-wind:
- STACK_WIND (frame, br_stub_writev_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
- flags, iobref, xdata);
+ wind:
+ STACK_WIND (frame, cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev,
+ fd, vector, count, offset, flags, iobref, xdata);
return 0;
-unwind:
+ cleanup_local:
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+
+ unwind:
frame->local = NULL;
STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, NULL, NULL,
NULL);
- br_stub_cleanup_local (local);
- br_stub_dealloc_local (local);
+
return 0;
}
@@ -1057,40 +1167,28 @@ br_stub_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- int32_t ret = 0;
- uint64_t ctx_addr = 0;
- br_stub_inode_ctx_t *ctx = NULL;
- br_stub_local_t *local = NULL;
+ int32_t ret = -1;
+ br_stub_local_t *local = NULL;
- if (frame->local) {
- local = frame->local;
- frame->local = NULL;
- }
+ local = frame->local;
+ frame->local = NULL;
if (op_ret < 0)
goto unwind;
- ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode,
- &ctx_addr);
- if (ret < 0)
- goto unwind;
-
- ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
-
- /* Mark the flag to indicate the inode has been modified */
- LOCK (&local->u.context.fd->inode->lock);
- {
- if (!__br_stub_is_inode_modified (ctx))
- __br_stub_set_inode_modified (ctx);
+ ret = br_stub_mark_inode_modified (this, local);
+ if (ret) {
+ op_ret = -1;
+ op_errno = EINVAL;
}
- UNLOCK (&local->u.context.fd->inode->lock);
-
unwind:
- STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
- xdata);
+ STACK_UNWIND_STRICT (ftruncate, frame,
+ op_ret, op_errno, prebuf, postbuf, xdata);
+
br_stub_cleanup_local (local);
br_stub_dealloc_local (local);
+
return 0;
}
@@ -1103,6 +1201,7 @@ br_stub_ftruncate_resume (call_frame_t *frame, xlator_t *this, fd_t *fd,
return 0;
}
+/* c.f. br_stub_writev() for explanation */
int32_t
br_stub_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd,
off_t offset, dict_t *xdata)
@@ -1112,72 +1211,59 @@ br_stub_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd,
int32_t op_ret = -1;
int32_t op_errno = EINVAL;
gf_boolean_t inc_version = _gf_false;
+ gf_boolean_t modified = _gf_false;
br_stub_inode_ctx_t *ctx = NULL;
- uint64_t ctx_addr = 0;
int32_t ret = -1;
+ fop_ftruncate_cbk_t cbk = default_ftruncate_cbk;
GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
GF_VALIDATE_OR_GOTO (this->name, frame, unwind);
GF_VALIDATE_OR_GOTO (this->name, fd, unwind);
- local = br_stub_alloc_local (this);
- if (!local) {
- gf_log (this->name, GF_LOG_ERROR, "local allocation failed "
- "(gfid: %s)", uuid_utoa (fd->inode->gfid));
- op_ret = -1;
- op_errno = ENOMEM;
+ ret = br_stub_need_versioning (this, fd, &inc_version, &modified, &ctx);
+ if (ret)
goto unwind;
- }
- local->u.context.fd = fd_ref (fd);
- frame->local = local;
+ if (!inc_version && modified)
+ goto wind;
- ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "failed to get the inode "
- "context for the inode %s",
- uuid_utoa (fd->inode->gfid));
+ ret = br_stub_versioning_prep (frame, this, fd, ctx);
+ if (ret)
goto unwind;
- }
- ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
- if (fd_is_anonymous (fd)) {
- ret = br_stub_anon_fd_ctx (this, fd, ctx);
- if (ret)
- goto unwind;
- }
-
- /**
- * c.f. br_stub_writev()
- */
- inc_version = br_stub_inc_version (this, fd, ctx);
- if (!inc_version)
+ local = frame->local;
+ if (!inc_version) {
+ br_stub_fill_local (local, NULL, fd, fd->inode,
+ fd->inode->gfid, BR_STUB_NO_VERSIONING, 0);
+ cbk = br_stub_ftruncate_cbk;
goto wind;
+ }
- /* Create the stub for the ftruncate fop */
stub = fop_ftruncate_stub (frame, br_stub_ftruncate_resume, fd, offset,
xdata);
if (!stub) {
gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for "
"ftruncate fop (gfid: %s), unwinding",
uuid_utoa (fd->inode->gfid));
- goto unwind;
+ goto cleanup_local;
}
- /* Perform Versioning */
return br_stub_perform_incversioning (this, frame, stub, fd, ctx);
-wind:
- STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this),
+ wind:
+ STACK_WIND (frame, cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
return 0;
-unwind:
+ cleanup_local:
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+
+ unwind:
frame->local = NULL;
STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, NULL, NULL,
NULL);
- br_stub_cleanup_local (local);
- br_stub_dealloc_local (local);
+
return 0;
}
@@ -1186,34 +1272,20 @@ br_stub_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- int32_t ret = 0;
- uint64_t ctx_addr = 0;
- br_stub_inode_ctx_t *ctx = NULL;
- br_stub_local_t *local = NULL;
+ int32_t ret = 0;
+ br_stub_local_t *local = NULL;
- if (frame->local) {
- local = frame->local;
- frame->local = NULL;
- }
+ local = frame->local;
+ frame->local = NULL;
if (op_ret < 0)
goto unwind;
- ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode,
- &ctx_addr);
- if (ret < 0)
- goto unwind;
-
- ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
-
- /* Mark the flag to indicate the inode has been modified */
- LOCK (&local->u.context.fd->inode->lock);
- {
- if (!__br_stub_is_inode_modified (ctx))
- __br_stub_set_inode_modified (ctx);
+ ret = br_stub_mark_inode_modified (this, local);
+ if (ret) {
+ op_ret = -1;
+ op_errno = EINVAL;
}
- UNLOCK (&local->u.context.fd->inode->lock);
-
unwind:
STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, postbuf,
@@ -1243,6 +1315,8 @@ br_stub_truncate_resume (call_frame_t *frame, xlator_t *this, loc_t *loc,
* on an anonymous fd. The fd will be valid till the completion of the
* truncate call. It guarantees that release on this anonymous fd will happen
* after the truncate call and notification is sent after the truncate call.
+ *
+ * c.f. br_writev_cbk() for explanation
*/
int32_t
br_stub_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc,
@@ -1253,10 +1327,11 @@ br_stub_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc,
int32_t op_ret = -1;
int32_t op_errno = EINVAL;
gf_boolean_t inc_version = _gf_false;
+ gf_boolean_t modified = _gf_false;
br_stub_inode_ctx_t *ctx = NULL;
- uint64_t ctx_addr = 0;
int32_t ret = -1;
fd_t *fd = NULL;
+ fop_truncate_cbk_t cbk = default_truncate_cbk;
GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
GF_VALIDATE_OR_GOTO (this->name, frame, unwind);
@@ -1270,63 +1345,50 @@ br_stub_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc,
goto unwind;
}
- local = br_stub_alloc_local (this);
- if (!local) {
- gf_log (this->name, GF_LOG_ERROR, "local allocation failed "
- "(gfid: %s)", uuid_utoa (loc->inode->gfid));
- op_ret = -1;
- op_errno = ENOMEM;
+ ret = br_stub_need_versioning (this, fd, &inc_version, &modified, &ctx);
+ if (ret)
goto unwind;
- }
-
- local->u.context.fd = fd;
- frame->local = local;
- ret = br_stub_get_inode_ctx (this, loc->inode, &ctx_addr);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "failed to get the inode "
- "context for the inode %s",
- uuid_utoa (fd->inode->gfid));
- goto unwind;
- }
+ if (!inc_version && modified)
+ goto wind;
- ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
- ret = br_stub_anon_fd_ctx (this, local->u.context.fd, ctx);
+ ret = br_stub_versioning_prep (frame, this, fd, ctx);
if (ret)
goto unwind;
- /**
- * c.f. br_stub_writev()
- */
- inc_version = br_stub_inc_version (this, fd, ctx);
- if (!inc_version)
+ local = frame->local;
+ if (!inc_version) {
+ br_stub_fill_local (local, NULL, fd, fd->inode,
+ fd->inode->gfid, BR_STUB_NO_VERSIONING, 0);
+ cbk = br_stub_truncate_cbk;
goto wind;
+ }
- /* Create the stub for the truncate fop */
stub = fop_truncate_stub (frame, br_stub_truncate_resume, loc, offset,
xdata);
if (!stub) {
gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for "
"truncate fop (gfid: %s), unwinding",
uuid_utoa (fd->inode->gfid));
- goto unwind;
+ goto cleanup_local;
}
- /* Perform Versioning */
- return br_stub_perform_incversioning (this, frame, stub,
- local->u.context.fd, ctx);
+ return br_stub_perform_incversioning (this, frame, stub, fd, ctx);
-wind:
- STACK_WIND (frame, br_stub_truncate_cbk, FIRST_CHILD(this),
+ wind:
+ STACK_WIND (frame, cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
return 0;
-unwind:
+ cleanup_local:
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+
+ unwind:
frame->local = NULL;
STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, NULL, NULL,
NULL);
- br_stub_cleanup_local (local);
- br_stub_dealloc_local (local);
+
return 0;
}
@@ -1810,7 +1872,7 @@ __br_stub_inode_sign_state (br_stub_inode_ctx_t *ctx,
switch (fop) {
- case GF_FOP_WRITE:
+ case GF_FOP_FSETXATTR:
sign_info = ctx->info_sign = BR_SIGN_QUICK;
break;
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.h b/xlators/features/bit-rot/src/stub/bit-rot-stub.h
index 69e212bb81f..e1e7b383f42 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub.h
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.h
@@ -66,7 +66,8 @@ typedef struct br_stub_local {
} u;
} br_stub_local_t;
-#define BR_STUB_INCREMENTAL_VERSIONING (1<<1)
+#define BR_STUB_NO_VERSIONING (1 << 0)
+#define BR_STUB_INCREMENTAL_VERSIONING (1 << 1)
typedef struct br_stub_private {
gf_boolean_t go;