summaryrefslogtreecommitdiffstats
path: root/xlators
diff options
context:
space:
mode:
authorRaghavendra Bhat <raghavendra@redhat.com>2015-04-09 15:38:47 +0530
committerVijay Bellur <vbellur@redhat.com>2015-05-08 11:27:42 -0700
commitc93c433a44770de931f837be179c5ccdba958cad (patch)
treef6c5f513e8b2ccadc3b53d73db609f731d54ba8b /xlators
parent680b3bf629f0fef038470baab62c6d6d8f5988ce (diff)
features/bit-rot-stub: versioning of objects in write/truncate fop instead of open
* This patch brings in the changes where object versioning is done in write and truncate fops instead of tracking them in open and create fops. This model works for both regular and anonymous fds. It also removes the race associated with open calls, create and lookups. This patch follows the below method for object versioning and notifications: Before sending writev on the fd, increase the ongoing version first. This makes anonymous fd write similar to the regular fd write by having the ongoing version increased before doing the write. Do following steps to do versioning: 1) For anonymous fds set the fd context (so that release is invoked) and add the fd context to the list maintained in the inode context. For regular fds the above think would have been done in open itself. 2) Increase the on-disk ongoing version 3) Increase the in memory ongoing version and mark inode as non-dirty 3) Once versioning is successfully done send write operation. If versioning fails, then fail the write fop. 5) In writev_cbk mark inode as modified. Change-Id: I7104391bbe076d8fc49b68745d2ec29a6e92476c BUG: 1207979 Signed-off-by: Raghavendra Bhat <raghavendra@redhat.com> Reviewed-on: http://review.gluster.org/10233 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'xlators')
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.c138
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.h6
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-common.h7
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h3
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub.c1070
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub.h216
6 files changed, 1101 insertions, 339 deletions
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c
index 5638b0f348b..61d461f897b 100644
--- a/xlators/features/bit-rot/src/bitd/bit-rot.c
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.c
@@ -171,11 +171,11 @@ bitd_is_bad_file (xlator_t *this, br_child_t *child, loc_t *loc, fd_t *fd)
if (fd)
ret = syncop_fgetxattr (child->xl, fd, &xattr,
- BITROT_OBJECT_BAD_KEY, NULL,
+ "trusted.glusterfs.bad-file", NULL,
NULL);
else if (loc)
ret = syncop_getxattr (child->xl, loc, &xattr,
- BITROT_OBJECT_BAD_KEY, NULL,
+ "trusted.glusterfs.bad-file", NULL,
NULL);
if (!ret) {
@@ -484,6 +484,98 @@ br_log_object_path (xlator_t *this, char *op,
op, path, strerror (op_errno));
}
+static void
+br_send_dummy_write (xlator_t *this, fd_t *fd, br_child_t *child,
+ dict_t *xdata)
+{
+ struct iovec iov = {0, };
+ struct iobref *iobref = NULL;
+ struct iobuf *iobuf = NULL;
+ char *msg = NULL;
+ size_t size = 0;
+ int32_t ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+ GF_VALIDATE_OR_GOTO (this->name, child, out);
+
+ msg = gf_strdup ("GLUSTERFS");
+ if (!msg)
+ goto out;
+
+ size = strlen (msg);
+
+ iov.iov_base = msg;
+ iov.iov_len = size;
+
+ iobref = iobref_new ();
+ if (!iobref)
+ goto free_msg;
+
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, size);
+ if (!iobuf)
+ goto free_iobref;
+
+ iobref_add (iobref, iobuf);
+
+ iov_unload (iobuf_ptr (iobuf), &iov, 1); /* FIXME!!! */
+
+ iov.iov_base = iobuf_ptr (iobuf);
+ iov.iov_len = size;
+
+ ret = syncop_writev (child->xl, fd, &iov, 1, 0, iobref, 0, xdata, NULL);
+ if (ret <= 0) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "dummy write failed (%s)", strerror (errno));
+ goto free_iobuf;
+ }
+
+ /* iobref_unbref() takes care of iobuf unref */
+ ret = 0;
+
+ free_iobuf:
+ iobuf_unref (iobuf);
+ free_iobref:
+ iobref_unref (iobref);
+ free_msg:
+ GF_FREE (msg);
+ out:
+ return;
+}
+
+static void
+br_object_handle_reopen (xlator_t *this,
+ br_object_t *object, inode_t *linked_inode)
+{
+ int32_t ret = -1;
+ dict_t *dict = NULL;
+ loc_t loc = {0, };
+
+ /**
+ * Here dict is purposefully not checked for NULL, because at any cost
+ * sending a re-open should not be missed. This re-open is an indication
+ * for the stub to properly mark inode's status.
+ */
+ dict = dict_new ();
+ if (dict) {
+ /* TODO: Make it a #define */
+ ret = dict_set_int32 (dict, "br-fd-reopen", 1);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING,
+ "Object reopen would trigger versioning.");
+ }
+
+ loc.inode = inode_ref (linked_inode);
+ gf_uuid_copy (loc.gfid, linked_inode->gfid);
+
+ br_trigger_sign (this, object->child, linked_inode, &loc, dict);
+
+ if (dict)
+ dict_unref (dict);
+ loc_wipe (&loc);
+}
+
/**
* Sign a given object. This routine runs full throttle. There needs to be
* some form of priority scheduling and/or read burstness to avoid starving
@@ -497,6 +589,7 @@ static inline int32_t br_sign_object (br_object_t *object)
fd_t *fd = NULL;
struct iatt iatt = {0, };
pid_t pid = GF_CLIENT_PID_BITD;
+ br_sign_state_t sign_info = BR_SIGN_NORMAL;
GF_VALIDATE_OR_GOTO ("bit-rot", object, out);
@@ -515,6 +608,20 @@ static inline int32_t br_sign_object (br_object_t *object)
goto out;
}
+ /* sanity check */
+ sign_info = ntohl (object->sign_info);
+ GF_ASSERT (sign_info != BR_SIGN_NORMAL);
+
+ /**
+ * For fd's that have notified for reopening, we send an explicit
+ * open() followed by a dummy write() call. This triggers the
+ * actual signing of the object.
+ */
+ if (sign_info == BR_SIGN_REOPEN_WAIT) {
+ br_object_handle_reopen (this, object, linked_inode);
+ goto unref_inode;
+ }
+
ret = br_object_open (this, object, linked_inode, &fd);
if (!fd) {
br_log_object (this, "open", object->gfid, -ret);
@@ -648,6 +755,7 @@ br_initialize_object (xlator_t *this, br_child_t *child, changelog_event_t *ev)
/* NOTE: it's BE, but no worry */
object->signedversion = ev->u.releasebr.version;
+ object->sign_info = ev->u.releasebr.sign_info;
out:
return object;
@@ -693,7 +801,6 @@ br_brick_callback (void *xl, char *brick,
xlator_t *this = NULL;
br_object_t *object = NULL;
br_child_t *child = NULL;
- int32_t flags = 0;
struct gf_tw_timer_list *timer = NULL;
this = xl;
@@ -710,14 +817,6 @@ br_brick_callback (void *xl, char *brick,
gf_log (this->name, GF_LOG_DEBUG,
"RELEASE EVENT [GFID %s]", uuid_utoa (gfid));
- flags = (int32_t)ntohl (ev->u.releasebr.flags);
- if (flags == O_RDONLY) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Read only fd [GFID: %s], ignoring signing..",
- uuid_utoa (gfid));
- goto out;
- }
-
child = br_get_child_from_brick_path (this, brick);
if (!child) {
gf_log (this->name, GF_LOG_ERROR, "failed to get the subvolume "
@@ -804,12 +903,15 @@ out:
return need_sign;
}
-static inline void
+void
br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode,
- loc_t *loc)
+ loc_t *loc, dict_t *xdata)
{
fd_t *fd = NULL;
int32_t ret = -1;
+ pid_t pid = GF_CLIENT_PID_BITD;
+
+ syncopctx_setfspid (&pid);
fd = fd_create (linked_inode, 0);
if (!fd) {
@@ -828,8 +930,10 @@ br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode,
fd_bind (fd);
}
- if (fd)
+ if (fd) {
+ br_send_dummy_write (this, fd, child, xdata);
syncop_close (fd);
+ }
out:
return;
@@ -972,7 +1076,7 @@ bitd_oneshot_crawl (xlator_t *subvol,
gf_log (this->name, GF_LOG_INFO,
"Triggering signing for %s [GFID: %s | Brick: %s]",
loc.path, uuid_utoa (linked_inode->gfid), child->brick_path);
- br_trigger_sign (this, child, linked_inode, &loc);
+ br_trigger_sign (this, child, linked_inode, &loc, NULL);
ret = 0;
@@ -1600,7 +1704,9 @@ struct xlator_cbks cbks;
struct volume_options options[] = {
{ .key = {"expiry-time"},
.type = GF_OPTION_TYPE_INT,
- .default_value = "120",
+ /* Let the default timer be half the value of the wait time for
+ * sining (which is 120 as of now) */
+ .default_value = "60",
.description = "default time duration for which an object waits "
"before it is signed",
},
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.h b/xlators/features/bit-rot/src/bitd/bit-rot.h
index 66515e3213c..bbaf86fa65f 100644
--- a/xlators/features/bit-rot/src/bitd/bit-rot.h
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.h
@@ -152,6 +152,8 @@ struct br_object {
be signed */
br_child_t *child; /* object's subvolume */
+ int sign_info;
+
struct list_head list; /* hook to add to the queue once the
object is expired from timer wheel */
void *data;
@@ -175,4 +177,8 @@ br_prepare_loc (xlator_t *, br_child_t *, loc_t *, gf_dirent_t *, loc_t *);
gf_boolean_t
bitd_is_bad_file (xlator_t *, br_child_t *, loc_t *, fd_t *);
+void
+br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode,
+ loc_t *loc, dict_t *xdata);
+
#endif /* __BIT_ROT_H__ */
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-common.h b/xlators/features/bit-rot/src/stub/bit-rot-common.h
index 699323170d3..7fd584e5970 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-common.h
+++ b/xlators/features/bit-rot/src/stub/bit-rot-common.h
@@ -33,6 +33,13 @@ typedef enum br_vxattr_state {
BR_VXATTR_STATUS_INVALID = 3,
} br_vxattr_status_t;
+typedef enum br_sign_state {
+ BR_SIGN_INVALID = -1,
+ BR_SIGN_NORMAL = 0,
+ BR_SIGN_REOPEN_WAIT = 1,
+ BR_SIGN_QUICK = 2,
+} br_sign_state_t;
+
static inline br_vxattr_status_t
br_version_xattr_state (dict_t *xattr,
br_version_t **obuf, br_signature_t **sbuf)
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
index 46271407219..9f6da89032f 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h
@@ -28,7 +28,8 @@ enum br_mem_types {
gf_br_mt_br_tbf_opspec_t,
gf_br_mt_br_scrubber_t,
gf_br_mt_br_fsscan_entry_t,
- gf_br_stub_mt_end
+ gf_br_stub_mt_br_stub_fd_t,
+ gf_br_stub_mt_end,
};
#endif
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.c b/xlators/features/bit-rot/src/stub/bit-rot-stub.c
index f9c3886948a..93db072f671 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub.c
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.c
@@ -198,14 +198,15 @@ br_stub_init_inode_versions (xlator_t *this, fd_t *fd, inode_t *inode,
if (!ctx)
goto error_return;
+ INIT_LIST_HEAD (&ctx->fd_list);
(markdirty) ? __br_stub_mark_inode_dirty (ctx)
: __br_stub_mark_inode_synced (ctx);
__br_stub_set_ongoing_version (ctx, version);
- __br_stub_reset_release_counters (ctx);
if (fd) {
- br_stub_require_release_call (this, fd);
- __br_stub_track_openfd (fd, ctx);
+ ret = br_stub_add_fd_to_inode (this, fd, ctx);
+ if (ret)
+ goto free_ctx;
}
ret = br_stub_set_inode_ctx (this, inode, ctx);
if (ret)
@@ -238,7 +239,6 @@ br_stub_mod_inode_versions (xlator_t *this,
__br_stub_mark_inode_synced (ctx);
}
- __br_stub_track_openfd (fd, ctx);
ret = 0;
}
unblock:
@@ -250,19 +250,16 @@ br_stub_mod_inode_versions (xlator_t *this,
static inline void
br_stub_fill_local (br_stub_local_t *local,
call_stub_t *stub, fd_t *fd, inode_t *inode, uuid_t gfid,
- int versioningtype, unsigned long memversion, int dirty)
+ int versioningtype, unsigned long memversion)
{
local->fopstub = stub;
local->versioningtype = versioningtype;
local->u.context.version = memversion;
- if (fd)
+ if (fd && !local->u.context.fd)
local->u.context.fd = fd_ref (fd);
if (inode)
local->u.context.inode = inode_ref (inode);
gf_uuid_copy (local->u.context.gfid, gfid);
-
- /* mark inode dirty/fresh according to durability */
- local->u.context.markdirty = (dirty) ? _gf_true : _gf_false;
}
static inline void
@@ -279,57 +276,13 @@ br_stub_cleanup_local (br_stub_local_t *local)
inode_unref (local->u.context.inode);
local->u.context.inode = NULL;
}
- local->u.context.markdirty = _gf_true;
memset (local->u.context.gfid, '\0', sizeof (uuid_t));
}
/**
- * callback for inode/fd full versioning
+ * callback for inode/fd versioning
*/
int
-br_stub_inode_fullversioning_cbk (call_frame_t *frame,
- void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
-{
- fd_t *fd = NULL;
- inode_t *inode = NULL;
- unsigned long version = 0;
- gf_boolean_t dirty = _gf_true;
- br_stub_local_t *local = NULL;
-
- local = (br_stub_local_t *)frame->local;
-
- /* be graceful to EEXIST */
- if ((op_ret < 0) && (op_errno == EEXIST)) {
- op_ret = 0;
- goto done;
- }
-
- if (op_ret < 0)
- goto done;
-
- fd = local->u.context.fd;
- inode = local->u.context.inode;
- version = local->u.context.version;
- dirty = local->u.context.markdirty;
-
- op_ret = br_stub_init_inode_versions (this, fd, inode, version, dirty);
- if (op_ret < 0)
- op_errno = EINVAL;
-
- done:
- frame->local = NULL;
- if (op_ret < 0)
- call_unwind_error (local->fopstub, op_ret, op_errno);
- else
- call_resume (local->fopstub);
- br_stub_cleanup_local (local);
- br_stub_dealloc_local (local);
-
- return 0;
-}
-
-int
br_stub_fd_incversioning_cbk (call_frame_t *frame,
void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
@@ -351,14 +304,14 @@ br_stub_fd_incversioning_cbk (call_frame_t *frame,
op_errno = EINVAL;
done:
- frame->local = NULL;
- if (op_ret < 0)
+ if (op_ret < 0) {
+ frame->local = NULL;
call_unwind_error (local->fopstub, -1, op_errno);
- else
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ } else {
call_resume (local->fopstub);
- br_stub_cleanup_local (local);
- br_stub_dealloc_local (local);
-
+ }
return 0;
}
@@ -366,28 +319,27 @@ br_stub_fd_incversioning_cbk (call_frame_t *frame,
* Initial object versioning
*
* Version persists two (2) extended attributes as explained below:
- * 1. Current (ongoing) version: This is incremented on an open()
- * or creat() and is the running version for an object.
+ * 1. Current (ongoing) version: This is incremented on an writev ()
+ * or truncate () and is the running version for an object.
* 2. Signing version: This is the version against which an object
* was signed (checksummed).
*
* During initial versioning, both ongoing and signing versions are
- * set of one and zero respectively. An open() call increments the
+ * set of one and zero respectively. A write() call increments the
* ongoing version as an indication of modification to the object.
* Additionally this needs to be persisted on disk and needs to be
* durable: fsync().. :-/
- * As an optimization only the first open() synchronizes the ongoing
- * version to disk, subsequent open()s before the *last* release()
+ * As an optimization only the first write() synchronizes the ongoing
+ * version to disk, subsequent write()s before the *last* release()
* are no-op's.
*
* create(), just like lookup() initializes the object versions to
- * the default, but persists the version to disk. As an optimization
- * this is not a durable operation: in case of a crash, hard reboot
- * etc.. absence of versioning xattrs is ignored in scrubber along
- * with the one time crawler explicitly triggering signing for such
- * objects.
+ * the default. As an optimization this is not a durable operation:
+ * in case of a crash, hard reboot etc.. absence of versioning xattrs
+ * is ignored in scrubber along with the one time crawler explicitly
+ * triggering signing for such objects.
*
- * c.f. br_stub_open_cbk() / br_stub_create_cbk()
+ * c.f. br_stub_writev() / br_stub_truncate()
*/
/**
@@ -400,7 +352,7 @@ int
br_stub_fd_versioning (xlator_t *this, call_frame_t *frame,
call_stub_t *stub, dict_t *dict, fd_t *fd,
br_stub_version_cbk *callback, unsigned long memversion,
- int versioningtype, int durable, int dirty)
+ int versioningtype, int durable)
{
int32_t ret = -1;
int flags = 0;
@@ -421,18 +373,11 @@ br_stub_fd_versioning (xlator_t *this, call_frame_t *frame,
goto dealloc_xdata;
}
- local = br_stub_alloc_local (this);
- if (!local) {
- ret = -1;
- goto dealloc_xdata;
- }
-
- if (versioningtype == BR_STUB_FULL_VERSIONING)
- flags |= XATTR_CREATE;
+ local = frame->local;
br_stub_fill_local (local, stub, fd,
fd->inode, fd->inode->gfid,
- versioningtype, memversion, dirty);
+ versioningtype, memversion);
frame->local = local;
STACK_WIND (frame, callback,
@@ -448,82 +393,21 @@ br_stub_fd_versioning (xlator_t *this, call_frame_t *frame,
}
static inline int
-br_stub_perform_fullversioning (xlator_t *this, call_frame_t *frame,
- call_stub_t *stub, fd_t *fd)
-{
- int32_t ret = -1;
- dict_t *dict = NULL;
- br_version_t *obuf = NULL;
- int op_errno = 0;
-
- op_errno = ENOMEM;
- dict = dict_new ();
- if (!dict)
- goto done;
- ret = br_stub_alloc_versions (&obuf, NULL, 0);
- if (ret)
- goto dealloc_dict;
-
- op_errno = EINVAL;
- ret = br_stub_prepare_version_request (this, dict, obuf,
- BITROT_DEFAULT_CURRENT_VERSION);
- if (ret)
- goto dealloc_versions;
-
- /**
- * Version extended attributes need not be durable at this point of
- * time. If the objects (inode) data gets persisted on disk but the
- * version extended attributes are lost due to a crash/power failure,
- * a subsequent lookup marks the objects signature as stale. This way,
- * dentry operation times do not shoot up.
- */
- ret = br_stub_fd_versioning (this, frame, stub, dict, fd,
- br_stub_inode_fullversioning_cbk,
- BITROT_DEFAULT_CURRENT_VERSION,
- BR_STUB_FULL_VERSIONING, !WRITEBACK_DURABLE, 0);
-
- dealloc_versions:
- br_stub_dealloc_versions (obuf);
- dealloc_dict:
- dict_unref (dict);
- done:
- if (ret)
- call_unwind_error (stub, -1, op_errno);
- return ret;
-}
-
-static inline int
br_stub_perform_incversioning (xlator_t *this,
call_frame_t *frame, call_stub_t *stub,
fd_t *fd, br_stub_inode_ctx_t *ctx)
{
- int32_t ret = -1;
- dict_t *dict = NULL;
- inode_t *inode = NULL;
- br_version_t *obuf = NULL;
- unsigned long writeback_version = 0;
- int op_errno = 0;
-
- inode = fd->inode;
+ int32_t ret = -1;
+ dict_t *dict = NULL;
+ br_version_t *obuf = NULL;
+ unsigned long writeback_version = 0;
+ int op_errno = 0;
+ br_stub_local_t *local = NULL;
op_errno = EINVAL;
- ret = br_stub_require_release_call (this, fd);
- if (ret)
- goto done;
-
- LOCK (&inode->lock);
- {
- if (__br_stub_is_inode_dirty (ctx))
- writeback_version = __br_stub_writeback_version (ctx);
- else
- __br_stub_track_openfd (fd, ctx);
- }
- UNLOCK (&inode->lock);
+ local = frame->local;
- if (!writeback_version) {
- ret = 0;
- goto done;
- }
+ writeback_version = __br_stub_writeback_version (ctx);
/* inode requires writeback to disk */
op_errno = ENOMEM;
@@ -541,17 +425,23 @@ br_stub_perform_incversioning (xlator_t *this,
ret = br_stub_fd_versioning
(this, frame, stub, dict,
fd, br_stub_fd_incversioning_cbk, writeback_version,
- BR_STUB_INCREMENTAL_VERSIONING, WRITEBACK_DURABLE, 0);
+ BR_STUB_INCREMENTAL_VERSIONING, !WRITEBACK_DURABLE);
dealloc_versions:
br_stub_dealloc_versions (obuf);
dealloc_dict:
dict_unref (dict);
done:
- if (!ret && !writeback_version)
- call_resume (stub);
- if (ret)
+ if (ret) {
+ if (local)
+ frame->local = NULL;
call_unwind_error (stub, -1, op_errno);
+ if (local) {
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ }
+ }
+
return ret;
}
@@ -560,6 +450,44 @@ br_stub_perform_incversioning (xlator_t *this,
/* fsetxattr() */
static inline int
+br_stub_compare_sign_version (xlator_t *this, inode_t *inode,
+ br_signature_t *sbuf, dict_t *dict)
+{
+ int32_t ret = -1;
+ br_stub_inode_ctx_t *ctx = NULL;
+ uint64_t tmp_ctx = 0;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ GF_VALIDATE_OR_GOTO (this->name, sbuf, out);
+ GF_VALIDATE_OR_GOTO (this->name, dict, out);
+
+ ret = br_stub_get_inode_ctx (this, inode, &tmp_ctx);
+ if (ret) {
+ dict_del (dict, BITROT_SIGNING_VERSION_KEY);
+ goto out;
+ }
+
+ ret = -1;
+ ctx = (br_stub_inode_ctx_t *)(long)tmp_ctx;
+
+ LOCK (&inode->lock);
+ {
+ if (ctx->currentversion == sbuf->signedversion)
+ ret = 0;
+ else
+ gf_log (this->name, GF_LOG_WARNING, "current version "
+ "%lu and version of the signature %lu are not "
+ "same", ctx->currentversion,
+ sbuf->signedversion);
+ }
+ UNLOCK (&inode->lock);
+
+out:
+ return ret;
+}
+
+static inline int
br_stub_prepare_signature (xlator_t *this, dict_t *dict,
inode_t *inode, br_isignature_t *sign)
{
@@ -577,6 +505,11 @@ br_stub_prepare_signature (xlator_t *this, dict_t *dict,
ret = br_stub_prepare_signing_request (dict, sbuf, sign, signaturelen);
if (ret)
goto dealloc_versions;
+
+ ret = br_stub_compare_sign_version (this, inode, sbuf, dict);
+ if (ret)
+ goto dealloc_versions;
+
return 0;
dealloc_versions:
@@ -620,6 +553,8 @@ br_stub_fsetxattr (call_frame_t *frame, xlator_t *this,
if (ret)
goto unwind;
+ gf_log (this->name, GF_LOG_DEBUG, "SIGNED VERSION: %lu",
+ sign->signedversion);
wind:
STACK_WIND (frame, default_setxattr_cbk,
FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr, fd,
@@ -865,77 +800,598 @@ br_stub_fgetxattr (call_frame_t *frame, xlator_t *this,
return 0;
}
-/** }}} */
+/**
+ * The first write response on the first fd in the list of fds will set
+ * the flag to indicate that the inode is modified. The subsequent write
+ * respnses coming on either the first fd or some other fd will not change
+ * the fd. The inode-modified flag is unset only upon release of all the
+ * fds.
+ */
+int32_t
+br_stub_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t ret = 0;
+ uint64_t ctx_addr = 0;
+ br_stub_inode_ctx_t *ctx = NULL;
+ br_stub_local_t *local = NULL;
+
+ if (frame->local) {
+ local = frame->local;
+ frame->local = NULL;
+ }
+ if (op_ret < 0)
+ goto unwind;
-/** {{{ */
+ ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode,
+ &ctx_addr);
+ if (ret < 0)
+ goto unwind;
-/* open() */
+ ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
-int
-br_stub_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
+ /* Mark the flag to indicate the inode has been modified */
+ LOCK (&local->u.context.fd->inode->lock);
+ {
+ if (!__br_stub_is_inode_modified (ctx))
+ __br_stub_set_inode_modified (ctx);
+ }
+ UNLOCK (&local->u.context.fd->inode->lock);
+
+
+unwind:
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ return 0;
+}
+
+/**
+ * Ongoing version is increased only for the first modify operation.
+ * First modify version means the first write or truncate call coming on the
+ * first fd in the list of inodes.
+ * For anonymous fds open would not have come, so check if its the first write
+ * by doing both inode dirty check and ensuring list of fds is empty
+ */
+static inline gf_boolean_t
+br_stub_inc_version (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx)
{
- int32_t ret = 0;
- uint64_t ctx_addr = 0;
- br_stub_inode_ctx_t *ctx = NULL;
- call_stub_t *stub = NULL;
+ gf_boolean_t inc_version = _gf_false;
+
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+ GF_VALIDATE_OR_GOTO (this->name, ctx, out);
+
+ LOCK (&fd->inode->lock);
+ {
+ if (__br_stub_is_inode_dirty (ctx))
+ inc_version = _gf_true;
+ }
+ UNLOCK (&fd->inode->lock);
+
+out:
+ return inc_version;
+}
+
+/**
+ * Since NFS does not do open, writes from NFS are sent over an anonymous
+ * fd. It means each write fop might come on a different anonymous fd and
+ * will lead to very large number of notifications being sent. It might
+ * affect the perfromance as, there will too many sign requests.
+ * To avoid that whenever the last fd released from an inode (logical release)
+ * is an anonymous fd the release notification is sent with a flag being set
+ * __br_stub_anon_release (ctx);
+ * BitD checks for the flag and if set, it will send a dummy write request
+ * (again on an anonymous fd) instead of triggering sign.
+ * Bit-rot-stub should identify such dummy writes and should send success to
+ * them instead of winding them downwards.
+ */
+gf_boolean_t
+br_stub_dummy_write (call_frame_t *frame)
+{
+ return (frame->root->pid == GF_CLIENT_PID_BITD)
+ ? _gf_true : _gf_false;
+}
+
+int32_t
+br_stub_anon_fd_ctx (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx)
+{
+ int32_t ret = -1;
+ br_stub_fd_t *br_stub_fd = NULL;
+
+ br_stub_fd = br_stub_fd_ctx_get (this, fd);
+ if (!br_stub_fd) {
+ ret = br_stub_add_fd_to_inode (this, fd, ctx);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
+ "add fd to the inode (gfid: %s)",
+ uuid_utoa (fd->inode->gfid));
+ goto out;
+ }
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int32_t
+br_stub_writev_resume (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata)
+{
+ if (frame->root->pid == GF_CLIENT_PID_BITD)
+ br_stub_writev_cbk (frame, NULL, this, vector->iov_len, 0,
+ NULL, NULL, NULL);
+ else
+ STACK_WIND (frame, br_stub_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count,
+ offset, flags, iobref, xdata);
+ return 0;
+}
+
+/**
+ TODO: If possible add pictorial represention of below comment.
+
+ Before sending writev on the ANONYMOUS FD, increase the ongoing
+ version first. This brings anonymous fd write closer to the regular
+ fd write by having the ongoing version increased before doing the
+ write (In regular fd, after open the ongoing version is incremented).
+ Do following steps to handle writes on anonymous fds:
+ 1) Increase the on-disk ongoing version
+ 2) Once versioning is successfully done send write operation. If versioning
+ fails, then fail the write fop.
+ 3) In writev_cbk do below things:
+ a) Increase in-memory version
+ b) set the fd context (so that br_stub_release is invoked)
+ c) add the fd to the list of fds maintained in the inode context of
+ bitrot-stub.
+ d) Mark inode as non dirty
+ e) Mard inode as modified (in the inode context)
+**/
+int32_t
+br_stub_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata)
+{
+ br_stub_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ gf_boolean_t inc_version = _gf_false;
+ br_stub_inode_ctx_t *ctx = NULL;
+ uint64_t ctx_addr = 0;
+ int32_t ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, frame, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd, unwind);
+
+ local = br_stub_alloc_local (this);
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR, "local allocation failed "
+ "(gfid: %s)", uuid_utoa (fd->inode->gfid));
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ local->u.context.fd = fd_ref (fd);
+ frame->local = local;
+
+ ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the inode "
+ "context for the inode %s",
+ uuid_utoa (fd->inode->gfid));
+ goto unwind;
+ }
+
+ ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
+ if (fd_is_anonymous (fd)) {
+ ret = br_stub_anon_fd_ctx (this, fd, ctx);
+ if (ret)
+ goto unwind;
+ }
+
+ /* TODO: Better to do a dummy fsetxattr instead of write. Keep write
+ simple */
+ if (br_stub_dummy_write (frame)) {
+ LOCK (&fd->inode->lock);
+ {
+ (void) __br_stub_inode_sign_state
+ (ctx, GF_FOP_WRITE, fd);
+ }
+ UNLOCK (&fd->inode->lock);
+
+ if (xdata && dict_get (xdata, "br-fd-reopen")) {
+ op_ret = vector->iov_len;
+ op_errno = 0;
+ goto unwind;
+ }
+ }
+
+ /**
+ * Check whether this is the first write on this inode since the last
+ * sign notification has been sent. If so, do versioning. Otherwise
+ * go ahead with the fop.
+ */
+ inc_version = br_stub_inc_version (this, fd, ctx);
+ if (!inc_version)
+ goto wind;
+
+ /* Create the stub for the write fop */
+ stub = fop_writev_stub (frame, br_stub_writev_resume, fd, vector, count,
+ offset, flags, iobref, xdata);
+
+ if (!stub) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for "
+ "write fop (gfid: %s), unwinding",
+ uuid_utoa (fd->inode->gfid));
+ goto unwind;
+ }
+
+ /* Perform Versioning */
+ return br_stub_perform_incversioning (this, frame, stub, fd, ctx);
+
+wind:
+ STACK_WIND (frame, br_stub_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count, offset,
+ flags, iobref, xdata);
+ return 0;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, NULL, NULL,
+ NULL);
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ return 0;
+}
+
+int32_t
+br_stub_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t ret = 0;
+ uint64_t ctx_addr = 0;
+ br_stub_inode_ctx_t *ctx = NULL;
+ br_stub_local_t *local = NULL;
+
+ if (frame->local) {
+ local = frame->local;
+ frame->local = NULL;
+ }
if (op_ret < 0)
goto unwind;
- if (cookie != (void *) BR_STUB_REQUEST_COOKIE)
+
+ ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode,
+ &ctx_addr);
+ if (ret < 0)
goto unwind;
+ ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
+
+ /* Mark the flag to indicate the inode has been modified */
+ LOCK (&local->u.context.fd->inode->lock);
+ {
+ if (!__br_stub_is_inode_modified (ctx))
+ __br_stub_set_inode_modified (ctx);
+ }
+ UNLOCK (&local->u.context.fd->inode->lock);
+
+
+unwind:
+ STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ return 0;
+}
+
+int32_t
+br_stub_ftruncate_resume (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, dict_t *xdata)
+{
+ STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+ return 0;
+}
+
+int32_t
+br_stub_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, dict_t *xdata)
+{
+ br_stub_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ gf_boolean_t inc_version = _gf_false;
+ br_stub_inode_ctx_t *ctx = NULL;
+ uint64_t ctx_addr = 0;
+ int32_t ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, frame, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd, unwind);
+
+ local = br_stub_alloc_local (this);
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR, "local allocation failed "
+ "(gfid: %s)", uuid_utoa (fd->inode->gfid));
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ local->u.context.fd = fd_ref (fd);
+ frame->local = local;
+
ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr);
- if (ret < 0)
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the inode "
+ "context for the inode %s",
+ uuid_utoa (fd->inode->gfid));
goto unwind;
+ }
+
+ ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
+ if (fd_is_anonymous (fd)) {
+ ret = br_stub_anon_fd_ctx (this, fd, ctx);
+ if (ret)
+ goto unwind;
+ }
- stub = fop_open_cbk_stub (frame, NULL, op_ret, op_errno, fd, xdata);
+ /**
+ * c.f. br_stub_writev()
+ */
+ inc_version = br_stub_inc_version (this, fd, ctx);
+ if (!inc_version)
+ goto wind;
+
+ /* Create the stub for the ftruncate fop */
+ stub = fop_ftruncate_stub (frame, br_stub_ftruncate_resume, fd, offset,
+ xdata);
if (!stub) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for "
+ "ftruncate fop (gfid: %s), unwinding",
+ uuid_utoa (fd->inode->gfid));
+ goto unwind;
+ }
+
+ /* Perform Versioning */
+ return br_stub_perform_incversioning (this, frame, stub, fd, ctx);
+
+wind:
+ STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+ return 0;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, NULL, NULL,
+ NULL);
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ return 0;
+}
+
+int32_t
+br_stub_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t ret = 0;
+ uint64_t ctx_addr = 0;
+ br_stub_inode_ctx_t *ctx = NULL;
+ br_stub_local_t *local = NULL;
+
+ if (frame->local) {
+ local = frame->local;
+ frame->local = NULL;
+ }
+
+ if (op_ret < 0)
+ goto unwind;
+
+ ret = br_stub_get_inode_ctx (this, local->u.context.fd->inode,
+ &ctx_addr);
+ if (ret < 0)
+ goto unwind;
+
+ ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
+
+ /* Mark the flag to indicate the inode has been modified */
+ LOCK (&local->u.context.fd->inode->lock);
+ {
+ if (!__br_stub_is_inode_modified (ctx))
+ __br_stub_set_inode_modified (ctx);
+ }
+ UNLOCK (&local->u.context.fd->inode->lock);
+
+
+unwind:
+ STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
+ return 0;
+}
+
+int32_t
+br_stub_truncate_resume (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata)
+{
+ STACK_WIND (frame, br_stub_ftruncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+ return 0;
+}
+
+/**
+ * Bit-rot-stub depends heavily on the fd based operations to for doing
+ * versioning and sending notification. It starts tracking the operation
+ * upon getting first fd based modify operation by doing versioning and
+ * sends notification when last fd using which the inode was modified is
+ * released.
+ * But for truncate there is no fd and hence it becomes difficult to do
+ * the versioning and send notification. It is handled by doing versioning
+ * on an anonymous fd. The fd will be valid till the completion of the
+ * truncate call. It guarantees that release on this anonymous fd will happen
+ * after the truncate call and notification is sent after the truncate call.
+ */
+int32_t
+br_stub_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata)
+{
+ br_stub_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ gf_boolean_t inc_version = _gf_false;
+ br_stub_inode_ctx_t *ctx = NULL;
+ uint64_t ctx_addr = 0;
+ int32_t ret = -1;
+ fd_t *fd = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, frame, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, loc, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, loc->inode, unwind);
+
+ fd = fd_anonymous (loc->inode);
+ if (!fd) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to create anonymous "
+ "fd for the inode %s", uuid_utoa (loc->inode->gfid));
+ goto unwind;
+ }
+
+ local = br_stub_alloc_local (this);
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR, "local allocation failed "
+ "(gfid: %s)", uuid_utoa (loc->inode->gfid));
op_ret = -1;
- op_errno = EINVAL;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ local->u.context.fd = fd;
+ frame->local = local;
+
+ ret = br_stub_get_inode_ctx (this, loc->inode, &ctx_addr);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the inode "
+ "context for the inode %s",
+ uuid_utoa (fd->inode->gfid));
goto unwind;
}
+ ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
+ ret = br_stub_anon_fd_ctx (this, local->u.context.fd, ctx);
+ if (ret)
+ goto unwind;
+
/**
- * Ongoing version needs to be incremented. If the inode is not dirty,
- * things are simple: increment the ongoing version safely and be done.
- * If inode is dirty, a writeback to disk is required. This is tricky in
- * case of multiple open()'s as ongoing version needs to be incremented
- * on a successful writeback. It's probably safe to remember the ongoing
- * version before writeback and *assigning* it in the callback, but that
- * may lead to a trustable checksum to be treated as stale by scrubber
- * (the case where the in-memory ongoing version is lesser than the
- * on-disk version). Therefore, *all* open() calls (which might have
- * come in parallel) try to synchronize the next ongoing version to
- * disk. In the callback path, the winner marks the inode as synced
- * therby loosing open() calls become no-op's.
+ * c.f. br_stub_writev()
*/
- ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
- return br_stub_perform_incversioning (this, frame, stub, fd, ctx);
+ inc_version = br_stub_inc_version (this, fd, ctx);
+ if (!inc_version)
+ goto wind;
- unwind:
- STACK_UNWIND_STRICT (open, frame,
- op_ret, op_errno, fd, xdata);
+ /* Create the stub for the truncate fop */
+ stub = fop_truncate_stub (frame, br_stub_truncate_resume, loc, offset,
+ xdata);
+ if (!stub) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to allocate stub for "
+ "truncate fop (gfid: %s), unwinding",
+ uuid_utoa (fd->inode->gfid));
+ goto unwind;
+ }
+
+ /* Perform Versioning */
+ return br_stub_perform_incversioning (this, frame, stub,
+ local->u.context.fd, ctx);
+
+wind:
+ STACK_WIND (frame, br_stub_truncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+ return 0;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, NULL, NULL,
+ NULL);
+ br_stub_cleanup_local (local);
+ br_stub_dealloc_local (local);
return 0;
}
+/** }}} */
+
+
+/** {{{ */
+
+/* open() */
+
+/**
+ * It's probably worth mentioning a bit about why some of the housekeeping
+ * work is done in open() call path, rather than the callback path.
+ * Two (or more) open()'s in parallel can race and lead to a situation
+ * where a release() gets triggered (possibly after a series of write()
+ * calls) when *other* open()'s have still not reached callback path
+ * thereby having an active fd on an inode that is in process of getting
+ * signed with the current version.
+ *
+ * Maintaining fd list in the call path ensures that a release() would
+ * not be triggered if an open() call races ahead (followed by a close())
+ * threby finding non-empty fd list.
+ */
+
int
br_stub_open (call_frame_t *frame, xlator_t *this,
loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata)
{
- void *cookie = NULL;
+ int32_t ret = -1;
+ br_stub_inode_ctx_t *ctx = NULL;
+ uint64_t ctx_addr = 0;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, loc, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd->inode, unwind);
- if (!flags)
- goto wind;
if (frame->root->pid == GF_CLIENT_PID_SCRUB)
goto wind;
- cookie = (void *) BR_STUB_REQUEST_COOKIE;
- wind:
- STACK_WIND_COOKIE (frame, br_stub_open_cbk, cookie,
- FIRST_CHILD (this), FIRST_CHILD (this)->fops->open,
- loc, flags, fd, xdata);
+ ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the inode "
+ "context for the file %s (gfid: %s)", loc->path,
+ uuid_utoa (fd->inode->gfid));
+ goto unwind;
+ }
+
+ ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
+ if (flags == O_RDONLY)
+ goto wind;
+
+ ret = br_stub_add_fd_to_inode (this, fd, ctx);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed add fd to the list "
+ "(gfid: %s)", uuid_utoa (fd->inode->gfid));
+ goto unwind;
+ }
+
+wind:
+ STACK_WIND (frame, default_open_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->open, loc, flags, fd, xdata);
+ return 0;
+unwind:
+ STACK_UNWIND_STRICT (open, frame, -1, EINVAL, NULL, NULL);
return 0;
}
@@ -946,39 +1402,60 @@ br_stub_open (call_frame_t *frame, xlator_t *this,
/* creat() */
+/**
+ * This routine registers a release callback for the given fd and adds the
+ * fd to the inode context fd tracking list.
+ */
+int32_t
+br_stub_add_fd_to_inode (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx)
+{
+ int32_t ret = -1;
+ br_stub_fd_t *br_stub_fd = NULL;
+
+ ret = br_stub_require_release_call (this, fd, &br_stub_fd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to set the fd "
+ "context for the file (gfid: %s)",
+ uuid_utoa (fd->inode->gfid));
+ goto out;
+ }
+
+ LOCK (&fd->inode->lock);
+ {
+ list_add_tail (&ctx->fd_list, &br_stub_fd->list);
+ }
+ UNLOCK (&fd->inode->lock);
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
int
br_stub_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, fd_t *fd, inode_t *inode,
struct iatt *stbuf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- int32_t ret = 0;
- uint64_t ctx_addr = 0;
- call_stub_t *stub = NULL;
- br_stub_inode_ctx_t *ctx = NULL;
+ int32_t ret = 0;
+ uint64_t ctx_addr = 0;
+ br_stub_inode_ctx_t *ctx = NULL;
+ unsigned long version = BITROT_DEFAULT_CURRENT_VERSION;
if (op_ret < 0)
goto unwind;
- stub = fop_create_cbk_stub (frame, NULL, op_ret, op_errno, fd, inode,
- stbuf, preparent, postparent, xdata);
- if (!stub) {
- op_ret = -1;
- op_errno = EINVAL;
- goto unwind;
- }
-
ret = br_stub_get_inode_ctx (this, fd->inode, &ctx_addr);
- if (ret < 0)
- ctx_addr = 0;
- ctx = (br_stub_inode_ctx_t *) (long) ctx_addr;
-
- /* see comment in br_stub_open_cbk().. */
- return (ctx)
- ? br_stub_perform_incversioning (this, frame, stub, fd, ctx)
- : br_stub_perform_fullversioning (this, frame, stub, fd);
+ if (ret < 0) {
+ ret = br_stub_init_inode_versions (this, fd, inode, version,
+ _gf_true);
+ } else {
+ ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
+ ret = br_stub_add_fd_to_inode (this, fd, ctx);
+ }
- unwind:
+unwind:
STACK_UNWIND_STRICT (create, frame, op_ret, op_errno,
fd, inode, stbuf, preparent, postparent, xdata);
return 0;
@@ -989,10 +1466,20 @@ br_stub_create (call_frame_t *frame,
xlator_t *this, loc_t *loc, int32_t flags,
mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
{
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, loc, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, loc->inode, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd->inode, unwind);
+
STACK_WIND (frame, br_stub_create_cbk, FIRST_CHILD (this),
FIRST_CHILD (this)->fops->create,
loc, flags, mode, umask, fd, xdata);
return 0;
+unwind:
+ STACK_UNWIND_STRICT (create, frame, -1, EINVAL, NULL, NULL, NULL, NULL,
+ NULL, NULL);
+ return 0;
}
/** }}} */
@@ -1011,21 +1498,11 @@ br_stub_lookup_version (xlator_t *this,
* out the correct version to use in the inode context (start with
* the default version if unavailable). As of now versions are not
* persisted on-disk. The inode is marked dirty, so that the first
- * operation (such as open(), etc..) would trigger synchronization
- * to disk.
+ * operation (such as write(), etc..) triggers synchronization to
+ * disk.
*/
status = br_version_xattr_state (xattr, &obuf, &sbuf);
- /**
- * stub does not know how to handle presence of signature but not
- * the object version, therefore, in such cases, bail out..
- */
- if (status == BR_VXATTR_STATUS_INVALID) {
- gf_log (this->name, GF_LOG_ERROR, "Invalid versioning xattrs. "
- "Bailing out [GFID: %s]", uuid_utoa (gfid));
- return -1;
- }
-
version = ((status == BR_VXATTR_STATUS_FULL)
|| (status == BR_VXATTR_STATUS_UNSIGNED))
? obuf->ongoingversion : BITROT_DEFAULT_CURRENT_VERSION;
@@ -1259,8 +1736,8 @@ br_stub_noop (call_frame_t *frame, void *cookie, xlator_t *this,
}
static inline void
-br_stub_send_ipc_fop (xlator_t *this,
- fd_t *fd, unsigned long releaseversion, int32_t flags)
+br_stub_send_ipc_fop (xlator_t *this, fd_t *fd, unsigned long releaseversion,
+ int sign_info)
{
int32_t op = 0;
int32_t ret = 0;
@@ -1269,8 +1746,8 @@ br_stub_send_ipc_fop (xlator_t *this,
changelog_event_t ev = {0,};
ev.ev_type = CHANGELOG_OP_TYPE_BR_RELEASE;
- ev.u.releasebr.flags = flags;
ev.u.releasebr.version = releaseversion;
+ ev.u.releasebr.sign_info = sign_info;
gf_uuid_copy (ev.u.releasebr.gfid, fd->inode->gfid);
xdata = dict_new ();
@@ -1305,14 +1782,67 @@ br_stub_send_ipc_fop (xlator_t *this,
return;
}
+/**
+ * This is how the state machine of sign info works:
+ * 3 states:
+ * 1) BR_SIGN_NORMAL => The default State of the inode
+ * 2) BR_SIGN_REOPEN_WAIT => A release has been sent and is waiting for reopen
+ * 3) BR_SIGN_QUICK => reopen has happened and this release should trigger sign
+ * 2 events:
+ * 1) GF_FOP_RELEASE
+ * 2) GF_FOP_WRITE (actually a dummy write fro BitD)
+ *
+ * This is how states are changed based on events:
+ * EVENT: GF_FOP_RELEASE:
+ * if (state == BR_SIGN_NORMAL) ; then
+ * set state = BR_SIGN_REOPEN_WAIT;
+ * if (state == BR_SIGN_QUICK); then
+ * set state = BR_SIGN_NORMAL;
+ * EVENT: GF_FOP_WRITE:
+ * if (state == BR_SIGN_REOPEN_WAIT); then
+ * set state = BR_SIGN_QUICK;
+ */
+br_sign_state_t
+__br_stub_inode_sign_state (br_stub_inode_ctx_t *ctx,
+ glusterfs_fop_t fop, fd_t *fd)
+{
+ br_sign_state_t sign_info = BR_SIGN_INVALID;
+
+ switch (fop) {
+
+ case GF_FOP_WRITE:
+ sign_info = ctx->info_sign = BR_SIGN_QUICK;
+ break;
+
+ case GF_FOP_RELEASE:
+ GF_ASSERT (ctx->info_sign != BR_SIGN_REOPEN_WAIT);
+
+ if (ctx->info_sign == BR_SIGN_NORMAL) {
+ sign_info = ctx->info_sign = BR_SIGN_REOPEN_WAIT;
+ } else {
+ sign_info = ctx->info_sign;
+ ctx->info_sign = BR_SIGN_NORMAL;
+ }
+
+ break;
+ default:
+ break;
+ }
+
+ return sign_info;
+}
+
int32_t
br_stub_release (xlator_t *this, fd_t *fd)
{
- int32_t ret = 0;
- int32_t flags = 0;
- inode_t *inode = NULL;
- unsigned long releaseversion = 0;
- br_stub_inode_ctx_t *ctx = NULL;
+ int32_t ret = 0;
+ int32_t flags = 0;
+ inode_t *inode = NULL;
+ unsigned long releaseversion = 0;
+ br_stub_inode_ctx_t *ctx = NULL;
+ uint64_t tmp = 0;
+ br_stub_fd_t *br_stub_fd = NULL;
+ int32_t signinfo = 0;
inode = fd->inode;
@@ -1321,12 +1851,23 @@ br_stub_release (xlator_t *this, fd_t *fd)
ctx = __br_stub_get_ongoing_version_ctx (this, inode, NULL);
if (ctx == NULL)
goto unblock;
- __br_stub_track_release (ctx);
+ br_stub_fd = br_stub_fd_ctx_get (this, fd);
+ if (br_stub_fd) {
+ list_del_init (&br_stub_fd->list);
+ }
+
ret = __br_stub_can_trigger_release
- (inode, ctx, &releaseversion, &flags);
- if (ret) {
- GF_ASSERT (__br_stub_is_inode_dirty (ctx) == 0);
+ (inode, ctx, &releaseversion);
+ if (!ret)
+ goto unblock;
+
+ signinfo = __br_stub_inode_sign_state (ctx, GF_FOP_RELEASE, fd);
+ signinfo = htonl (signinfo);
+
+ /* inode back to initital state: mark dirty */
+ if (ctx->info_sign == BR_SIGN_NORMAL) {
__br_stub_mark_inode_dirty (ctx);
+ __br_stub_unset_inode_modified (ctx);
}
}
unblock:
@@ -1334,10 +1875,17 @@ br_stub_release (xlator_t *this, fd_t *fd)
if (ret) {
gf_log (this->name, GF_LOG_DEBUG,
- "releaseversion: %lu|flags: %d", releaseversion, flags);
- br_stub_send_ipc_fop (this, fd, releaseversion, flags);
+ "releaseversion: %lu | flags: %d | signinfo: %d",
+ (unsigned long) ntohl (releaseversion),
+ flags, ntohl(signinfo));
+ br_stub_send_ipc_fop (this, fd, releaseversion, signinfo);
}
+ ret = fd_ctx_del (fd, this, &tmp);
+ br_stub_fd = (br_stub_fd_t *)(long)tmp;
+
+ GF_FREE (br_stub_fd);
+
return 0;
}
@@ -1351,11 +1899,12 @@ void
br_stub_ictxmerge (xlator_t *this, fd_t *fd,
inode_t *inode, inode_t *linked_inode)
{
- int32_t ret = 0;
- uint64_t ctxaddr = 0;
- uint64_t lctxaddr = 0;
- br_stub_inode_ctx_t *ctx = NULL;
- br_stub_inode_ctx_t *lctx = NULL;
+ int32_t ret = 0;
+ uint64_t ctxaddr = 0;
+ uint64_t lctxaddr = 0;
+ br_stub_inode_ctx_t *ctx = NULL;
+ br_stub_inode_ctx_t *lctx = NULL;
+ br_stub_fd_t *br_stub_fd = NULL;
ret = br_stub_get_inode_ctx (this, inode, &ctxaddr);
if (ret < 0)
@@ -1369,29 +1918,15 @@ br_stub_ictxmerge (xlator_t *this, fd_t *fd,
goto unblock;
lctx = (br_stub_inode_ctx_t *) lctxaddr;
- if (__br_stub_is_inode_dirty (lctx)) {
- /**
- * RACY code: An inode can end up in this situation
- * after a lookup() or after a create() followed by
- * a release(). Even if we distinguish b/w the two,
- * there needs to be more infrastructure built up
- * in stub to handle these races. Note, that it's
- * probably OK to ignore the race iff the version
- * was initialized on the very first lookup(), i.e.,
- * [ongoingversion: default].
- *
- * FIXME: fixup races [create(1..n)/lookup(1..n)].
- */
- GF_ASSERT (lctx->currentversion
- == BITROT_DEFAULT_CURRENT_VERSION);
- __br_stub_track_openfd (fd, lctx);
- __br_stub_mark_inode_synced (lctx);
- } else {
- GF_ASSERT (ctx->currentversion <= lctx->currentversion);
- __br_stub_track_openfd (fd, lctx);
+ GF_ASSERT (list_is_singular (&ctx->fd_list));
+ br_stub_fd = list_first_entry (&ctx->fd_list, br_stub_fd_t,
+ list);
+ if (br_stub_fd) {
+ GF_ASSERT (br_stub_fd->fd == fd);
+ list_move_tail (&br_stub_fd->list, &lctx->fd_list);
}
}
- unblock:
+unblock:
UNLOCK (&linked_inode->lock);
done:
@@ -1409,6 +1944,9 @@ struct xlator_fops fops = {
.getxattr = br_stub_getxattr,
.fgetxattr = br_stub_fgetxattr,
.fsetxattr = br_stub_fsetxattr,
+ .writev = br_stub_writev,
+ .truncate = br_stub_truncate,
+ .ftruncate = br_stub_ftruncate,
};
struct xlator_cbks cbks = {
diff --git a/xlators/features/bit-rot/src/stub/bit-rot-stub.h b/xlators/features/bit-rot/src/stub/bit-rot-stub.h
index 86090bfb877..69e212bb81f 100644
--- a/xlators/features/bit-rot/src/stub/bit-rot-stub.h
+++ b/xlators/features/bit-rot/src/stub/bit-rot-stub.h
@@ -21,6 +21,7 @@
#include "xlator.h"
#include "defaults.h"
#include "call-stub.h"
+#include "bit-rot-stub-mem-types.h"
#include "bit-rot-common.h"
@@ -32,17 +33,18 @@ typedef struct br_stub_inode_ctx {
a writeback to disk? */
unsigned long currentversion; /* ongoing version */
- struct release {
- int32_t ordflags;
- unsigned long opencount; /* number of open()s before
- final release() */
- unsigned long releasecount; /* number of release()s */
- } releasectx;
-#define BR_STUB_REQUIRE_RELEASE_CBK 0x0E0EA0E
+ int info_sign;
+ struct list_head fd_list; /* list of open fds or fds participating in
+ write operations */
} br_stub_inode_ctx_t;
+typedef struct br_stub_fd {
+ fd_t *fd;
+ struct list_head list;
+} br_stub_fd_t;
#define I_DIRTY (1<<0) /* inode needs writeback */
+#define I_MODIFIED (1<<1)
#define WRITEBACK_DURABLE 1 /* writeback is durable */
/**
@@ -60,12 +62,10 @@ typedef struct br_stub_local {
uuid_t gfid;
inode_t *inode;
unsigned long version;
- gf_boolean_t markdirty;
} context;
} u;
} br_stub_local_t;
-#define BR_STUB_FULL_VERSIONING (1<<0)
#define BR_STUB_INCREMENTAL_VERSIONING (1<<1)
typedef struct br_stub_private {
@@ -96,16 +96,131 @@ __br_stub_is_inode_dirty (br_stub_inode_ctx_t *ctx)
return (ctx->need_writeback & I_DIRTY);
}
+/* inode mofification markers */
+static inline void
+__br_stub_set_inode_modified (br_stub_inode_ctx_t *ctx)
+{
+ ctx->need_writeback |= I_MODIFIED;
+}
+
+static inline void
+__br_stub_unset_inode_modified (br_stub_inode_ctx_t *ctx)
+{
+ ctx->need_writeback &= ~I_MODIFIED;
+}
+
+static inline int
+__br_stub_is_inode_modified (br_stub_inode_ctx_t *ctx)
+{
+ return (ctx->need_writeback & I_MODIFIED);
+}
+
+br_stub_fd_t *
+br_stub_fd_new (void)
+{
+ br_stub_fd_t *br_stub_fd = NULL;
+
+ br_stub_fd = GF_CALLOC (1, sizeof (*br_stub_fd),
+ gf_br_stub_mt_br_stub_fd_t);
+
+ return br_stub_fd;
+}
+
+int
+__br_stub_fd_ctx_set (xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd)
+{
+ uint64_t value = 0;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+ GF_VALIDATE_OR_GOTO (this->name, br_stub_fd, out);
+
+ value = (uint64_t)(long) br_stub_fd;
+
+ ret = __fd_ctx_set (fd, this, value);
+
+out:
+ return ret;
+}
+
+br_stub_fd_t *
+__br_stub_fd_ctx_get (xlator_t *this, fd_t *fd)
+{
+ br_stub_fd_t *br_stub_fd = NULL;
+ uint64_t value = 0;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+
+ ret = __fd_ctx_get (fd, this, &value);
+ if (ret)
+ return NULL;
+
+ br_stub_fd = (br_stub_fd_t *) ((long) value);
+
+out:
+ return br_stub_fd;
+}
+
+br_stub_fd_t *
+br_stub_fd_ctx_get (xlator_t *this, fd_t *fd)
+{
+ br_stub_fd_t *br_stub_fd = NULL;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+
+ LOCK (&fd->lock);
+ {
+ br_stub_fd = __br_stub_fd_ctx_get (this, fd);
+ }
+ UNLOCK (&fd->lock);
+
+out:
+ return br_stub_fd;
+}
+
+int32_t
+br_stub_fd_ctx_set (xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd)
+{
+ int32_t ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot-stub", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+ GF_VALIDATE_OR_GOTO (this->name, br_stub_fd, out);
+
+ LOCK (&fd->lock);
+ {
+ ret = __br_stub_fd_ctx_set (this, fd, br_stub_fd);
+ }
+ UNLOCK (&fd->lock);
+
+out:
+ return ret;
+}
+
static inline int
-br_stub_require_release_call (xlator_t *this, fd_t *fd)
+br_stub_require_release_call (xlator_t *this, fd_t *fd, br_stub_fd_t **fd_ctx)
{
int32_t ret = 0;
+ br_stub_fd_t *br_stub_fd = NULL;
+
+ br_stub_fd = br_stub_fd_new ();
+ if (!br_stub_fd)
+ return -1;
+
+ br_stub_fd->fd = fd;
+ INIT_LIST_HEAD (&br_stub_fd->list);
- ret = fd_ctx_set (fd, this,
- (uint64_t)(long)BR_STUB_REQUIRE_RELEASE_CBK);
+ ret = br_stub_fd_ctx_set (this, fd, br_stub_fd);
if (ret)
gf_log (this->name, GF_LOG_WARNING,
"could not set fd context (for release callback");
+ else
+ *fd_ctx = br_stub_fd;
+
return ret;
}
@@ -122,7 +237,15 @@ static inline int
br_stub_get_inode_ctx (xlator_t *this,
inode_t *inode, uint64_t *ctx)
{
- return inode_ctx_get (inode, this, ctx);
+ int ret = -1;
+
+ LOCK (&inode->lock);
+ {
+ ret = __br_stub_get_inode_ctx (this, inode, ctx);
+ }
+ UNLOCK (&inode->lock);
+
+ return ret;
}
static inline int
@@ -144,55 +267,31 @@ __br_stub_writeback_version (br_stub_inode_ctx_t *ctx)
static inline void
__br_stub_set_ongoing_version (br_stub_inode_ctx_t *ctx, unsigned long version)
{
- ctx->currentversion = version;
-}
-
-static inline void
-__br_stub_reset_release_counters (br_stub_inode_ctx_t *ctx)
-{
- ctx->releasectx.ordflags = 0;
- ctx->releasectx.opencount = 0;
- ctx->releasectx.releasecount = 0;
-}
-
-static inline void
-__br_stub_track_release (br_stub_inode_ctx_t *ctx)
-{
- ++ctx->releasectx.releasecount;
-}
-
-static inline void
-___br_stub_track_open (br_stub_inode_ctx_t *ctx)
-{
- ++ctx->releasectx.opencount;
-}
-
-static inline void
-___br_stub_track_open_flags (fd_t *fd, br_stub_inode_ctx_t *ctx)
-{
- ctx->releasectx.ordflags |= fd->flags;
-}
-
-static inline void
-__br_stub_track_openfd (fd_t *fd, br_stub_inode_ctx_t *ctx)
-{
- ___br_stub_track_open (ctx);
- ___br_stub_track_open_flags (fd, ctx);
+ if (ctx->currentversion < version)
+ ctx->currentversion = version;
+ else
+ gf_log ("bit-rot-stub", GF_LOG_WARNING, "current version: %lu"
+ "new version: %lu", ctx->currentversion, version);
}
static inline int
__br_stub_can_trigger_release (inode_t *inode,
- br_stub_inode_ctx_t *ctx,
- unsigned long *version, int32_t *flags)
+ br_stub_inode_ctx_t *ctx, unsigned long *version)
{
- if (list_empty (&inode->fd_list)
- && (ctx->releasectx.releasecount == ctx->releasectx.opencount)) {
- if (flags)
- *flags = htonl (ctx->releasectx.ordflags);
+ /**
+ * If the inode is modified, then it has to be dirty. An inode is
+ * marked dirty once version is increased. Its marked as modified
+ * when the modification call (write/truncate) which triggered
+ * the versioning is successful.
+ */
+ if (__br_stub_is_inode_modified (ctx)
+ && list_empty (&ctx->fd_list)
+ && (ctx->info_sign != BR_SIGN_REOPEN_WAIT)) {
+
+ GF_ASSERT (__br_stub_is_inode_dirty (ctx) == 0);
+
if (version)
*version = htonl (ctx->currentversion);
-
- __br_stub_reset_release_counters (ctx);
return 1;
}
@@ -261,11 +360,16 @@ static inline void
br_stub_remove_vxattrs (dict_t *xattr)
{
if (xattr) {
- dict_del (xattr, BITROT_OBJECT_BAD_KEY);
dict_del (xattr, BITROT_CURRENT_VERSION_KEY);
dict_del (xattr, BITROT_SIGNING_VERSION_KEY);
dict_del (xattr, BITROT_SIGNING_XATTR_SIZE_KEY);
}
}
+int32_t
+br_stub_add_fd_to_inode (xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx);
+
+br_sign_state_t
+__br_stub_inode_sign_state (br_stub_inode_ctx_t *ctx, glusterfs_fop_t fop,
+ fd_t *fd);
#endif /* __BIT_ROT_STUB_H__ */