summaryrefslogtreecommitdiffstats
path: root/xlators/features/bit-rot/src/bitd/bit-rot.c
diff options
context:
space:
mode:
authorRaghavendra Bhat <raghavendra@redhat.com>2015-04-09 15:38:47 +0530
committerNiels de Vos <ndevos@redhat.com>2015-05-10 08:14:33 -0700
commitda48a6a596251c19a8ddb1bdfec3da9744a78b8f (patch)
treec2835e85440ca2c17f55641d2312a9dd345dad6a /xlators/features/bit-rot/src/bitd/bit-rot.c
parentd1d54d027fc616ccae5c329d5b5f02ee9aab1549 (diff)
features/bit-rot-stub: versioning of objects in write/truncate fop instead of open
* This patch brings in the changes where object versioning is done in write and truncate fops instead of tracking them in open and create fops. This model works for both regular and anonymous fds. It also removes the race associated with open calls, create and lookups. This patch follows the below method for object versioning and notifications: Before sending writev on the fd, increase the ongoing version first. This makes anonymous fd write similar to the regular fd write by having the ongoing version increased before doing the write. Do following steps to do versioning: 1) For anonymous fds set the fd context (so that release is invoked) and add the fd context to the list maintained in the inode context. For regular fds the above think would have been done in open itself. 2) Increase the on-disk ongoing version 3) Increase the in memory ongoing version and mark inode as non-dirty 3) Once versioning is successfully done send write operation. If versioning fails, then fail the write fop. 5) In writev_cbk mark inode as modified. > Change-Id: I7104391bbe076d8fc49b68745d2ec29a6e92476c > BUG: 1207979 > Signed-off-by: Raghavendra Bhat <raghavendra@redhat.com> > Reviewed-on: http://review.gluster.org/10233 > Tested-by: Gluster Build System <jenkins@build.gluster.com> > Reviewed-by: Vijay Bellur <vbellur@redhat.com> Change-Id: I4bb86989b5fab02b9ed2950798b1a80e566f1024 BUG: 1220041 Signed-off-by: Raghavendra Bhat <raghavendra@redhat.com> Reviewed-on: http://review.gluster.org/10722 Reviewed-by: Gaurav Kumar Garg <ggarg@redhat.com> Tested-by: NetBSD Build System Tested-by: Gluster Build System <jenkins@build.gluster.com>
Diffstat (limited to 'xlators/features/bit-rot/src/bitd/bit-rot.c')
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.c138
1 files changed, 122 insertions, 16 deletions
diff --git a/xlators/features/bit-rot/src/bitd/bit-rot.c b/xlators/features/bit-rot/src/bitd/bit-rot.c
index 0bb2f2ab8e6..b9adbd6647c 100644
--- a/xlators/features/bit-rot/src/bitd/bit-rot.c
+++ b/xlators/features/bit-rot/src/bitd/bit-rot.c
@@ -171,11 +171,11 @@ bitd_is_bad_file (xlator_t *this, br_child_t *child, loc_t *loc, fd_t *fd)
if (fd)
ret = syncop_fgetxattr (child->xl, fd, &xattr,
- BITROT_OBJECT_BAD_KEY, NULL,
+ "trusted.glusterfs.bad-file", NULL,
NULL);
else if (loc)
ret = syncop_getxattr (child->xl, loc, &xattr,
- BITROT_OBJECT_BAD_KEY, NULL,
+ "trusted.glusterfs.bad-file", NULL,
NULL);
if (!ret) {
@@ -484,6 +484,98 @@ br_log_object_path (xlator_t *this, char *op,
op, path, strerror (op_errno));
}
+static void
+br_send_dummy_write (xlator_t *this, fd_t *fd, br_child_t *child,
+ dict_t *xdata)
+{
+ struct iovec iov = {0, };
+ struct iobref *iobref = NULL;
+ struct iobuf *iobuf = NULL;
+ char *msg = NULL;
+ size_t size = 0;
+ int32_t ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+ GF_VALIDATE_OR_GOTO (this->name, child, out);
+
+ msg = gf_strdup ("GLUSTERFS");
+ if (!msg)
+ goto out;
+
+ size = strlen (msg);
+
+ iov.iov_base = msg;
+ iov.iov_len = size;
+
+ iobref = iobref_new ();
+ if (!iobref)
+ goto free_msg;
+
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, size);
+ if (!iobuf)
+ goto free_iobref;
+
+ iobref_add (iobref, iobuf);
+
+ iov_unload (iobuf_ptr (iobuf), &iov, 1); /* FIXME!!! */
+
+ iov.iov_base = iobuf_ptr (iobuf);
+ iov.iov_len = size;
+
+ ret = syncop_writev (child->xl, fd, &iov, 1, 0, iobref, 0, xdata, NULL);
+ if (ret <= 0) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "dummy write failed (%s)", strerror (errno));
+ goto free_iobuf;
+ }
+
+ /* iobref_unbref() takes care of iobuf unref */
+ ret = 0;
+
+ free_iobuf:
+ iobuf_unref (iobuf);
+ free_iobref:
+ iobref_unref (iobref);
+ free_msg:
+ GF_FREE (msg);
+ out:
+ return;
+}
+
+static void
+br_object_handle_reopen (xlator_t *this,
+ br_object_t *object, inode_t *linked_inode)
+{
+ int32_t ret = -1;
+ dict_t *dict = NULL;
+ loc_t loc = {0, };
+
+ /**
+ * Here dict is purposefully not checked for NULL, because at any cost
+ * sending a re-open should not be missed. This re-open is an indication
+ * for the stub to properly mark inode's status.
+ */
+ dict = dict_new ();
+ if (dict) {
+ /* TODO: Make it a #define */
+ ret = dict_set_int32 (dict, "br-fd-reopen", 1);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING,
+ "Object reopen would trigger versioning.");
+ }
+
+ loc.inode = inode_ref (linked_inode);
+ gf_uuid_copy (loc.gfid, linked_inode->gfid);
+
+ br_trigger_sign (this, object->child, linked_inode, &loc, dict);
+
+ if (dict)
+ dict_unref (dict);
+ loc_wipe (&loc);
+}
+
/**
* Sign a given object. This routine runs full throttle. There needs to be
* some form of priority scheduling and/or read burstness to avoid starving
@@ -497,6 +589,7 @@ static inline int32_t br_sign_object (br_object_t *object)
fd_t *fd = NULL;
struct iatt iatt = {0, };
pid_t pid = GF_CLIENT_PID_BITD;
+ br_sign_state_t sign_info = BR_SIGN_NORMAL;
GF_VALIDATE_OR_GOTO ("bit-rot", object, out);
@@ -515,6 +608,20 @@ static inline int32_t br_sign_object (br_object_t *object)
goto out;
}
+ /* sanity check */
+ sign_info = ntohl (object->sign_info);
+ GF_ASSERT (sign_info != BR_SIGN_NORMAL);
+
+ /**
+ * For fd's that have notified for reopening, we send an explicit
+ * open() followed by a dummy write() call. This triggers the
+ * actual signing of the object.
+ */
+ if (sign_info == BR_SIGN_REOPEN_WAIT) {
+ br_object_handle_reopen (this, object, linked_inode);
+ goto unref_inode;
+ }
+
ret = br_object_open (this, object, linked_inode, &fd);
if (!fd) {
br_log_object (this, "open", object->gfid, -ret);
@@ -648,6 +755,7 @@ br_initialize_object (xlator_t *this, br_child_t *child, changelog_event_t *ev)
/* NOTE: it's BE, but no worry */
object->signedversion = ev->u.releasebr.version;
+ object->sign_info = ev->u.releasebr.sign_info;
out:
return object;
@@ -693,7 +801,6 @@ br_brick_callback (void *xl, char *brick,
xlator_t *this = NULL;
br_object_t *object = NULL;
br_child_t *child = NULL;
- int32_t flags = 0;
struct gf_tw_timer_list *timer = NULL;
this = xl;
@@ -710,14 +817,6 @@ br_brick_callback (void *xl, char *brick,
gf_log (this->name, GF_LOG_DEBUG,
"RELEASE EVENT [GFID %s]", uuid_utoa (gfid));
- flags = (int32_t)ntohl (ev->u.releasebr.flags);
- if (flags == O_RDONLY) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Read only fd [GFID: %s], ignoring signing..",
- uuid_utoa (gfid));
- goto out;
- }
-
child = br_get_child_from_brick_path (this, brick);
if (!child) {
gf_log (this->name, GF_LOG_ERROR, "failed to get the subvolume "
@@ -804,12 +903,15 @@ out:
return need_sign;
}
-static inline void
+void
br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode,
- loc_t *loc)
+ loc_t *loc, dict_t *xdata)
{
fd_t *fd = NULL;
int32_t ret = -1;
+ pid_t pid = GF_CLIENT_PID_BITD;
+
+ syncopctx_setfspid (&pid);
fd = fd_create (linked_inode, 0);
if (!fd) {
@@ -828,8 +930,10 @@ br_trigger_sign (xlator_t *this, br_child_t *child, inode_t *linked_inode,
fd_bind (fd);
}
- if (fd)
+ if (fd) {
+ br_send_dummy_write (this, fd, child, xdata);
syncop_close (fd);
+ }
out:
return;
@@ -972,7 +1076,7 @@ bitd_oneshot_crawl (xlator_t *subvol,
gf_log (this->name, GF_LOG_INFO,
"Triggering signing for %s [GFID: %s | Brick: %s]",
loc.path, uuid_utoa (linked_inode->gfid), child->brick_path);
- br_trigger_sign (this, child, linked_inode, &loc);
+ br_trigger_sign (this, child, linked_inode, &loc, NULL);
ret = 0;
@@ -1600,7 +1704,9 @@ struct xlator_cbks cbks;
struct volume_options options[] = {
{ .key = {"expiry-time"},
.type = GF_OPTION_TYPE_INT,
- .default_value = "120",
+ /* Let the default timer be half the value of the wait time for
+ * sining (which is 120 as of now) */
+ .default_value = "60",
.description = "default time duration for which an object waits "
"before it is signed",
},