diff options
author | Mohammed Junaid <junaid@redhat.com> | 2012-02-08 18:06:39 +0530 |
---|---|---|
committer | Vijay Bellur <vijay@gluster.com> | 2012-02-20 04:45:31 -0800 |
commit | f764516c2e526624ce0088963924ff2d88304553 (patch) | |
tree | 85262797baad440b12853a3a6ad41ab518d9f996 | |
parent | 4d1b040f00e7ec8de997d151b35fa035bba9cb25 (diff) |
protocol/client,server: fcntl lock self healing.
Currently(with out this patch), on a disconnect the server cleans up
the transport which inturn closes the fd's and releases the locks acquired on
those fd's by that client. On a reconnect, client just reopens the fd's but
doesn't reacquire the locks. The application that had previously acquired
the locks still is under the assumption that it is the owner of those locks
which might have been granted to other clients(if they request) by the server
leading to data corruption.
This patch allows the client to reacquire the fcntl locks (held on the fd's)
during client-server handshake.
* The server identifies the client via process-uuid-xl (which is a combination
of uuid and client-protocol name, it is assumed to be unique) and lk-version
number.
* The client maintains a list of process-uuid-xl, lk-version pair for each
accepted connection. On a connect, the server traverses the list for a
matching pair, if a matching pair is not found the the server returns
lk-version with value 0, else it returns the lk-version it has in store.
* On a disconnect, the server and client enter grace period, and on the
completion of the grace period, the client bumps up its lk-version number
(which means, it will reacquire the locks the next time) and the server will
distroy the connection. If reconnection happens within the grace period, the
server will find the matching (process-uuid-xl, lk-version) pair in its list
which guarantees that the fd's and there corresponding locks are still valid
for this client.
Configurable options:
To set grace-timeout, the following options are
option server.grace-timeout value
option client.grace-timeout value
To enable or disable the lk-heal,
option lk-heal [on|off]
gluster volume set command can be used to configurable options
Change-Id: Id677ef1087b300d649f278b8b2aa0d94eae85ed2
BUG: 795386
Signed-off-by: Mohammed Junaid <junaid@redhat.com>
Reviewed-on: http://review.gluster.com/2766
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Vijay Bellur <vijay@gluster.com>
25 files changed, 1484 insertions, 56 deletions
diff --git a/libglusterfs/src/Makefile.am b/libglusterfs/src/Makefile.am index 34543f62215..bbe7a2cd739 100644 --- a/libglusterfs/src/Makefile.am +++ b/libglusterfs/src/Makefile.am @@ -23,7 +23,7 @@ libglusterfs_la_SOURCES = dict.c xlator.c logging.c \ $(CONTRIBDIR)/uuid/parse.c $(CONTRIBDIR)/uuid/unparse.c \ $(CONTRIBDIR)/uuid/uuid_time.c $(CONTRIBDIR)/uuid/compare.c \ $(CONTRIBDIR)/uuid/isnull.c $(CONTRIBDIR)/uuid/unpack.c syncop.c \ - graph-print.c trie.c run.c options.c + graph-print.c trie.c run.c options.c fd-lk.c nodist_libglusterfs_la_SOURCES = y.tab.c graph.lex.c @@ -37,7 +37,7 @@ noinst_HEADERS = common-utils.h defaults.h dict.h glusterfs.h hashfn.h \ rbthash.h iatt.h latency.h mem-types.h $(CONTRIBDIR)/uuid/uuidd.h \ $(CONTRIBDIR)/uuid/uuid.h $(CONTRIBDIR)/uuid/uuidP.h \ $(CONTRIB_BUILDDIR)/uuid/uuid_types.h syncop.h graph-utils.h trie.h run.h \ - options.h lkowner.h + options.h lkowner.h fd-lk.h EXTRA_DIST = graph.l graph.y diff --git a/libglusterfs/src/fd-lk.c b/libglusterfs/src/fd-lk.c new file mode 100644 index 00000000000..8df43bb602f --- /dev/null +++ b/libglusterfs/src/fd-lk.c @@ -0,0 +1,458 @@ +/* + Copyright (c) 2011-2012 Gluster, Inc. <http://www.gluster.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include "fd-lk.h" +#include "common-utils.h" + + +int32_t +_fd_lk_delete_lock (fd_lk_ctx_node_t *lock) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("fd-lk", lock, out); + + list_del_init (&lock->next); + + ret = 0; +out: + return ret; +} + +int32_t +_fd_lk_destroy_lock (fd_lk_ctx_node_t *lock) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("fd-lk", lock, out); + + GF_FREE (lock); + + ret = 0; +out: + return ret; +} + +int +_fd_lk_destroy_lock_list (fd_lk_ctx_t *lk_ctx) +{ + int ret = -1; + fd_lk_ctx_node_t *lk = NULL; + fd_lk_ctx_node_t *tmp = NULL; + + GF_VALIDATE_OR_GOTO ("fd-lk", lk_ctx, out); + + list_for_each_entry_safe (lk, tmp, &lk_ctx->lk_list, next) { + _fd_lk_delete_lock (lk); + _fd_lk_destroy_lock (lk); + } + ret = 0; +out: + return ret; +} + +int +fd_lk_ctx_unref (fd_lk_ctx_t *lk_ctx) +{ + int ref = -1; + + GF_VALIDATE_OR_GOTO ("fd-lk", lk_ctx, err); + + LOCK (&lk_ctx->lock); + { + ref = --lk_ctx->ref; + if (ref < 0) + GF_ASSERT (!ref); + if (ref == 0) + _fd_lk_destroy_lock_list (lk_ctx); + } + UNLOCK (&lk_ctx->lock); + + if (ref == 0) { + LOCK_DESTROY (&lk_ctx->lock); + GF_FREE (lk_ctx); + } + + return 0; +err: + return -1; +} + +fd_lk_ctx_t * +_fd_lk_ctx_ref (fd_lk_ctx_t *lk_ctx) +{ + if (!lk_ctx) { + gf_log_callingfn ("fd", GF_LOG_WARNING, + "invalid argument"); + return NULL; + } + + ++lk_ctx->ref; + + return lk_ctx; +} + +fd_lk_ctx_t * +fd_lk_ctx_ref (fd_lk_ctx_t *lk_ctx) +{ + fd_lk_ctx_t *new_lk_ctx = NULL; + + if (!lk_ctx) { + gf_log_callingfn ("fd", GF_LOG_WARNING, + "invalid argument"); + return NULL; + } + + LOCK (&lk_ctx->lock); + { + new_lk_ctx = _fd_lk_ctx_ref (lk_ctx); + } + UNLOCK (&lk_ctx->lock); + + return new_lk_ctx; +} + +fd_lk_ctx_t * +fd_lk_ctx_create () +{ + fd_lk_ctx_t *fd_lk_ctx = NULL; + + fd_lk_ctx = GF_CALLOC (1, sizeof (fd_lk_ctx_t), + gf_common_mt_fd_lk_ctx_t); + if (!fd_lk_ctx) + goto out; + + INIT_LIST_HEAD (&fd_lk_ctx->lk_list); + + LOCK_INIT (&fd_lk_ctx->lock); + + fd_lk_ctx = fd_lk_ctx_ref (fd_lk_ctx); +out: + return fd_lk_ctx; +} + +int +_fd_lk_insert_lock (fd_lk_ctx_t *lk_ctx, + fd_lk_ctx_node_t *lock) +{ + list_add_tail (&lock->next, &lk_ctx->lk_list); + return 0; +} + +static off_t +_fd_lk_get_lock_len (off_t start, off_t end) +{ + if (end == LLONG_MAX) + return 0; + else + return (end - start + 1); +} + +fd_lk_ctx_node_t * +fd_lk_ctx_node_new (int32_t cmd, struct gf_flock *flock) +{ + fd_lk_ctx_node_t *new_lock = NULL; + + /* TODO: get from mem-pool */ + new_lock = GF_CALLOC (1, sizeof (fd_lk_ctx_node_t), + gf_common_mt_fd_lk_ctx_node_t); + if (!new_lock) + goto out; + + new_lock->cmd = cmd; + + if (flock) { + new_lock->fl_type = flock->l_type; + new_lock->fl_start = flock->l_start; + + if (flock->l_len == 0) + new_lock->fl_end = LLONG_MAX; + else + new_lock->fl_end = flock->l_start + flock->l_len - 1; + + memcpy (&new_lock->user_flock, flock, + sizeof (struct gf_flock)); + } + + INIT_LIST_HEAD (&new_lock->next); +out: + return new_lock; +} + +int32_t +_fd_lk_delete_unlck_locks (fd_lk_ctx_t *lk_ctx) +{ + int32_t ret = -1; + fd_lk_ctx_node_t *tmp = NULL; + fd_lk_ctx_node_t *lk = NULL; + + GF_VALIDATE_OR_GOTO ("fd-lk", lk_ctx, out); + + list_for_each_entry_safe (lk, tmp, &lk_ctx->lk_list, next) { + if (lk->fl_type == F_UNLCK) { + _fd_lk_delete_lock (lk); + _fd_lk_destroy_lock (lk); + } + } +out: + return ret; +} + +int +fd_lk_overlap (fd_lk_ctx_node_t *l1, + fd_lk_ctx_node_t *l2) +{ + if (l1->fl_end >= l2->fl_start && + l2->fl_end >= l1->fl_start) + return 1; + + return 0; +} + +fd_lk_ctx_node_t * +_fd_lk_add_locks (fd_lk_ctx_node_t *l1, + fd_lk_ctx_node_t *l2) +{ + fd_lk_ctx_node_t *sum = NULL; + + sum = fd_lk_ctx_node_new (0, NULL); + if (!sum) + goto out; + + sum->fl_start = min (l1->fl_start, l2->fl_start); + sum->fl_end = max (l1->fl_end, l2->fl_end); + + sum->user_flock.l_start = sum->fl_start; + sum->user_flock.l_len = _fd_lk_get_lock_len (sum->fl_start, + sum->fl_end); +out: + return sum; +} + +/* Subtract two locks */ +struct _values { + fd_lk_ctx_node_t *locks[3]; +}; + +int32_t +_fd_lk_sub_locks (struct _values *v, + fd_lk_ctx_node_t *big, + fd_lk_ctx_node_t *small) +{ + int32_t ret = -1; + + if ((big->fl_start == small->fl_start) && + (big->fl_end == small->fl_end)) { + /* both edges coincide with big */ + v->locks[0] = fd_lk_ctx_node_new (small->cmd, NULL); + if (!v->locks[0]) + goto out; + + memcpy (v->locks[0], big, sizeof (fd_lk_ctx_node_t)); + + v->locks[0]->fl_type = small->fl_type; + v->locks[0]->user_flock.l_type = small->fl_type; + } else if ((small->fl_start > big->fl_start) && + (small->fl_end < big->fl_end)) { + /* small lock is completely inside big lock, + break it down into 3 different locks. */ + v->locks[0] = fd_lk_ctx_node_new (big->cmd, NULL); + if (!v->locks[0]) + goto out; + + v->locks[1] = fd_lk_ctx_node_new (small->cmd, NULL); + if (!v->locks[1]) + goto out; + + v->locks[2] = fd_lk_ctx_node_new (big->cmd, NULL); + if (!v->locks[2]) + goto out; + + memcpy (v->locks[0], big, sizeof (fd_lk_ctx_node_t)); + v->locks[0]->fl_end = small->fl_start - 1; + v->locks[0]->user_flock.l_len = + _fd_lk_get_lock_len (v->locks[0]->fl_start, + v->locks[0]->fl_end); + + memcpy (v->locks[1], small, sizeof (fd_lk_ctx_node_t)); + + memcpy (v->locks[2], big, sizeof (fd_lk_ctx_node_t)); + v->locks[2]->fl_start = small->fl_end + 1; + v->locks[2]->user_flock.l_len = + _fd_lk_get_lock_len (v->locks[2]->fl_start, + v->locks[2]->fl_end); + } else if (small->fl_start == big->fl_start) { + /* One of the ends co-incide, break the + locks into two seperate parts */ + v->locks[0] = fd_lk_ctx_node_new (small->cmd, NULL); + if (!v->locks[0]) + goto out; + + v->locks[1] = fd_lk_ctx_node_new (big->cmd, NULL); + if (!v->locks[1]) + goto out; + + memcpy (v->locks[0], small, sizeof (fd_lk_ctx_node_t)); + + memcpy (v->locks[1], big, sizeof (fd_lk_ctx_node_t)); + v->locks[1]->fl_start = small->fl_end + 1; + v->locks[1]->user_flock.l_start = small->fl_end + 1; + } else if (small->fl_end == big->fl_end) { + /* One of the ends co-incide, break the + locks into two seperate parts */ + v->locks[0] = fd_lk_ctx_node_new (small->cmd, NULL); + if (!v->locks[0]) + goto out; + + v->locks[1] = fd_lk_ctx_node_new (big->cmd, NULL); + if (!v->locks[1]) + goto out; + + memcpy (v->locks[0], big, sizeof (fd_lk_ctx_node_t)); + v->locks[0]->fl_end = small->fl_start - 1; + v->locks[0]->user_flock.l_len = + _fd_lk_get_lock_len (v->locks[0]->fl_start, + v->locks[0]->fl_end); + + memcpy (v->locks[1], small, sizeof (fd_lk_ctx_node_t)); + } else { + /* We should never come to this case */ + GF_ASSERT (!"Invalid case"); + } + ret = 0; +out: + return ret; +} + +static void +_fd_lk_insert_and_merge (fd_lk_ctx_t *lk_ctx, + fd_lk_ctx_node_t *lock) +{ + int32_t ret = -1; + int32_t i = 0; + fd_lk_ctx_node_t *entry = NULL; + fd_lk_ctx_node_t *t = NULL; + fd_lk_ctx_node_t *sum = NULL; + struct _values v = {.locks = {0, 0, 0 }}; + + list_for_each_entry_safe (entry, t, &lk_ctx->lk_list, next) { + if (!fd_lk_overlap (entry, lock)) + continue; + + if (entry->fl_type == lock->fl_type) { + sum = _fd_lk_add_locks (entry, lock); + if (sum) + return; + sum->fl_type = entry->fl_type; + sum->user_flock.l_type = entry->fl_type; + _fd_lk_delete_lock (entry); + _fd_lk_destroy_lock (entry); + _fd_lk_destroy_lock (lock); + _fd_lk_insert_and_merge (lk_ctx, sum); + return; + } else { + sum = _fd_lk_add_locks (entry, lock); + sum->fl_type = entry->fl_type; + sum->user_flock.l_type = entry->fl_type; + ret = _fd_lk_sub_locks (&v, sum, lock); + if (ret) + return; + _fd_lk_delete_lock (entry); + _fd_lk_destroy_lock (entry); + + _fd_lk_delete_lock (lock); + _fd_lk_destroy_lock (lock); + + for (i = 0; i < 3; i++) { + if (!v.locks[i]) + continue; + + INIT_LIST_HEAD (&v.locks[i]->next); + _fd_lk_insert_and_merge (lk_ctx, v.locks[i]); + } + _fd_lk_delete_unlck_locks (lk_ctx); + return; + } + } + + /* no conflicts, so just insert */ + if (lock->fl_type != F_UNLCK) { + _fd_lk_insert_lock (lk_ctx, lock); + } else { + _fd_lk_destroy_lock_list (lk_ctx); + } +} + +static void +print_lock_list (fd_lk_ctx_t *lk_ctx) +{ + fd_lk_ctx_node_t *lk = NULL; + + gf_log ("fd-lk", GF_LOG_WARNING, "lock list:"); + + list_for_each_entry (lk, &lk_ctx->lk_list, next) + gf_log ("fd-lk", GF_LOG_DEBUG, "owner = %s, " + "cmd = %s fl_type = %s, fs_start = %"PRId64", " + "fs_end = %"PRId64", user_flock: l_type = %s, " + "l_start = %"PRId64", l_len = %"PRId64", ", + lkowner_utoa (&lk->user_flock.l_owner), + get_lk_cmd (lk->cmd), get_lk_type (lk->fl_type), + lk->fl_start, lk->fl_end, + get_lk_type (lk->user_flock.l_type), + lk->user_flock.l_start, + lk->user_flock.l_len); +} + +int +fd_lk_insert_and_merge (fd_t *fd, int32_t cmd, + struct gf_flock *flock) +{ + int32_t ret = -1; + fd_lk_ctx_t *lk_ctx = NULL; + fd_lk_ctx_node_t *lk = NULL; + + GF_VALIDATE_OR_GOTO ("fd-lk", fd, out); + GF_VALIDATE_OR_GOTO ("fd-lk", flock, out); + + lk_ctx = fd_lk_ctx_ref (fd->lk_ctx); + lk = fd_lk_ctx_node_new (cmd, flock); + + gf_log ("fd-lk", GF_LOG_DEBUG, + "new lock requrest: owner = %s, fl_type = %s, " + "fs_start = %"PRId64", fs_end = %"PRId64", " + "user_flock: l_type = %s, l_start = %"PRId64", " + "l_len = %"PRId64, lkowner_utoa (&flock->l_owner), + get_lk_type (lk->fl_type), lk->fl_start, + lk->fl_end, get_lk_type (lk->user_flock.l_type), + lk->user_flock.l_start, + lk->user_flock.l_len); + + LOCK (&lk_ctx->lock); + { + _fd_lk_insert_and_merge (lk_ctx, lk); + print_lock_list (lk_ctx); + } + UNLOCK (&lk_ctx->lock); + + fd_lk_ctx_unref (lk_ctx); + + ret = 0; +out: + return ret; +} diff --git a/libglusterfs/src/fd-lk.h b/libglusterfs/src/fd-lk.h new file mode 100644 index 00000000000..3e419e14377 --- /dev/null +++ b/libglusterfs/src/fd-lk.h @@ -0,0 +1,72 @@ +/* + Copyright (c) 2011-2012 Gluster, Inc. <http://www.gluster.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _FD_LK_H +#define _FD_LK_H + +#include "fd.h" +#include "locking.h" +#include "list.h" +#include "logging.h" +#include "mem-pool.h" +#include "mem-types.h" +#include "glusterfs.h" + +#define get_lk_type(type) \ + type == F_UNLCK ? "F_UNLCK" : (type == F_RDLCK ? "F_RDLCK" : "F_WRLCK") + +#define get_lk_cmd(cmd) \ + cmd == F_SETLKW ? "F_SETLKW" : (cmd == F_SETLK ? "F_SETLK" : "F_GETLK") + +struct _fd; + +struct fd_lk_ctx { + struct list_head lk_list; + int ref; + gf_lock_t lock; +}; +typedef struct fd_lk_ctx fd_lk_ctx_t; + +struct fd_lk_ctx_node { + int32_t cmd; + struct gf_flock user_flock; + off_t fl_start; + off_t fl_end; + short fl_type; + struct list_head next; +}; +typedef struct fd_lk_ctx_node fd_lk_ctx_node_t; + +fd_lk_ctx_t * +_fd_lk_ctx_ref (fd_lk_ctx_t *lk_ctx); + +fd_lk_ctx_t * +fd_lk_ctx_ref (fd_lk_ctx_t *lk_ctx); + +fd_lk_ctx_t * +fd_lk_ctx_create (); + +int +fd_lk_insert_and_merge (struct _fd *lk_ctx, int32_t cmd, + struct gf_flock *flock); + +int +fd_lk_ctx_unref (fd_lk_ctx_t *lk_ctx); + +#endif /* _FD_LK_H */ diff --git a/libglusterfs/src/fd.c b/libglusterfs/src/fd.c index 62a70c457e7..d4cc3464a93 100644 --- a/libglusterfs/src/fd.c +++ b/libglusterfs/src/fd.c @@ -429,6 +429,7 @@ fd_destroy (fd_t *fd) GF_FREE (fd->_ctx); inode_unref (fd->inode); fd->inode = (inode_t *)0xaaaaaaaa; + fd_lk_ctx_unref (fd->lk_ctx); mem_put (fd); out: return; @@ -505,11 +506,12 @@ __fd_create (inode_t *inode, pid_t pid) fd->_ctx = GF_CALLOC (1, (sizeof (struct _fd_ctx) * fd->xl_count), gf_common_mt_fd_ctx); - if (!fd->_ctx) { - mem_put (fd); - fd = NULL; - goto out; - } + if (!fd->_ctx) + goto free_fd; + + fd->lk_ctx = fd_lk_ctx_create (); + if (!fd->lk_ctx) + goto free_fd_ctx; fd->inode = inode_ref (inode); fd->pid = pid; @@ -518,6 +520,13 @@ __fd_create (inode_t *inode, pid_t pid) LOCK_INIT (&fd->lock); out: return fd; + +free_fd_ctx: + GF_FREE (fd->_ctx); +free_fd: + mem_put (fd); + + return NULL; } diff --git a/libglusterfs/src/fd.h b/libglusterfs/src/fd.h index d4cd9bd0662..6b0ed891ff0 100644 --- a/libglusterfs/src/fd.h +++ b/libglusterfs/src/fd.h @@ -30,9 +30,11 @@ #include <unistd.h> #include "glusterfs.h" #include "locking.h" +#include "fd-lk.h" struct _inode; struct _dict; +struct fd_lk_ctx; struct _fd_ctx { union { @@ -59,6 +61,7 @@ struct _fd { 'struct _fd_ctx' array (_ctx).*/ struct _fd_ctx *_ctx; int xl_count; /* Number of xl referred in this fd */ + struct fd_lk_ctx *lk_ctx; }; typedef struct _fd fd_t; diff --git a/libglusterfs/src/mem-types.h b/libglusterfs/src/mem-types.h index b8c61d6897f..1ebf4d36008 100644 --- a/libglusterfs/src/mem-types.h +++ b/libglusterfs/src/mem-types.h @@ -106,6 +106,8 @@ enum gf_common_mem_types_ { gf_common_mt_trie_end = 81, gf_common_mt_run_argv = 82, gf_common_mt_run_logbuf = 83, - gf_common_mt_end = 84 + gf_common_mt_fd_lk_ctx_t = 84, + gf_common_mt_fd_lk_ctx_node_t = 85, + gf_common_mt_end = 86, }; #endif diff --git a/rpc/rpc-lib/src/protocol-common.h b/rpc/rpc-lib/src/protocol-common.h index 874f46e0b70..827201e2dad 100644 --- a/rpc/rpc-lib/src/protocol-common.h +++ b/rpc/rpc-lib/src/protocol-common.h @@ -73,6 +73,7 @@ enum gf_handshake_procnum { GF_HNDSK_SETVOLUME, GF_HNDSK_GETSPEC, GF_HNDSK_PING, + GF_HNDSK_SET_LK_VER, GF_HNDSK_MAXVALUE, }; diff --git a/rpc/rpc-lib/src/rpc-clnt.c b/rpc/rpc-lib/src/rpc-clnt.c index 540c72c82d6..adf6fd87620 100644 --- a/rpc/rpc-lib/src/rpc-clnt.c +++ b/rpc/rpc-lib/src/rpc-clnt.c @@ -838,7 +838,6 @@ out: return; } - int rpc_clnt_notify (rpc_transport_t *trans, void *mydata, rpc_transport_event_t event, void *data, ...) diff --git a/rpc/xdr/src/glusterfs3-xdr.c b/rpc/xdr/src/glusterfs3-xdr.c index 8008a747026..47d7328db28 100644 --- a/rpc/xdr/src/glusterfs3-xdr.c +++ b/rpc/xdr/src/glusterfs3-xdr.c @@ -1821,3 +1821,31 @@ xdr_gfs3_readdirp_rsp (XDR *xdrs, gfs3_readdirp_rsp *objp) return FALSE; return TRUE; } + +bool_t +xdr_gf_set_lk_ver_rsp (XDR *xdrs, gf_set_lk_ver_rsp *objp) +{ + register int32_t *buf; + buf = NULL; + + if (!xdr_int (xdrs, &objp->op_ret)) + return FALSE; + if (!xdr_int (xdrs, &objp->op_errno)) + return FALSE; + if (!xdr_int (xdrs, &objp->lk_ver)) + return FALSE; + return TRUE; +} + +bool_t +xdr_gf_set_lk_ver_req (XDR *xdrs, gf_set_lk_ver_req *objp) +{ + register int32_t *buf; + buf = NULL; + + if (!xdr_string (xdrs, &objp->uid, ~0)) + return FALSE; + if (!xdr_int (xdrs, &objp->lk_ver)) + return FALSE; + return TRUE; +} diff --git a/rpc/xdr/src/glusterfs3-xdr.h b/rpc/xdr/src/glusterfs3-xdr.h index 2b8129a3354..49e9d6cc0b4 100644 --- a/rpc/xdr/src/glusterfs3-xdr.h +++ b/rpc/xdr/src/glusterfs3-xdr.h @@ -1088,6 +1088,19 @@ struct gfs3_readdirp_rsp { }; typedef struct gfs3_readdirp_rsp gfs3_readdirp_rsp; +struct gf_set_lk_ver_rsp { + int op_ret; + int op_errno; + int lk_ver; +}; +typedef struct gf_set_lk_ver_rsp gf_set_lk_ver_rsp; + +struct gf_set_lk_ver_req { + char *uid; + int lk_ver; +}; +typedef struct gf_set_lk_ver_req gf_set_lk_ver_req; + /* the xdr functions */ #if defined(__STDC__) || defined(__cplusplus) @@ -1177,6 +1190,8 @@ extern bool_t xdr_gfs3_dirlist (XDR *, gfs3_dirlist*); extern bool_t xdr_gfs3_readdir_rsp (XDR *, gfs3_readdir_rsp*); extern bool_t xdr_gfs3_dirplist (XDR *, gfs3_dirplist*); extern bool_t xdr_gfs3_readdirp_rsp (XDR *, gfs3_readdirp_rsp*); +extern bool_t xdr_gf_set_lk_ver_rsp (XDR *, gf_set_lk_ver_rsp*); +extern bool_t xdr_gf_set_lk_ver_req (XDR *, gf_set_lk_ver_req*); #else /* K&R C */ extern bool_t xdr_gf_statfs (); @@ -1265,6 +1280,8 @@ extern bool_t xdr_gfs3_dirlist (); extern bool_t xdr_gfs3_readdir_rsp (); extern bool_t xdr_gfs3_dirplist (); extern bool_t xdr_gfs3_readdirp_rsp (); +extern bool_t xdr_gf_set_lk_ver_rsp (); +extern bool_t xdr_gf_set_lk_ver_req (); #endif /* K&R C */ diff --git a/rpc/xdr/src/glusterfs3-xdr.x b/rpc/xdr/src/glusterfs3-xdr.x index 710a9037807..f35820b57cb 100644 --- a/rpc/xdr/src/glusterfs3-xdr.x +++ b/rpc/xdr/src/glusterfs3-xdr.x @@ -675,3 +675,13 @@ struct gfs3_readdirp_rsp { opaque xdata<>; /* Extra data */ }; +struct gf_set_lk_ver_rsp { + int op_ret; + int op_errno; + int lk_ver; +}; + +struct gf_set_lk_ver_req { + string uid<>; + int lk_ver; +}; diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 923271f0a2e..ac5378b1acb 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -203,6 +203,9 @@ static struct volopt_map_entry glusterd_volopt_map[] = { {VKEY_FEATURES_LIMIT_USAGE, "features/quota", "limit-set", NULL, NO_DOC, 0}, {"features.quota-timeout", "features/quota", "timeout", "0", DOC, 0}, {"server.statedump-path", "protocol/server", "statedump-path", NULL, NO_DOC, 0}, + {"client.lk-heal", "protocol/client", "lk-heal", NULL, DOC, 0}, + {"client.grace-timeout", "protocol/client", "grace-timeout", NULL, DOC, 0}, + {"server.grace-timeout", "protocol/server", "grace-timeout", NULL, DOC, 0}, {NULL, } }; diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c index e644290e42d..e44aca1d0c1 100644 --- a/xlators/mount/fuse/src/fuse-bridge.c +++ b/xlators/mount/fuse/src/fuse-bridge.c @@ -3066,13 +3066,19 @@ static int fuse_setlk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct gf_flock *lock) { + uint32_t op = 0; fuse_state_t *state = NULL; state = frame->root->state; + op = state->finh->opcode; if (op_ret == 0) { gf_log ("glusterfs-fuse", GF_LOG_TRACE, "%"PRIu64": ERR => 0", frame->root->unique); + fd_lk_insert_and_merge (state->fd, + (op == FUSE_SETLK) ? F_SETLK : F_SETLKW, + &state->lk_lock); + send_fuse_err (this, state->finh, 0); } else { if (op_errno == ENOSYS) { diff --git a/xlators/protocol/client/src/client-handshake.c b/xlators/protocol/client/src/client-handshake.c index 91cda6d0c45..1896e6b6391 100644 --- a/xlators/protocol/client/src/client-handshake.c +++ b/xlators/protocol/client/src/client-handshake.c @@ -22,6 +22,7 @@ #include "config.h" #endif +#include "fd-lk.h" #include "client.h" #include "xlator.h" #include "defaults.h" @@ -39,6 +40,18 @@ extern rpc_clnt_prog_t clnt_pmap_prog; int client_ping_cbk (struct rpc_req *req, struct iovec *iov, int count, void *myframe); +int client_set_lk_version_cbk (struct rpc_req *req, struct iovec *iov, + int count, void *myframe); + +int client_set_lk_version (xlator_t *this); + +typedef struct client_fd_lk_local { + int ref; + gf_boolean_t error; + gf_lock_t lock; + clnt_fd_ctx_t *fdctx; +}clnt_fd_lk_local_t; + /* Handshake */ void @@ -391,6 +404,411 @@ client_notify_parents_child_up (xlator_t *this) } int +client_set_lk_version_cbk (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + int32_t ret = -1; + call_frame_t *fr = NULL; + gf_set_lk_ver_rsp rsp = {0,}; + + fr = (call_frame_t *) myframe; + GF_VALIDATE_OR_GOTO ("client", fr, out); + + if (req->rpc_status == -1) { + gf_log (fr->this->name, GF_LOG_WARNING, + "received RPC status error"); + goto out; + } + + ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_set_lk_ver_rsp); + if (ret < 0) + gf_log (fr->this->name, GF_LOG_WARNING, + "xdr decoding failed"); + else + gf_log (fr->this->name, GF_LOG_DEBUG, + "Server lk version = %d", rsp.lk_ver); + + ret = 0; +out: + if (fr) + STACK_DESTROY (fr->root); + + return ret; +} + +int +client_set_lk_version (xlator_t *this) +{ + int ret = -1; + clnt_conf_t *conf = NULL; + call_frame_t *frame = NULL; + gf_set_lk_ver_req req = {0, }; + + conf = (clnt_conf_t *) this->private; + + req.lk_ver = client_get_lk_ver (conf); + req.uid = this->ctx->process_uuid; + + gf_log (this->name, GF_LOG_DEBUG, "Sending SET_LK_VERSION"); + + frame = create_frame (this, this->ctx->pool); + if (!frame) + goto out; + + ret = client_submit_request (this, &req, frame, + conf->handshake, + GF_HNDSK_SET_LK_VER, + client_set_lk_version_cbk, + NULL, NULL, 0, NULL, 0, NULL, + (xdrproc_t)xdr_gf_set_lk_ver_req); +out: + if (ret < 0) + gf_log (this->name, GF_LOG_WARNING, + "Failed to send SET_LK_VERSION to server"); + + return ret; +} + +int +client_fd_lk_list_empty (fd_lk_ctx_t *lk_ctx) +{ + int ret = 1; + + GF_VALIDATE_OR_GOTO ("client", lk_ctx, out); + + LOCK (&lk_ctx->lock); + { + ret = list_empty (&lk_ctx->lk_list); + } + UNLOCK (&lk_ctx->lock); +out: + return ret; +} + +int +client_fd_lk_count (fd_lk_ctx_t *lk_ctx) +{ + int count = 0; + fd_lk_ctx_node_t *fd_lk = NULL; + + GF_VALIDATE_OR_GOTO ("client", lk_ctx, err); + + LOCK (&lk_ctx->lock); + { + list_for_each_entry (fd_lk, &lk_ctx->lk_list, next) + count++; + } + UNLOCK (&lk_ctx->lock); + + return count; +err: + return -1; +} + +clnt_fd_lk_local_t * +clnt_fd_lk_local_ref (xlator_t *this, clnt_fd_lk_local_t *local) +{ + GF_VALIDATE_OR_GOTO (this->name, local, out); + + LOCK (&local->lock); + { + local->ref++; + } + UNLOCK (&local->lock); +out: + return local; +} + +int +clnt_fd_lk_local_unref (xlator_t *this, clnt_fd_lk_local_t *local) +{ + int ref = -1; + + GF_VALIDATE_OR_GOTO (this->name, local, out); + + LOCK (&local->lock); + { + ref = --local->ref; + } + UNLOCK (&local->lock); + + if (ref == 0) { + LOCK_DESTROY (&local->lock); + GF_FREE (local); + } + ref = 0; +out: + return ref; +} + +clnt_fd_lk_local_t * +clnt_fd_lk_local_create (clnt_fd_ctx_t *fdctx) +{ + clnt_fd_lk_local_t *local = NULL; + + local = GF_CALLOC (1, sizeof (clnt_fd_lk_local_t), + gf_client_mt_clnt_fd_lk_local_t); + if (!local) + goto out; + + local->ref = 1; + local->error = _gf_false; + local->fdctx = fdctx; + + LOCK_INIT (&local->lock); +out: + return local; +} + +void +clnt_mark_fd_bad (clnt_conf_t *conf, clnt_fd_ctx_t *fdctx) +{ + pthread_mutex_lock (&conf->lock); + { + fdctx->remote_fd = -1; + } + pthread_mutex_unlock (&conf->lock); +} + +// call decrement_reopen_fd_count +int +clnt_release_reopen_fd_cbk (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + xlator_t *this = NULL; + call_frame_t *frame = NULL; + clnt_conf_t *conf = NULL; + clnt_fd_ctx_t *fdctx = NULL; + + frame = myframe; + this = frame->this; + fdctx = (clnt_fd_ctx_t *) frame->local; + conf = (clnt_conf_t *) this->private; + + clnt_mark_fd_bad (conf, fdctx); + + decrement_reopen_fd_count (this, conf); + + frame->local = NULL; + STACK_DESTROY (frame->root); + + return 0; +} + +int +clnt_release_reopen_fd (xlator_t *this, clnt_fd_ctx_t *fdctx) +{ + int ret = -1; + clnt_conf_t *conf = NULL; + call_frame_t *frame = NULL; + gfs3_release_req req = {{0,},}; + + conf = (clnt_conf_t *) this->private; + + frame = create_frame (THIS, THIS->ctx->pool); + if (!frame) + goto out; + + frame->local = (void *) fdctx; + req.fd = fdctx->remote_fd; + + ret = client_submit_request (this, &req, frame, conf->fops, + GFS3_OP_RELEASE, + clnt_release_reopen_fd_cbk, NULL, + NULL, 0, NULL, 0, NULL, + (xdrproc_t)xdr_gfs3_releasedir_req); +out: + if (ret) { + decrement_reopen_fd_count (this, conf); + clnt_mark_fd_bad (conf, fdctx); + if (frame) { + frame->local = NULL; + STACK_DESTROY (frame->root); + } + } + + return 0; +} + +int +clnt_fd_lk_local_mark_error (xlator_t *this, + clnt_fd_lk_local_t *local) +{ + gf_boolean_t error = _gf_false; + + LOCK (&local->lock); + { + error = local->error; + local->error = _gf_true; + } + UNLOCK (&local->lock); + + if (error) + clnt_release_reopen_fd (this, local->fdctx); + + return 0; +} + +// Also, I think in reopen_cbk, the fdctx is added to +// saved_fd list.. avoid that, may cause a problem +// Reason: While the locks on the fd are reacquired, a release +// fop may be received by the client-protocol translator +// which will free the fdctx datastructure. +int +client_reacquire_lock_cbk (struct rpc_req *req, struct iovec *iov, + int count, void *myframe) +{ + int32_t ret = -1; + xlator_t *this = NULL; + gf_common_rsp rsp = {0,}; + call_frame_t *frame = NULL; + clnt_fd_lk_local_t *local = NULL; + + frame = (call_frame_t *) myframe; + this = frame->this; + local = (clnt_fd_lk_local_t *) frame->local; + + if (req->rpc_status == -1) { + gf_log ("client", GF_LOG_WARNING, + "request failed at rpc"); + goto out; + } + + ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_common_rsp); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "XDR decoding failed"); + goto out; + } + + if (rsp.op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "lock request failed"); + ret = -1; + goto out; + } + + // TODO: Add more info to log. + gf_log (this->name, GF_LOG_DEBUG, "Reacquired lock"); + + ret = 0; +out: + if (ret < 0) + clnt_fd_lk_local_mark_error (this, local); + + (void) clnt_fd_lk_local_unref (this, local); + frame->local = NULL; + STACK_DESTROY (frame->root); + + return ret; +} + +int +_client_reacquire_lock (xlator_t *this, clnt_fd_ctx_t *fdctx) +{ + int32_t ret = -1; + int32_t gf_cmd = 0; + int32_t gf_type = 0; + gfs3_lk_req req = {{0,},}; + struct gf_flock flock = {0,}; + fd_lk_ctx_t *lk_ctx = NULL; + clnt_fd_lk_local_t *local = NULL; + fd_lk_ctx_node_t *fd_lk = NULL; + call_frame_t *frame = NULL; + clnt_conf_t *conf = NULL; + + conf = (clnt_conf_t *) this->private; + lk_ctx = fdctx->lk_ctx; + + local = clnt_fd_lk_local_create (fdctx); + if (!local) { + clnt_release_reopen_fd (this, fdctx); + goto out; + } + + list_for_each_entry (fd_lk, &lk_ctx->lk_list, next) { + memcpy (&flock, &fd_lk->user_flock, + sizeof (struct gf_flock)); + + ret = client_cmd_to_gf_cmd (fd_lk->cmd, &gf_cmd); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "client_cmd_to_gf_cmd failed, " + "aborting reacquiring of locks"); + break; + } + + gf_type = client_type_to_gf_type (flock.l_type); + req.fd = fdctx->remote_fd; + req.cmd = gf_cmd; + req.type = gf_type; + (void) gf_proto_flock_from_flock (&req.flock, + &flock); + + memcpy (req.gfid, fdctx->inode->gfid, 16); + + frame = create_frame (THIS, THIS->ctx->pool); + if (!frame) { + ret = -1; + break; + } + + frame->local = clnt_fd_lk_local_ref (this, local); + frame->root->lk_owner = fd_lk->user_flock.l_owner; + + ret = client_submit_request (this, &req, frame, + conf->fops, GFS3_OP_LK, + client_reacquire_lock_cbk, + NULL, NULL, 0, NULL, 0, NULL, + (xdrproc_t)xdr_gfs3_lk_req); + if (ret) + break; + + ret = 0; + frame = NULL; + } + + if (ret) { + clnt_fd_lk_local_mark_error (this, local); + + if (frame) { + if (frame->local) { + clnt_fd_lk_local_unref (this, frame->local); + frame->local = NULL; + } + STACK_DESTROY (frame->root); + } + } + if (local) + (void) clnt_fd_lk_local_unref (this, local); +out: + return ret; +} + +int +client_reacquire_lock (xlator_t *this, clnt_fd_ctx_t *fdctx) +{ + int32_t ret = -1; + fd_lk_ctx_t *lk_ctx = NULL; + + if (client_fd_lk_list_empty (fdctx->lk_ctx)) { + gf_log (this->name, GF_LOG_WARNING, + "fd lock list is empty"); + decrement_reopen_fd_count (this, (clnt_conf_t *)this->private); + ret = 0; + goto out; + } + + lk_ctx = fdctx->lk_ctx; + + LOCK (&lk_ctx->lock); + { + ret = _client_reacquire_lock (this, fdctx); + } + UNLOCK (&lk_ctx->lock); +out: + return ret; +} + +int client3_1_reopen_cbk (struct rpc_req *req, struct iovec *iov, int count, void *myframe) { @@ -402,11 +820,13 @@ client3_1_reopen_cbk (struct rpc_req *req, struct iovec *iov, int count, clnt_conf_t *conf = NULL; clnt_fd_ctx_t *fdctx = NULL; call_frame_t *frame = NULL; + xlator_t *this = NULL; frame = myframe; if (!frame || !frame->this) goto out; + this = frame->this; local = frame->local; conf = frame->this->private; @@ -454,7 +874,7 @@ client3_1_reopen_cbk (struct rpc_req *req, struct iovec *iov, int count, fdctx->remote_fd = rsp.fd; if (!fdctx->released) { list_add_tail (&fdctx->sfd_pos, &conf->saved_fds); - if (!list_empty (&fdctx->lock_list)) + if (!client_fd_lk_list_empty (fdctx->lk_ctx)) attempt_lock_recovery = _gf_true; fdctx = NULL; } @@ -463,31 +883,27 @@ client3_1_reopen_cbk (struct rpc_req *req, struct iovec *iov, int count, ret = 0; - attempt_lock_recovery = _gf_false; /* temporarily */ - - if (attempt_lock_recovery) { - ret = client_attempt_lock_recovery (frame->this, local->fdctx); - if (ret < 0) { - gf_log (frame->this->name, GF_LOG_DEBUG, - "lock recovery not attempted on fd"); - } else { - gf_log (frame->this->name, GF_LOG_INFO, - "need to attempt lock recovery on %"PRIu64 - " open fds", fd_count); - } + if (conf->lk_heal && attempt_lock_recovery) { + /* Delay decrement the reopen fd count untill all the + locks corresponding to this fd are acquired.*/ + gf_log (frame->this->name, GF_LOG_WARNING, "acquiring locks on " + "%s", local->loc.path); + ret = client_reacquire_lock (frame->this, local->fdctx); } else { fd_count = decrement_reopen_fd_count (frame->this, conf); } out: if (fdctx) - client_fdctx_destroy (frame->this, fdctx); + client_fdctx_destroy (this, fdctx); if ((ret < 0) && frame && frame->this && conf) decrement_reopen_fd_count (frame->this, conf); - frame->local = NULL; - STACK_DESTROY (frame->root); + if (frame) { + frame->local = NULL; + STACK_DESTROY (frame->root); + } client_local_wipe (local); @@ -792,7 +1208,8 @@ client_post_handshake (call_frame_t *frame, xlator_t *this) } } else { gf_log (this->name, GF_LOG_DEBUG, - "no open fds - notifying all parents child up"); + "no fds to open - notifying all parents child up"); + client_set_lk_version (this); client_notify_parents_child_up (this); } out: @@ -814,6 +1231,7 @@ client_setvolume_cbk (struct rpc_req *req, struct iovec *iov, int count, void *m int32_t op_ret = 0; int32_t op_errno = 0; gf_boolean_t auth_fail = _gf_false; + uint32_t lk_ver = 0; frame = myframe; this = frame->this; @@ -895,6 +1313,15 @@ client_setvolume_cbk (struct rpc_req *req, struct iovec *iov, int count, void *m goto out; } + ret = dict_get_uint32 (reply, "clnt-lk-version", &lk_ver); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "failed to find key 'clnt-lk-version' in the options"); + goto out; + } + + gf_log (this->name, GF_LOG_INFO, "clnt-lk-version = %d, " + "server-lk-version = %d", client_get_lk_ver (conf), lk_ver); /* TODO: currently setpeer path is broken */ /* if (process_uuid && req->conn && @@ -930,8 +1357,15 @@ client_setvolume_cbk (struct rpc_req *req, struct iovec *iov, int count, void *m conf->need_different_port = 0; - /* TODO: more to test */ - client_post_handshake (frame, frame->this); + if (lk_ver != client_get_lk_ver (conf)) { + client_mark_fd_bad (this); + client_post_handshake (frame, frame->this); + } else { + /*TODO: Traverse the saved fd list, and send + release to the server on fd's that were closed + during grace period */ + ; + } out: if (auth_fail) { @@ -1043,6 +1477,14 @@ client_setvolume (xlator_t *this, struct rpc_clnt *rpc) "failed to set 'volfile-checksum'"); } + ret = dict_set_int16 (options, "clnt-lk-version", + client_get_lk_ver (conf)); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set clnt-lk-version(%"PRIu32") in handshake msg", + client_get_lk_ver (conf)); + } + req.dict.dict_len = dict_serialized_length (options); if (req.dict.dict_len < 0) { gf_log (this->name, GF_LOG_ERROR, @@ -1366,6 +1808,7 @@ char *clnt_handshake_procs[GF_HNDSK_MAXVALUE] = { [GF_HNDSK_SETVOLUME] = "SETVOLUME", [GF_HNDSK_GETSPEC] = "GETSPEC", [GF_HNDSK_PING] = "PING", + [GF_HNDSK_SET_LK_VER] = "SET_LK_VER" }; rpc_clnt_prog_t clnt_handshake_prog = { diff --git a/xlators/protocol/client/src/client-lk.c b/xlators/protocol/client/src/client-lk.c index 842e3ec5b62..e99fe774de6 100644 --- a/xlators/protocol/client/src/client-lk.c +++ b/xlators/protocol/client/src/client-lk.c @@ -608,6 +608,7 @@ decrement_reopen_fd_count (xlator_t *this, clnt_conf_t *conf) if (fd_count == 0) { gf_log (this->name, GF_LOG_INFO, "last fd open'd/lock-self-heal'd - notifying CHILD-UP"); + client_set_lk_version (this); client_notify_parents_child_up (this); } diff --git a/xlators/protocol/client/src/client-mem-types.h b/xlators/protocol/client/src/client-mem-types.h index c2aa690b1c5..6bc7daad271 100644 --- a/xlators/protocol/client/src/client-mem-types.h +++ b/xlators/protocol/client/src/client-mem-types.h @@ -29,6 +29,7 @@ enum gf_client_mem_types_ { gf_client_mt_clnt_req_buf_t, gf_client_mt_clnt_fdctx_t, gf_client_mt_clnt_lock_t, + gf_client_mt_clnt_fd_lk_local_t, gf_client_mt_end, }; #endif /* __CLIENT_MEM_TYPES_H__ */ diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c index 229e0191725..8955e237dee 100644 --- a/xlators/protocol/client/src/client.c +++ b/xlators/protocol/client/src/client.c @@ -40,6 +40,81 @@ int client_handshake (xlator_t *this, struct rpc_clnt *rpc); void client_start_ping (void *data); int client_init_rpc (xlator_t *this); int client_destroy_rpc (xlator_t *this); +int client_mark_fd_bad (xlator_t *this); + +int32_t +client_type_to_gf_type (short l_type) +{ + int32_t gf_type; + + switch (l_type) { + case F_RDLCK: + gf_type = GF_LK_F_RDLCK; + break; + case F_WRLCK: + gf_type = GF_LK_F_WRLCK; + break; + case F_UNLCK: + gf_type = GF_LK_F_UNLCK; + break; + } + + return gf_type; +} + +uint32_t +client_get_lk_ver (clnt_conf_t *conf) +{ + uint32_t lk_ver = 0; + + GF_VALIDATE_OR_GOTO ("client", conf, out); + + pthread_mutex_lock (&conf->lock); + { + lk_ver = conf->lk_version; + } + pthread_mutex_unlock (&conf->lock); +out: + return lk_ver; +} + +void +client_grace_timeout (void *data) +{ + int ver = 0; + xlator_t *this = NULL; + struct clnt_conf *conf = NULL; + struct rpc_clnt *rpc = NULL; + + GF_VALIDATE_OR_GOTO ("client", data, out); + + this = THIS; + + rpc = (struct rpc_clnt *) data; + + conf = (struct clnt_conf *) this->private; + + pthread_mutex_lock (&conf->lock); + { + ver = ++conf->lk_version; + /* ver == 0 is a special value used by server + to notify client that this is a fresh connect.*/ + if (ver == 0) + ver = ++conf->lk_version; + + gf_timer_call_cancel (this->ctx, conf->grace_timer); + conf->grace_timer = NULL; + } + pthread_mutex_unlock (&conf->lock); + + gf_log (this->name, GF_LOG_WARNING, + "client grace timer expired, updating " + "the lk-version to %d", ver); + + client_mark_fd_bad (this); +out: + return; +} int client_submit_request (xlator_t *this, void *req, call_frame_t *frame, @@ -828,7 +903,6 @@ out: } - int32_t client_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) { @@ -1455,7 +1529,6 @@ out: return 0; } - int32_t client_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, struct gf_flock *lock) @@ -1841,7 +1914,7 @@ out: } - int +int client_mark_fd_bad (xlator_t *this) { clnt_conf_t *conf = NULL; @@ -1908,11 +1981,42 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, conf->last_sent_event = GF_EVENT_CHILD_UP; } } + + /* Cancel grace timer if set */ + pthread_mutex_lock (&conf->lock); + { + if (conf->grace_timer) { + gf_log (this->name, GF_LOG_WARNING, + "Cancelling the grace timer"); + + gf_timer_call_cancel (this->ctx, + conf->grace_timer); + conf->grace_timer = NULL; + } + } + pthread_mutex_unlock (&conf->lock); + break; } case RPC_CLNT_DISCONNECT: + /* client_mark_fd_bad (this); */ - client_mark_fd_bad (this); + pthread_mutex_lock (&conf->lock); + { + if (conf->grace_timer) { + gf_log (this->name, GF_LOG_DEBUG, + "Client grace timer is already set"); + } else { + gf_log (this->name, GF_LOG_WARNING, + "Registering a grace timer"); + conf->grace_timer = + gf_timer_call_after (this->ctx, + conf->grace_tv, + client_grace_timeout, + conf->rpc); + } + } + pthread_mutex_unlock (&conf->lock); if (!conf->skip_notify) { if (conf->connected) @@ -2107,6 +2211,40 @@ out: int +client_init_grace_timer (xlator_t *this, dict_t *options, + clnt_conf_t *conf) +{ + char *lk_heal = NULL; + int32_t ret = -1; + int32_t grace_timeout = -1; + + GF_VALIDATE_OR_GOTO ("client", this, out); + GF_VALIDATE_OR_GOTO (this->name, options, out); + GF_VALIDATE_OR_GOTO (this->name, conf, out); + + conf->lk_heal = _gf_true; + + ret = dict_get_str (options, "lk-heal", &lk_heal); + if (!ret) + gf_string2boolean (lk_heal, &conf->lk_heal); + + ret = dict_get_int32 (options, "grace-timeout", &grace_timeout); + if (!ret) + conf->grace_tv.tv_sec = grace_timeout; + else + conf->grace_tv.tv_sec = 10; + + conf->grace_tv.tv_usec = 0; + + gf_log (this->name, GF_LOG_INFO, "lk-heal = %s", + (conf->lk_heal) ? "on" : "off"); + + ret = 0; +out: + return ret; +} + +int reconfigure (xlator_t *this, dict_t *options) { clnt_conf_t *conf = NULL; @@ -2153,6 +2291,10 @@ reconfigure (xlator_t *this, dict_t *options) } } + ret = client_init_grace_timer (this, options, conf); + if (ret) + goto out; + ret = 0; out: return ret; @@ -2186,6 +2328,14 @@ init (xlator_t *this) pthread_mutex_init (&conf->lock, NULL); INIT_LIST_HEAD (&conf->saved_fds); + /* Initialize parameters for lock self healing*/ + conf->lk_version = 1; + conf->grace_timer = NULL; + + ret = client_init_grace_timer (this, this->options, conf); + if (ret) + goto out; + LOCK_INIT (&conf->rec_lock); conf->last_sent_event = -1; /* To start with we don't have any events */ @@ -2207,7 +2357,6 @@ init (xlator_t *this) goto out; } - ret = client_init_rpc (this); out: if (ret) @@ -2409,5 +2558,11 @@ struct volume_options options[] = { { .key = {"client-bind-insecure"}, .type = GF_OPTION_TYPE_BOOL }, + { .key = {"lk-heal"}, + .type = GF_OPTION_TYPE_STR + }, + { .key = {"grace-timeout"}, + .type = GF_OPTION_TYPE_INT + }, { .key = {NULL} }, }; diff --git a/xlators/protocol/client/src/client.h b/xlators/protocol/client/src/client.h index 2dda451c9cb..00addf34ceb 100644 --- a/xlators/protocol/client/src/client.h +++ b/xlators/protocol/client/src/client.h @@ -29,6 +29,7 @@ #include "client-mem-types.h" #include "protocol-common.h" #include "glusterfs3.h" +#include "fd-lk.h" /* FIXME: Needs to be defined in a common file */ #define CLIENT_CMD_CONNECT "trusted.glusterfs.client-connect" @@ -91,6 +92,12 @@ typedef struct clnt_conf { char need_different_port; /* flag used to change the portmap path in case of 'tcp,rdma' on server */ + gf_boolean_t lk_heal; + uint16_t lk_version; /* this variable is used to distinguish + client-server transaction while + performing lock healing */ + struct timeval grace_tv; + gf_timer_t *grace_timer; } clnt_conf_t; typedef struct _client_fd_ctx { @@ -105,7 +112,7 @@ typedef struct _client_fd_ctx { char released; int32_t flags; int32_t wbflags; - + fd_lk_ctx_t *lk_ctx; pthread_mutex_t mutex; struct list_head lock_list; /* List of all granted locks on this fd */ } clnt_fd_ctx_t; @@ -211,4 +218,11 @@ int32_t client_dump_locks (char *name, inode_t *inode, dict_t *dict); int client_fdctx_destroy (xlator_t *this, clnt_fd_ctx_t *fdctx); +uint32_t client_get_lk_ver (clnt_conf_t *conf); + +int32_t client_type_to_gf_type (short l_type); + +int client_mark_fd_bad (xlator_t *this); + +int client_set_lk_version (xlator_t *this); #endif /* !_CLIENT_H */ diff --git a/xlators/protocol/client/src/client3_1-fops.c b/xlators/protocol/client/src/client3_1-fops.c index 76d4fb0d691..4d6d57528f3 100644 --- a/xlators/protocol/client/src/client3_1-fops.c +++ b/xlators/protocol/client/src/client3_1-fops.c @@ -351,6 +351,7 @@ client3_1_open_cbk (struct rpc_req *req, struct iovec *iov, int count, fdctx->inode = inode_ref (fd->inode); fdctx->flags = local->flags; fdctx->wbflags = local->wbflags; + fdctx->lk_ctx = fd_lk_ctx_ref (fd->lk_ctx); INIT_LIST_HEAD (&fdctx->sfd_pos); INIT_LIST_HEAD (&fdctx->lock_list); @@ -2279,17 +2280,30 @@ client3_1_releasedir_cbk (struct rpc_req *req, struct iovec *iov, int count, int client_fdctx_destroy (xlator_t *this, clnt_fd_ctx_t *fdctx) { + clnt_conf_t *conf = NULL; call_frame_t *fr = NULL; int32_t ret = -1; + fd_lk_ctx_t *lk_ctx = NULL; if (!fdctx) goto out; + conf = (clnt_conf_t *) this->private; + if (fdctx->remote_fd == -1) { gf_log (this->name, GF_LOG_DEBUG, "not a valid fd"); goto out; } + pthread_mutex_lock (&conf->lock); + { + lk_ctx = fdctx->lk_ctx; + fdctx->lk_ctx = NULL; + } + pthread_mutex_unlock (&conf->lock); + + fd_lk_ctx_unref (lk_ctx); + fr = create_frame (this, this->ctx->pool); if (fdctx->is_dir) { @@ -4466,7 +4480,6 @@ unwind: return 0; } - int32_t client3_1_lk (call_frame_t *frame, xlator_t *this, void *data) @@ -4523,6 +4536,7 @@ client3_1_lk (call_frame_t *frame, xlator_t *this, req.cmd = gf_cmd; req.type = gf_type; gf_proto_flock_from_flock (&req.flock, args->flock); + memcpy (req.gfid, args->fd->inode->gfid, 16); ret = client_submit_request (this, &req, frame, conf->fops, GFS3_OP_LK, diff --git a/xlators/protocol/server/src/server-handshake.c b/xlators/protocol/server/src/server-handshake.c index 2c8cf059be3..374f5a49ae7 100644 --- a/xlators/protocol/server/src/server-handshake.c +++ b/xlators/protocol/server/src/server-handshake.c @@ -354,6 +354,7 @@ server_setvolume (rpcsvc_request_t *req) int32_t op_errno = EINVAL; int32_t fop_version = 0; int32_t mgmt_version = 0; + uint32_t lk_version = 0; char *buf = NULL; params = dict_new (); @@ -408,8 +409,33 @@ server_setvolume (rpcsvc_request_t *req) goto fail; } + /*lk_verion :: [1..2^31-1]*/ + ret = dict_get_uint32 (params, "clnt-lk-version", &lk_version); + if (ret < 0) { + ret = dict_set_str (reply, "ERROR", + "lock state verison not supplied"); + if (ret < 0) + gf_log (this->name, GF_LOG_DEBUG, + "failed to set error msg"); + + op_ret = -1; + op_errno = EINVAL; + goto fail; + } conn = server_connection_get (this, process_uuid); + if (!conn) { + op_ret = -1; + op_errno = ENOMEM; + goto fail; + } + + server_cancel_conn_timer (this, conn); + if (conn->lk_version != 0 && + conn->lk_version != lk_version) { + (void) server_connection_cleanup (this, conn); + } + if (req->trans->xl_private != conn) req->trans->xl_private = conn; @@ -595,6 +621,12 @@ server_setvolume (rpcsvc_request_t *req) gf_log (this->name, GF_LOG_DEBUG, "failed to set 'process-uuid'"); + ret = dict_set_uint32 (reply, "clnt-lk-version", + conn->lk_version); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "failed to set 'clnt-lk-version'"); + ret = dict_set_uint64 (reply, "transport-ptr", ((uint64_t) (long) req->trans)); if (ret) @@ -663,12 +695,50 @@ server_ping (rpcsvc_request_t *req) return 0; } +int +server_set_lk_version (rpcsvc_request_t *req) +{ + int op_ret = -1; + int op_errno = EINVAL; + gf_set_lk_ver_req args = {0, }; + gf_set_lk_ver_rsp rsp = {0,}; + server_connection_t *conn = NULL; + xlator_t *this = NULL; + + this = req->svc->mydata; + //TODO: Decide on an appropriate errno for the error-path + //below + if (!this) + goto fail; + + if (!xdr_to_generic (req->msg[0], &args, + (xdrproc_t)xdr_gf_set_lk_ver_req)) { + //failed to decode msg; + req->rpc_err = GARBAGE_ARGS; + goto fail; + } + + conn = server_connection_get (this, args.uid); + conn->lk_version = args.lk_ver; + server_connection_put (this, conn); + + rsp.lk_ver = args.lk_ver; + + op_ret = 0; +fail: + rsp.op_ret = op_ret; + rsp.op_errno = op_errno; + server_submit_reply (NULL, req, &rsp, NULL, 0, NULL, + (xdrproc_t)xdr_gf_set_lk_ver_rsp); + return 0; +} rpcsvc_actor_t gluster_handshake_actors[] = { - [GF_HNDSK_NULL] = {"NULL", GF_HNDSK_NULL, server_null, NULL, NULL, 0}, - [GF_HNDSK_SETVOLUME] = {"SETVOLUME", GF_HNDSK_SETVOLUME, server_setvolume, NULL, NULL, 0}, - [GF_HNDSK_GETSPEC] = {"GETSPEC", GF_HNDSK_GETSPEC, server_getspec, NULL, NULL, 0}, - [GF_HNDSK_PING] = {"PING", GF_HNDSK_PING, server_ping, NULL, NULL, 0}, + [GF_HNDSK_NULL] = {"NULL", GF_HNDSK_NULL, server_null, NULL, NULL, 0}, + [GF_HNDSK_SETVOLUME] = {"SETVOLUME", GF_HNDSK_SETVOLUME, server_setvolume, NULL, NULL, 0}, + [GF_HNDSK_GETSPEC] = {"GETSPEC", GF_HNDSK_GETSPEC, server_getspec, NULL, NULL, 0}, + [GF_HNDSK_PING] = {"PING", GF_HNDSK_PING, server_ping, NULL, NULL, 0}, + [GF_HNDSK_SET_LK_VER] = {"SET_LK_VER", GF_HNDSK_SET_LK_VER, server_set_lk_version, NULL, NULL }, }; diff --git a/xlators/protocol/server/src/server-helpers.c b/xlators/protocol/server/src/server-helpers.c index 4980424d350..9de1082dc94 100644 --- a/xlators/protocol/server/src/server-helpers.c +++ b/xlators/protocol/server/src/server-helpers.c @@ -774,6 +774,7 @@ server_connection_t * server_connection_get (xlator_t *this, const char *id) { server_connection_t *conn = NULL; + server_connection_t *trav = NULL; server_conf_t *conf = NULL; GF_VALIDATE_OR_GOTO ("server", this, out); @@ -783,20 +784,29 @@ server_connection_get (xlator_t *this, const char *id) pthread_mutex_lock (&conf->mutex); { + list_for_each_entry (trav, &conf->conns, list) { + if (!strncmp (trav->id, id, strlen (id))) { + conn = trav; + conn->ref++; + goto unlock; + } + } + conn = (void *) GF_CALLOC (1, sizeof (*conn), gf_server_mt_conn_t); if (!conn) goto unlock; conn->id = gf_strdup (id); + /*'0' denotes uninitialised lock state*/ + conn->lk_version = 0; + conn->ref++; conn->fdtable = gf_fd_fdtable_alloc (); conn->ltable = gf_lock_table_new (); conn->this = this; pthread_mutex_init (&conn->lock, NULL); - list_add (&conn->list, &conf->conns); - conn->ref++; } unlock: pthread_mutex_unlock (&conf->mutex); @@ -982,6 +992,17 @@ out: return ret; } +void +put_server_conn_state (xlator_t *this, rpc_transport_t *xprt) +{ + GF_VALIDATE_OR_GOTO ("server", this, out); + GF_VALIDATE_OR_GOTO ("server", xprt, out); + + xprt->xl_private = NULL; +out: + return; +} + server_connection_t * get_server_conn_state (xlator_t *this, rpc_transport_t *xprt) { @@ -1497,3 +1518,26 @@ gf_server_check_setxattr_cmd (call_frame_t *frame, dict_t *dict) return 0; } + +void +server_cancel_conn_timer (xlator_t *this, server_connection_t *conn) +{ + if (!this || !conn) { + gf_log (THIS->name, GF_LOG_ERROR, "Invalid arguments to " + "cancel connection timer"); + return; + } + + pthread_mutex_lock (&conn->lock); + { + if (!conn->timer) + goto unlock; + + gf_timer_call_cancel (this->ctx, conn->timer); + conn->timer = NULL; + } +unlock: + pthread_mutex_unlock (&conn->lock); + + return; +} diff --git a/xlators/protocol/server/src/server-helpers.h b/xlators/protocol/server/src/server-helpers.h index 844c98c27bf..99ba7e546b4 100644 --- a/xlators/protocol/server/src/server-helpers.h +++ b/xlators/protocol/server/src/server-helpers.h @@ -68,6 +68,12 @@ server_print_request (call_frame_t *frame); call_frame_t * get_frame_from_request (rpcsvc_request_t *req); +void +server_cancel_conn_timer (xlator_t *this, server_connection_t *conn); + +void +put_server_conn_state (xlator_t *this, rpc_transport_t *xprt); + server_connection_t * get_server_conn_state (xlator_t *this, rpc_transport_t *xptr); diff --git a/xlators/protocol/server/src/server-mem-types.h b/xlators/protocol/server/src/server-mem-types.h index 88bae8cb45d..5438ed6db1a 100644 --- a/xlators/protocol/server/src/server-mem-types.h +++ b/xlators/protocol/server/src/server-mem-types.h @@ -33,6 +33,7 @@ enum gf_server_mem_types_ { gf_server_mt_dirent_rsp_t, gf_server_mt_rsp_buf_t, gf_server_mt_volfile_ctx_t, + gf_server_mt_timer_data_t, gf_server_mt_end, }; #endif /* __SERVER_MEM_TYPES_H__ */ diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c index b0697bb7b9d..b45b77baae0 100644 --- a/xlators/protocol/server/src/server.c +++ b/xlators/protocol/server/src/server.c @@ -36,6 +36,26 @@ #include "authenticate.h" #include "rpcsvc.h" +void +grace_time_handler (void *data) +{ + server_connection_t *conn = NULL; + xlator_t *this = NULL; + + conn = data; + this = conn->this; + + GF_VALIDATE_OR_GOTO (THIS->name, conn, out); + GF_VALIDATE_OR_GOTO (THIS->name, this, out); + + gf_log (this->name, GF_LOG_INFO, "grace timer expired"); + + server_cancel_conn_timer (this, conn); + server_connection_put (this, conn); +out: + return; +} + struct iobuf * gfs_serialize_reply (rpcsvc_request_t *req, void *arg, struct iovec *outmsg, xdrproc_t xdrproc) @@ -554,11 +574,10 @@ int server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event, void *data) { - xlator_t *this = NULL; - rpc_transport_t *xprt = NULL; - server_connection_t *conn = NULL; - server_conf_t *conf = NULL; - + xlator_t *this = NULL; + rpc_transport_t *xprt = NULL; + server_connection_t *conn = NULL; + server_conf_t *conf = NULL; if (!xl || !data) { gf_log_callingfn ("server", GF_LOG_WARNING, @@ -589,20 +608,37 @@ server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event, } case RPCSVC_EVENT_DISCONNECT: conn = get_server_conn_state (this, xprt); - if (conn) - server_connection_cleanup (this, conn); - - gf_log (this->name, GF_LOG_INFO, - "disconnected connection from %s", - xprt->peerinfo.identifier); + if (!conn) + break; + put_server_conn_state (this, xprt); + gf_log (this->name, GF_LOG_INFO, "disconnecting connection" + "from %s", xprt->peerinfo.identifier); list_del (&xprt->list); + pthread_mutex_lock (&conn->lock); + { + if (conn->timer) + goto unlock; + + gf_log (this->name, GF_LOG_INFO, "starting a grace " + "timer for %s", xprt->name); + + conn->timer = gf_timer_call_after (this->ctx, + conf->grace_tv, + grace_time_handler, + conn); + } + unlock: + pthread_mutex_unlock (&conn->lock); + break; case RPCSVC_EVENT_TRANSPORT_DESTROY: - conn = get_server_conn_state (this, xprt); - if (conn) - server_connection_put (this, conn); + /*- conn obj has been disassociated from xprt on first + * disconnect. + * conn cleanup and destruction is handed over to + * grace_time_handler or the subsequent handler that 'owns' + * the conn. Nothing left to be done here. */ break; default: break; @@ -668,6 +704,30 @@ _copy_auth_opt (dict_t *unused, int +server_init_grace_timer (xlator_t *this, dict_t *options, + server_conf_t *conf) +{ + int32_t ret = -1; + int32_t grace_timeout = -1; + + GF_VALIDATE_OR_GOTO ("server", this, out); + GF_VALIDATE_OR_GOTO (this->name, options, out); + GF_VALIDATE_OR_GOTO (this->name, conf, out); + + ret = dict_get_int32 (options, "grace-timeout", &grace_timeout); + if (!ret) + conf->grace_tv.tv_sec = grace_timeout; + else + conf->grace_tv.tv_sec = 10; + + conf->grace_tv.tv_usec = 0; + + ret = 0; +out: + return ret; +} + +int reconfigure (xlator_t *this, dict_t *options) { @@ -761,6 +821,7 @@ reconfigure (xlator_t *this, dict_t *options) "Reconfigure not found for transport" ); } } + ret = server_init_grace_timer (this, options, conf); out: gf_log ("", GF_LOG_DEBUG, "returning %d", ret); @@ -797,6 +858,10 @@ init (xlator_t *this) INIT_LIST_HEAD (&conf->xprt_list); pthread_mutex_init (&conf->mutex, NULL); + ret = server_init_grace_timer (this, this->options, conf); + if (ret) + goto out; + ret = server_build_config (this, conf); if (ret) goto out; @@ -1032,5 +1097,8 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_PATH, .default_value = "/tmp" }, + {.key = {"grace-timeout"}, + .type = GF_OPTION_TYPE_INT, + }, { .key = {NULL} }, }; diff --git a/xlators/protocol/server/src/server.h b/xlators/protocol/server/src/server.h index 92785c5a9d6..091a02ccba2 100644 --- a/xlators/protocol/server/src/server.h +++ b/xlators/protocol/server/src/server.h @@ -28,6 +28,7 @@ #include "protocol-common.h" #include "server-mem-types.h" #include "glusterfs3.h" +#include "timer.h" #define DEFAULT_BLOCK_SIZE 4194304 /* 4MB */ #define DEFAULT_VOLUME_FILE_PATH CONFDIR "/glusterfs.vol" @@ -60,8 +61,10 @@ struct _server_connection { pthread_mutex_t lock; fdtable_t *fdtable; struct _lock_table *ltable; + gf_timer_t *timer; xlator_t *bound_xl; xlator_t *this; + uint32_t lk_version; }; typedef struct _server_connection server_connection_t; @@ -92,7 +95,7 @@ struct server_conf { gf_boolean_t trace; char *conf_dir; struct _volfile_ctx *volfile; - + struct timeval grace_tv; dict_t *auth_modules; pthread_mutex_t mutex; struct list_head conns; |