diff options
author | Mohammed Junaid <junaid@redhat.com> | 2012-02-08 18:06:39 +0530 |
---|---|---|
committer | Vijay Bellur <vijay@gluster.com> | 2012-02-20 04:45:31 -0800 |
commit | f764516c2e526624ce0088963924ff2d88304553 (patch) | |
tree | 85262797baad440b12853a3a6ad41ab518d9f996 /libglusterfs | |
parent | 4d1b040f00e7ec8de997d151b35fa035bba9cb25 (diff) |
protocol/client,server: fcntl lock self healing.
Currently(with out this patch), on a disconnect the server cleans up
the transport which inturn closes the fd's and releases the locks acquired on
those fd's by that client. On a reconnect, client just reopens the fd's but
doesn't reacquire the locks. The application that had previously acquired
the locks still is under the assumption that it is the owner of those locks
which might have been granted to other clients(if they request) by the server
leading to data corruption.
This patch allows the client to reacquire the fcntl locks (held on the fd's)
during client-server handshake.
* The server identifies the client via process-uuid-xl (which is a combination
of uuid and client-protocol name, it is assumed to be unique) and lk-version
number.
* The client maintains a list of process-uuid-xl, lk-version pair for each
accepted connection. On a connect, the server traverses the list for a
matching pair, if a matching pair is not found the the server returns
lk-version with value 0, else it returns the lk-version it has in store.
* On a disconnect, the server and client enter grace period, and on the
completion of the grace period, the client bumps up its lk-version number
(which means, it will reacquire the locks the next time) and the server will
distroy the connection. If reconnection happens within the grace period, the
server will find the matching (process-uuid-xl, lk-version) pair in its list
which guarantees that the fd's and there corresponding locks are still valid
for this client.
Configurable options:
To set grace-timeout, the following options are
option server.grace-timeout value
option client.grace-timeout value
To enable or disable the lk-heal,
option lk-heal [on|off]
gluster volume set command can be used to configurable options
Change-Id: Id677ef1087b300d649f278b8b2aa0d94eae85ed2
BUG: 795386
Signed-off-by: Mohammed Junaid <junaid@redhat.com>
Reviewed-on: http://review.gluster.com/2766
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Vijay Bellur <vijay@gluster.com>
Diffstat (limited to 'libglusterfs')
-rw-r--r-- | libglusterfs/src/Makefile.am | 4 | ||||
-rw-r--r-- | libglusterfs/src/fd-lk.c | 458 | ||||
-rw-r--r-- | libglusterfs/src/fd-lk.h | 72 | ||||
-rw-r--r-- | libglusterfs/src/fd.c | 19 | ||||
-rw-r--r-- | libglusterfs/src/fd.h | 3 | ||||
-rw-r--r-- | libglusterfs/src/mem-types.h | 4 |
6 files changed, 552 insertions, 8 deletions
diff --git a/libglusterfs/src/Makefile.am b/libglusterfs/src/Makefile.am index 34543f62215..bbe7a2cd739 100644 --- a/libglusterfs/src/Makefile.am +++ b/libglusterfs/src/Makefile.am @@ -23,7 +23,7 @@ libglusterfs_la_SOURCES = dict.c xlator.c logging.c \ $(CONTRIBDIR)/uuid/parse.c $(CONTRIBDIR)/uuid/unparse.c \ $(CONTRIBDIR)/uuid/uuid_time.c $(CONTRIBDIR)/uuid/compare.c \ $(CONTRIBDIR)/uuid/isnull.c $(CONTRIBDIR)/uuid/unpack.c syncop.c \ - graph-print.c trie.c run.c options.c + graph-print.c trie.c run.c options.c fd-lk.c nodist_libglusterfs_la_SOURCES = y.tab.c graph.lex.c @@ -37,7 +37,7 @@ noinst_HEADERS = common-utils.h defaults.h dict.h glusterfs.h hashfn.h \ rbthash.h iatt.h latency.h mem-types.h $(CONTRIBDIR)/uuid/uuidd.h \ $(CONTRIBDIR)/uuid/uuid.h $(CONTRIBDIR)/uuid/uuidP.h \ $(CONTRIB_BUILDDIR)/uuid/uuid_types.h syncop.h graph-utils.h trie.h run.h \ - options.h lkowner.h + options.h lkowner.h fd-lk.h EXTRA_DIST = graph.l graph.y diff --git a/libglusterfs/src/fd-lk.c b/libglusterfs/src/fd-lk.c new file mode 100644 index 00000000000..8df43bb602f --- /dev/null +++ b/libglusterfs/src/fd-lk.c @@ -0,0 +1,458 @@ +/* + Copyright (c) 2011-2012 Gluster, Inc. <http://www.gluster.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include "fd-lk.h" +#include "common-utils.h" + + +int32_t +_fd_lk_delete_lock (fd_lk_ctx_node_t *lock) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("fd-lk", lock, out); + + list_del_init (&lock->next); + + ret = 0; +out: + return ret; +} + +int32_t +_fd_lk_destroy_lock (fd_lk_ctx_node_t *lock) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("fd-lk", lock, out); + + GF_FREE (lock); + + ret = 0; +out: + return ret; +} + +int +_fd_lk_destroy_lock_list (fd_lk_ctx_t *lk_ctx) +{ + int ret = -1; + fd_lk_ctx_node_t *lk = NULL; + fd_lk_ctx_node_t *tmp = NULL; + + GF_VALIDATE_OR_GOTO ("fd-lk", lk_ctx, out); + + list_for_each_entry_safe (lk, tmp, &lk_ctx->lk_list, next) { + _fd_lk_delete_lock (lk); + _fd_lk_destroy_lock (lk); + } + ret = 0; +out: + return ret; +} + +int +fd_lk_ctx_unref (fd_lk_ctx_t *lk_ctx) +{ + int ref = -1; + + GF_VALIDATE_OR_GOTO ("fd-lk", lk_ctx, err); + + LOCK (&lk_ctx->lock); + { + ref = --lk_ctx->ref; + if (ref < 0) + GF_ASSERT (!ref); + if (ref == 0) + _fd_lk_destroy_lock_list (lk_ctx); + } + UNLOCK (&lk_ctx->lock); + + if (ref == 0) { + LOCK_DESTROY (&lk_ctx->lock); + GF_FREE (lk_ctx); + } + + return 0; +err: + return -1; +} + +fd_lk_ctx_t * +_fd_lk_ctx_ref (fd_lk_ctx_t *lk_ctx) +{ + if (!lk_ctx) { + gf_log_callingfn ("fd", GF_LOG_WARNING, + "invalid argument"); + return NULL; + } + + ++lk_ctx->ref; + + return lk_ctx; +} + +fd_lk_ctx_t * +fd_lk_ctx_ref (fd_lk_ctx_t *lk_ctx) +{ + fd_lk_ctx_t *new_lk_ctx = NULL; + + if (!lk_ctx) { + gf_log_callingfn ("fd", GF_LOG_WARNING, + "invalid argument"); + return NULL; + } + + LOCK (&lk_ctx->lock); + { + new_lk_ctx = _fd_lk_ctx_ref (lk_ctx); + } + UNLOCK (&lk_ctx->lock); + + return new_lk_ctx; +} + +fd_lk_ctx_t * +fd_lk_ctx_create () +{ + fd_lk_ctx_t *fd_lk_ctx = NULL; + + fd_lk_ctx = GF_CALLOC (1, sizeof (fd_lk_ctx_t), + gf_common_mt_fd_lk_ctx_t); + if (!fd_lk_ctx) + goto out; + + INIT_LIST_HEAD (&fd_lk_ctx->lk_list); + + LOCK_INIT (&fd_lk_ctx->lock); + + fd_lk_ctx = fd_lk_ctx_ref (fd_lk_ctx); +out: + return fd_lk_ctx; +} + +int +_fd_lk_insert_lock (fd_lk_ctx_t *lk_ctx, + fd_lk_ctx_node_t *lock) +{ + list_add_tail (&lock->next, &lk_ctx->lk_list); + return 0; +} + +static off_t +_fd_lk_get_lock_len (off_t start, off_t end) +{ + if (end == LLONG_MAX) + return 0; + else + return (end - start + 1); +} + +fd_lk_ctx_node_t * +fd_lk_ctx_node_new (int32_t cmd, struct gf_flock *flock) +{ + fd_lk_ctx_node_t *new_lock = NULL; + + /* TODO: get from mem-pool */ + new_lock = GF_CALLOC (1, sizeof (fd_lk_ctx_node_t), + gf_common_mt_fd_lk_ctx_node_t); + if (!new_lock) + goto out; + + new_lock->cmd = cmd; + + if (flock) { + new_lock->fl_type = flock->l_type; + new_lock->fl_start = flock->l_start; + + if (flock->l_len == 0) + new_lock->fl_end = LLONG_MAX; + else + new_lock->fl_end = flock->l_start + flock->l_len - 1; + + memcpy (&new_lock->user_flock, flock, + sizeof (struct gf_flock)); + } + + INIT_LIST_HEAD (&new_lock->next); +out: + return new_lock; +} + +int32_t +_fd_lk_delete_unlck_locks (fd_lk_ctx_t *lk_ctx) +{ + int32_t ret = -1; + fd_lk_ctx_node_t *tmp = NULL; + fd_lk_ctx_node_t *lk = NULL; + + GF_VALIDATE_OR_GOTO ("fd-lk", lk_ctx, out); + + list_for_each_entry_safe (lk, tmp, &lk_ctx->lk_list, next) { + if (lk->fl_type == F_UNLCK) { + _fd_lk_delete_lock (lk); + _fd_lk_destroy_lock (lk); + } + } +out: + return ret; +} + +int +fd_lk_overlap (fd_lk_ctx_node_t *l1, + fd_lk_ctx_node_t *l2) +{ + if (l1->fl_end >= l2->fl_start && + l2->fl_end >= l1->fl_start) + return 1; + + return 0; +} + +fd_lk_ctx_node_t * +_fd_lk_add_locks (fd_lk_ctx_node_t *l1, + fd_lk_ctx_node_t *l2) +{ + fd_lk_ctx_node_t *sum = NULL; + + sum = fd_lk_ctx_node_new (0, NULL); + if (!sum) + goto out; + + sum->fl_start = min (l1->fl_start, l2->fl_start); + sum->fl_end = max (l1->fl_end, l2->fl_end); + + sum->user_flock.l_start = sum->fl_start; + sum->user_flock.l_len = _fd_lk_get_lock_len (sum->fl_start, + sum->fl_end); +out: + return sum; +} + +/* Subtract two locks */ +struct _values { + fd_lk_ctx_node_t *locks[3]; +}; + +int32_t +_fd_lk_sub_locks (struct _values *v, + fd_lk_ctx_node_t *big, + fd_lk_ctx_node_t *small) +{ + int32_t ret = -1; + + if ((big->fl_start == small->fl_start) && + (big->fl_end == small->fl_end)) { + /* both edges coincide with big */ + v->locks[0] = fd_lk_ctx_node_new (small->cmd, NULL); + if (!v->locks[0]) + goto out; + + memcpy (v->locks[0], big, sizeof (fd_lk_ctx_node_t)); + + v->locks[0]->fl_type = small->fl_type; + v->locks[0]->user_flock.l_type = small->fl_type; + } else if ((small->fl_start > big->fl_start) && + (small->fl_end < big->fl_end)) { + /* small lock is completely inside big lock, + break it down into 3 different locks. */ + v->locks[0] = fd_lk_ctx_node_new (big->cmd, NULL); + if (!v->locks[0]) + goto out; + + v->locks[1] = fd_lk_ctx_node_new (small->cmd, NULL); + if (!v->locks[1]) + goto out; + + v->locks[2] = fd_lk_ctx_node_new (big->cmd, NULL); + if (!v->locks[2]) + goto out; + + memcpy (v->locks[0], big, sizeof (fd_lk_ctx_node_t)); + v->locks[0]->fl_end = small->fl_start - 1; + v->locks[0]->user_flock.l_len = + _fd_lk_get_lock_len (v->locks[0]->fl_start, + v->locks[0]->fl_end); + + memcpy (v->locks[1], small, sizeof (fd_lk_ctx_node_t)); + + memcpy (v->locks[2], big, sizeof (fd_lk_ctx_node_t)); + v->locks[2]->fl_start = small->fl_end + 1; + v->locks[2]->user_flock.l_len = + _fd_lk_get_lock_len (v->locks[2]->fl_start, + v->locks[2]->fl_end); + } else if (small->fl_start == big->fl_start) { + /* One of the ends co-incide, break the + locks into two seperate parts */ + v->locks[0] = fd_lk_ctx_node_new (small->cmd, NULL); + if (!v->locks[0]) + goto out; + + v->locks[1] = fd_lk_ctx_node_new (big->cmd, NULL); + if (!v->locks[1]) + goto out; + + memcpy (v->locks[0], small, sizeof (fd_lk_ctx_node_t)); + + memcpy (v->locks[1], big, sizeof (fd_lk_ctx_node_t)); + v->locks[1]->fl_start = small->fl_end + 1; + v->locks[1]->user_flock.l_start = small->fl_end + 1; + } else if (small->fl_end == big->fl_end) { + /* One of the ends co-incide, break the + locks into two seperate parts */ + v->locks[0] = fd_lk_ctx_node_new (small->cmd, NULL); + if (!v->locks[0]) + goto out; + + v->locks[1] = fd_lk_ctx_node_new (big->cmd, NULL); + if (!v->locks[1]) + goto out; + + memcpy (v->locks[0], big, sizeof (fd_lk_ctx_node_t)); + v->locks[0]->fl_end = small->fl_start - 1; + v->locks[0]->user_flock.l_len = + _fd_lk_get_lock_len (v->locks[0]->fl_start, + v->locks[0]->fl_end); + + memcpy (v->locks[1], small, sizeof (fd_lk_ctx_node_t)); + } else { + /* We should never come to this case */ + GF_ASSERT (!"Invalid case"); + } + ret = 0; +out: + return ret; +} + +static void +_fd_lk_insert_and_merge (fd_lk_ctx_t *lk_ctx, + fd_lk_ctx_node_t *lock) +{ + int32_t ret = -1; + int32_t i = 0; + fd_lk_ctx_node_t *entry = NULL; + fd_lk_ctx_node_t *t = NULL; + fd_lk_ctx_node_t *sum = NULL; + struct _values v = {.locks = {0, 0, 0 }}; + + list_for_each_entry_safe (entry, t, &lk_ctx->lk_list, next) { + if (!fd_lk_overlap (entry, lock)) + continue; + + if (entry->fl_type == lock->fl_type) { + sum = _fd_lk_add_locks (entry, lock); + if (sum) + return; + sum->fl_type = entry->fl_type; + sum->user_flock.l_type = entry->fl_type; + _fd_lk_delete_lock (entry); + _fd_lk_destroy_lock (entry); + _fd_lk_destroy_lock (lock); + _fd_lk_insert_and_merge (lk_ctx, sum); + return; + } else { + sum = _fd_lk_add_locks (entry, lock); + sum->fl_type = entry->fl_type; + sum->user_flock.l_type = entry->fl_type; + ret = _fd_lk_sub_locks (&v, sum, lock); + if (ret) + return; + _fd_lk_delete_lock (entry); + _fd_lk_destroy_lock (entry); + + _fd_lk_delete_lock (lock); + _fd_lk_destroy_lock (lock); + + for (i = 0; i < 3; i++) { + if (!v.locks[i]) + continue; + + INIT_LIST_HEAD (&v.locks[i]->next); + _fd_lk_insert_and_merge (lk_ctx, v.locks[i]); + } + _fd_lk_delete_unlck_locks (lk_ctx); + return; + } + } + + /* no conflicts, so just insert */ + if (lock->fl_type != F_UNLCK) { + _fd_lk_insert_lock (lk_ctx, lock); + } else { + _fd_lk_destroy_lock_list (lk_ctx); + } +} + +static void +print_lock_list (fd_lk_ctx_t *lk_ctx) +{ + fd_lk_ctx_node_t *lk = NULL; + + gf_log ("fd-lk", GF_LOG_WARNING, "lock list:"); + + list_for_each_entry (lk, &lk_ctx->lk_list, next) + gf_log ("fd-lk", GF_LOG_DEBUG, "owner = %s, " + "cmd = %s fl_type = %s, fs_start = %"PRId64", " + "fs_end = %"PRId64", user_flock: l_type = %s, " + "l_start = %"PRId64", l_len = %"PRId64", ", + lkowner_utoa (&lk->user_flock.l_owner), + get_lk_cmd (lk->cmd), get_lk_type (lk->fl_type), + lk->fl_start, lk->fl_end, + get_lk_type (lk->user_flock.l_type), + lk->user_flock.l_start, + lk->user_flock.l_len); +} + +int +fd_lk_insert_and_merge (fd_t *fd, int32_t cmd, + struct gf_flock *flock) +{ + int32_t ret = -1; + fd_lk_ctx_t *lk_ctx = NULL; + fd_lk_ctx_node_t *lk = NULL; + + GF_VALIDATE_OR_GOTO ("fd-lk", fd, out); + GF_VALIDATE_OR_GOTO ("fd-lk", flock, out); + + lk_ctx = fd_lk_ctx_ref (fd->lk_ctx); + lk = fd_lk_ctx_node_new (cmd, flock); + + gf_log ("fd-lk", GF_LOG_DEBUG, + "new lock requrest: owner = %s, fl_type = %s, " + "fs_start = %"PRId64", fs_end = %"PRId64", " + "user_flock: l_type = %s, l_start = %"PRId64", " + "l_len = %"PRId64, lkowner_utoa (&flock->l_owner), + get_lk_type (lk->fl_type), lk->fl_start, + lk->fl_end, get_lk_type (lk->user_flock.l_type), + lk->user_flock.l_start, + lk->user_flock.l_len); + + LOCK (&lk_ctx->lock); + { + _fd_lk_insert_and_merge (lk_ctx, lk); + print_lock_list (lk_ctx); + } + UNLOCK (&lk_ctx->lock); + + fd_lk_ctx_unref (lk_ctx); + + ret = 0; +out: + return ret; +} diff --git a/libglusterfs/src/fd-lk.h b/libglusterfs/src/fd-lk.h new file mode 100644 index 00000000000..3e419e14377 --- /dev/null +++ b/libglusterfs/src/fd-lk.h @@ -0,0 +1,72 @@ +/* + Copyright (c) 2011-2012 Gluster, Inc. <http://www.gluster.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _FD_LK_H +#define _FD_LK_H + +#include "fd.h" +#include "locking.h" +#include "list.h" +#include "logging.h" +#include "mem-pool.h" +#include "mem-types.h" +#include "glusterfs.h" + +#define get_lk_type(type) \ + type == F_UNLCK ? "F_UNLCK" : (type == F_RDLCK ? "F_RDLCK" : "F_WRLCK") + +#define get_lk_cmd(cmd) \ + cmd == F_SETLKW ? "F_SETLKW" : (cmd == F_SETLK ? "F_SETLK" : "F_GETLK") + +struct _fd; + +struct fd_lk_ctx { + struct list_head lk_list; + int ref; + gf_lock_t lock; +}; +typedef struct fd_lk_ctx fd_lk_ctx_t; + +struct fd_lk_ctx_node { + int32_t cmd; + struct gf_flock user_flock; + off_t fl_start; + off_t fl_end; + short fl_type; + struct list_head next; +}; +typedef struct fd_lk_ctx_node fd_lk_ctx_node_t; + +fd_lk_ctx_t * +_fd_lk_ctx_ref (fd_lk_ctx_t *lk_ctx); + +fd_lk_ctx_t * +fd_lk_ctx_ref (fd_lk_ctx_t *lk_ctx); + +fd_lk_ctx_t * +fd_lk_ctx_create (); + +int +fd_lk_insert_and_merge (struct _fd *lk_ctx, int32_t cmd, + struct gf_flock *flock); + +int +fd_lk_ctx_unref (fd_lk_ctx_t *lk_ctx); + +#endif /* _FD_LK_H */ diff --git a/libglusterfs/src/fd.c b/libglusterfs/src/fd.c index 62a70c457e7..d4cc3464a93 100644 --- a/libglusterfs/src/fd.c +++ b/libglusterfs/src/fd.c @@ -429,6 +429,7 @@ fd_destroy (fd_t *fd) GF_FREE (fd->_ctx); inode_unref (fd->inode); fd->inode = (inode_t *)0xaaaaaaaa; + fd_lk_ctx_unref (fd->lk_ctx); mem_put (fd); out: return; @@ -505,11 +506,12 @@ __fd_create (inode_t *inode, pid_t pid) fd->_ctx = GF_CALLOC (1, (sizeof (struct _fd_ctx) * fd->xl_count), gf_common_mt_fd_ctx); - if (!fd->_ctx) { - mem_put (fd); - fd = NULL; - goto out; - } + if (!fd->_ctx) + goto free_fd; + + fd->lk_ctx = fd_lk_ctx_create (); + if (!fd->lk_ctx) + goto free_fd_ctx; fd->inode = inode_ref (inode); fd->pid = pid; @@ -518,6 +520,13 @@ __fd_create (inode_t *inode, pid_t pid) LOCK_INIT (&fd->lock); out: return fd; + +free_fd_ctx: + GF_FREE (fd->_ctx); +free_fd: + mem_put (fd); + + return NULL; } diff --git a/libglusterfs/src/fd.h b/libglusterfs/src/fd.h index d4cd9bd0662..6b0ed891ff0 100644 --- a/libglusterfs/src/fd.h +++ b/libglusterfs/src/fd.h @@ -30,9 +30,11 @@ #include <unistd.h> #include "glusterfs.h" #include "locking.h" +#include "fd-lk.h" struct _inode; struct _dict; +struct fd_lk_ctx; struct _fd_ctx { union { @@ -59,6 +61,7 @@ struct _fd { 'struct _fd_ctx' array (_ctx).*/ struct _fd_ctx *_ctx; int xl_count; /* Number of xl referred in this fd */ + struct fd_lk_ctx *lk_ctx; }; typedef struct _fd fd_t; diff --git a/libglusterfs/src/mem-types.h b/libglusterfs/src/mem-types.h index b8c61d6897f..1ebf4d36008 100644 --- a/libglusterfs/src/mem-types.h +++ b/libglusterfs/src/mem-types.h @@ -106,6 +106,8 @@ enum gf_common_mem_types_ { gf_common_mt_trie_end = 81, gf_common_mt_run_argv = 82, gf_common_mt_run_logbuf = 83, - gf_common_mt_end = 84 + gf_common_mt_fd_lk_ctx_t = 84, + gf_common_mt_fd_lk_ctx_node_t = 85, + gf_common_mt_end = 86, }; #endif |