From d6c1468b2779b6247e44b75276436021a3469a59 Mon Sep 17 00:00:00 2001 From: Krishnan Parthasarathi Date: Tue, 21 Jan 2014 23:41:07 +0530 Subject: rpc: transport may be destroyed while rpc isn't rpc_clnt object is destroyed after the corresponding transport object is destroyed. But rpc_clnt_reconnect, a timer driven function, refers to the transport object beyond its 'life'. Instead, using the embedded connection object prevents use after free problem wrt transport object. Also, access transport object under conn->lock. Change-Id: Iae28e8a657d02689963c510114ad7cb7e6764e62 BUG: 962619 Signed-off-by: Krishnan Parthasarathi Reviewed-on: http://review.gluster.org/6751 Tested-by: Gluster Build System Reviewed-by: Anand Avati --- xlators/mgmt/glusterd/src/glusterd-rebalance.c | 4 ++-- xlators/protocol/client/src/client-handshake.c | 2 +- xlators/protocol/client/src/client.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'xlators') diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c index b274e3367..bdedf4c04 100644 --- a/xlators/mgmt/glusterd/src/glusterd-rebalance.c +++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c @@ -126,7 +126,7 @@ __glusterd_defrag_notify (struct rpc_clnt *rpc, void *mydata, UNLOCK (&defrag->lock); gf_log ("", GF_LOG_DEBUG, "%s got RPC_CLNT_CONNECT", - rpc->conn.trans->name); + rpc->conn.name); break; } @@ -161,7 +161,7 @@ __glusterd_defrag_notify (struct rpc_clnt *rpc, void *mydata, GF_FREE (defrag); gf_log ("", GF_LOG_DEBUG, "%s got RPC_CLNT_DISCONNECT", - rpc->conn.trans->name); + rpc->conn.name); break; } case RPC_CLNT_DESTROY: diff --git a/xlators/protocol/client/src/client-handshake.c b/xlators/protocol/client/src/client-handshake.c index 7c8be42ed..85b0f757b 100644 --- a/xlators/protocol/client/src/client-handshake.c +++ b/xlators/protocol/client/src/client-handshake.c @@ -1458,7 +1458,7 @@ client_setvolume_cbk (struct rpc_req *req, struct iovec *iov, int count, void *m gf_log (this->name, GF_LOG_INFO, "Connected to %s, attached to remote volume '%s'.", - conf->rpc->conn.trans->peerinfo.identifier, + conf->rpc->conn.name, remote_subvol); rpc_clnt_set_connected (&conf->rpc->conn); diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c index b0a71d3f9..306e555ef 100644 --- a/xlators/protocol/client/src/client.c +++ b/xlators/protocol/client/src/client.c @@ -2205,7 +2205,7 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, "will keep trying to connect to " "glusterd until brick's port is " "available", - conf->rpc->conn.trans->peerinfo.identifier); + conf->rpc->conn.name); if (conf->portmap_err_logged) conf->disconnect_err_logged = 1; -- cgit From c817c214033481fe59f9f44c325a9092dc337d07 Mon Sep 17 00:00:00 2001 From: Luis Pabon Date: Thu, 20 Feb 2014 13:50:19 -0500 Subject: build: GlusterFS Unit Test Framework This patch will allow for developers to create unit tests for their code. Documentation has been added to the patch and is available here: doc/hacker-guide/en-US/markdown/unittest.md Also, unit tests are run when RPM is created. BUG: 1067059 Change-Id: I95cf8bb0354d4ca4ed4476a0f2385436a17d2369 Signed-off-by: Vijay Bellur Signed-off-by: Luis Pabon Reviewed-on: http://review.gluster.org/7145 Tested-by: Gluster Build System Reviewed-by: Rajesh Joseph Reviewed-by: Justin Clift Tested-by: Justin Clift --- xlators/cluster/dht/src/Makefile.am | 15 +++ xlators/cluster/dht/src/dht-layout.c | 20 ++++ xlators/cluster/dht/src/unittest/dht_layout_mock.c | 63 +++++++++++ .../cluster/dht/src/unittest/dht_layout_unittest.c | 124 +++++++++++++++++++++ xlators/storage/posix/src/posix-helpers.c | 10 ++ 5 files changed, 232 insertions(+) create mode 100644 xlators/cluster/dht/src/unittest/dht_layout_mock.c create mode 100644 xlators/cluster/dht/src/unittest/dht_layout_unittest.c (limited to 'xlators') diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am index 174bea841..3032705b5 100644 --- a/xlators/cluster/dht/src/Makefile.am +++ b/xlators/cluster/dht/src/Makefile.am @@ -36,3 +36,18 @@ uninstall-local: install-data-hook: ln -sf dht.so $(DESTDIR)$(xlatordir)/distribute.so + +#### UNIT TESTS ##### +CLEANFILES += *.gcda *.gcno *_xunit.xml +noinst_PROGRAMS = +TESTS = + +dht_layout_unittest_CPPFLAGS = $(UNITTEST_CPPFLAGS) $(AM_CPPFLAGS) +dht_layout_unittest_SOURCES = unittest/dht_layout_unittest.c \ + unittest/dht_layout_mock.c \ + dht-layout.c +dht_layout_unittest_CFLAGS = $(UNITTEST_CFLAGS) +dht_layout_unittest_LDADD = $(UNITTEST_LDADD) +dht_layout_unittest_LDFLAGS = $(UNITTEST_LDFLAGS) +noinst_PROGRAMS += dht_layout_unittest +TESTS += dht_layout_unittest diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c index 31d85a506..deaa493f9 100644 --- a/xlators/cluster/dht/src/dht-layout.c +++ b/xlators/cluster/dht/src/dht-layout.c @@ -25,6 +25,19 @@ #define layout_size(cnt) (layout_base_size + (cnt * layout_entry_size)) +#include +#include + +// Change GF_CALLOC and GF_FREE to use +// cmockery2 memory allocation versions +#ifdef UNIT_TESTING +#undef GF_CALLOC +#define GF_CALLOC(n, s, t) test_calloc(n, s) +#undef GF_FREE +#define GF_FREE test_free +#endif + + dht_layout_t * dht_layout_new (xlator_t *this, int cnt) @@ -32,6 +45,8 @@ dht_layout_new (xlator_t *this, int cnt) dht_layout_t *layout = NULL; dht_conf_t *conf = NULL; + REQUIRE(NULL != this); + REQUIRE(cnt >= 0); conf = this->private; @@ -50,6 +65,11 @@ dht_layout_new (xlator_t *this, int cnt) } layout->ref = 1; + + ENSURE(NULL != layout); + ENSURE(layout->type == DHT_HASH_TYPE_DM); + ENSURE(layout->cnt == cnt); + ENSURE(layout->ref == 1); out: return layout; } diff --git a/xlators/cluster/dht/src/unittest/dht_layout_mock.c b/xlators/cluster/dht/src/unittest/dht_layout_mock.c new file mode 100644 index 000000000..aa19ddc57 --- /dev/null +++ b/xlators/cluster/dht/src/unittest/dht_layout_mock.c @@ -0,0 +1,63 @@ +/* + Copyright (c) 2014 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" +#include "byte-order.h" + +int +dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p) +{ + return 0; +} + +int +dht_inode_ctx_layout_get (inode_t *inode, xlator_t *this, dht_layout_t **layout) +{ + return 0; +} + +int +dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this, + dht_layout_t *layout_int) +{ + return 0; +} + +int +dict_get_ptr (dict_t *this, char *key, void **ptr) +{ + return 0; +} + +int +dict_get_ptr_and_len (dict_t *this, char *key, void **ptr, int *len) +{ + return 0; +} + +int _gf_log (const char *domain, const char *file, + const char *function, int32_t line, gf_loglevel_t level, + const char *fmt, ...) +{ + return 0; +} + +int _gf_log_callingfn (const char *domain, const char *file, + const char *function, int32_t line, gf_loglevel_t level, + const char *fmt, ...) +{ + return 0; +} diff --git a/xlators/cluster/dht/src/unittest/dht_layout_unittest.c b/xlators/cluster/dht/src/unittest/dht_layout_unittest.c new file mode 100644 index 000000000..b5233d235 --- /dev/null +++ b/xlators/cluster/dht/src/unittest/dht_layout_unittest.c @@ -0,0 +1,124 @@ +/* + Copyright (c) 2008-2014 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "dht-common.h" +#include "logging.h" +#include "xlator.h" + +#include +#include +#include +#include +#include +#include + +/* + * Helper functions + */ + +static xlator_t * +helper_xlator_init(uint32_t num_types) +{ + xlator_t *xl; + int i, ret; + + REQUIRE(num_types > 0); + + xl = test_calloc(1, sizeof(xlator_t)); + assert_non_null(xl); + xl->mem_acct.num_types = num_types; + xl->mem_acct.rec = test_calloc(num_types, sizeof(struct mem_acct_rec)); + assert_non_null(xl->mem_acct.rec); + + xl->ctx = test_calloc(1, sizeof(glusterfs_ctx_t)); + assert_non_null(xl->ctx); + + for (i = 0; i < num_types; i++) { + ret = LOCK_INIT(&(xl->mem_acct.rec[i].lock)); + assert_false(ret); + } + + ENSURE(num_types == xl->mem_acct.num_types); + ENSURE(NULL != xl); + + return xl; +} + +static int +helper_xlator_destroy(xlator_t *xl) +{ + int i, ret; + + for (i = 0; i < xl->mem_acct.num_types; i++) { + ret = LOCK_DESTROY(&(xl->mem_acct.rec[i].lock)); + assert_int_equal(ret, 0); + } + + free(xl->mem_acct.rec); + free(xl->ctx); + free(xl); + return 0; +} + +/* + * Unit tests + */ +static void +test_dht_layout_new(void **state) +{ + xlator_t *xl; + dht_layout_t *layout; + dht_conf_t *conf; + int cnt; + + expect_assert_failure(dht_layout_new(NULL, 0)); + expect_assert_failure(dht_layout_new((xlator_t *)0x12345, -1)); + xl = helper_xlator_init(10); + + // xl->private is NULL + assert_null(xl->private); + cnt = 100; + layout = dht_layout_new(xl, cnt); + assert_non_null(layout); + assert_int_equal(layout->type, DHT_HASH_TYPE_DM); + assert_int_equal(layout->cnt, cnt); + assert_int_equal(layout->ref, 1); + assert_int_equal(layout->gen, 0); + assert_int_equal(layout->spread_cnt, 0); + free(layout); + + // xl->private is not NULL + cnt = 110; + conf = (dht_conf_t *)test_calloc(1, sizeof(dht_conf_t)); + assert_non_null(conf); + conf->dir_spread_cnt = 12345; + conf->gen = -123; + xl->private = conf; + + layout = dht_layout_new(xl, cnt); + assert_non_null(layout); + assert_int_equal(layout->type, DHT_HASH_TYPE_DM); + assert_int_equal(layout->cnt, cnt); + assert_int_equal(layout->ref, 1); + assert_int_equal(layout->gen, conf->gen); + assert_int_equal(layout->spread_cnt, conf->dir_spread_cnt); + free(layout); + + free(conf); + helper_xlator_destroy(xl); +} + +int main(void) { + const UnitTest tests[] = { + unit_test(test_dht_layout_new), + }; + + return run_tests(tests, "xlator_dht_layout"); +} diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c index 2cf46669e..5725cad7d 100644 --- a/xlators/storage/posix/src/posix-helpers.c +++ b/xlators/storage/posix/src/posix-helpers.c @@ -266,6 +266,16 @@ _posix_xattr_get_set (dict_t *xattr_req, goto err; } + /* + * There could be a situation where the ia_size is + * zero. GF_CALLOC will return a pointer to the + * memory initialized by gf_mem_set_acct_info. + * This function adds a header and a footer to + * the allocated memory. The returned pointer + * points to the memory just after the header, but + * when size is zero, there is no space for user + * data. The memory can be freed by calling GF_FREE. + */ databuf = GF_CALLOC (1, filler->stbuf->ia_size, gf_posix_mt_char); if (!databuf) { -- cgit From f1c4c9e6d47b637939b62b473178e1c3095651fc Mon Sep 17 00:00:00 2001 From: Satheesaran Date: Thu, 6 Mar 2014 15:40:31 +0530 Subject: glusterd: Fixed typo in console message during volume create While creating a volume, if the brick is created on the root partition, then the error statement is thrown. This error statements was containing two "is" in it. Removed one of the "is" Change-Id: I0d83f0feccda34989f7e2b97041d1f15ec9e2f00 BUG: 1065551 Signed-off-by: Satheesaran Reviewed-on: http://review.gluster.org/7198 Tested-by: Gluster Build System Reviewed-by: Krishnan Parthasarathi Reviewed-by: Vijay Bellur --- xlators/mgmt/glusterd/src/glusterd-utils.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'xlators') diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index ea26c4e47..3eefe36ed 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -956,7 +956,7 @@ glusterd_validate_and_create_brickpath (glusterd_brickinfo_t *brickinfo, goto out; } else if (parent_st.st_dev == root_st.st_dev) { - snprintf (msg, sizeof (msg), "The brick %s:%s is " + snprintf (msg, sizeof (msg), "The brick %s:%s " "is being created in the root partition. It " "is recommended that you don't use the " "system's root partition for storage backend." -- cgit From dec7950d4b0944697e4bb8788cc02de2ac4d8708 Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Wed, 5 Mar 2014 04:46:50 +0000 Subject: glusterd: send/receive volinfo->caps during peer probe. Problem: volinfo->caps was not sent over to newly probed peers, resulting in a 'Peer Rejected' state due to volinfo checksum mismatch. Fix: send/receive volinfo capability when peer probing. Change-Id: I2508d3fc7a6e4aeac9c22dd7fb2d3b362f4c21ff BUG: 1072720 Signed-off-by: Ravishankar N Reviewed-on: http://review.gluster.org/7186 Tested-by: Gluster Build System Reviewed-by: Kaushal M Reviewed-by: Vijay Bellur --- xlators/mgmt/glusterd/src/glusterd-utils.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'xlators') diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index 3eefe36ed..6393c554e 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -2145,6 +2145,13 @@ glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo, memset (key, 0, sizeof (key)); snprintf (key, sizeof (key), "volume%d.client-op-version", count); ret = dict_set_int32 (dict, key, volinfo->client_op_version); + if (ret) + goto out; + + /*Add volume Capability (BD Xlator) to dict*/ + memset (key, 0 ,sizeof (key)); + snprintf (key, sizeof (key), "volume%d.caps", count); + ret = dict_set_int32 (dict, key, volinfo->caps); out: GF_FREE (volume_id_str); @@ -3303,6 +3310,11 @@ glusterd_import_volinfo (dict_t *vols, int count, new_volinfo->client_op_version = 1; } + memset (key, 0 ,sizeof (key)); + snprintf (key, sizeof (key), "volume%d.caps", count); + /*This is not present in older glusterfs versions, so ignore ret value*/ + ret = dict_get_int32 (vols, key, &new_volinfo->caps); + ret = glusterd_import_bricks (vols, count, new_volinfo); if (ret) goto out; -- cgit From 61f071e15572b12d12cf9764cac6456fc2df5ff3 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Sat, 8 Mar 2014 12:50:47 -0800 Subject: locks: fix unconditional op_ret success of entrylk Bug introduced in recent refactoring. op_ret of entrylk() was always getting set to 0 even though second locker wouldn't have gotten a lock. This was resulting in multiple contenders to get locks granted at the same time. Change-Id: I99c187a9285fb80cc500b38f468f2ebda7048cab Signed-off-by: Anand Avati BUG: 849630 Reviewed-on: http://review.gluster.org/7224 Reviewed-by: Pranith Kumar Karampuri Tested-by: Gluster Build System Reviewed-by: Kaleb KEITHLEY --- xlators/features/locks/src/entrylk.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'xlators') diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c index c176306fe..ea6995627 100644 --- a/xlators/features/locks/src/entrylk.c +++ b/xlators/features/locks/src/entrylk.c @@ -607,6 +607,9 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, if (unlocked) { list_del_init (&unlocked->client_list); __pl_entrylk_unref (unlocked); + op_ret = 0; + } else { + op_errno = EINVAL; } __pl_entrylk_unref (reqlock); } @@ -624,8 +627,6 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this, "a bug report at http://bugs.gluster.com", cmd); goto out; } - - op_ret = 0; out: pl_update_refkeeper (this, inode); -- cgit From 0ddd69a60e49f1335ed29a8225e31d24c836083a Mon Sep 17 00:00:00 2001 From: Varun Shastry Date: Wed, 12 Mar 2014 15:22:14 +0530 Subject: features/quota: fix the dict leak when quota is off Change-Id: Iafe0c5104e38a1e34de1f2c2a19682178eb60e11 BUG: 1075506 Signed-off-by: Varun Shastry Reviewed-on: http://review.gluster.org/7227 Tested-by: Gluster Build System Reviewed-by: Vijay Bellur --- xlators/features/quota/src/quota.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'xlators') diff --git a/xlators/features/quota/src/quota.c b/xlators/features/quota/src/quota.c index bb5dc7aba..4beaae341 100644 --- a/xlators/features/quota/src/quota.c +++ b/xlators/features/quota/src/quota.c @@ -1075,12 +1075,12 @@ quota_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, priv = this->private; + WIND_IF_QUOTAOFF (priv->is_quota_on, off); + xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new(); if (!xattr_req) goto err; - WIND_IF_QUOTAOFF (priv->is_quota_on, off); - local = quota_local_new (); if (local == NULL) { goto err; -- cgit From 40b0bf5fc01a17e9a1628cd9ff537b7e15353958 Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Wed, 12 Mar 2014 19:56:08 +0530 Subject: storage/bd: Fix allocations/deallocations Change-Id: I39c9eb083fc1c144fe6f011dd983b877fbbff0f7 BUG: 1075717 Signed-off-by: Pranith Kumar K Reviewed-on: http://review.gluster.org/7230 Reviewed-by: Anand Avati Tested-by: Anand Avati --- xlators/storage/bd/src/Makefile.am | 2 +- xlators/storage/bd/src/bd-aio.c | 5 +++-- xlators/storage/bd/src/bd-helper.c | 9 +++++---- xlators/storage/bd/src/bd-mem-types.h | 27 +++++++++++++++++++++++++++ xlators/storage/bd/src/bd.c | 18 ++++++++++-------- xlators/storage/bd/src/bd.h | 7 ------- 6 files changed, 46 insertions(+), 22 deletions(-) create mode 100644 xlators/storage/bd/src/bd-mem-types.h (limited to 'xlators') diff --git a/xlators/storage/bd/src/Makefile.am b/xlators/storage/bd/src/Makefile.am index 3d93f7442..60ceff31b 100644 --- a/xlators/storage/bd/src/Makefile.am +++ b/xlators/storage/bd/src/Makefile.am @@ -7,7 +7,7 @@ LIBBD = -llvm2app -lrt bd_la_SOURCES = bd.c bd-helper.c bd-aio.c bd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBBD) $(LIBAIO) -noinst_HEADERS = bd.h bd-aio.h +noinst_HEADERS = bd.h bd-aio.h bd-mem-types.h AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ -I$(top_srcdir)/rpc/xdr/src \ diff --git a/xlators/storage/bd/src/bd-aio.c b/xlators/storage/bd/src/bd-aio.c index 62d4590f7..9dc13b3ec 100644 --- a/xlators/storage/bd/src/bd-aio.c +++ b/xlators/storage/bd/src/bd-aio.c @@ -29,6 +29,7 @@ #ifdef HAVE_LIBAIO #include +#include "bd-mem-types.h" struct bd_aio_cb { struct iocb iocb; @@ -187,7 +188,7 @@ bd_aio_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, goto err; } - paiocb = CALLOC (1, sizeof (*paiocb)); + paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_bd_aio_cb); if (!paiocb) { op_errno = ENOMEM; goto err; @@ -314,7 +315,7 @@ bd_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, _fd = bd_fd->fd; - paiocb = CALLOC (1, sizeof (*paiocb)); + paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_bd_aio_cb); if (!paiocb) { op_errno = ENOMEM; goto err; diff --git a/xlators/storage/bd/src/bd-helper.c b/xlators/storage/bd/src/bd-helper.c index 63e26d8a3..4bd1d6111 100644 --- a/xlators/storage/bd/src/bd-helper.c +++ b/xlators/storage/bd/src/bd-helper.c @@ -9,6 +9,7 @@ #include #include #include "bd.h" +#include "bd-mem-types.h" #include "run.h" int @@ -242,7 +243,7 @@ __bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd_p) } uuid_utoa_r (fd->inode->gfid, gfid); - asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid); + gf_asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid); if (!devpath) goto out; @@ -268,7 +269,7 @@ __bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd_p) ret = 0; out: - FREE (devpath); + GF_FREE (devpath); if (ret) { close (_fd); GF_FREE (bdfd); @@ -701,8 +702,8 @@ out: if (fd2 != -1) close (fd2); - FREE (spath); - FREE (dpath); + GF_FREE (spath); + GF_FREE (dpath); return ret; } diff --git a/xlators/storage/bd/src/bd-mem-types.h b/xlators/storage/bd/src/bd-mem-types.h new file mode 100644 index 000000000..58b448342 --- /dev/null +++ b/xlators/storage/bd/src/bd-mem-types.h @@ -0,0 +1,27 @@ +/* + Copyright (c) 2008-2014 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef __BD_MEM_TYPES_H__ +#define __BD_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_bd_mem_types_ { + gf_bd_private = gf_common_mt_end + 1, + gf_bd_attr, + gf_bd_fd, + gf_bd_loc_t, + gf_bd_int32_t, + gf_bd_aio_cb, + gf_bd_mt_end +}; + +#endif diff --git a/xlators/storage/bd/src/bd.c b/xlators/storage/bd/src/bd.c index 4c3a7e14e..0954b553d 100644 --- a/xlators/storage/bd/src/bd.c +++ b/xlators/storage/bd/src/bd.c @@ -32,6 +32,7 @@ #include "bd.h" #include "bd-aio.h" +#include "bd-mem-types.h" #include "defaults.h" #include "glusterfs3-xdr.h" #include "run.h" @@ -213,7 +214,7 @@ bd_forget (xlator_t *this, inode_t *inode) ret = bd_inode_ctx_get (inode, this, &bdatt); if (!ret) { inode_ctx_del (inode, this, &ctx); - FREE (bdatt); + GF_FREE (bdatt); } return 0; } @@ -236,7 +237,7 @@ bd_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, entry->d_stat.ia_gfid, &type, &size)) { entry->d_stat.ia_size = size; entry->d_stat.ia_blocks = size / 512; - FREE (type); + GF_FREE (type); } } @@ -653,7 +654,7 @@ bd_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, goto posix; uuid_utoa_r (fd->inode->gfid, gfid); - asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid); + gf_asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid); BD_VALIDATE_MEM_ALLOC (devpath, ret, out); _fd = open (devpath, flags | O_LARGEFILE, 0); @@ -688,7 +689,7 @@ posix: out: BD_STACK_UNWIND (open, frame, -1, ret, fd, NULL); - FREE (devpath); + GF_FREE (devpath); if (ret) { close (_fd); GF_FREE (bd_fd); @@ -1213,7 +1214,7 @@ bd_offload_dest_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - local->bdatt = CALLOC (1, sizeof (bd_attr_t)); + local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out); STACK_WIND (frame, bd_offload_getx_cbk, FIRST_CHILD(this), @@ -1303,7 +1304,7 @@ bd_offload (call_frame_t *frame, xlator_t *this, loc_t *loc, local->dict = dict_new (); BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out); - local->dloc = CALLOC (1, sizeof (loc_t)); + local->dloc = GF_CALLOC (1, sizeof (loc_t), gf_bd_loc_t); BD_VALIDATE_MEM_ALLOC (local->dloc, op_errno, out); strncpy (param, local->data->data, local->data->len); @@ -1923,7 +1924,7 @@ bd_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, memcpy (postbuf, &bdatt->iatt, sizeof (struct iatt)); out: - FREE (valid); + GF_FREE (valid); BD_STACK_UNWIND (setattr, frame, op_ret, op_errno, prebuf, postbuf, xdata); return 0; @@ -1948,7 +1949,7 @@ bd_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, local = bd_local_init (frame, this); BD_VALIDATE_MEM_ALLOC (local, op_errno, out); - ck_valid = CALLOC (1, sizeof (valid)); + ck_valid = GF_CALLOC (1, sizeof (valid), gf_bd_int32_t); BD_VALIDATE_MEM_ALLOC (ck_valid, op_errno, out); local->inode = inode_ref (loc->inode); @@ -2268,6 +2269,7 @@ mem_acct_init (xlator_t *this) if (ret != 0) gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" "failed"); + return ret; } diff --git a/xlators/storage/bd/src/bd.h b/xlators/storage/bd/src/bd.h index f59bc6a09..5307ca407 100644 --- a/xlators/storage/bd/src/bd.h +++ b/xlators/storage/bd/src/bd.h @@ -92,13 +92,6 @@ typedef char bd_gfid_t[GF_UUID_BUF_SIZE]; -enum gf_bd_mem_types_ { - gf_bd_private = gf_common_mt_end + 1, - gf_bd_attr, - gf_bd_fd, - gf_bd_mt_end -}; - /** * bd_fd - internal structure */ -- cgit From 6224e878cdf780360b49760c4b0c20584bbc0b6f Mon Sep 17 00:00:00 2001 From: Luis Pabon Date: Sun, 16 Mar 2014 23:07:19 -0400 Subject: build: Remove cmockery2 from repo While we wait for cmockery2 to be available from Fedora, we can remove cmockery2 from the repo. BUG: 1077011 Change-Id: I75d462c607cd376a5d838ea83f4d12eb59757e73 Signed-off-by: Luis Pabon Reviewed-on: http://review.gluster.org/7281 Reviewed-by: Justin Clift Tested-by: Gluster Build System Reviewed-by: Harshavardhana Reviewed-by: Niels de Vos Reviewed-by: Anand Avati --- xlators/cluster/dht/src/Makefile.am | 15 --------------- xlators/cluster/dht/src/dht-layout.c | 21 --------------------- 2 files changed, 36 deletions(-) (limited to 'xlators') diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am index 3032705b5..174bea841 100644 --- a/xlators/cluster/dht/src/Makefile.am +++ b/xlators/cluster/dht/src/Makefile.am @@ -36,18 +36,3 @@ uninstall-local: install-data-hook: ln -sf dht.so $(DESTDIR)$(xlatordir)/distribute.so - -#### UNIT TESTS ##### -CLEANFILES += *.gcda *.gcno *_xunit.xml -noinst_PROGRAMS = -TESTS = - -dht_layout_unittest_CPPFLAGS = $(UNITTEST_CPPFLAGS) $(AM_CPPFLAGS) -dht_layout_unittest_SOURCES = unittest/dht_layout_unittest.c \ - unittest/dht_layout_mock.c \ - dht-layout.c -dht_layout_unittest_CFLAGS = $(UNITTEST_CFLAGS) -dht_layout_unittest_LDADD = $(UNITTEST_LDADD) -dht_layout_unittest_LDFLAGS = $(UNITTEST_LDFLAGS) -noinst_PROGRAMS += dht_layout_unittest -TESTS += dht_layout_unittest diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c index deaa493f9..e1a37b77c 100644 --- a/xlators/cluster/dht/src/dht-layout.c +++ b/xlators/cluster/dht/src/dht-layout.c @@ -25,29 +25,12 @@ #define layout_size(cnt) (layout_base_size + (cnt * layout_entry_size)) -#include -#include - -// Change GF_CALLOC and GF_FREE to use -// cmockery2 memory allocation versions -#ifdef UNIT_TESTING -#undef GF_CALLOC -#define GF_CALLOC(n, s, t) test_calloc(n, s) -#undef GF_FREE -#define GF_FREE test_free -#endif - - - dht_layout_t * dht_layout_new (xlator_t *this, int cnt) { dht_layout_t *layout = NULL; dht_conf_t *conf = NULL; - REQUIRE(NULL != this); - REQUIRE(cnt >= 0); - conf = this->private; layout = GF_CALLOC (1, layout_size (cnt), @@ -66,10 +49,6 @@ dht_layout_new (xlator_t *this, int cnt) layout->ref = 1; - ENSURE(NULL != layout); - ENSURE(layout->type == DHT_HASH_TYPE_DM); - ENSURE(layout->cnt == cnt); - ENSURE(layout->ref == 1); out: return layout; } -- cgit From 128863af2d7f37571583fe37424e76b46f8525d4 Mon Sep 17 00:00:00 2001 From: Kotresh H R Date: Tue, 18 Mar 2014 14:45:42 +0530 Subject: geo-rep/glusterd: Fix geo-rep status on introduction of volume lock Getting op context in 'glusterd_op_gsync_set' is no longer valid as it is expected that 'rsp_dict' sent from caller is filled. It was fine till now as no one was setting the op context. The introduction of volume locks sets it, consequently breaking geo-rep status command. Hence the code that gets dict from op context if present is removed. Also corrected some indentation issues in 'glusterd_op_gsync_set' Signed-off-by: Kotresh H R Change-Id: Ieacd6e6c9be3c92159f849caca2acf5aabca1e32 BUG: 1077697 Signed-off-by: Kotresh H R Reviewed-on: http://review.gluster.org/7289 Reviewed-by: Avra Sengupta Tested-by: Gluster Build System --- xlators/mgmt/glusterd/src/glusterd-geo-rep.c | 31 +++++++++++++--------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'xlators') diff --git a/xlators/mgmt/glusterd/src/glusterd-geo-rep.c b/xlators/mgmt/glusterd/src/glusterd-geo-rep.c index 9208ece2d..9433a128e 100644 --- a/xlators/mgmt/glusterd/src/glusterd-geo-rep.c +++ b/xlators/mgmt/glusterd/src/glusterd-geo-rep.c @@ -3900,8 +3900,6 @@ glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict) { int32_t ret = -1; int32_t type = -1; - dict_t *ctx = NULL; - dict_t *resp_dict = NULL; char *host_uuid = NULL; char *slave = NULL; char *slave_ip = NULL; @@ -3919,6 +3917,7 @@ glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict) GF_ASSERT (THIS->private); GF_ASSERT (dict); GF_ASSERT (op_errstr); + GF_ASSERT (rsp_dict); priv = THIS->private; @@ -3930,12 +3929,8 @@ glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict) if (ret < 0) goto out; - ctx = glusterd_op_get_ctx (); - resp_dict = ctx ? ctx : rsp_dict; - GF_ASSERT (resp_dict); - if (type == GF_GSYNC_OPTION_TYPE_STATUS) { - ret = glusterd_get_gsync_status (dict, op_errstr, resp_dict); + ret = glusterd_get_gsync_status (dict, op_errstr, rsp_dict); goto out; } @@ -3965,8 +3960,8 @@ glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict) if (dict_get_str (dict, "master", &volname) == 0) { ret = glusterd_volinfo_find (volname, &volinfo); if (ret) { - gf_log ("", GF_LOG_WARNING, "Volinfo for %s (master) not found", - volname); + gf_log ("", GF_LOG_WARNING, "Volinfo for %s (master)" + " not found", volname); goto out; } @@ -3975,9 +3970,9 @@ glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict) if (type == GF_GSYNC_OPTION_TYPE_CONFIG) { ret = glusterd_gsync_configure (volinfo, slave, path_list, - dict, resp_dict, op_errstr); + dict, rsp_dict, op_errstr); if (!ret) { - ret = dict_set_str (resp_dict, "conf_path", conf_path); + ret = dict_set_str (rsp_dict, "conf_path", conf_path); if (ret) { gf_log ("", GF_LOG_ERROR, "Unable to store conf_file_path."); @@ -3994,7 +3989,7 @@ glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict) ret = glusterd_gsync_delete (volinfo, slave, slave_ip, slave_vol, path_list, dict, - resp_dict, op_errstr); + rsp_dict, op_errstr); goto out; } @@ -4009,8 +4004,9 @@ glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict) ret = glusterd_set_gsync_confs (volinfo); if (ret != 0) { - gf_log ("", GF_LOG_WARNING, "marker/changelog start failed"); - *op_errstr = gf_strdup ("failed to initialize indexing"); + gf_log ("", GF_LOG_WARNING, "marker/changelog" + " start failed"); + *op_errstr = gf_strdup ("Index initialization failed"); ret = -1; goto out; } @@ -4031,9 +4027,10 @@ glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict) goto out; } - ret = stop_gsync (volname, slave, &status_msg, conf_path, is_force); + ret = stop_gsync (volname, slave, &status_msg, + conf_path, is_force); if (ret == 0 && status_msg) - ret = dict_set_str (resp_dict, "gsync-status", + ret = dict_set_str (rsp_dict, "gsync-status", status_msg); if (ret != 0 && !is_force && path_list) *op_errstr = gf_strdup ("internal error"); @@ -4041,7 +4038,7 @@ glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict) if (!ret) { ret = glusterd_create_status_file (volinfo->volname, slave, slave_ip, - slave_vol, "Stopped"); + slave_vol,"Stopped"); if (ret) { gf_log ("", GF_LOG_ERROR, "Unable to update" "state_file. Error : %s", -- cgit From eb87c96f49b3dd2c7460e58c54ce909c706cd475 Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Wed, 19 Mar 2014 18:03:54 +0100 Subject: build: do not create versioned .so files There has been a misspelled option in the Makefile.am files. The option is called -avoid-version, and not -avoidversion. It is not trivial to provide a test-case for this. One way would be to check generated RPMs with a command like this (output should be empty): $ rpm -qlp *.rpm | grep -E '/xlator/.+.so.0' Change-Id: I2a6cc557eada4d098b73af5a254f8c75707543da BUG: 1078365 Signed-off-by: Niels de Vos Reviewed-on: http://review.gluster.org/7299 Reviewed-by: Lalatendu Mohanty Reviewed-by: Kaleb KEITHLEY Tested-by: Gluster Build System --- xlators/encryption/crypt/src/Makefile.am | 2 +- xlators/features/changelog/src/Makefile.am | 2 +- xlators/features/compress/src/Makefile.am | 2 +- xlators/features/protect/src/Makefile.am | 6 +++--- xlators/performance/readdir-ahead/src/Makefile.am | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'xlators') diff --git a/xlators/encryption/crypt/src/Makefile.am b/xlators/encryption/crypt/src/Makefile.am index faadd117f..b13f65043 100644 --- a/xlators/encryption/crypt/src/Makefile.am +++ b/xlators/encryption/crypt/src/Makefile.am @@ -3,7 +3,7 @@ if ENABLE_CRYPT_XLATOR xlator_LTLIBRARIES = crypt.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/encryption -crypt_la_LDFLAGS = -module -avoidversion -lssl -lcrypto +crypt_la_LDFLAGS = -module -avoid-version -lssl -lcrypto crypt_la_SOURCES = keys.c data.c metadata.c atom.c crypt.c crypt_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la diff --git a/xlators/features/changelog/src/Makefile.am b/xlators/features/changelog/src/Makefile.am index e85031ad4..54c21ac21 100644 --- a/xlators/features/changelog/src/Makefile.am +++ b/xlators/features/changelog/src/Makefile.am @@ -5,7 +5,7 @@ xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features noinst_HEADERS = changelog-helpers.h changelog-mem-types.h changelog-rt.h \ changelog-misc.h changelog-encoders.h changelog-notifier.h -changelog_la_LDFLAGS = -module -avoidversion +changelog_la_LDFLAGS = -module -avoid-version changelog_la_SOURCES = changelog.c changelog-rt.c changelog-helpers.c \ changelog-encoders.c changelog-notifier.c diff --git a/xlators/features/compress/src/Makefile.am b/xlators/features/compress/src/Makefile.am index 4a64b52a9..0bf757c06 100644 --- a/xlators/features/compress/src/Makefile.am +++ b/xlators/features/compress/src/Makefile.am @@ -4,7 +4,7 @@ xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features noinst_HEADERS = cdc.h cdc-mem-types.h -cdc_la_LDFLAGS = -module -avoidversion $(LIBZ_LIBS) +cdc_la_LDFLAGS = -module -avoid-version $(LIBZ_LIBS) cdc_la_SOURCES = cdc.c cdc-helper.c cdc_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la diff --git a/xlators/features/protect/src/Makefile.am b/xlators/features/protect/src/Makefile.am index 7eb93f32e..968e88c45 100644 --- a/xlators/features/protect/src/Makefile.am +++ b/xlators/features/protect/src/Makefile.am @@ -2,15 +2,15 @@ xlator_LTLIBRARIES = prot_dht.la prot_client.la prot_server.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features -prot_dht_la_LDFLAGS = -module -avoidversion +prot_dht_la_LDFLAGS = -module -avoid-version prot_dht_la_SOURCES = prot_dht.c prot_dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -prot_client_la_LDFLAGS = -module -avoidversion +prot_client_la_LDFLAGS = -module -avoid-version prot_client_la_SOURCES = prot_client.c prot_client_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -prot_server_la_LDFLAGS = -module -avoidversion +prot_server_la_LDFLAGS = -module -avoid-version prot_server_la_SOURCES = prot_server.c prot_server_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la diff --git a/xlators/performance/readdir-ahead/src/Makefile.am b/xlators/performance/readdir-ahead/src/Makefile.am index cdabd1428..539d6ede4 100644 --- a/xlators/performance/readdir-ahead/src/Makefile.am +++ b/xlators/performance/readdir-ahead/src/Makefile.am @@ -1,7 +1,7 @@ xlator_LTLIBRARIES = readdir-ahead.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance -readdir_ahead_la_LDFLAGS = -module -avoidversion +readdir_ahead_la_LDFLAGS = -module -avoid-version readdir_ahead_la_SOURCES = readdir-ahead.c readdir_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -- cgit From 6d3739292b7b51d2ddbab75b5f884fb38925b943 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Thu, 16 Jan 2014 16:14:36 -0800 Subject: cluster/afr: refactor - Remove client side self-healing completely (opendir, openfd, lookup) - Re-work readdir-failover to work reliably in case of NFS - Remove unused/dead lock recovery code - Consistently use xdata in both calls and callbacks in all FOPs - Per-inode event generation, used to force inode ctx refresh - Implement dirty flag support (in place of pending counts) - Eliminate inode ctx structure, use read subvol bits + event_generation - Implement inode ctx refreshing based on event generation - Provide backward compatibility in transactions - remove unused variables and functions - make code more consistent in style and pattern - regularize and clean up inode-write transaction code - regularize and clean up dir-write transaction code - regularize and clean up common FOPs - reorganize transaction framework code - skip setting xattrs in pending dict if nothing is pending - re-write self-healing code using syncops - re-write simpler self-heal-daemon Change-Id: I1e4080c9796c8a2815c2dab4be3073f389d614a8 BUG: 1021686 Signed-off-by: Anand Avati Reviewed-on: http://review.gluster.org/6010 Tested-by: Gluster Build System Reviewed-by: Vijay Bellur --- xlators/cluster/afr/src/Makefile.am | 21 +- xlators/cluster/afr/src/afr-common.c | 4351 ++++++++------------- xlators/cluster/afr/src/afr-dir-read.c | 629 ++- xlators/cluster/afr/src/afr-dir-write.c | 1649 +++----- xlators/cluster/afr/src/afr-inode-read.c | 1055 ++--- xlators/cluster/afr/src/afr-inode-write.c | 2625 ++++--------- xlators/cluster/afr/src/afr-lk-common.c | 509 +-- xlators/cluster/afr/src/afr-mem-types.h | 6 +- xlators/cluster/afr/src/afr-open.c | 245 +- xlators/cluster/afr/src/afr-read-txn.c | 239 ++ xlators/cluster/afr/src/afr-self-heal-algorithm.c | 837 ---- xlators/cluster/afr/src/afr-self-heal-algorithm.h | 32 - xlators/cluster/afr/src/afr-self-heal-common.c | 3287 ++++------------ xlators/cluster/afr/src/afr-self-heal-common.h | 144 - xlators/cluster/afr/src/afr-self-heal-data.c | 2094 +++------- xlators/cluster/afr/src/afr-self-heal-entry.c | 2787 +++---------- xlators/cluster/afr/src/afr-self-heal-metadata.c | 969 ++--- xlators/cluster/afr/src/afr-self-heal-name.c | 457 +++ xlators/cluster/afr/src/afr-self-heal.h | 162 +- xlators/cluster/afr/src/afr-self-heald.c | 2643 +++++-------- xlators/cluster/afr/src/afr-self-heald.h | 95 +- xlators/cluster/afr/src/afr-transaction.c | 1457 +++---- xlators/cluster/afr/src/afr-transaction.h | 26 +- xlators/cluster/afr/src/afr.c | 144 +- xlators/cluster/afr/src/afr.h | 823 ++-- xlators/cluster/afr/src/pump.c | 602 +-- xlators/cluster/afr/src/pump.h | 3 + xlators/cluster/dht/src/dht-common.c | 2 +- xlators/cluster/stripe/src/stripe.c | 2 +- xlators/features/index/src/index.c | 398 +- xlators/features/index/src/index.h | 14 - 31 files changed, 8896 insertions(+), 19411 deletions(-) create mode 100644 xlators/cluster/afr/src/afr-read-txn.c delete mode 100644 xlators/cluster/afr/src/afr-self-heal-algorithm.c delete mode 100644 xlators/cluster/afr/src/afr-self-heal-algorithm.h delete mode 100644 xlators/cluster/afr/src/afr-self-heal-common.h create mode 100644 xlators/cluster/afr/src/afr-self-heal-name.c (limited to 'xlators') diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am index 35d18a6c0..ea5a90abb 100644 --- a/xlators/cluster/afr/src/Makefile.am +++ b/xlators/cluster/afr/src/Makefile.am @@ -2,24 +2,26 @@ xlator_LTLIBRARIES = afr.la pump.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \ - afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c \ - afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c \ - afr-self-heal-algorithm.c afr-lk-common.c afr-self-heald.c \ + afr-inode-write.c afr-open.c afr-transaction.c afr-lk-common.c \ + afr-read-txn.c \ $(top_builddir)/xlators/lib/src/libxlator.c +AFR_SELFHEAL_SOURCES = afr-self-heal-common.c afr-self-heal-data.c \ + afr-self-heal-entry.c afr-self-heal-metadata.c afr-self-heald.c \ + afr-self-heal-name.c + afr_la_LDFLAGS = -module -avoid-version -afr_la_SOURCES = $(afr_common_source) afr.c +afr_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) afr.c afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la pump_la_LDFLAGS = -module -avoid-version -pump_la_SOURCES = $(afr_common_source) pump.c +pump_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) pump.c pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \ - afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h \ - afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c \ - afr-self-heald.h $(top_builddir)/xlators/lib/src/libxlator.h \ - $(top_builddir)/glusterfsd/src/glusterfsd.h + afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-mem-types.h \ + afr-common.c afr-self-heald.h pump.h \ + $(top_builddir)/xlators/lib/src/libxlator.h AM_CPPFLAGS = $(GF_CPPFLAGS) \ -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \ @@ -31,7 +33,6 @@ CLEANFILES = uninstall-local: rm -f $(DESTDIR)$(xlatordir)/replicate.so - rm -f $(DESTDIR)$(xlatordir)/pump.so install-data-hook: ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 224d30546..2bab0f853 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -45,787 +45,797 @@ #include "afr-dir-write.h" #include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" #include "afr-self-heald.h" -#include "pump.h" -#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000100000000ULL -#define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL -#define AFR_STATISTICS_HISTORY_SIZE 50 -int -afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, - gf_boolean_t fail_conflict); -void -afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count) -{ - int i = 0; - - for (i = 0; i < child_count; i++) - dst[i] = src[i]; -} -void -afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path) +call_frame_t * +afr_copy_frame (call_frame_t *base) { - int i = 0; - afr_private_t *priv = NULL; - int ret = 0; + afr_local_t *local = NULL; + call_frame_t *frame = NULL; + int op_errno = 0; - priv = this->private; + frame = copy_frame (base); + if (!frame) + return NULL; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) { + AFR_STACK_DESTROY (frame); + return NULL; + } - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (xattr_req, priv->pending_key[i], - 3 * sizeof(int32_t)); - if (ret < 0) - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - path, priv->pending_key[i]); - /* 3 = data+metadata+entry */ - } - ret = dict_set_int32 (xattr_req, GF_GFIDLESS_LOOKUP, 1); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, "%s: failed to set gfidless " - "lookup", path); - } + return frame; } +/* + * INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS: + * + * |<---------- 64bit ------------>| + * 63 32 31 16 15 0 + * | EVENT_GEN | DATA | METADATA | + * + * + * METADATA (bit-0 .. bit-15): bitmap representing subvolumes from which + * metadata can be attempted to be read. + * + * bit-0 => priv->subvolumes[0] + * bit-1 => priv->subvolumes[1] + * ... etc. till bit-15 + * + * DATA (bit-16 .. bit-31): bitmap representing subvolumes from which data + * can be attempted to be read. + * + * bit-16 => priv->subvolumes[0] + * bit-17 => priv->subvolumes[1] + * ... etc. till bit-31 + * + * EVENT_GEN (bit-32 .. bit-63): event generation (i.e priv->event_generation) + * when DATA and METADATA was last updated. + * + * If EVENT_GEN is < priv->event_generation, + * or is 0, it means afr_inode_refresh() needs + * to be called to recalculate the bitmaps. + */ + int -afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, - dict_t *xattr_req, loc_t *loc, void **gfid_req) +__afr_inode_read_subvol_get_small (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int *event_p) { - int ret = -ENOMEM; + afr_private_t *priv = NULL; + int ret = -1; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint32_t event = 0; + uint64_t val = 0; + int i = 0; - GF_ASSERT (gfid_req); + priv = this->private; - *gfid_req = NULL; - local->xattr_req = dict_new (); - if (!local->xattr_req) - goto out; - if (xattr_req) - dict_copy (xattr_req, local->xattr_req); + ret = __inode_ctx_get (inode, this, &val); + if (ret < 0) + return ret; - afr_xattr_req_prepare (this, local->xattr_req, loc->path); - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_INODELK_COUNT); - } - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_ENTRYLK_COUNT); - } + metadatamap = (val & 0x000000000000ffff); + datamap = (val & 0x00000000ffff0000) >> 16; + event = (val & 0xffffffff00000000) >> 32; - ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_PARENT_ENTRYLK); - } + for (i = 0; i < priv->child_count; i++) { + if (metadata) + metadata[i] = (metadatamap >> i) & 1; + if (data) + data[i] = (datamap >> i) & 1; + } - ret = dict_get_ptr (local->xattr_req, "gfid-req", gfid_req); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: failed to get the gfid from dict", loc->path); - *gfid_req = NULL; - } else { - if (loc->parent != NULL) - dict_del (local->xattr_req, "gfid-req"); - } - ret = 0; -out: - return ret; + if (event_p) + *event_p = event; + return ret; } -void -afr_lookup_save_gfid (uuid_t dst, void* new, const loc_t *loc) -{ - inode_t *inode = NULL; - - inode = loc->inode; - if (inode && !uuid_is_null (inode->gfid)) - uuid_copy (dst, inode->gfid); - else if (!uuid_is_null (loc->gfid)) - uuid_copy (dst, loc->gfid); - else if (new && !uuid_is_null (new)) - uuid_copy (dst, new); -} int -afr_errno_count (int32_t *children, int *child_errno, - unsigned int child_count, int32_t op_errno) -{ - int i = 0; - int errno_count = 0; - int child = 0; +__afr_inode_read_subvol_set_small (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int event) +{ + afr_private_t *priv = NULL; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint64_t val = 0; + int i = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (data[i]) + datamap |= (1 << i); + if (metadata[i]) + metadatamap |= (1 << i); + } - for (i = 0; i < child_count; i++) { - if (children) { - child = children[i]; - if (child == -1) - break; - } else { - child = i; - } - if (child_errno[child] == op_errno) - errno_count++; - } - return errno_count; -} + val = ((uint64_t) metadatamap) | + (((uint64_t) datamap) << 16) | + (((uint64_t) event) << 32); -int32_t -afr_set_dict_gfid (dict_t *dict, uuid_t gfid) -{ - int ret = 0; - uuid_t *pgfid = NULL; + return __inode_ctx_set (inode, this, &val); +} - GF_ASSERT (gfid); - pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char); - if (!pgfid) { - ret = -1; - goto out; - } +int +__afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this) +{ + int ret = -1; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint32_t event = 0; + uint64_t val = 0; - uuid_copy (*pgfid, gfid); + ret = __inode_ctx_get (inode, this, &val); + (void) ret; - ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t)); - if (ret) - gf_log (THIS->name, GF_LOG_ERROR, "gfid set failed"); + metadatamap = (val & 0x000000000000ffff) >> 0; + datamap = (val & 0x00000000ffff0000) >> 16; + event = 0; -out: - if (ret && pgfid) - GF_FREE (pgfid); + val = ((uint64_t) metadatamap) | + (((uint64_t) datamap) << 16) | + (((uint64_t) event) << 32); - return ret; + return __inode_ctx_set (inode, this, &val); } -void -afr_inode_ctx_destroy (afr_inode_ctx_t *ctx) -{ - if (!ctx) - return; - GF_FREE (ctx->fresh_children); - GF_FREE (ctx); -} -afr_inode_ctx_t* -__afr_inode_ctx_get (inode_t *inode, xlator_t *this) +int +__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int *event_p) { - int ret = 0; - uint64_t ctx_addr = 0; - afr_inode_ctx_t *ctx = NULL; - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; + int ret = -1; - priv = this->private; - ret = __inode_ctx_get (inode, this, &ctx_addr); - if (ret < 0) - ctx_addr = 0; - if (ctx_addr != 0) { - ctx = (afr_inode_ctx_t*) (long) ctx_addr; - goto out; - } - ctx = GF_CALLOC (1, sizeof (*ctx), - gf_afr_mt_inode_ctx_t); - if (!ctx) - goto fail; - ctx->fresh_children = GF_CALLOC (priv->child_count, - sizeof (*ctx->fresh_children), - gf_afr_mt_int32_t); - if (!ctx->fresh_children) - goto fail; - ret = __inode_ctx_put (inode, this, (uint64_t)ctx); - if (ret) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " - "set the inode ctx (%s)", - uuid_utoa (inode->gfid)); - goto fail; - } + priv = this->private; -out: - return ctx; + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_get_small (inode, this, data, + metadata, event_p); + else + /* TBD: allocate structure with array and read from it */ + ret = -1; -fail: - afr_inode_ctx_destroy (ctx); - return NULL; + return ret; } -afr_inode_ctx_t* -afr_inode_ctx_get (inode_t *inode, xlator_t *this) + +int +__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) { - afr_inode_ctx_t *ctx = NULL; + afr_private_t *priv = NULL; + int ret = -1; - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - } - UNLOCK (&inode->lock); - return ctx; + priv = this->private; + + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_set_small (inode, this, data, + metadata, event); + else + ret = -1; + + return ret; } -void -afr_inode_get_ctx_params (xlator_t *this, inode_t *inode, - afr_inode_params_t *params) + +int +__afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this) { - GF_ASSERT (inode); - GF_ASSERT (params); + afr_private_t *priv = NULL; + int ret = -1; - afr_inode_ctx_t *ctx = NULL; - afr_private_t *priv = NULL; - int i = 0; - int32_t read_child = -1; - int32_t *fresh_children = NULL; + priv = this->private; - priv = this->private; - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - if (!ctx) - goto unlock; - switch (params->op) { - case AFR_INODE_GET_READ_CTX: - fresh_children = params->u.read_ctx.children; - read_child = (int32_t)(ctx->masks & - AFR_ICTX_READ_CHILD_MASK); - params->u.read_ctx.read_child = read_child; - if (!fresh_children) - goto unlock; - for (i = 0; i < priv->child_count; i++) - fresh_children[i] = ctx->fresh_children[i]; - break; - case AFR_INODE_GET_OPENDIR_DONE: - params->u.value = _gf_false; - if (ctx->masks & AFR_ICTX_OPENDIR_DONE_MASK) - params->u.value = _gf_true; - break; - default: - GF_ASSERT (0); - break; - } - } -unlock: - UNLOCK (&inode->lock); + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_reset_small (inode, this); + else + ret = -1; + + return ret; } -gf_boolean_t -afr_is_split_brain (xlator_t *this, inode_t *inode) + +int +afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int *event_p) { - afr_inode_ctx_t *ctx = NULL; - gf_boolean_t spb = _gf_false; + int ret = -1; - ctx = afr_inode_ctx_get (inode, this); - if (!ctx) - goto out; - if ((ctx->mdata_spb == SPB) || (ctx->data_spb == SPB)) - spb = _gf_true; -out: - return spb; + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_get (inode, this, data, + metadata, event_p); + } + UNLOCK(&inode->lock); + + return ret; } -gf_boolean_t -afr_is_opendir_done (xlator_t *this, inode_t *inode) + +int +afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) { - afr_inode_params_t params = {0}; + int ret = -1; + + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_set (inode, this, data, metadata, + event); + } + UNLOCK(&inode->lock); - params.op = AFR_INODE_GET_OPENDIR_DONE; - afr_inode_get_ctx_params (this, inode, ¶ms); - return params.u.value; + return ret; } -int32_t -afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) + +int +afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this) { - afr_inode_params_t params = {0}; + int ret = -1; - params.op = AFR_INODE_GET_READ_CTX; - params.u.read_ctx.children = fresh_children; - afr_inode_get_ctx_params (this, inode, ¶ms); - return params.u.read_ctx.read_child; + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_reset (inode, this); + } + UNLOCK(&inode->lock); + + return ret; } -void -afr_inode_ctx_set_read_child (afr_inode_ctx_t *ctx, int32_t read_child) -{ - uint64_t remaining_mask = 0; - uint64_t mask = 0; - remaining_mask = (~AFR_ICTX_READ_CHILD_MASK & ctx->masks); - mask = (AFR_ICTX_READ_CHILD_MASK & read_child); - ctx->masks = remaining_mask | mask; -} +int +afr_accused_fill (xlator_t *this, dict_t *xdata, unsigned char *accused, + afr_transaction_type type) +{ + afr_private_t *priv = NULL; + int i = 0; + int idx = afr_index_for_transaction_type (type); + void *pending_raw = NULL; + int pending[3]; + int ret = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + ret = dict_get_ptr (xdata, priv->pending_key[i], + &pending_raw); + if (ret) /* no pending flags */ + continue; + memcpy (pending, pending_raw, sizeof(pending)); + + if (ntoh32 (pending[idx])) + accused[i] = 1; + } -void -afr_inode_ctx_set_read_ctx (afr_inode_ctx_t *ctx, int32_t read_child, - int32_t *fresh_children, int32_t child_count) -{ - int i = 0; - - afr_inode_ctx_set_read_child (ctx, read_child); - for (i = 0; i < child_count; i++) { - if (fresh_children) - ctx->fresh_children[i] = fresh_children[i]; - else - ctx->fresh_children[i] = -1; - } + return 0; } -void -afr_inode_ctx_rm_stale_children (afr_inode_ctx_t *ctx, int32_t *stale_children, - int32_t child_count) + +int +afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies, + unsigned char *data_accused) { - int i = 0; - int32_t read_child = -1; + int i = 0; + afr_private_t *priv = NULL; + uint64_t maxsize = 0; - GF_ASSERT (stale_children); - for (i = 0; i < child_count; i++) { - if (stale_children[i] == -1) - break; - afr_children_rm_child (ctx->fresh_children, - stale_children[i], child_count); - } - read_child = (int32_t)(ctx->masks & AFR_ICTX_READ_CHILD_MASK); - if (!afr_is_child_present (ctx->fresh_children, child_count, - read_child)) - afr_inode_ctx_set_read_child (ctx, ctx->fresh_children[0]); -} + priv = this->private; -void -afr_inode_ctx_set_opendir_done (afr_inode_ctx_t *ctx) -{ - uint64_t remaining_mask = 0; - uint64_t mask = 0; + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) + continue; + if (replies[i].poststat.ia_size > maxsize) + maxsize = replies[i].poststat.ia_size; + } + + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) + continue; + if (replies[i].poststat.ia_size < maxsize) + data_accused[i] = 1; + } - remaining_mask = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx->masks); - mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK); - ctx->masks = remaining_mask | mask; + return 0; } -void -afr_inode_set_ctx_params (xlator_t *this, inode_t *inode, - afr_inode_params_t *params) -{ - GF_ASSERT (inode); - GF_ASSERT (params); - afr_inode_ctx_t *ctx = NULL; - afr_private_t *priv = NULL; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - int32_t *stale_children = NULL; +int +afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int event_generation = 0; + int i = 0; + unsigned char *data_accused = NULL; + unsigned char *metadata_accused = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + int ret = 0; - priv = this->private; - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - if (!ctx) - goto unlock; - switch (params->op) { - case AFR_INODE_SET_READ_CTX: - read_child = params->u.read_ctx.read_child; - fresh_children = params->u.read_ctx.children; - afr_inode_ctx_set_read_ctx (ctx, read_child, - fresh_children, - priv->child_count); - break; - case AFR_INODE_RM_STALE_CHILDREN: - stale_children = params->u.read_ctx.children; - afr_inode_ctx_rm_stale_children (ctx, - stale_children, - priv->child_count); - break; - case AFR_INODE_SET_OPENDIR_DONE: - afr_inode_ctx_set_opendir_done (ctx); - break; - default: - GF_ASSERT (0); - break; - } - } -unlock: - UNLOCK (&inode->lock); -} + local = frame->local; + priv = this->private; + replies = local->replies; + event_generation = local->event_generation; + + data_accused = alloca0 (priv->child_count); + data_readable = alloca0 (priv->child_count); + metadata_accused = alloca0 (priv->child_count); + metadata_readable = alloca0 (priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + data_readable[i] = 1; + metadata_readable[i] = 1; + } -void -afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, - afr_spb_state_t data_spb) -{ - afr_inode_ctx_t *ctx = NULL; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) { + data_readable[i] = 0; + metadata_readable[i] = 0; + continue; + } - ctx = afr_inode_ctx_get (inode, this); - if (mdata_spb != DONT_KNOW) - ctx->mdata_spb = mdata_spb; - if (data_spb != DONT_KNOW) - ctx->data_spb = data_spb; -} + if (replies[i].op_ret == -1) { + data_readable[i] = 0; + metadata_readable[i] = 0; + continue; + } -void -afr_set_opendir_done (xlator_t *this, inode_t *inode) -{ - afr_inode_params_t params = {0}; + afr_accused_fill (this, replies[i].xdata, data_accused, + (inode->ia_type == IA_IFDIR) ? + AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION); - params.op = AFR_INODE_SET_OPENDIR_DONE; - afr_inode_set_ctx_params (this, inode, ¶ms); -} + afr_accused_fill (this, replies[i].xdata, + metadata_accused, AFR_METADATA_TRANSACTION); -void -afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, - int32_t *fresh_children) -{ - afr_inode_params_t params = {0}; - afr_private_t *priv = NULL; + } - priv = this->private; - GF_ASSERT (read_child >= 0); - GF_ASSERT (fresh_children); - GF_ASSERT (afr_is_child_present (fresh_children, priv->child_count, - read_child)); - - params.op = AFR_INODE_SET_READ_CTX; - params.u.read_ctx.read_child = read_child; - params.u.read_ctx.children = fresh_children; - afr_inode_set_ctx_params (this, inode, ¶ms); + if (inode->ia_type != IA_IFDIR) + afr_accuse_smallfiles (this, replies, data_accused); + + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) { + data_readable[i] = 0; + ret = 1; + } + if (metadata_accused[i]) { + metadata_readable[i] = 0; + ret = 1; + } + } + + afr_inode_read_subvol_set (inode, this, data_readable, + metadata_readable, event_generation); + return ret; } -void -afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, - int32_t *stale_children) -{ - afr_inode_params_t params = {0}; - GF_ASSERT (stale_children); - params.op = AFR_INODE_RM_STALE_CHILDREN; - params.u.read_ctx.children = stale_children; - afr_inode_set_ctx_params (this, inode, ¶ms); +int +afr_refresh_selfheal_done (int ret, call_frame_t *heal, void *opaque) +{ + if (heal) + STACK_DESTROY (heal->root); + return 0; } -gf_boolean_t -afr_is_source_child (int32_t *sources, int32_t child_count, int32_t child) +int +afr_inode_refresh_err (call_frame_t *frame, xlator_t *this) { - gf_boolean_t source_xattrs = _gf_false; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int err = 0; - GF_ASSERT (child < child_count); + local = frame->local; + priv = this->private; - if ((child >= 0) && (child < child_count) && - sources[child]) { - source_xattrs = _gf_true; - } - return source_xattrs; + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && !local->replies[i].op_ret) { + err = 0; + goto ret; + } + } + + err = afr_final_errno (local, priv); +ret: + return -err; } -gf_boolean_t -afr_is_child_present (int32_t *success_children, int32_t child_count, - int32_t child) + +int +afr_refresh_selfheal_wrap (void *opaque) { - gf_boolean_t success_child = _gf_false; - int i = 0; + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + int err = 0; - GF_ASSERT (child < child_count); + local = frame->local; + this = frame->this; - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - if (child == success_children[i]) { - success_child = _gf_true; - break; - } - } - return success_child; -} + afr_selfheal (frame->this, local->refreshinode->gfid); -gf_boolean_t -afr_is_read_child (int32_t *success_children, int32_t *sources, - int32_t child_count, int32_t child) -{ - gf_boolean_t success_child = _gf_false; - gf_boolean_t source = _gf_false; + afr_selfheal_unlocked_discover (frame, local->refreshinode, + local->refreshinode->gfid, + local->replies); - if (child < 0) { - return _gf_false; - } + afr_replies_interpret (frame, this, local->refreshinode); - GF_ASSERT (success_children); - GF_ASSERT (child_count > 0); + err = afr_inode_refresh_err (frame, this); - success_child = afr_is_child_present (success_children, child_count, - child); - if (!success_child) - goto out; - if (NULL == sources) { - source = _gf_true; - goto out; - } - source = afr_is_source_child (sources, child_count, child); -out: - return (success_child && source); + afr_replies_wipe (local, this->private); + + local->refreshfn (frame, this, err); + + return 0; } -int32_t -afr_hash_child (int32_t *success_children, int32_t child_count, - unsigned int hmode, uuid_t gfid) + +gf_boolean_t +afr_selfheal_enabled (xlator_t *this) { - uuid_t gfid_copy = {0,}; - pid_t pid; + afr_private_t *priv = NULL; + gf_boolean_t data = _gf_false; - if (!hmode) { - return -1; - } + priv = this->private; - if (gfid) { - uuid_copy(gfid_copy,gfid); - } - if (hmode > 1) { - /* - * Why getpid? Because it's one of the cheapest calls - * available - faster than gethostname etc. - and returns a - * constant-length value that's sure to be shorter than a UUID. - * It's still very unlikely to be the same across clients, so - * it still provides good mixing. We're not trying for - * perfection here. All we need is a low probability that - * multiple clients won't converge on the same subvolume. - */ - pid = getpid(); - memcpy (gfid_copy, &pid, sizeof(pid)); - } + gf_string2boolean (priv->data_self_heal, &data); - return SuperFastHash((char *)gfid_copy, - sizeof(gfid_copy)) % child_count; + return data || priv->metadata_self_heal || priv->entry_self_heal; } -/* If sources is NULL the xattrs are assumed to be of source for all - * success_children. - */ + + int -afr_select_read_child_from_policy (int32_t *success_children, - int32_t child_count, int32_t prev_read_child, - int32_t config_read_child, int32_t *sources, - unsigned int hmode, uuid_t gfid) +afr_inode_refresh_done (call_frame_t *frame, xlator_t *this) { - int32_t read_child = -1; - int i = 0; + call_frame_t *heal = NULL; + afr_local_t *local = NULL; + int ret = 0; + int err = 0; - GF_ASSERT (success_children); + local = frame->local; - read_child = config_read_child; - if (afr_is_read_child (success_children, sources, child_count, - read_child)) - goto out; + ret = afr_replies_interpret (frame, this, local->refreshinode); - read_child = prev_read_child; - if (afr_is_read_child (success_children, sources, child_count, - read_child)) - goto out; + err = afr_inode_refresh_err (frame, this); - read_child = afr_hash_child (success_children, child_count, - hmode, gfid); - if (afr_is_read_child (success_children, sources, child_count, - read_child)) { - goto out; - } + afr_replies_wipe (local, this->private); - for (i = 0; i < child_count; i++) { - read_child = success_children[i]; - if (read_child < 0) - break; - if (afr_is_read_child (success_children, sources, child_count, - read_child)) - goto out; - } - read_child = -1; + if (ret && afr_selfheal_enabled (this)) { + heal = copy_frame (frame); + if (heal) + heal->root->pid = -1; + ret = synctask_new (this->ctx->env, afr_refresh_selfheal_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) + goto refresh_done; + } else { + refresh_done: + local->refreshfn (frame, this, err); + } -out: - return read_child; + return 0; } -/* This function should be used when all the success_children are sources - */ -void -afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, - int32_t *fresh_children, int32_t prev_read_child, - int32_t config_read_child, uuid_t gfid) + +int +afr_inode_refresh_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *par) { - int read_child = -1; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int call_child = (long) cookie; + int call_count = 0; - priv = this->private; - read_child = afr_select_read_child_from_policy (fresh_children, - priv->child_count, - prev_read_child, - config_read_child, - NULL, - priv->hash_mode, gfid); - if (read_child >= 0) - afr_inode_set_read_ctx (this, inode, read_child, - fresh_children); -} + local = frame->local; -/* afr_next_call_child () - * This is a common function used by all the read-type fops - * This function should not be called with the inode's read_children array. - * The fop's handler should make a copy of the inode's read_children, - * preferred read_child into the local vars, because while this function is - * in execution there is a chance for inode's read_ctx to change. - */ -int32_t -afr_next_call_child (int32_t *fresh_children, unsigned char *child_up, - size_t child_count, int32_t *last_index, - int32_t read_child) + local->replies[call_child].valid = 1; + local->replies[call_child].op_ret = op_ret; + local->replies[call_child].op_errno = op_errno; + if (op_ret != -1) { + local->replies[call_child].poststat = *buf; + local->replies[call_child].postparent = *par; + local->replies[call_child].xdata = dict_ref (xdata); + } + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_inode_refresh_done (frame, this); + + return 0; +} + + +int +afr_inode_refresh_subvol (call_frame_t *frame, xlator_t *this, int i, + inode_t *inode, dict_t *xdata) { - int next_index = 0; - int32_t next_call_child = -1; + loc_t loc = {0, }; + afr_private_t *priv = NULL; - GF_ASSERT (last_index); + priv = this->private; - next_index = *last_index; -retry: - next_index++; - if ((next_index >= child_count) || - (fresh_children[next_index] == -1)) - goto out; - if ((fresh_children[next_index] == read_child) || - (!child_up[fresh_children[next_index]])) - goto retry; - *last_index = next_index; - next_call_child = fresh_children[next_index]; -out: - return next_call_child; + loc.inode = inode; + uuid_copy (loc.gfid, inode->gfid); + + STACK_WIND_COOKIE (frame, afr_inode_refresh_subvol_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->lookup, &loc, xdata); + return 0; } - /* This function should not be called with the inode's read_children array. - * The fop's handler should make a copy of the inode's read_children, - * preferred read_child into the local vars, because while this function is - * in execution there is a chance for inode's read_ctx to change. - */ -int32_t -afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, - int32_t *fresh_children, - int32_t *call_child, int32_t *last_index) + +int +afr_inode_refresh_do (call_frame_t *frame, xlator_t *this) { - int ret = 0; - afr_private_t *priv = NULL; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t *xdata = NULL; - GF_ASSERT (child_up); - GF_ASSERT (call_child); - GF_ASSERT (last_index); - GF_ASSERT (fresh_children); + priv = this->private; + local = frame->local; - if (read_child < 0) { - ret = -EIO; - goto out; - } - priv = this->private; - *call_child = -1; - *last_index = -1; + afr_replies_wipe (local, priv); - if (child_up[read_child]) { - *call_child = read_child; - } else { - for (i = 0; i < priv->child_count; i++) { - if (fresh_children[i] == -1) - break; - if (child_up[fresh_children[i]]) { - *call_child = fresh_children[i]; - ret = 0; - break; - } - } + xdata = dict_new (); + if (!xdata) { + afr_inode_refresh_done (frame, this); + return 0; + } - if (*call_child == -1) { - ret = -ENOTCONN; - goto out; - } + if (afr_xattr_req_prepare (this, xdata) != 0) { + dict_unref (xdata); + afr_inode_refresh_done (frame, this); + return 0; + } + + local->call_count = AFR_COUNT (local->child_up, priv->child_count); + + call_count = local->call_count; + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + + afr_inode_refresh_subvol (frame, this, i, local->refreshinode, + xdata); + + if (!--call_count) + break; + } + + dict_unref (xdata); + + return 0; +} + + +int +afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_inode_refresh_cbk_t refreshfn) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->refreshfn = refreshfn; - *last_index = i; + if (local->refreshinode) { + inode_unref (local->refreshinode); + local->refreshinode = NULL; + } + + local->refreshinode = inode_ref (inode); + + afr_inode_refresh_do (frame, this); + + return 0; +} + + +int +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req) +{ + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + ret = dict_set_uint64 (xattr_req, priv->pending_key[i], + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret < 0) + gf_log (this->name, GF_LOG_WARNING, + "Unable to set dict value for %s", + priv->pending_key[i]); + /* 3 = data+metadata+entry */ } -out: - gf_log (this->name, GF_LOG_DEBUG, "Returning %d, call_child: %d, " - "last_index: %d", ret, *call_child, *last_index); - return ret; + ret = dict_set_uint64 (xattr_req, AFR_DIRTY, + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret) { + gf_log (this->name, GF_LOG_DEBUG, "failed to set dirty " + "query flag"); + } + + return ret; } -void -afr_reset_xattr (dict_t **xattr, unsigned int child_count) +int +afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, + dict_t *xattr_req, loc_t *loc) { - unsigned int i = 0; + int ret = -ENOMEM; - if (!xattr) + local->xattr_req = dict_new (); + if (!local->xattr_req) goto out; - for (i = 0; i < child_count; i++) { - if (xattr[i]) { - dict_unref (xattr[i]); - xattr[i] = NULL; - } + if (xattr_req) + dict_copy (xattr_req, local->xattr_req); + + ret = afr_xattr_req_prepare (this, local->xattr_req); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to prepare xattr_req", loc->path); + } + + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_INODELK_COUNT); + } + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_ENTRYLK_COUNT); } + + ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s: Unable to set dict value for %s", + loc->path, GLUSTERFS_PARENT_ENTRYLK); + } + + ret = 0; out: - return; + return ret; } -void -afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count) -{ - afr_reset_xattr (xattr, child_count); - GF_FREE (xattr); -} -void -afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) +int +afr_hash_child (inode_t *inode, int32_t child_count, int hashmode) { - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; + uuid_t gfid_copy = {0,}; + pid_t pid; - sh = &local->self_heal; - priv = this->private; + if (!hashmode) { + return -1; + } + + if (inode) { + uuid_copy (gfid_copy, inode->gfid); + } + + if (hashmode > 1) { + /* + * Why getpid? Because it's one of the cheapest calls + * available - faster than gethostname etc. - and returns a + * constant-length value that's sure to be shorter than a UUID. + * It's still very unlikely to be the same across clients, so + * it still provides good mixing. We're not trying for + * perfection here. All we need is a low probability that + * multiple clients won't converge on the same subvolume. + */ + pid = getpid(); + memcpy (gfid_copy, &pid, sizeof(pid)); + } + + return SuperFastHash((char *)gfid_copy, + sizeof(gfid_copy)) % child_count; +} - if (sh->data_sh_info && strcmp (sh->data_sh_info, "")) - GF_FREE (sh->data_sh_info); - if (sh->metadata_sh_info && strcmp (sh->metadata_sh_info, "")) - GF_FREE (sh->metadata_sh_info); +int +afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, + unsigned char *readable) +{ + afr_private_t *priv = NULL; + int read_subvol = -1; + int i = 0; - GF_FREE (sh->buf); + priv = this->private; - GF_FREE (sh->parentbufs); + /* first preference - explicitly specified or local subvolume */ + if (priv->read_child >= 0 && readable[priv->read_child]) + return priv->read_child; - if (sh->inode) - inode_unref (sh->inode); + /* second preference - use hashed mode */ + read_subvol = afr_hash_child (inode, priv->child_count, + priv->hash_mode); + if (read_subvol >= 0 && readable[read_subvol]) + return read_subvol; - afr_xattr_array_destroy (sh->xattr, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (readable[i]) + return i; + } - GF_FREE (sh->child_errno); + /* no readable subvolumes, either split brain or all subvols down */ - afr_matrix_cleanup (sh->pending_matrix, priv->child_count); - afr_matrix_cleanup (sh->delta_matrix, priv->child_count); + return -1; +} - GF_FREE (sh->sources); - GF_FREE (sh->success); +int +afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, + int type) +{ + int ret = -1; - GF_FREE (sh->locked_nodes); + if (type == AFR_METADATA_TRANSACTION) + ret = afr_inode_read_subvol_get (inode, this, 0, readable, + event_p); + else + ret = afr_inode_read_subvol_get (inode, this, readable, 0, + event_p); + return ret; +} - if (sh->healing_fd) { - fd_unref (sh->healing_fd); - sh->healing_fd = NULL; - } - GF_FREE ((char *)sh->linkname); +int +afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p, + int *event_p, afr_transaction_type type) +{ + afr_private_t *priv = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + unsigned char *readable = NULL; + unsigned char *intersection = NULL; + int subvol = -1; + int event = 0; - GF_FREE (sh->success_children); + priv = this->private; - GF_FREE (sh->fresh_children); + readable = alloca0 (priv->child_count); + data_readable = alloca0 (priv->child_count); + metadata_readable = alloca0 (priv->child_count); + intersection = alloca0 (priv->child_count); - GF_FREE (sh->fresh_parent_dirs); + afr_inode_read_subvol_type_get (inode, this, readable, &event, type); - loc_wipe (&sh->parent_loc); - loc_wipe (&sh->lookup_loc); + afr_inode_read_subvol_get (inode, this, data_readable, metadata_readable, + &event); - GF_FREE (sh->checksum); + AFR_INTERSECT (intersection, data_readable, metadata_readable, + priv->child_count); - GF_FREE (sh->write_needed); - if (sh->healing_fd) - fd_unref (sh->healing_fd); + if (AFR_COUNT (intersection, priv->child_count) > 0) + subvol = afr_read_subvol_select_by_policy (inode, this, + intersection); + else + subvol = afr_read_subvol_select_by_policy (inode, this, + readable); + if (subvol_p) + *subvol_p = subvol; + if (event_p) + *event_p = event; + return subvol; } @@ -838,8 +848,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) priv = this->private; afr_matrix_cleanup (local->pending, priv->child_count); - afr_matrix_cleanup (local->transaction.txn_changelog, - priv->child_count); GF_FREE (local->internal_lock.locked_nodes); @@ -860,7 +868,25 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) loc_wipe (&local->transaction.parent_loc); loc_wipe (&local->transaction.new_parent_loc); - GF_FREE (local->transaction.postop_piggybacked); +} + + +void +afr_replies_wipe (afr_local_t *local, afr_private_t *priv) +{ + int i; + + if (!local->replies) + return; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].xdata) { + dict_unref (local->replies[i].xdata); + local->replies[i].xdata = NULL; + } + } + + memset (local->replies, 0, sizeof(*local->replies) * priv->child_count); } @@ -872,7 +898,7 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (!local) return; - afr_local_sh_cleanup (local, this); + syncbarrier_destroy (&local->barrier); afr_local_transaction_cleanup (local, this); @@ -890,40 +916,26 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (local->dict) dict_unref (local->dict); + afr_replies_wipe (local, priv); GF_FREE(local->replies); GF_FREE (local->child_up); - GF_FREE (local->child_errno); - - GF_FREE (local->fresh_children); - - { /* lookup */ - if (local->cont.lookup.xattrs) { - afr_reset_xattr (local->cont.lookup.xattrs, - priv->child_count); - GF_FREE (local->cont.lookup.xattrs); - local->cont.lookup.xattrs = NULL; - } - - if (local->cont.lookup.xattr) { - dict_unref (local->cont.lookup.xattr); - } + GF_FREE (local->read_attempted); - if (local->cont.lookup.inode) { - inode_unref (local->cont.lookup.inode); - } + GF_FREE (local->readable); - GF_FREE (local->cont.lookup.postparents); + if (local->inode) + inode_unref (local->inode); - GF_FREE (local->cont.lookup.bufs); + if (local->parent) + inode_unref (local->parent); - GF_FREE (local->cont.lookup.success_children); + if (local->parent2) + inode_unref (local->parent2); - GF_FREE (local->cont.lookup.sources); - afr_matrix_cleanup (local->cont.lookup.pending_matrix, - priv->child_count); - } + if (local->refreshinode) + inode_unref (local->refreshinode); { /* getxattr */ GF_FREE (local->cont.getxattr.name); @@ -980,1167 +992,260 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (local->cont.fxattrop.xattr) dict_unref (local->cont.fxattrop.xattr); } - { /* symlink */ - GF_FREE (local->cont.symlink.linkpath); - } - - { /* opendir */ - GF_FREE (local->cont.opendir.checksum); - } - - { /* readdirp */ - if (local->cont.readdir.dict) - dict_unref (local->cont.readdir.dict); - } - - if (local->xdata_req) - dict_unref (local->xdata_req); - - if (local->xdata_rsp) - dict_unref (local->xdata_rsp); -} - - -int -afr_frame_return (call_frame_t *frame) -{ - afr_local_t *local = NULL; - int call_count = 0; - - local = frame->local; - - LOCK (&frame->lock); - { - call_count = --local->call_count; - } - UNLOCK (&frame->lock); - - return call_count; -} - -int -afr_set_elem_count_get (unsigned char *elems, int child_count) -{ - int i = 0; - int ret = 0; - - for (i = 0; i < child_count; i++) - if (elems[i]) - ret++; - return ret; -} - -/** - * up_children_count - return the number of children that are up - */ - -unsigned int -afr_up_children_count (unsigned char *child_up, unsigned int child_count) -{ - return afr_set_elem_count_get (child_up, child_count); -} - -unsigned int -afr_locked_children_count (unsigned char *children, unsigned int child_count) -{ - return afr_set_elem_count_get (children, child_count); -} - -unsigned int -afr_pre_op_done_children_count (unsigned char *pre_op, - unsigned int child_count) -{ - return afr_set_elem_count_get (pre_op, child_count); -} - -gf_boolean_t -afr_is_fresh_lookup (loc_t *loc, xlator_t *this) -{ - uint64_t ctx = 0; - int32_t ret = 0; - - GF_ASSERT (loc); - GF_ASSERT (this); - GF_ASSERT (loc->inode); - - ret = inode_ctx_get (loc->inode, this, &ctx); - if (0 == ret) - return _gf_false; - return _gf_true; -} - -void -afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) -{ - GF_ASSERT (loc); - GF_ASSERT (buf); - - uuid_copy (loc->gfid, buf->ia_gfid); - if (postparent) - uuid_copy (loc->pargfid, postparent->ia_gfid); -} - -/* - * Quota size xattrs are not maintained by afr. There is a - * possibility that they differ even when both the directory changelog xattrs - * suggest everything is fine. So if there is at least one 'source' check among - * the sources which has the maximum quota size. Otherwise check among all the - * available ones for maximum quota size. This way if there is a source and - * stale copies it always votes for the 'source'. - * */ - -static void -afr_handle_quota_size (afr_local_t *local, xlator_t *this, - dict_t *rsp_dict) -{ - int32_t *sources = NULL; - dict_t *xattr = NULL; - data_t *max_data = NULL; - int64_t max_quota_size = -1; - data_t *data = NULL; - int64_t *size = NULL; - int64_t quota_size = -1; - afr_private_t *priv = NULL; - int i = 0; - int ret = -1; - gf_boolean_t source_present = _gf_false; - - priv = this->private; - sources = local->cont.lookup.sources; - - if (rsp_dict == NULL) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "%s: Invalid " - "response dictionary", local->loc.path); - return; - } - - for (i = 0; i < priv->child_count; i++) { - if (sources[i]) { - source_present = _gf_true; - break; - } - } - - for (i = 0; i < priv->child_count; i++) { - /* - * If there is at least one source lets check - * for maximum quota sizes among sources, otherwise take the - * maximum of the ones present to be on the safer side. - */ - if (source_present && !sources[i]) - continue; - - xattr = local->cont.lookup.xattrs[i]; - if (!xattr) - continue; - - data = dict_get (xattr, QUOTA_SIZE_KEY); - if (!data) - continue; - - size = (int64_t*)data->data; - quota_size = ntoh64(*size); - gf_log (this->name, GF_LOG_DEBUG, "%s: %d, size: %"PRId64, - local->loc.path, i, quota_size); - if (quota_size > max_quota_size) { - if (max_data) - data_unref (max_data); - - max_quota_size = quota_size; - max_data = data_ref (data); - } - } - - if (max_data) { - ret = dict_set (rsp_dict, QUOTA_SIZE_KEY, max_data); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " - "quota size", local->loc.path); - } - - data_unref (max_data); - } -} - -int -afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) -{ - struct iatt *buf = NULL; - struct iatt *postparent = NULL; - dict_t **xattr = NULL; - int32_t *success_children = NULL; - int32_t *sources = NULL; - afr_private_t *priv = NULL; - int32_t read_child = -1; - int ret = 0; - int i = 0; - - GF_ASSERT (local); - - buf = &local->cont.lookup.buf; - postparent = &local->cont.lookup.postparent; - xattr = &local->cont.lookup.xattr; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->cont.lookup.inode, - local->fresh_children); - if (read_child < 0) { - ret = -1; - goto out; - } - success_children = local->cont.lookup.success_children; - sources = local->cont.lookup.sources; - memset (sources, 0, sizeof (*sources) * priv->child_count); - afr_children_intersection_get (local->fresh_children, success_children, - sources, priv->child_count); - if (!sources[read_child]) { - read_child = -1; - for (i = 0; i < priv->child_count; i++) { - if (sources[i]) { - read_child = i; - break; - } - } - } - if (read_child < 0) { - ret = -1; - goto out; - } - - gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d", - read_child); - if (!*xattr) - *xattr = dict_ref (local->cont.lookup.xattrs[read_child]); - - *buf = local->cont.lookup.bufs[read_child]; - *postparent = local->cont.lookup.postparents[read_child]; - - if (dict_get (local->xattr_req, QUOTA_SIZE_KEY)) - afr_handle_quota_size (local, this, *xattr); - - if (IA_INVAL == local->cont.lookup.inode->ia_type) { - /* fix for RT #602 */ - local->cont.lookup.inode->ia_type = buf->ia_type; - } -out: - return ret; -} - -static void -afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this, - int child_index, dict_t *xattr) -{ - uint32_t inodelk_count = 0; - uint32_t entrylk_count = 0; - int ret = -1; - uint32_t parent_entrylk = 0; - - GF_ASSERT (local); - GF_ASSERT (this); - GF_ASSERT (xattr); - GF_ASSERT (child_index >= 0); - - ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT, - &inodelk_count); - if (ret == 0) - local->inodelk_count += inodelk_count; - - ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT, - &entrylk_count); - if (ret == 0) - local->entrylk_count += entrylk_count; - ret = dict_get_uint32 (xattr, GLUSTERFS_PARENT_ENTRYLK, - &parent_entrylk); - if (!ret) - local->cont.lookup.parent_entrylk += parent_entrylk; -} - -/* - * It's important to maintain a commutative property on do_*_self_heal and - * found*; once set, they must not be cleared by a subsequent iteration or - * call, so that they represent a logical OR of all iterations and calls - * regardless of child/key order. That allows the caller to call us multiple - * times without having to use a separate variable as a "reduce" accumulator. - */ -static void -afr_lookup_set_self_heal_params_by_xattr (afr_local_t *local, xlator_t *this, - dict_t *xattr) -{ - afr_private_t *priv = NULL; - int i = 0; - int ret = -1; - void *pending_raw = NULL; - int32_t *pending = NULL; - - GF_ASSERT (local); - GF_ASSERT (this); - GF_ASSERT (xattr); - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); - if (ret != 0) { - continue; - } - pending = pending_raw; - - if (pending[AFR_METADATA_TRANSACTION]) { - gf_log(this->name, GF_LOG_DEBUG, - "metadata self-heal is pending for %s.", - local->loc.path); - local->self_heal.do_metadata_self_heal = _gf_true; - } - - if (pending[AFR_ENTRY_TRANSACTION]) { - gf_log(this->name, GF_LOG_DEBUG, - "entry self-heal is pending for %s.", - local->loc.path); - local->self_heal.do_entry_self_heal = _gf_true; - } - - if (pending[AFR_DATA_TRANSACTION]) { - gf_log(this->name, GF_LOG_DEBUG, - "data self-heal is pending for %s.", - local->loc.path); - local->self_heal.do_data_self_heal = _gf_true; - } - } -} - -void -afr_lookup_check_set_metadata_split_brain (afr_local_t *local, xlator_t *this) -{ - int32_t *sources = NULL; - afr_private_t *priv = NULL; - int32_t subvol_status = 0; - int32_t *success_children = NULL; - dict_t **xattrs = NULL; - struct iatt *bufs = NULL; - int32_t **pending_matrix = NULL; - - priv = this->private; - - sources = GF_CALLOC (priv->child_count, sizeof (*sources), - gf_afr_mt_int32_t); - if (NULL == sources) - goto out; - success_children = local->cont.lookup.success_children; - xattrs = local->cont.lookup.xattrs; - bufs = local->cont.lookup.bufs; - pending_matrix = local->cont.lookup.pending_matrix; - afr_build_sources (this, xattrs, bufs, pending_matrix, - sources, success_children, AFR_METADATA_TRANSACTION, - &subvol_status, _gf_false); - if (subvol_status & SPLIT_BRAIN) - local->cont.lookup.possible_spb = _gf_true; -out: - GF_FREE (sources); -} - -static void -afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, - struct iatt *buf, struct iatt *lookup_buf) -{ - if (PERMISSION_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - gf_log (this->name, GF_LOG_DEBUG, - "permissions differ for %s ", local->loc.path); - local->self_heal.do_metadata_self_heal = _gf_true; - } - - if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { - /* mismatching permissions */ - local->self_heal.do_metadata_self_heal = _gf_true; - gf_log (this->name, GF_LOG_DEBUG, - "ownership differs for %s ", local->loc.path); - } - - if (SIZE_DIFFERS (buf, lookup_buf) - && IA_ISREG (buf->ia_type)) { - gf_log (this->name, GF_LOG_DEBUG, - "size differs for %s ", local->loc.path); - local->self_heal.do_data_self_heal = _gf_true; - } - - if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) { - /* mismatching gfid */ - gf_log (this->name, GF_LOG_DEBUG, - "%s: gfid different on subvolume", local->loc.path); - } -} - -static void -afr_detect_self_heal_by_split_brain_status (afr_local_t *local, xlator_t *this) -{ - gf_boolean_t split_brain = _gf_false; - afr_self_heal_t *sh = NULL; - - sh = &local->self_heal; - - split_brain = afr_is_split_brain (this, local->cont.lookup.inode); - split_brain = split_brain || local->cont.lookup.possible_spb; - if ((local->success_count > 0) && split_brain && - IA_ISREG (local->cont.lookup.inode->ia_type)) { - sh->force_confirm_spb = _gf_true; - gf_log (this->name, GF_LOG_DEBUG, - "split brain detected during lookup of %s.", - local->loc.path); - } -} - -static void -afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this) -{ - GF_ASSERT (local); - GF_ASSERT (this); - - if ((local->success_count > 0) && (local->enoent_count > 0)) { - local->self_heal.do_metadata_self_heal = _gf_true; - local->self_heal.do_data_self_heal = _gf_true; - local->self_heal.do_entry_self_heal = _gf_true; - local->self_heal.do_gfid_self_heal = _gf_true; - local->self_heal.do_missing_entry_self_heal = _gf_true; - gf_log(this->name, GF_LOG_DEBUG, - "entries are missing in lookup of %s.", - local->loc.path); - } - - return; -} - -gf_boolean_t -afr_can_self_heal_proceed (afr_self_heal_t *sh, afr_private_t *priv) -{ - GF_ASSERT (sh); - GF_ASSERT (priv); - - if (sh->force_confirm_spb) - return _gf_true; - return (sh->do_gfid_self_heal - || sh->do_missing_entry_self_heal - || (afr_data_self_heal_enabled (priv->data_self_heal) && - sh->do_data_self_heal) - || (priv->metadata_self_heal && sh->do_metadata_self_heal) - || (priv->entry_self_heal && sh->do_entry_self_heal)); -} - -afr_transaction_type -afr_transaction_type_get (ia_type_t ia_type) -{ - afr_transaction_type type = AFR_METADATA_TRANSACTION; - - GF_ASSERT (ia_type != IA_INVAL); - - if (IA_ISDIR (ia_type)) { - type = AFR_ENTRY_TRANSACTION; - } else if (IA_ISREG (ia_type)) { - type = AFR_DATA_TRANSACTION; - } - return type; -} - -int -afr_lookup_select_read_child (afr_local_t *local, xlator_t *this, - int32_t *read_child) -{ - ia_type_t ia_type = IA_INVAL; - int32_t source = -1; - int ret = -1; - dict_t **xattrs = NULL; - int32_t *success_children = NULL; - afr_transaction_type type = AFR_METADATA_TRANSACTION; - uuid_t *gfid = NULL; - - GF_ASSERT (local); - GF_ASSERT (this); - GF_ASSERT (local->success_count > 0); - - success_children = local->cont.lookup.success_children; - /*We can take the success_children[0] only because we already - *handle the conflicting children other wise, we could select the - *read_child based on wrong file type - */ - ia_type = local->cont.lookup.bufs[success_children[0]].ia_type; - type = afr_transaction_type_get (ia_type); - xattrs = local->cont.lookup.xattrs; - gfid = &local->cont.lookup.buf.ia_gfid; - source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs, - type, *gfid); - if (source < 0) { - gf_log (this->name, GF_LOG_DEBUG, "failed to select source " - "for %s", local->loc.path); - goto out; - } - - gf_log (this->name, GF_LOG_DEBUG, "Source selected as %d for %s", - source, local->loc.path); - *read_child = source; - ret = 0; -out: - return ret; -} - -static inline gf_boolean_t -afr_is_transaction_running (afr_local_t *local) -{ - GF_ASSERT (local->fop == GF_FOP_LOOKUP); - return ((local->inodelk_count > 0) || (local->entrylk_count > 0)); -} - -void -afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, - gf_boolean_t background, ia_type_t ia_type, char *reason, - void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, - xlator_t *this), - int (*unwind) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno, - int32_t sh_failed)) -{ - afr_local_t *local = NULL; - char sh_type_str[256] = {0,}; - char *bg = ""; - - GF_ASSERT (frame); - GF_ASSERT (this); - GF_ASSERT (inode); - GF_ASSERT (ia_type != IA_INVAL); - - local = frame->local; - local->self_heal.background = background; - local->self_heal.type = ia_type; - local->self_heal.unwind = unwind; - local->self_heal.gfid_sh_success_cbk = gfid_sh_success_cbk; - - afr_self_heal_type_str_get (&local->self_heal, - sh_type_str, - sizeof (sh_type_str)); - - if (background) - bg = "background"; - gf_log (this->name, GF_LOG_DEBUG, - "%s %s self-heal triggered. path: %s, reason: %s", bg, - sh_type_str, local->loc.path, reason); - - afr_self_heal (frame, this, inode); -} - -unsigned int -afr_gfid_missing_count (const char *xlator_name, int32_t *success_children, - struct iatt *bufs, unsigned int child_count, - const char *path) -{ - unsigned int gfid_miss_count = 0; - int i = 0; - struct iatt *child1 = NULL; - - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - child1 = &bufs[success_children[i]]; - if (uuid_is_null (child1->ia_gfid)) { - gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid is null" - " on subvolume %d", path, success_children[i]); - gfid_miss_count++; - } - } - - return gfid_miss_count; -} - -static int -afr_lookup_gfid_missing_count (afr_local_t *local, xlator_t *this) -{ - int32_t *success_children = NULL; - afr_private_t *priv = NULL; - struct iatt *bufs = NULL; - int miss_count = 0; - - priv = this->private; - bufs = local->cont.lookup.bufs; - success_children = local->cont.lookup.success_children; - - miss_count = afr_gfid_missing_count (this->name, success_children, - bufs, priv->child_count, - local->loc.path); - return miss_count; -} - -gf_boolean_t -afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, - unsigned int child_count, const char *path, - const char *xlator_name) -{ - gf_boolean_t conflicting = _gf_false; - int i = 0; - struct iatt *child1 = NULL; - struct iatt *child2 = NULL; - uuid_t *gfid = NULL; - - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - child1 = &bufs[success_children[i]]; - if ((!gfid) && (!uuid_is_null (child1->ia_gfid))) - gfid = &child1->ia_gfid; - - if (i == 0) - continue; - - child2 = &bufs[success_children[i-1]]; - if (FILETYPE_DIFFERS (child1, child2)) { - gf_log (xlator_name, GF_LOG_DEBUG, "%s: filetype " - "differs on subvolumes (%d, %d)", path, - success_children[i-1], success_children[i]); - conflicting = _gf_true; - goto out; - } - if (!gfid || uuid_is_null (child1->ia_gfid)) - continue; - if (uuid_compare (*gfid, child1->ia_gfid)) { - gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid differs" - " on subvolume %d", path, success_children[i]); - conflicting = _gf_true; - goto out; - } - } -out: - return conflicting; -} - -/* afr_update_gfid_from_iatts: This function should be called only if the - * iatts are not conflicting. - */ -void -afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs, - int32_t *success_children, unsigned int child_count) -{ - uuid_t *gfid = NULL; - int i = 0; - int child = 0; - - for (i = 0; i < child_count; i++) { - child = success_children[i]; - if (child == -1) - break; - if ((!gfid) && (!uuid_is_null (bufs[child].ia_gfid))) { - gfid = &bufs[child].ia_gfid; - } else if (gfid && (!uuid_is_null (bufs[child].ia_gfid))) { - if (uuid_compare (*gfid, bufs[child].ia_gfid)) { - GF_ASSERT (0); - goto out; - } - } - } - if (gfid && (!uuid_is_null (*gfid))) - uuid_copy (uuid, *gfid); -out: - return; -} - -static gf_boolean_t -afr_lookup_conflicting_entries (afr_local_t *local, xlator_t *this) -{ - afr_private_t *priv = NULL; - gf_boolean_t conflict = _gf_false; - - priv = this->private; - conflict = afr_conflicting_iattrs (local->cont.lookup.bufs, - local->cont.lookup.success_children, - priv->child_count, local->loc.path, - this->name); - return conflict; -} - -gf_boolean_t -afr_open_only_data_self_heal (char *data_self_heal) -{ - return !strcmp (data_self_heal, "open"); -} - -gf_boolean_t -afr_data_self_heal_enabled (char *data_self_heal) -{ - gf_boolean_t enabled = _gf_false; - - if (gf_string2boolean (data_self_heal, &enabled) == -1) { - enabled = !strcmp (data_self_heal, "open"); - GF_ASSERT (enabled); - } - - return enabled; -} - -static void -afr_lookup_set_self_heal_params (afr_local_t *local, xlator_t *this) -{ - int i = 0; - struct iatt *bufs = NULL; - dict_t **xattr = NULL; - afr_private_t *priv = NULL; - int32_t child1 = -1; - int32_t child2 = -1; - afr_self_heal_t *sh = NULL; - - priv = this->private; - sh = &local->self_heal; - - afr_detect_self_heal_by_lookup_status (local, this); - - if (afr_lookup_gfid_missing_count (local, this)) - local->self_heal.do_gfid_self_heal = _gf_true; - - if (_gf_true == afr_lookup_conflicting_entries (local, this)) - local->self_heal.do_missing_entry_self_heal = _gf_true; - else - afr_update_gfid_from_iatts (local->self_heal.sh_gfid_req, - local->cont.lookup.bufs, - local->cont.lookup.success_children, - priv->child_count); - - bufs = local->cont.lookup.bufs; - for (i = 1; i < local->success_count; i++) { - child1 = local->cont.lookup.success_children[i-1]; - child2 = local->cont.lookup.success_children[i]; - afr_detect_self_heal_by_iatt (local, this, - &bufs[child1], &bufs[child2]); - } - - xattr = local->cont.lookup.xattrs; - for (i = 0; i < local->success_count; i++) { - child1 = local->cont.lookup.success_children[i]; - afr_lookup_set_self_heal_params_by_xattr (local, this, - xattr[child1]); - } - if (afr_open_only_data_self_heal (priv->data_self_heal)) - sh->do_data_self_heal = _gf_false; - if (sh->do_metadata_self_heal) - afr_lookup_check_set_metadata_split_brain (local, this); - afr_detect_self_heal_by_split_brain_status (local, this); -} - -int -afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno, - int32_t sh_failed) -{ - afr_local_t *local = NULL; - int ret = -1; - dict_t *xattr = NULL; - - local = frame->local; - - if (op_ret == -1) { - local->op_ret = -1; - local->op_errno = afr_most_important_error(local->op_errno, - op_errno, _gf_true); - - goto out; - } else { - local->op_ret = 0; - } - - afr_lookup_done_success_action (frame, this, _gf_true); - xattr = local->cont.lookup.xattr; - if (xattr) { - ret = dict_set_int32 (xattr, "sh-failed", sh_failed); - if (ret) - gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " - "sh-failed to %d", local->loc.path, sh_failed); - - if (local->self_heal.actual_sh_started == _gf_true && - sh_failed == 0) { - ret = dict_set_int32 (xattr, "actual-sh-done", 1); - if (ret) - gf_log(this->name, GF_LOG_ERROR, "%s: Failed to" - " set actual-sh-done to %d", - local->loc.path, - local->self_heal.actual_sh_started); - } - } -out: - AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->cont.lookup.inode, &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); - - return 0; -} - -//TODO: At the moment only lookup needs this, so not doing any checks, in the -// future we will have to do fop specific operations -void -afr_post_gfid_sh_success (call_frame_t *sh_frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_local_t *sh_local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - struct iatt *lookup_bufs = NULL; - struct iatt *lookup_parentbufs = NULL; - - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - local = sh->orig_frame->local; - lookup_bufs = local->cont.lookup.bufs; - lookup_parentbufs = local->cont.lookup.postparents; - priv = this->private; - - memcpy (lookup_bufs, sh->buf, priv->child_count * sizeof (*sh->buf)); - memcpy (lookup_parentbufs, sh->parentbufs, - priv->child_count * sizeof (*sh->parentbufs)); - - afr_reset_xattr (local->cont.lookup.xattrs, priv->child_count); - if (local->cont.lookup.xattr) { - dict_unref (local->cont.lookup.xattr); - local->cont.lookup.xattr = NULL; - } - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - local->cont.lookup.xattrs[i] = dict_ref (sh->xattr[i]); - } - - afr_reset_children (local->cont.lookup.success_children, - priv->child_count); - afr_children_copy (local->cont.lookup.success_children, - sh->fresh_children, priv->child_count); -} - -static void -afr_lookup_perform_self_heal (call_frame_t *frame, xlator_t *this, - gf_boolean_t *sh_launched) -{ - unsigned int up_count = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - char *reason = NULL; - - GF_ASSERT (sh_launched); - *sh_launched = _gf_false; - priv = this->private; - local = frame->local; - - up_count = afr_up_children_count (local->child_up, priv->child_count); - if (up_count == 1) { - gf_log (this->name, GF_LOG_DEBUG, - "Only 1 child up - do not attempt to detect self heal"); - goto out; - } - - afr_lookup_set_self_heal_params (local, this); - if (afr_can_self_heal_proceed (&local->self_heal, priv)) { - if (afr_is_transaction_running (local) && - /*Forcefully call afr_launch_self_heal (which will go on to - fail) for SB files.This prevents stale data being served - due to race in afr_is_transaction_running() when - multiple clients access the same SB file*/ - !local->cont.lookup.possible_spb && - (!local->attempt_self_heal)) - goto out; - - reason = "lookup detected pending operations"; - afr_launch_self_heal (frame, this, local->cont.lookup.inode, - !local->foreground_self_heal, - local->cont.lookup.buf.ia_type, - reason, afr_post_gfid_sh_success, - afr_self_heal_lookup_unwind); - *sh_launched = _gf_true; - } -out: - return; -} - -void -afr_get_fresh_children (int32_t *success_children, int32_t *sources, - int32_t *fresh_children, unsigned int child_count) -{ - unsigned int i = 0; - unsigned int j = 0; - - GF_ASSERT (success_children); - GF_ASSERT (sources); - GF_ASSERT (fresh_children); - - afr_reset_children (fresh_children, child_count); - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - if (afr_is_read_child (success_children, sources, child_count, - success_children[i])) { - fresh_children[j] = success_children[i]; - j++; - } - } -} - -static int -afr_lookup_set_read_ctx (afr_local_t *local, xlator_t *this, int32_t read_child) -{ - afr_private_t *priv = NULL; - - GF_ASSERT (read_child >= 0); - - priv = this->private; - afr_get_fresh_children (local->cont.lookup.success_children, - local->cont.lookup.sources, - local->fresh_children, priv->child_count); - afr_inode_set_read_ctx (this, local->cont.lookup.inode, read_child, - local->fresh_children); - - return 0; -} - -int -afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, - gf_boolean_t fail_conflict) -{ - int32_t read_child = -1; - int32_t ret = -1; - afr_local_t *local = NULL; - gf_boolean_t fresh_lookup = _gf_false; - - local = frame->local; - fresh_lookup = local->cont.lookup.fresh_lookup; - - if (local->loc.parent == NULL) - fail_conflict = _gf_true; - - if (afr_lookup_conflicting_entries (local, this)) { - if (fail_conflict == _gf_false) - ret = 0; - goto out; - } + { /* symlink */ + GF_FREE (local->cont.symlink.linkpath); + } - ret = afr_lookup_select_read_child (local, this, &read_child); - if (!afr_is_transaction_running (local) || fresh_lookup) { - if (read_child < 0) - goto out; + { /* opendir */ + GF_FREE (local->cont.opendir.checksum); + } - ret = afr_lookup_set_read_ctx (local, this, read_child); - if (ret) - goto out; + { /* readdirp */ + if (local->cont.readdir.dict) + dict_unref (local->cont.readdir.dict); } - ret = afr_lookup_build_response_params (local, this); - if (ret) - goto out; - afr_update_loc_gfids (&local->loc, - &local->cont.lookup.buf, - &local->cont.lookup.postparent); + if (local->xdata_req) + dict_unref (local->xdata_req); - ret = 0; -out: - if (ret) { - local->op_ret = -1; - local->op_errno = EIO; - } - return ret; + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); } + int -afr_lookup_get_latest_subvol (afr_local_t *local, xlator_t *this) +afr_frame_return (call_frame_t *frame) { - afr_private_t *priv = NULL; - int32_t *success_children = NULL; - struct iatt *bufs = NULL; - int i = 0; - int child = 0; - int lsubvol = -1; - - priv = this->private; - success_children = local->cont.lookup.success_children; - bufs = local->cont.lookup.bufs; - for (i = 0; i < priv->child_count; i++) { - child = success_children[i]; - if (child == -1) - break; - if (uuid_is_null (bufs[child].ia_gfid)) - continue; - if (lsubvol < 0) { - lsubvol = child; - } else if (bufs[lsubvol].ia_ctime < bufs[child].ia_ctime) { - lsubvol = child; - } else if ((bufs[lsubvol].ia_ctime == bufs[child].ia_ctime) && - (bufs[lsubvol].ia_ctime_nsec < bufs[child].ia_ctime_nsec)) { - lsubvol = child; - } - } - return lsubvol; -} + afr_local_t *local = NULL; + int call_count = 0; -void -afr_lookup_mark_other_entries_stale (afr_local_t *local, xlator_t *this, - int subvol) -{ - afr_private_t *priv = NULL; - int32_t *success_children = NULL; - struct iatt *bufs = NULL; - int i = 0; - int child = 0; + local = frame->local; - priv = this->private; - success_children = local->cont.lookup.success_children; - bufs = local->cont.lookup.bufs; - memcpy (local->fresh_children, success_children, - sizeof (*success_children) * priv->child_count); - for (i = 0; i < priv->child_count; i++) { - child = local->fresh_children[i]; - if (child == -1) - break; - if (child == subvol) - continue; - if (uuid_is_null (bufs[child].ia_gfid) && - (bufs[child].ia_type == bufs[subvol].ia_type)) - continue; - afr_children_rm_child (success_children, child, - priv->child_count); - local->success_count--; + LOCK (&frame->lock); + { + call_count = --local->call_count; } - afr_reset_children (local->fresh_children, priv->child_count); + UNLOCK (&frame->lock); + + return call_count; } -void -afr_succeed_lookup_on_latest_iatt (afr_local_t *local, xlator_t *this) -{ - int lsubvol = 0; - if (!afr_lookup_conflicting_entries (local, this)) - goto out; +gf_boolean_t +afr_is_entry_possibly_under_txn (afr_local_t *local, xlator_t *this) +{ + int i = 0; + int tmp = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].xdata) + continue; + if (dict_get_int32 (local->replies[i].xdata, + GLUSTERFS_PARENT_ENTRYLK, + &tmp) == 0) + if (tmp) + return _gf_true; + } - lsubvol = afr_lookup_get_latest_subvol (local, this); - if (lsubvol < 0) - goto out; - afr_lookup_mark_other_entries_stale (local, this, lsubvol); -out: - return; + return _gf_false; } -gf_boolean_t -afr_is_entry_possibly_under_creation (afr_local_t *local, xlator_t *this) + +/* + * Quota size xattrs are not maintained by afr. There is a + * possibility that they differ even when both the directory changelog xattrs + * suggest everything is fine. So if there is at least one 'source' check among + * the sources which has the maximum quota size. Otherwise check among all the + * available ones for maximum quota size. This way if there is a source and + * stale copies it always votes for the 'source'. + * */ + +static void +afr_handle_quota_size (call_frame_t *frame, xlator_t *this) { - /* - * We need to perform this test in lookup done and treat on going - * create/DELETE as ENOENT. - * Reason: - Multiple clients A, B and C are attempting 'mkdir -p /mnt/a/b/c' - - 1 Client A is in the middle of mkdir(/a). It has acquired lock. - It has performed mkdir(/a) on one subvol, and second one is still - in progress - 2 Client B performs a lookup, sees directory /a on one, - ENOENT on the other, succeeds lookup. - 3 Client B performs lookup on /a/b on both subvols, both return ENOENT - (one subvol because /a/b does not exist, another because /a - itself does not exist) - 4 Client B proceeds to mkdir /a/b. It obtains entrylk on inode=/a with - basename=b on one subvol, but fails on other subvol as /a is yet to - be created by Client A. - 5 Client A finishes mkdir of /a on other subvol - 6 Client C also attempts to create /a/b, lookup returns ENOENT on - both subvols. - 7 Client C tries to obtain entrylk on on inode=/a with basename=b, - obtains on one subvol (where B had failed), and waits for B to unlock - on other subvol. - 8 Client B finishes mkdir() on one subvol with GFID-1 and completes - transaction and unlocks - 9 Client C gets the lock on the second subvol, At this stage second - subvol already has /a/b created from Client B, but Client C does not - check that in the middle of mkdir transaction - 10 Client C attempts mkdir /a/b on both subvols. It succeeds on - ONLY ONE (where Client B could not get lock because of - missing parent /a dir) with GFID-2, and gets EEXIST from ONE subvol. - This way we have /a/b in GFID mismatch. One subvol got GFID-1 because - Client B performed transaction on only one subvol (because entrylk() - could not be obtained on second subvol because of missing parent dir -- - caused by premature/speculative succeeding of lookup() on /a when locks - are detected). Other subvol gets GFID-2 from Client C because while - it was waiting for entrylk() on both subvols, Client B was in the - middle of creating mkdir() on only one subvol, and Client C does not - "expect" this when it is between lock() and pre-op()/op() phase of the - transaction. - */ - if (local->cont.lookup.parent_entrylk && local->enoent_count) - return _gf_true; + unsigned char *readable = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int i = 0; + uint64_t size = 0; + uint64_t max_size = 0; + int readable_cnt = 0; - return _gf_false; + local = frame->local; + priv = this->private; + replies = local->replies; + + readable = alloca0 (priv->child_count); + + afr_inode_read_subvol_get (local->inode, this, readable, 0, 0); + + readable_cnt = AFR_COUNT (readable, priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + if (dict_get_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, &size)) + continue; + if (size > max_size) + max_size = size; + } + + if (!max_size) + return; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + if (dict_set_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, max_size)) + continue; + } } static void afr_lookup_done (call_frame_t *frame, xlator_t *this) { - int unwind = 1; afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; - gf_boolean_t sh_launched = _gf_false; - gf_boolean_t fail_conflict = _gf_false; - int gfid_miss_count = 0; - int enotconn_count = 0; - int up_children_count = 0; + int i = -1; + int op_errno = 0; + int read_subvol = 0; + unsigned char *readable = NULL; + int event = 0; + struct afr_reply *replies = NULL; + uuid_t read_gfid = {0, }; + gf_boolean_t locked_entry = _gf_false; + gf_boolean_t can_interpret = _gf_true; priv = this->private; local = frame->local; + replies = local->replies; + + locked_entry = afr_is_entry_possibly_under_txn (local, this); - if (afr_is_entry_possibly_under_creation (local, this)) { + readable = alloca0 (priv->child_count); + + afr_inode_read_subvol_get (local->loc.parent, this, readable, + NULL, &event); + + /* First, check if we have an ESTALE from somewhere, + If so, propagate that so that a revalidate can be + issued + */ + op_errno = afr_final_errno (frame->local, this->private); + local->op_errno = op_errno; + if (op_errno == ESTALE) { + local->op_errno = op_errno; local->op_ret = -1; - local->op_errno = ENOENT; - goto unwind; + goto unwind; } - if (local->op_ret < 0) - goto unwind; + read_subvol = -1; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (locked_entry && replies[i].op_ret == -1 && + replies[i].op_errno == ENOENT) { + /* Second, check entry is still + "underway" in creation */ + local->op_ret = -1; + local->op_errno = ENOENT; + read_subvol = i; + goto unwind; + } - if (local->cont.lookup.parent_entrylk && local->success_count > 1) - afr_succeed_lookup_on_latest_iatt (local, this); - - gfid_miss_count = afr_lookup_gfid_missing_count (local, this); - up_children_count = afr_up_children_count (local->child_up, - priv->child_count); - enotconn_count = priv->child_count - up_children_count; - if ((gfid_miss_count == local->success_count) && - (enotconn_count > 0)) { - local->op_ret = -1; - local->op_errno = EIO; - gf_log (this->name, GF_LOG_ERROR, "Failing lookup for %s, " - "LOOKUP on a file without gfid is not allowed when " - "some of the children are down", local->loc.path); - goto unwind; - } + if (replies[i].op_ret == -1) + continue; - if ((gfid_miss_count == local->success_count) && - uuid_is_null (local->cont.lookup.gfid_req)) { - local->op_ret = -1; - local->op_errno = ENODATA; - gf_log (this->name, GF_LOG_ERROR, "%s: No gfid present", - local->loc.path); - goto unwind; - } + if (read_subvol == -1 || !readable[read_subvol]) { + read_subvol = i; + uuid_copy (read_gfid, replies[i].poststat.ia_gfid); + local->op_ret = 0; + } + } - if (gfid_miss_count && uuid_is_null (local->cont.lookup.gfid_req)) - fail_conflict = _gf_true; - ret = afr_lookup_done_success_action (frame, this, fail_conflict); - if (ret) - goto unwind; - uuid_copy (local->self_heal.sh_gfid_req, local->cont.lookup.gfid_req); + if (read_subvol == -1) + goto unwind; + /* We now have a read_subvol, which is readable[] (if there + were any). Next we look for GFID mismatches. We don't + consider a GFID mismatch as an error if read_subvol is + readable[] but the mismatching GFID subvol is not. + */ + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) { + if (priv->child_up[i]) + can_interpret = _gf_false; + continue; + } - afr_lookup_perform_self_heal (frame, this, &sh_launched); - if (sh_launched) { - unwind = 0; - goto unwind; - } + if (!uuid_compare (replies[i].poststat.ia_gfid, + read_gfid)) + continue; - unwind: - if (unwind) { - AFR_STACK_UNWIND (lookup, frame, local->op_ret, - local->op_errno, local->cont.lookup.inode, - &local->cont.lookup.buf, - local->cont.lookup.xattr, - &local->cont.lookup.postparent); - } + can_interpret = _gf_false; + + if (locked_entry) + continue; + + /* Now GFIDs mismatch. It's OK as long as this subvol + is not readable[] but read_subvol is */ + if (readable[read_subvol] && !readable[i]) + continue; + + /* LOG ERROR */ + local->op_ret = -1; + local->op_errno = EIO; + goto unwind; + } + + /* Forth, for the finalized GFID, pick the best subvolume + to return stats from. + */ + if (can_interpret) { + /* It is safe to call afr_replies_interpret() because we have + a response from all the UP subvolumes and all of them resolved + to the same GFID + */ + if (afr_replies_interpret (frame, this, local->inode)) { + read_subvol = afr_data_subvol_get (local->inode, this, + 0, 0); + afr_inode_read_subvol_reset (local->inode, this); + goto cant_interpret; + } else { + read_subvol = afr_data_subvol_get (local->inode, this, + 0, 0); + } + } else { + cant_interpret: + if (read_subvol == -1) + dict_del (replies[0].xdata, GF_CONTENT_KEY); + else + dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY); + } + + afr_handle_quota_size (frame, this); + +unwind: + if (read_subvol == -1) + read_subvol = 0; + + AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[read_subvol].postparent); } /* @@ -2148,104 +1253,102 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) * others in that they must be given higher priority while * returning to the user. * - * The hierarchy is ESTALE > EIO > ENOENT > others + * The hierarchy is ESTALE > ENOENT > others */ -int32_t -afr_most_important_error(int32_t old_errno, int32_t new_errno, - gf_boolean_t eio) + +int +afr_higher_errno (int32_t old_errno, int32_t new_errno) { + if (old_errno == ENODATA || new_errno == ENODATA) + return ENODATA; if (old_errno == ESTALE || new_errno == ESTALE) return ESTALE; - if (eio && (old_errno == EIO || new_errno == EIO)) - return EIO; if (old_errno == ENOENT || new_errno == ENOENT) return ENOENT; return new_errno; } -int32_t -afr_resultant_errno_get (int32_t *children, - int *child_errno, unsigned int child_count) -{ - int i = 0; - int32_t op_errno = 0; - int child = 0; - for (i = 0; i < child_count; i++) { - if (children) { - child = children[i]; - if (child == -1) - break; - } else { - child = i; - } - op_errno = afr_most_important_error(op_errno, - child_errno[child], - _gf_false); - } - return op_errno; +int +afr_final_errno (afr_local_t *local, afr_private_t *priv) +{ + int i = 0; + int op_errno = 0; + int tmp_errno = 0; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == 0) + continue; + tmp_errno = local->replies[i].op_errno; + op_errno = afr_higher_errno (op_errno, tmp_errno); + } + + return op_errno; } -static void -afr_lookup_handle_error (afr_local_t *local, int32_t op_ret, int32_t op_errno) +static int +get_pathinfo_host (char *pathinfo, char *hostname, size_t size) { - GF_ASSERT (local); - if (op_errno == ENOENT) - local->enoent_count++; + char *start = NULL; + char *end = NULL; + int ret = -1; + int i = 0; + + if (!pathinfo) + goto out; - local->op_errno = afr_most_important_error(local->op_errno, op_errno, - _gf_false); + start = strchr (pathinfo, ':'); + if (!start) + goto out; + end = strrchr (pathinfo, ':'); + if (start == end) + goto out; - if (local->op_errno == ESTALE) { - local->op_ret = -1; - } + memset (hostname, 0, size); + i = 0; + while (++start != end) + hostname[i++] = *start; + ret = 0; +out: + return ret; } -static void -afr_set_root_inode_on_first_lookup (afr_local_t *local, xlator_t *this, - inode_t *inode) +int +afr_local_pathinfo (char *pathinfo, gf_boolean_t *local) { - afr_private_t *priv = NULL; - GF_ASSERT (inode); + int ret = 0; + char pathinfohost[1024] = {0}; + char localhost[1024] = {0}; + xlator_t *this = THIS; - if (!__is_root_gfid (inode->gfid)) - goto out; - if (!afr_is_fresh_lookup (&local->loc, this)) + *local = _gf_false; + ret = get_pathinfo_host (pathinfo, pathinfohost, sizeof (pathinfohost)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Invalid pathinfo: %s", + pathinfo); goto out; - priv = this->private; - if ((priv->first_lookup)) { - gf_log (this->name, GF_LOG_INFO, "added root inode"); - priv->root_inode = inode_ref (inode); - priv->first_lookup = 0; } -out: - return; -} -static void -afr_lookup_cache_args (afr_local_t *local, int child_index, dict_t *xattr, - struct iatt *buf, struct iatt *postparent) -{ - GF_ASSERT (child_index >= 0); - local->cont.lookup.xattrs[child_index] = dict_ref (xattr); - local->cont.lookup.postparents[child_index] = *postparent; - local->cont.lookup.bufs[child_index] = *buf; -} + ret = gethostname (localhost, sizeof (localhost)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "gethostname() failed, " + "reason: %s", strerror (errno)); + goto out; + } -static void -afr_lookup_handle_first_success (afr_local_t *local, xlator_t *this, - inode_t *inode, struct iatt *buf) -{ - local->cont.lookup.inode = inode_ref (inode); - local->cont.lookup.buf = *buf; - afr_set_root_inode_on_first_lookup (local, this, inode); + if (!strcmp (localhost, pathinfohost)) + *local = _gf_true; +out: + return ret; } static int32_t -afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, - dict_t *xdata) +afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { int ret = 0; char *pathinfo = NULL; @@ -2257,6 +1360,9 @@ afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } + priv = this->private; + child_index = (int32_t)(long)cookie; + ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); if (ret != 0) { goto out; @@ -2267,7 +1373,6 @@ afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - priv = this->private; /* * Note that one local subvolume will override another here. The only * way to avoid that would be to retain extra information about whether @@ -2275,13 +1380,11 @@ afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, * the slowest local subvolume is far preferable to a remote one. */ if (is_local) { - child_index = (int32_t)(long)cookie; gf_log (this->name, GF_LOG_INFO, "selecting local read_child %s", priv->children[child_index]->name); priv->read_child = child_index; } - out: STACK_DESTROY(frame->root); return 0; @@ -2300,234 +1403,357 @@ afr_attempt_local_discovery (xlator_t *this, int32_t child_index) } tmploc.gfid[sizeof(tmploc.gfid)-1] = 1; - STACK_WIND_COOKIE (newframe, afr_discovery_cbk, + STACK_WIND_COOKIE (newframe, afr_local_discovery_cbk, (void *)(long)child_index, priv->children[child_index], priv->children[child_index]->fops->getxattr, &tmploc, GF_XATTR_PATHINFO_KEY, NULL); } -static void -afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_index, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) + +int +afr_lookup_selfheal_wrap (void *opaque) { - afr_private_t *priv = this->private; + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + inode_t *inode = NULL; - if (local->success_count == 0) { - if (local->op_errno != ESTALE) { - local->op_ret = op_ret; - local->op_errno = 0; - } - afr_lookup_handle_first_success (local, this, inode, buf); - } - afr_lookup_update_lk_counts (local, this, - child_index, xattr); + local = frame->local; + this = frame->this; + + afr_selfheal_name (frame->this, local->loc.pargfid, local->loc.name); + + afr_replies_wipe (local, this->private); + + inode = afr_selfheal_unlocked_lookup_on (frame, local->loc.parent, + local->loc.name, local->replies, + local->child_up); + if (inode) + inode_unref (inode); + afr_lookup_done (frame, this); + + return 0; +} + + +int +afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + call_frame_t *heal = NULL; + int i = 0, first = -1; + gf_boolean_t need_heal = _gf_false; + struct afr_reply *replies = NULL; + int ret = 0; + + local = frame->local; + replies = local->replies; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (first == -1) { + first = i; + continue; + } + + if (replies[i].op_ret != replies[first].op_ret) { + need_heal = _gf_true; + break; + } + + if (uuid_compare (replies[i].poststat.ia_gfid, + replies[first].poststat.ia_gfid)) { + need_heal = _gf_true; + break; + } + } + + if (need_heal) { + heal = copy_frame (frame); + if (heal) + heal->root->pid = -1; + ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) + goto lookup_done; + } else { + lookup_done: + afr_lookup_done (frame, this); + } + + return ret; +} + + +int +afr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + dict_t *xdata, struct iatt *postparent) +{ + afr_local_t * local = NULL; + int call_count = -1; + int child_index = -1; + + child_index = (long) cookie; + + local = frame->local; - afr_lookup_cache_args (local, child_index, xattr, - buf, postparent); + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + } - if (local->do_discovery && (priv->read_child == (-1))) { - afr_attempt_local_discovery(this,child_index); + call_count = afr_frame_return (frame); + if (call_count == 0) { + afr_lookup_entry_heal (frame, this); } - local->cont.lookup.success_children[local->success_count] = child_index; - local->success_count++; + return 0; +} + + + +static void +afr_discover_done (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = -1; + int op_errno = 0; + int read_subvol = 0; + + priv = this->private; + local = frame->local; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == 0) + local->op_ret = 0; + } + + op_errno = afr_final_errno (frame->local, this->private); + + if (local->op_ret < 0) { + local->op_errno = op_errno; + local->op_ret = -1; + goto unwind; + } + + afr_replies_interpret (frame, this, local->inode); + + read_subvol = afr_data_subvol_get (local->inode, this, 0, 0); + if (read_subvol == -1) { + gf_log (this->name, GF_LOG_WARNING, "no read subvols for %s", + local->loc.path); + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid || + local->replies[i].op_ret == -1) + continue; + read_subvol = i; + break; + } + } + +unwind: + if (read_subvol == -1) + read_subvol = 0; + + AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[read_subvol].postparent); } + int -afr_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *xattr, - struct iatt *postparent) +afr_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + dict_t *xdata, struct iatt *postparent) { afr_local_t * local = NULL; int call_count = -1; int child_index = -1; - child_index = (long) cookie; + child_index = (long) cookie; - LOCK (&frame->lock); - { - local = frame->local; + local = frame->local; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + } + + if (local->do_discovery && (op_ret == 0)) + afr_attempt_local_discovery (this, child_index); + + call_count = afr_frame_return (frame); + if (call_count == 0) { + afr_discover_done (frame, this); + } + + return 0; +} + + +int +afr_discover_do (call_frame_t *frame, xlator_t *this, int err) +{ + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + priv = this->private; + + if (err) { + local->op_errno = -err; + ret = -1; + goto out; + } + + call_count = local->call_count = AFR_COUNT (local->child_up, + priv->child_count); + + ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, + &local->loc); + if (ret) { + local->op_errno = -ret; + ret = -1; + goto out; + } - if (op_ret == -1) { - afr_lookup_handle_error (local, op_ret, op_errno); - goto unlock; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_discover_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, local->xattr_req); + if (!--call_count) + break; } - afr_lookup_handle_success (local, this, child_index, op_ret, - op_errno, inode, buf, xattr, - postparent); - - } -unlock: - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_lookup_done (frame, this); } - return 0; + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; } + int -afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) +afr_discover (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { - int ret = -ENOMEM; - struct iatt *iatts = NULL; - int32_t *success_children = NULL; - int32_t *sources = NULL; - int32_t **pending_matrix = NULL; - - GF_ASSERT (local); - local->cont.lookup.xattrs = GF_CALLOC (child_count, - sizeof (*local->cont.lookup.xattr), - gf_afr_mt_dict_t); - if (NULL == local->cont.lookup.xattrs) - goto out; - - iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); - if (NULL == iatts) - goto out; - local->cont.lookup.postparents = iatts; - - iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt); - if (NULL == iatts) - goto out; - local->cont.lookup.bufs = iatts; + int op_errno = ENOMEM; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int event = 0; - success_children = afr_children_create (child_count); - if (NULL == success_children) - goto out; - local->cont.lookup.success_children = success_children; + priv = this->private; - local->fresh_children = afr_children_create (child_count); - if (NULL == local->fresh_children) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - sources = GF_CALLOC (sizeof (*sources), child_count, gf_afr_mt_int32_t); - if (NULL == sources) + if (!local->call_count) { + op_errno = ENOTCONN; goto out; - local->cont.lookup.sources = sources; + } - pending_matrix = afr_matrix_create (child_count, child_count); - if (NULL == pending_matrix) - goto out; - local->cont.lookup.pending_matrix = pending_matrix; + if (__is_root_gfid (loc->inode->gfid)) { + if (!this->itable) + this->itable = loc->inode->table; + if (!priv->root_inode) + priv->root_inode = inode_ref (loc->inode); - ret = 0; -out: - return ret; -} + if (priv->choose_local && !priv->did_discovery) { + /* Logic to detect which subvolumes of AFR are + local, in order to prefer them for reads + */ + local->do_discovery = _gf_true; + priv->did_discovery = _gf_true; + } + } -int -afr_lookup (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xattr_req) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - void *gfid_req = NULL; - int ret = -1; - int i = 0; - int call_count = 0; - uint64_t ctx = 0; - int32_t op_errno = 0; - priv = this->private; - - AFR_LOCAL_ALLOC_OR_GOTO (local, out); + local->op = GF_FOP_LOOKUP; - local->op_ret = -1; + loc_copy (&local->loc, loc); - frame->local = local; - local->fop = GF_FOP_LOOKUP; + local->inode = inode_ref (loc->inode); - loc_copy (&local->loc, loc); - ret = loc_path (&local->loc, NULL); - if (ret < 0) { - op_errno = EINVAL; - goto out; - } + if (xattr_req) + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_ref (xattr_req); - if (local->loc.path && - (strcmp (local->loc.path, "/" GF_REPLICATE_TRASH_DIR) == 0)) { - op_errno = EPERM; - ret = -1; - goto out; - } + if (uuid_is_null (loc->inode->gfid)) { + afr_discover_do (frame, this, 0); + return 0; + } - ret = inode_ctx_get (local->loc.inode, this, &ctx); - if (ret == 0) { - /* lookup is a revalidate */ + afr_read_subvol_get (loc->inode, this, NULL, &event, + AFR_DATA_TRANSACTION); - local->read_child_index = afr_inode_get_read_ctx (this, - local->loc.inode, - NULL); - } else { - LOCK (&priv->read_child_lock); - { - if (priv->hash_mode) { - local->read_child_index = -1; - } - else { - local->read_child_index = - (++priv->read_child_rr) % - (priv->child_count); - } - } - UNLOCK (&priv->read_child_lock); - local->cont.lookup.fresh_lookup = _gf_true; - } + if (event != local->event_generation) + afr_inode_refresh (frame, this, loc->inode, afr_discover_do); + else + afr_discover_do (frame, this, 0); - local->child_up = memdup (priv->child_up, - sizeof (*local->child_up) * priv->child_count); - if (NULL == local->child_up) { - op_errno = ENOMEM; - goto out; - } + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; +} - ret = afr_lookup_cont_init (local, priv->child_count); - if (ret < 0) { - op_errno = -ret; - goto out; - } - local->call_count = afr_up_children_count (local->child_up, - priv->child_count); - call_count = local->call_count; - if (local->call_count == 0) { - ret = -1; - op_errno = ENOTCONN; - goto out; - } +int +afr_lookup_do (call_frame_t *frame, xlator_t *this, int err) +{ + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; - /* By default assume ENOTCONN. On success it will be set to 0. */ - local->op_errno = ENOTCONN; + local = frame->local; + priv = this->private; - ret = dict_get_int32 (xattr_req, "attempt-self-heal", - &local->attempt_self_heal); - dict_del (xattr_req, "attempt-self-heal"); + if (err < 0) { + local->op_errno = -err; + ret = -1; + goto out; + } - ret = dict_get_int32 (xattr_req, "foreground-self-heal", - &local->foreground_self_heal); - dict_del (xattr_req, "foreground-self-heal"); + call_count = local->call_count = AFR_COUNT (local->child_up, + priv->child_count); - ret = afr_lookup_xattr_req_prepare (local, this, xattr_req, &local->loc, - &gfid_req); + ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, + &local->loc); if (ret) { local->op_errno = -ret; + ret = -1; goto out; } - afr_lookup_save_gfid (local->cont.lookup.gfid_req, gfid_req, - &local->loc); - local->fop = GF_FOP_LOOKUP; - if (priv->choose_local && !priv->did_discovery) { - if (gfid_req && __is_root_gfid(gfid_req)) { - local->do_discovery = _gf_true; - priv->did_discovery = _gf_true; - } - } + for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, afr_lookup_cbk, @@ -2539,12 +1765,98 @@ afr_lookup (call_frame_t *frame, xlator_t *this, break; } } + return 0; +out: + AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; +} - ret = 0; +/* + * afr_lookup() + * + * The goal here is to figure out what the element getting looked up is. + * i.e what is the GFID, inode type and a conservative estimate of the + * inode attributes are. + * + * As we lookup, operations may be underway on the entry name and the + * inode. In lookup() we are primarily concerned only with the entry + * operations. If the entry is getting unlinked or renamed, we detect + * what operation is underway by querying for on-going transactions and + * pending self-healing on the entry through xdata. + * + * If the entry is a file/dir, it may need self-heal and/or in a + * split-brain condition. Lookup is not the place to worry about these + * conditions. Outcast marking will naturally handle them in the read + * paths. + * + * Here is a brief goal of what we are trying to achieve: + * + * - LOOKUP on all subvolumes concurrently, querying on-going transaction + * and pending self-heal info from the servers. + * + * - If all servers reply the same inode type and GFID, the overall call + * MUST be a success. + * + * - If inode types or GFIDs mismatch, and there IS either an on-going + * transaction or pending self-heal, inspect what the nature of the + * transaction or pending heal is, and select the appropriate subvolume's + * reply as the winner. + * + * - If inode types or GFIDs mismatch, and there are no on-going transactions + * or pending self-heal on the entry name on any of the servers, fail the + * lookup with EIO. Something has gone wrong beyond reasonable action. + */ + +int +afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + afr_local_t *local = NULL; + int32_t op_errno = 0; + int event = 0; + + if (!loc->parent) { + afr_discover (frame, this, loc, xattr_req); + return 0; + } + + if (__is_root_gfid (loc->parent->gfid)) { + if (!strcmp (loc->name, GF_REPLICATE_TRASH_DIR)) { + op_errno = EPERM; + goto out; + } + } + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + if (!local->call_count) { + op_errno = ENOTCONN; + goto out; + } + + local->op = GF_FOP_LOOKUP; + + loc_copy (&local->loc, loc); + + local->inode = inode_ref (loc->inode); + + if (xattr_req) + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_ref (xattr_req); + + afr_read_subvol_get (loc->parent, this, NULL, &event, + AFR_DATA_TRANSACTION); + + if (event != local->event_generation) + afr_inode_refresh (frame, this, loc->parent, afr_lookup_do); + else + afr_lookup_do (frame, this, 0); + + return 0; out: - if (ret) - AFR_STACK_UNWIND (lookup, frame, -1, op_errno, - NULL, NULL, NULL, NULL); + AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); return 0; } @@ -2552,6 +1864,46 @@ out: /* {{{ open */ +afr_fd_ctx_t * +__afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ + uint64_t ctx = 0; + int ret = 0; + afr_fd_ctx_t *fd_ctx = NULL; + + ret = __fd_ctx_get (fd, this, &ctx); + + if (ret < 0) { + ret = __afr_fd_ctx_set (this, fd); + if (ret < 0) + goto out; + + ret = __fd_ctx_get (fd, this, &ctx); + if (ret < 0) + goto out; + } + + fd_ctx = (afr_fd_ctx_t *)(long) ctx; +out: + return fd_ctx; +} + + +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this) +{ + afr_fd_ctx_t *fd_ctx = NULL; + + LOCK(&fd->lock); + { + fd_ctx = __afr_fd_ctx_get (fd, this); + } + UNLOCK(&fd->lock); + + return fd_ctx; +} + + int __afr_fd_ctx_set (xlator_t *this, fd_t *fd) { @@ -2559,6 +1911,7 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) int ret = -1; uint64_t ctx = 0; afr_fd_ctx_t * fd_ctx = NULL; + int i = 0; VALIDATE_OR_GOTO (this->private, out); VALIDATE_OR_GOTO (fd, out); @@ -2577,21 +1930,15 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto out; } - fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_done) { - ret = -ENOMEM; - goto out; - } - - fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->pre_op_piggyback) { - ret = -ENOMEM; - goto out; - } + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]), + priv->child_count, + gf_afr_mt_int32_t); + if (!fd_ctx->pre_op_done[i]) { + ret = -ENOMEM; + goto out; + } + } fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), priv->child_count, @@ -2601,6 +1948,13 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto out; } + for (i = 0; i < priv->child_count; i++) { + if (fd_is_anonymous (fd)) + fd_ctx->opened_on[i] = AFR_FD_OPENED; + else + fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED; + } + fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), priv->child_count, gf_afr_mt_char); @@ -2617,20 +1971,7 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto out; } - fd_ctx->up_count = priv->up_count; - fd_ctx->down_count = priv->down_count; - - fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->locked_on) { - ret = -ENOMEM; - goto out; - } - pthread_mutex_init (&fd_ctx->delay_lock, NULL); - INIT_LIST_HEAD (&fd_ctx->entries); - fd_ctx->call_child = -1; INIT_LIST_HEAD (&fd_ctx->eager_locked); @@ -2660,32 +2001,31 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd) /* {{{ flush */ int -afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - int call_count = -1; + afr_local_t *local = NULL; + int call_count = -1; local = frame->local; LOCK (&frame->lock); { if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - } - - local->op_errno = op_errno; + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } else { + local->op_errno = op_errno; + } } UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) - AFR_STACK_UNWIND(flush, frame, local->op_ret, - local->op_errno, NULL); + AFR_STACK_UNWIND (flush, frame, local->op_ret, + local->op_errno, local->xdata_rsp); return 0; } @@ -2708,7 +2048,7 @@ afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) (void *) (long) i, priv->children[i], priv->children[i]->fops->flush, - local->fd, NULL); + local->fd, xdata); if (!--call_count) break; @@ -2721,40 +2061,30 @@ afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) int afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; call_stub_t *stub = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; + int op_errno = ENOMEM; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - ret = afr_local_init(local, priv, &op_errno); - if (ret < 0) + if (!local->call_count) { + op_errno = ENOTCONN; goto out; + } local->fd = fd_ref(fd); + stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata); - if (!stub) { - ret = -1; - op_errno = ENOMEM; + if (!stub) goto out; - } afr_delayed_changelog_wake_resume (this, fd, stub); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND(flush, frame, -1, op_errno, NULL); - + AFR_STACK_UNWIND (flush, frame, -1, op_errno, NULL); return 0; } @@ -2767,6 +2097,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) uint64_t ctx = 0; afr_fd_ctx_t *fd_ctx = NULL; int ret = 0; + int i = 0; ret = fd_ctx_get (fd, this, &ctx); if (ret < 0) @@ -2775,13 +2106,11 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) fd_ctx = (afr_fd_ctx_t *)(long) ctx; if (fd_ctx) { - GF_FREE (fd_ctx->pre_op_done); + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) + GF_FREE (fd_ctx->pre_op_done[i]); GF_FREE (fd_ctx->opened_on); - GF_FREE (fd_ctx->locked_on); - - GF_FREE (fd_ctx->pre_op_piggyback); GF_FREE (fd_ctx->lock_piggyback); GF_FREE (fd_ctx->lock_acquired); @@ -2799,24 +2128,8 @@ out: int afr_release (xlator_t *this, fd_t *fd) { - afr_locked_fd_t *locked_fd = NULL; - afr_locked_fd_t *tmp = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - afr_cleanup_fd_ctx (this, fd); - list_for_each_entry_safe (locked_fd, tmp, &priv->saved_fds, - list) { - - if (locked_fd->fd == fd) { - list_del_init (&locked_fd->list); - GF_FREE (locked_fd); - } - - } - return 0; } @@ -2841,36 +2154,38 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_local_t *local = NULL; int call_count = -1; int child_index = (long) cookie; - int read_child = 0; + int read_subvol = 0; call_stub_t *stub = NULL; local = frame->local; - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + read_subvol = afr_data_subvol_get (local->inode, this, 0, 0); LOCK (&frame->lock); { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - if (op_ret == 0) { - local->op_ret = 0; + if (local->op_ret == -1) { + local->op_ret = 0; - if (local->success_count == 0) { local->cont.inode_wfop.prebuf = *prebuf; local->cont.inode_wfop.postbuf = *postbuf; + + if (xdata) + local->xdata_rsp = dict_ref (xdata); } - if (child_index == read_child) { + if (child_index == read_subvol) { local->cont.inode_wfop.prebuf = *prebuf; local->cont.inode_wfop.postbuf = *postbuf; + if (xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = dict_ref (xdata); + } } - - local->success_count++; - } - - local->op_errno = op_errno; + } else { + local->op_errno = op_errno; + } } UNLOCK (&frame->lock); @@ -2890,7 +2205,7 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_ret, local->op_errno, &local->cont.inode_wfop.prebuf, &local->cont.inode_wfop.postbuf, - xdata); + local->xdata_rsp); if (!stub) { AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); return 0; @@ -2910,37 +2225,35 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int -afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync, dict_t *xdata) +afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; + int32_t op_errno = ENOMEM; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + priv = this->private; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } - local->fd = fd_ref (fd); + local->fd = fd_ref (fd); if (afr_fd_has_witnessed_unstable_write (this, fd)) { /* don't care. we only wanted to CLEAR the bit */ } + local->inode = inode_ref (fd->inode); + for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, afr_fsync_cbk, @@ -2953,10 +2266,10 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); + AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } @@ -2964,10 +2277,9 @@ out: /* {{{ fsync */ -int32_t -afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xdata) +int +afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; int call_count = -1; @@ -2976,10 +2288,13 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, LOCK (&frame->lock); { - if (op_ret == 0) + if (op_ret == 0) { local->op_ret = 0; - - local->op_errno = op_errno; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + } else { + local->op_errno = op_errno; + } } UNLOCK (&frame->lock); @@ -2987,37 +2302,33 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret, - local->op_errno, xdata); + local->op_errno, local->xdata_rsp); return 0; } -int32_t -afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t datasync, dict_t *xdata) +int +afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) { - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; + int32_t op_errno = ENOMEM; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + priv = this->private; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3030,10 +2341,10 @@ afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); + return 0; } @@ -3056,6 +2367,10 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, if (op_ret == 0) { if (!local->cont.xattrop.xattr) local->cont.xattrop.xattr = dict_ref (xattr); + + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); + local->op_ret = 0; } @@ -3067,7 +2382,7 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno, - local->cont.xattrop.xattr, xdata); + local->cont.xattrop.xattr, local->xdata_rsp); return 0; } @@ -3079,25 +2394,21 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3110,10 +2421,10 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -3138,6 +2449,8 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, if (!local->cont.fxattrop.xattr) local->cont.fxattrop.xattr = dict_ref (xattr); + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); local->op_ret = 0; } @@ -3149,7 +2462,7 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie, if (call_count == 0) AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno, - local->cont.fxattrop.xattr, xdata); + local->cont.fxattrop.xattr, local->xdata_rsp); return 0; } @@ -3161,25 +2474,21 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; int32_t op_errno = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3192,10 +2501,10 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -3203,8 +2512,8 @@ out: int32_t -afr_inodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -3238,25 +2547,21 @@ afr_inodelk (call_frame_t *frame, xlator_t *this, { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOMEM; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3270,18 +2575,17 @@ afr_inodelk (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL); + return 0; } int32_t -afr_finodelk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xdata) +afr_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -3309,31 +2613,26 @@ afr_finodelk_cbk (call_frame_t *frame, void *cookie, int32_t -afr_finodelk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock, - dict_t *xdata) +afr_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *flock, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3347,10 +2646,10 @@ afr_finodelk (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL); + return 0; } @@ -3383,33 +2682,28 @@ afr_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } -int32_t -afr_entrylk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, - const char *basename, entrylk_cmd cmd, entrylk_type type, - dict_t *xdata) +int +afr_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, + loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; int32_t op_errno = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3423,18 +2717,18 @@ afr_entrylk (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL); + return 0; } -int32_t -afr_fentrylk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) +int +afr_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; @@ -3461,33 +2755,28 @@ afr_fentrylk_cbk (call_frame_t *frame, void *cookie, } -int32_t -afr_fentrylk (call_frame_t *frame, xlator_t *this, - const char *volume, fd_t *fd, - const char *basename, entrylk_cmd cmd, - entrylk_type type, dict_t *xdata) +int +afr_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - int ret = -1; int i = 0; int32_t call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { @@ -3501,82 +2790,85 @@ afr_fentrylk (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL); + return 0; } -int32_t -afr_statfs_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct statvfs *statvfs, dict_t *xdata) + +int +afr_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct statvfs *statvfs, dict_t *xdata) { afr_local_t *local = NULL; int call_count = 0; + struct statvfs *buf = NULL; LOCK (&frame->lock); { local = frame->local; - if (op_ret == 0) { - local->op_ret = op_ret; - - if (local->cont.statfs.buf_set) { - if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail) - local->cont.statfs.buf = *statvfs; - } else { - local->cont.statfs.buf = *statvfs; - local->cont.statfs.buf_set = 1; - } - } - - if (op_ret == -1) + if (op_ret != 0) { local->op_errno = op_errno; + goto unlock; + } + local->op_ret = op_ret; + + buf = &local->cont.statfs.buf; + if (local->cont.statfs.buf_set) { + if (statvfs->f_bavail < buf->f_bavail) { + *buf = *statvfs; + if (xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = dict_ref (xdata); + } + } + } else { + *buf = *statvfs; + local->cont.statfs.buf_set = 1; + if (xdata) + local->xdata_rsp = dict_ref (xdata); + } } +unlock: UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, - &local->cont.statfs.buf, xdata); + &local->cont.statfs.buf, local->xdata_rsp); return 0; } -int32_t -afr_statfs (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xdata) +int +afr_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - afr_private_t * priv = NULL; - int child_count = 0; afr_local_t * local = NULL; + afr_private_t *priv = NULL; int i = 0; - int ret = -1; int call_count = 0; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); + int32_t op_errno = ENOMEM; - priv = this->private; - child_count = priv->child_count; - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + priv = this->private; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } - for (i = 0; i < child_count; i++) { + for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND (frame, afr_statfs_cbk, priv->children[i], @@ -3587,10 +2879,10 @@ afr_statfs (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -3699,21 +2991,6 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN, &local->cont.lk.ret_flock, NULL); } else { - /* locking has succeeded on all nodes that are up */ - - /* temporarily - ret = afr_mark_locked_nodes (this, local->fd, - local->cont.lk.locked_nodes); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "Could not save locked nodes info in fdctx"); - - ret = afr_save_locked_fd (this, local->fd); - if (ret) - gf_log (this->name, GF_LOG_DEBUG, - "Could not save locked fd"); - - */ AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, &local->cont.lk.ret_flock, NULL); } @@ -3729,20 +3006,12 @@ afr_lk (call_frame_t *frame, xlator_t *this, afr_private_t *priv = NULL; afr_local_t *local = NULL; int i = 0; - int32_t op_errno = 0; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; priv = this->private; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) goto out; local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count, @@ -3764,28 +3033,16 @@ afr_lk (call_frame_t *frame, xlator_t *this, priv->children[i]->fops->lk, fd, cmd, flock, xdata); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); + return 0; } int afr_forget (xlator_t *this, inode_t *inode) { - uint64_t ctx_addr = 0; - afr_inode_ctx_t *ctx = NULL; - - inode_ctx_get (inode, this, &ctx_addr); - - if (!ctx_addr) - goto out; - - ctx = (afr_inode_ctx_t *)(long)ctx_addr; - GF_FREE (ctx->fresh_children); - GF_FREE (ctx); -out: return 0; } @@ -3805,7 +3062,6 @@ afr_priv_dump (xlator_t *this) snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); gf_proc_dump_add_section(key_prefix); gf_proc_dump_write("child_count", "%u", priv->child_count); - gf_proc_dump_write("read_child_rr", "%u", priv->read_child_rr); for (i = 0; i < priv->child_count; i++) { sprintf (key, "child_up[%d]", i); gf_proc_dump_write(key, "%d", priv->child_up[i]); @@ -3862,7 +3118,7 @@ afr_notify (xlator_t *this, int32_t event, int idx = -1; int ret = -1; int call_psh = 0; - int up_child = AFR_ALL_CHILDREN; + int up_child = -1; dict_t *input = NULL; dict_t *output = NULL; @@ -3914,6 +3170,7 @@ afr_notify (xlator_t *this, int32_t event, */ if (priv->child_up[idx] != 1) { priv->up_count++; + priv->event_generation++; } priv->child_up[idx] = 1; @@ -3953,6 +3210,7 @@ afr_notify (xlator_t *this, int32_t event, */ if (priv->child_up[idx] == 1) { priv->down_count++; + priv->event_generation++; } priv->child_up[idx] = 0; @@ -4019,8 +3277,7 @@ afr_notify (xlator_t *this, int32_t event, LOCK (&priv->lock); { - up_children = afr_up_children_count (priv->child_up, - priv->child_count); + up_children = AFR_COUNT (priv->child_up, priv->child_count); for (i = 0; i < priv->child_count; i++) { if (priv->last_event[i] == GF_EVENT_CHILD_UP) { event = GF_EVENT_CHILD_UP; @@ -4040,39 +3297,23 @@ afr_notify (xlator_t *this, int32_t event, ret = 0; if (propagate) ret = default_notify (this, event, data); - if (call_psh && priv->shd.iamshd) - afr_proactive_self_heal ((void*) (long) up_child); + if (call_psh && priv->shd.iamshd) { + afr_selfheal_childup (this, up_child); + } out: return ret; } -int -afr_first_up_child (unsigned char *child_up, size_t child_count) -{ - int ret = -1; - int i = 0; - - GF_ASSERT (child_up); - - for (i = 0; i < child_count; i++) { - if (child_up[i]) { - ret = i; - break; - } - } - - return ret; -} int afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) { - int ret = -1; - local->op_ret = -1; local->op_errno = EUCLEAN; + syncbarrier_init (&local->barrier); + local->child_up = GF_CALLOC (priv->child_count, sizeof (*local->child_up), gf_afr_mt_char); @@ -4084,38 +3325,42 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) memcpy (local->child_up, priv->child_up, sizeof (*local->child_up) * priv->child_count); - local->call_count = afr_up_children_count (local->child_up, - priv->child_count); + local->call_count = AFR_COUNT (local->child_up, priv->child_count); if (local->call_count == 0) { gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up"); if (op_errno) *op_errno = ENOTCONN; goto out; } + local->event_generation = priv->event_generation; - local->child_errno = GF_CALLOC (priv->child_count, - sizeof (*local->child_errno), - gf_afr_mt_int32_t); - if (!local->child_errno) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } + local->read_attempted = GF_CALLOC (priv->child_count, sizeof (char), + gf_afr_mt_char); + if (!local->read_attempted) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } - local->transaction.postop_piggybacked = GF_CALLOC (priv->child_count, - sizeof (int), - gf_afr_mt_int32_t); - if (!local->transaction.postop_piggybacked) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } + local->readable = GF_CALLOC (priv->child_count, sizeof (char), + gf_afr_mt_char); + if (!local->readable) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } - local->append_write = _gf_false; + local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), + gf_afr_mt_reply_t); + if (!local->replies) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } - ret = 0; + return 0; out: - return ret; + return -1; } int @@ -4218,13 +3463,11 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) } ret = -ENOMEM; - child_up_count = afr_up_children_count (local->child_up, - priv->child_count); + child_up_count = AFR_COUNT (local->child_up, priv->child_count); if (priv->optimistic_change_log && child_up_count == priv->child_count) local->optimistic_change_log = 1; - local->first_up_child = afr_first_up_child (local->child_up, - priv->child_count); + local->pre_op_compat = priv->pre_op_compat; local->transaction.eager_lock = GF_CALLOC (sizeof (*local->transaction.eager_lock), @@ -4234,26 +3477,29 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (!local->transaction.eager_lock) goto out; - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) - goto out; - local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op), priv->child_count, gf_afr_mt_char); if (!local->transaction.pre_op) goto out; + local->transaction.fop_subvols = GF_CALLOC (sizeof (*local->transaction.fop_subvols), + priv->child_count, + gf_afr_mt_char); + if (!local->transaction.fop_subvols) + goto out; + + local->transaction.failed_subvols = GF_CALLOC (sizeof (*local->transaction.failed_subvols), + priv->child_count, + gf_afr_mt_char); + if (!local->transaction.failed_subvols) + goto out; + local->pending = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); if (!local->pending) goto out; - local->transaction.txn_changelog = afr_matrix_create (priv->child_count, - AFR_NUM_CHANGE_LOGS); - if (!local->transaction.txn_changelog) - goto out; - INIT_LIST_HEAD (&local->transaction.eager_locked); ret = 0; @@ -4261,86 +3507,6 @@ out: return ret; } -void -afr_reset_children (int32_t *fresh_children, int32_t child_count) -{ - unsigned int i = 0; - for (i = 0; i < child_count; i++) - fresh_children[i] = -1; -} - -int32_t* -afr_children_create (int32_t child_count) -{ - int32_t *children = NULL; - int i = 0; - - GF_ASSERT (child_count > 0); - - children = GF_CALLOC (child_count, sizeof (*children), - gf_afr_mt_int32_t); - if (NULL == children) - goto out; - for (i = 0; i < child_count; i++) - children[i] = -1; -out: - return children; -} - -void -afr_children_add_child (int32_t *children, int32_t child, - int32_t child_count) -{ - gf_boolean_t child_found = _gf_false; - int i = 0; - - for (i = 0; i < child_count; i++) { - if (children[i] == -1) - break; - if (children[i] == child) { - child_found = _gf_true; - break; - } - } - - if (!child_found) { - GF_ASSERT (i < child_count); - children[i] = child; - } -} - -void -afr_children_rm_child (int32_t *children, int32_t child, int32_t child_count) -{ - int i = 0; - - GF_ASSERT ((child >= 0) && (child < child_count)); - for (i = 0; i < child_count; i++) { - if (children[i] == -1) - break; - if (children[i] == child) { - if (i != (child_count - 1)) - memmove (children + i, children + i + 1, - sizeof (*children)*(child_count - i - 1)); - children[child_count - 1] = -1; - break; - } - } -} - -int -afr_get_children_count (int32_t *children, unsigned int child_count) -{ - int count = 0; - int i = 0; - - for (i = 0; i < child_count; i++) { - if (children[i] == -1) - break; - count++; - } - return count; -} void afr_set_low_priority (call_frame_t *frame) @@ -4348,38 +3514,6 @@ afr_set_low_priority (call_frame_t *frame) frame->root->pid = LOW_PRIO_PROC_PID; } -int -afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child, - int flags) -{ - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - GF_ASSERT (fd && fd->inode); - ret = afr_fd_ctx_set (this, fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not set fd ctx for fd=%p", fd); - goto out; - } - - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not get fd ctx for fd=%p", fd); - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - fd_ctx->opened_on[child] = AFR_FD_OPENED; - if (!IA_ISDIR (fd->inode->ia_type)) { - fd_ctx->flags = flags; - } - ret = 0; -out: - return ret; -} gf_boolean_t afr_have_quorum (char *logname, afr_private_t *priv) @@ -4426,33 +3560,6 @@ afr_priv_destroy (afr_private_t *priv) if (!priv) goto out; inode_unref (priv->root_inode); - GF_FREE (priv->shd.pos); - GF_FREE (priv->shd.pending); - GF_FREE (priv->shd.inprogress); -// for (i = 0; i < priv->child_count; i++) -// if (priv->shd.timer && priv->shd.timer[i]) -// gf_timer_call_cancel (this->ctx, priv->shd.timer[i]); - GF_FREE (priv->shd.timer); - - if (priv->shd.healed) - eh_destroy (priv->shd.healed); - - if (priv->shd.heal_failed) - eh_destroy (priv->shd.heal_failed); - - if (priv->shd.split_brain) - eh_destroy (priv->shd.split_brain); - - for (i = 0; i < priv->child_count; i++) - { - if (priv->shd.statistics[i]) - eh_destroy (priv->shd.statistics[i]); - } - - GF_FREE (priv->shd.statistics); - - GF_FREE (priv->shd.crawl_events); - GF_FREE (priv->last_event); if (priv->pending_key) { for (i = 0; i < priv->child_count; i++) @@ -4462,8 +3569,7 @@ afr_priv_destroy (afr_private_t *priv) GF_FREE (priv->children); GF_FREE (priv->child_up); LOCK_DESTROY (&priv->lock); - LOCK_DESTROY (&priv->read_child_lock); - pthread_mutex_destroy (&priv->mutex); + GF_FREE (priv); out: return; @@ -4480,124 +3586,21 @@ xlator_subvolume_count (xlator_t *this) return i; } -inline gf_boolean_t -afr_is_errno_set (int *child_errno, int child) -{ - return child_errno[child]; -} - -inline gf_boolean_t -afr_is_errno_unset (int *child_errno, int child) -{ - return !afr_is_errno_set (child_errno, child); -} - -void -afr_prepare_new_entry_pending_matrix (int32_t **pending, - gf_boolean_t (*is_pending) (int *, int), - int *ctx, struct iatt *buf, - unsigned int child_count) -{ - int midx = 0; - int idx = 0; - int i = 0; - - midx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); - if (IA_ISDIR (buf->ia_type)) - idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); - else if (IA_ISREG (buf->ia_type)) - idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - else - idx = -1; - for (i = 0; i < child_count; i++) { - if (is_pending (ctx, i)) { - pending[i][midx] = hton32 (1); - if (idx == -1) - continue; - pending[i][idx] = hton32 (1); - } - } -} - -gf_boolean_t -afr_is_fd_fixable (fd_t *fd) -{ - if (!fd || !fd->inode) - return _gf_false; - else if (fd_is_anonymous (fd)) - return _gf_false; - else if (uuid_is_null (fd->inode->gfid)) - return _gf_false; - - return _gf_true; -} void afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; - inode_t *inode = NULL; - afr_inode_ctx_t *ctx = NULL; + afr_fd_ctx_t *fd_ctx = NULL; local = frame->local; - if (local->fd) - inode = local->fd->inode; - else - inode = local->loc.inode; - - if (!inode) - return; - - LOCK (&inode->lock); - { - ctx = __afr_inode_ctx_get (inode, this); - ctx->open_fd_count = local->open_fd_count; - } - UNLOCK (&inode->lock); -} - -int -afr_initialise_statistics (xlator_t *this) -{ - afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int child_count = 0; - eh_t *stats_per_brick = NULL; - shd_crawl_event_t ***shd_crawl_events = NULL; - priv = this->private; - - priv->shd.statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count, - gf_common_mt_eh_t); - if (!priv->shd.statistics) { - ret = -1; - goto out; - } - child_count = priv->child_count; - for (i=0; i < child_count ; i++) { - stats_per_brick = eh_new (AFR_STATISTICS_HISTORY_SIZE, - _gf_false, - _destroy_crawl_event_data); - if (!stats_per_brick) { - ret = -1; - goto out; - } - priv->shd.statistics[i] = stats_per_brick; - - } - - shd_crawl_events = (shd_crawl_event_t***)(&priv->shd.crawl_events); - *shd_crawl_events = GF_CALLOC (sizeof(shd_crawl_event_t*), - priv->child_count, - gf_afr_mt_shd_crawl_event_t); + if (!local->fd) + return; - if (!priv->shd.crawl_events) { - ret = -1; - goto out; - } - ret = 0; -out: - return ret; + fd_ctx = afr_fd_ctx_get (local->fd, this); + if (!fd_ctx) + return; + fd_ctx->open_fd_count = local->open_fd_count; } diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index 689dd84e6..fa1da3958 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -37,177 +37,7 @@ #include "checksum.h" #include "afr.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" - -int -afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret, - int32_t op_errno, int32_t sh_failed) -{ - afr_local_t *local = NULL; - - local = frame->local; - - afr_set_opendir_done (this, local->fd->inode); - - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd, NULL); - - return 0; -} - - -gf_boolean_t -__checksums_differ (uint32_t *checksum, int child_count, - unsigned char *child_up) -{ - int ret = _gf_false; - int i = 0; - uint32_t cksum = 0; - gf_boolean_t activate_check = _gf_false; - - for (i = 0; i < child_count; i++) { - if (!child_up[i]) - continue; - if (_gf_false == activate_check) { - cksum = checksum[i]; - activate_check = _gf_true; - continue; - } - - if (cksum != checksum[i]) { - ret = _gf_true; - break; - } - - cksum = checksum[i]; - } - - return ret; -} - - -int32_t -afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; - char *reason = NULL; - int child_index = 0; - uint32_t entry_cksum = 0; - int call_count = 0; - off_t last_offset = 0; - inode_t *inode = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - inode = local->fd->inode; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to do opendir on %s", - local->loc.path, priv->children[child_index]->name); - local->op_ret = -1; - local->op_ret = op_errno; - goto out; - } - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "%s: no entries found in %s", - local->loc.path, priv->children[child_index]->name); - goto out; - } - - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - entry_cksum = gf_rsync_weak_checksum ((unsigned char *)entry->d_name, - strlen (entry->d_name)); - local->cont.opendir.checksum[child_index] ^= entry_cksum; - } - - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; - } - - /* read more entries */ - - STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->readdir, - local->fd, 131072, last_offset, NULL); - - return 0; - -out: - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (__checksums_differ (local->cont.opendir.checksum, - priv->child_count, - local->child_up)) { - - sh->do_entry_self_heal = _gf_true; - sh->forced_merge = _gf_true; - - reason = "checksums of directory differ"; - afr_launch_self_heal (frame, this, inode, _gf_false, - inode->ia_type, reason, NULL, - afr_examine_dir_sh_unwind); - } else { - afr_set_opendir_done (this, inode); - - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd, NULL); - } - } - - return 0; -} - - -int -afr_examine_dir (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int i = 0; - int call_count = 0; - - local = frame->local; - priv = this->private; - - local->cont.opendir.checksum = GF_CALLOC (priv->child_count, - sizeof (*local->cont.opendir.checksum), - gf_afr_mt_int32_t); - - call_count = afr_up_children_count (local->child_up, priv->child_count); - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->readdir, - local->fd, 131072, 0, NULL); - - if (!--call_count) - break; - } - } - - return 0; -} +#include "afr-transaction.h" int32_t @@ -215,112 +45,66 @@ afr_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - afr_private_t *priv = NULL; afr_local_t *local = NULL; - int32_t up_children_count = 0; - int ret = -1; int call_count = -1; int32_t child_index = 0; + afr_fd_ctx_t *fd_ctx = NULL; - priv = this->private; local = frame->local; + fd_ctx = local->fd_ctx; child_index = (long) cookie; - up_children_count = afr_up_children_count (local->child_up, - priv->child_count); - LOCK (&frame->lock); { - if (op_ret >= 0) { + if (op_ret == -1) { + local->op_errno = op_errno; + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } else { local->op_ret = op_ret; - ret = afr_child_fd_ctx_set (this, fd, child_index, 0); - if (ret) { - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); } - - local->op_errno = op_errno; } -unlock: UNLOCK (&frame->lock); call_count = afr_frame_return (frame); - if (call_count == 0) { - if (local->op_ret != 0) - goto out; - - if (!afr_is_opendir_done (this, local->fd->inode) && - up_children_count > 1 && priv->entry_self_heal) { - - /* - * This is the first opendir on this inode. We need - * to check if the directory's entries are the same - * on all subvolumes. This is needed in addition - * to regular entry self-heal because the readdir - * call is sent only to the first subvolume, and - * thus files that exist only there will never be healed - * otherwise (assuming changelog shows no anomalies). - */ - - gf_log (this->name, GF_LOG_TRACE, - "reading contents of directory %s looking for mismatch", - local->loc.path); - - afr_examine_dir (frame, this); - - } else { - /* do the unwind */ - goto out; - } - } - - return 0; - -out: - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd, NULL); - + if (call_count == 0) + AFR_STACK_UNWIND (opendir, frame, local->op_ret, + local->op_errno, local->fd, NULL); return 0; } -int32_t -afr_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd) +int +afr_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) { afr_private_t * priv = NULL; afr_local_t * local = NULL; - int child_count = 0; int i = 0; - int ret = -1; int call_count = -1; - int32_t op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int32_t op_errno = ENOMEM; + afr_fd_ctx_t *fd_ctx = NULL; priv = this->private; - child_count = priv->child_count; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + goto out; loc_copy (&local->loc, loc); local->fd = fd_ref (fd); + local->fd_ctx = fd_ctx; call_count = local->call_count; - for (i = 0; i < child_count; i++) { + for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, afr_opendir_cbk, (void*) (long) i, @@ -333,182 +117,280 @@ afr_opendir (call_frame_t *frame, xlator_t *this, } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL); - + AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL); return 0; } -/** - * Common algorithm for directory read calls: - * - * - Try the fop on the first child that is up - * - if we have failed due to ENOTCONN: - * try the next child - * - * Applicable to: readdir - */ +#define BACKEND_D_OFF_BITS 63 +#define PRESENT_D_OFF_BITS 63 +#define ONE 1ULL +#define MASK (~0ULL) +#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS)) +#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS)) -struct entry_name { - char *name; - struct list_head list; -}; +#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1)) +#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1))) -static void -afr_forget_entries (fd_t *fd) +static uint64_t +afr_bits_for (uint64_t num) { - struct entry_name *entry = NULL; - struct entry_name *tmp = NULL; - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - ret = fd_ctx_get (fd, THIS, &ctx); - if (ret < 0) { - gf_log (THIS->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", fd); - return; - } + uint64_t bits = 0, ctrl = 1; - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + while (ctrl < num) { + ctrl *= 2; + bits ++; + } - list_for_each_entry_safe (entry, tmp, &fd_ctx->entries, list) { - GF_FREE (entry->name); - list_del (&entry->list); - GF_FREE (entry); - } + return bits; } -static void -afr_readdir_filter_trash_dir (gf_dirent_t *entries, fd_t *fd) +int +afr_itransform (xlator_t *this, int subvol, uint64_t x, uint64_t *y_p) { - gf_dirent_t * entry = NULL; - gf_dirent_t * tmp = NULL; + afr_private_t *conf = NULL; + int cnt = 0; + int max = 0; + uint64_t y = 0; + uint64_t hi_mask = 0; + uint64_t off_mask = 0; + int max_bits = 0; + + if (x == ((uint64_t) -1)) { + y = (uint64_t) -1; + goto out; + } - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - if (__is_root_gfid (fd->inode->gfid) && - !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { - list_del_init (&entry->list); - GF_FREE (entry); - } + conf = this->private; + if (!conf) + goto out; + + max = conf->child_count; + cnt = subvol; + + if (max == 1) { + y = x; + goto out; + } + + max_bits = afr_bits_for (max); + + hi_mask = ~(PRESENT_MASK >> (max_bits + 1)); + + if (x & hi_mask) { + /* HUGE d_off */ + off_mask = MASK << max_bits; + y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt; + } else { + /* small d_off */ + y = ((x * max) + cnt); } + +out: + if (y_p) + *y_p = y; + + return 0; } -int32_t -afr_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries, dict_t *xdata) + +int +afr_deitransform (xlator_t *this, uint64_t y, int *subvol_p, + uint64_t *x_p) { - afr_local_t *local = NULL; + afr_private_t *conf = NULL; + int cnt = 0; + int max = 0; + uint64_t x = 0; + int subvol = 0; + int max_bits = 0; + uint64_t off_mask = 0; + uint64_t host_mask = 0; + + if (!this->private) + return -1; + + conf = this->private; + max = conf->child_count; + + if (max == 1) { + x = y; + cnt = 0; + goto out; + } + + if (y & TOP_BIT) { + /* HUGE d_off */ + max_bits = afr_bits_for (max); + off_mask = (MASK << max_bits); + host_mask = ~(off_mask); + + x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS; + + cnt = y & host_mask; + } else { + /* small d_off */ + cnt = y % max; + x = y / max; + } - if (op_ret == -1) - goto out; +out: + subvol = cnt; - local = frame->local; - afr_readdir_filter_trash_dir (entries, local->fd); + if (subvol_p) + *subvol_p = subvol; + + if (x_p) + *x_p = x; -out: - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL); return 0; } -int32_t -afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, - dict_t *xdata) +static void +afr_readdir_transform_entries (gf_dirent_t *subvol_entries, int subvol, + gf_dirent_t *entries, fd_t *fd) { - afr_local_t *local = NULL; + afr_private_t *priv = NULL; + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + int gen = 0; - if (op_ret == -1) - goto out; + priv = THIS->private; - local = frame->local; - afr_readdir_filter_trash_dir (entries, local->fd); + data_readable = alloca0 (priv->child_count); + metadata_readable = alloca0 (priv->child_count); -out: - AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, NULL); - return 0; + list_for_each_entry_safe (entry, tmp, &subvol_entries->list, list) { + if (__is_root_gfid (fd->inode->gfid) && + !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { + continue; + } + + list_del_init (&entry->list); + afr_itransform (THIS, subvol, entry->d_off, &entry->d_off); + list_add_tail (&entry->list, &entries->list); + + if (entry->inode) { + gen = 0; + afr_inode_read_subvol_get (entry->inode, THIS, + data_readable, + metadata_readable, &gen); + + if (gen != priv->event_generation || + !data_readable[subvol] || + !metadata_readable[subvol]) { + + inode_unref (entry->inode); + entry->inode = NULL; + } + } + } } + int32_t -afr_do_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, int whichop, dict_t *dict) +afr_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *subvol_entries, + dict_t *xdata) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - int ret = -1; - int32_t op_errno = 0; - uint64_t read_child = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_local_t *local = NULL; + gf_dirent_t entries; - priv = this->private; - children = priv->children; + INIT_LIST_HEAD (&entries.list); - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + if (op_ret < 0 && !local->cont.readdir.offset) { + /* failover only if this was first readdir, detected + by offset == 0 */ + local->op_ret = op_ret; + local->op_errno = op_errno; - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - read_child = afr_inode_get_read_ctx (this, fd->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.readdir.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } + if (op_ret >= 0) + afr_readdir_transform_entries (subvol_entries, (long) cookie, + &entries, local->fd); - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) { - op_errno = EBADF; - goto out; - } + AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, xdata); - if ((offset == 0) || (fd_ctx->call_child == -1)) { - fd_ctx->call_child = call_child; - } else if ((priv->readdir_failover == _gf_false) && - (call_child != fd_ctx->call_child)) { - op_errno = EBADF; - goto out; - } + return 0; +} + + +int +afr_readdir_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; - local->fd = fd_ref (fd); - local->cont.readdir.size = size; - local->cont.readdir.dict = (dict)? dict_ref (dict) : NULL; + if (subvol == -1) { + AFR_STACK_UNWIND (readdir, frame, local->op_ret, + local->op_errno, 0, 0); + return 0; + } - if (whichop == GF_FOP_READDIR) + if (local->op == GF_FOP_READDIR) STACK_WIND_COOKIE (frame, afr_readdir_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readdir, fd, - size, offset, dict); + (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readdir, + local->fd, local->cont.readdir.size, + local->cont.readdir.offset, + local->xdata_req); else - STACK_WIND_COOKIE (frame, afr_readdirp_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readdirp, fd, - size, offset, dict); + STACK_WIND_COOKIE (frame, afr_readdir_cbk, + (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readdirp, + local->fd, local->cont.readdir.size, + local->cont.readdir.offset, + local->xdata_req); + return 0; +} + + +int +afr_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, int whichop, dict_t *dict) +{ + afr_local_t *local = NULL; + int32_t op_errno = 0; + int subvol = -1; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + local->op = whichop; + local->fd = fd_ref (fd); + local->cont.readdir.size = size; + local->cont.readdir.offset = offset; + local->xdata_req = (dict)? dict_ref (dict) : NULL; + + if (offset == 0) { + /* First readdir has option of failing over and selecting + an appropriate read subvolume */ + afr_read_txn (frame, this, fd->inode, afr_readdir_wind, + AFR_DATA_TRANSACTION); + } else { + /* But continued readdirs MUST stick to the same subvolume + without an option to failover */ + afr_deitransform (this, offset, &subvol, + (uint64_t *)&local->cont.readdir.offset); + afr_readdir_wind (frame, this, subvol); + } return 0; out: @@ -521,7 +403,8 @@ int32_t afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t offset, dict_t *xdata) { - afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata); + afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata); + return 0; } @@ -531,6 +414,7 @@ afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t offset, dict_t *dict) { afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP, dict); + return 0; } @@ -538,7 +422,6 @@ afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, int32_t afr_releasedir (xlator_t *this, fd_t *fd) { - afr_forget_entries (fd); afr_cleanup_fd_ctx (this, fd); return 0; diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 1943b719b..465dde54f 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -34,10 +34,14 @@ #include "common-utils.h" #include "compat-errno.h" #include "compat.h" +#include "byte-order.h" #include "afr.h" #include "afr-transaction.h" +void +afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this); + int afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno) { @@ -56,79 +60,214 @@ afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno) *op_errno = ENOMEM; goto out; } - parent->path = gf_strdup( dirname (child_path) ); - if (!parent->path) { + + parent->path = gf_strdup (dirname (child_path)); + if (!parent->path) { if (op_errno) *op_errno = ENOMEM; goto out; } - parent->inode = inode_ref (child->parent); - uuid_copy (parent->gfid, child->pargfid); + + parent->inode = inode_ref (child->parent); + uuid_copy (parent->gfid, child->pargfid); ret = 0; out: - GF_FREE(child_path); + GF_FREE (child_path); return ret; } -void -__dir_entry_fop_common_cbk (call_frame_t *frame, int child_index, - xlator_t *this, int32_t op_ret, - int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, struct iatt *prenewparent, - struct iatt *postnewparent) + +static void +__afr_dir_write_finalize (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int inode_read_subvol = -1; + int parent_read_subvol = -1; + int parent2_read_subvol = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + if (local->inode) { + afr_replies_interpret (frame, this, local->inode); + inode_read_subvol = afr_data_subvol_get (local->inode, this, + NULL, NULL); + } + if (local->parent) + parent_read_subvol = afr_data_subvol_get (local->parent, this, + NULL, NULL); + if (local->parent2) + parent2_read_subvol = afr_data_subvol_get (local->parent2, this, + NULL, NULL); + + local->op_ret = -1; + local->op_errno = afr_final_errno (local, priv); + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret < 0) { + if (local->inode) + afr_inode_read_subvol_reset (local->inode, + this); + if (local->parent) + afr_inode_read_subvol_reset (local->parent, + this); + if (local->parent2) + afr_inode_read_subvol_reset (local->parent2, + this); + continue; + } + + if (local->op_ret == -1) { + local->op_ret = local->replies[i].op_ret; + local->op_errno = local->replies[i].op_errno; + + local->cont.dir_fop.buf = + local->replies[i].poststat; + local->cont.dir_fop.preparent = + local->replies[i].preparent; + local->cont.dir_fop.postparent = + local->replies[i].postparent; + local->cont.dir_fop.prenewparent = + local->replies[i].preparent2; + local->cont.dir_fop.postnewparent = + local->replies[i].postparent2; + if (local->replies[i].xdata) + local->xdata_rsp = + dict_ref (local->replies[i].xdata); + continue; + } + + if (i == inode_read_subvol) { + local->cont.dir_fop.buf = + local->replies[i].poststat; + if (local->replies[i].xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = + dict_ref (local->replies[i].xdata); + } + } + + if (i == parent_read_subvol) { + local->cont.dir_fop.preparent = + local->replies[i].preparent; + local->cont.dir_fop.postparent = + local->replies[i].postparent; + } + + if (i == parent2_read_subvol) { + local->cont.dir_fop.prenewparent = + local->replies[i].preparent2; + local->cont.dir_fop.postnewparent = + local->replies[i].postparent2; + } + } +} + + +static void +__afr_dir_write_fill (call_frame_t *frame, xlator_t *this, int child_index, + int op_ret, int op_errno, struct iatt *poststat, + struct iatt *preparent, struct iatt *postparent, + struct iatt *preparent2, struct iatt *postparent2, + dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_fd_ctx_t *fd_ctx = NULL; local = frame->local; + fd_ctx = local->fd_ctx; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + + if (op_ret >= 0) { + if (poststat) + local->replies[child_index].poststat = *poststat; + if (preparent) + local->replies[child_index].preparent = *preparent; + if (postparent) + local->replies[child_index].postparent = *postparent; + if (preparent2) + local->replies[child_index].preparent2 = *preparent2; + if (postparent2) + local->replies[child_index].postparent2 = *postparent2; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + + if (fd_ctx) + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + } else { + if (op_errno != ENOTEMPTY) + afr_transaction_fop_failed (frame, this, child_index); + if (fd_ctx) + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } + + return; +} + + +static int +__afr_dir_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + struct iatt *preparent2, struct iatt *postparent2, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = (long) cookie; + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + __afr_dir_write_fill (frame, this, child_index, op_ret, + op_errno, buf, preparent, postparent, + preparent2, postparent2, xdata); + } + UNLOCK (&frame->lock); + call_count = afr_frame_return (frame); + + if (call_count == 0) { + __afr_dir_write_finalize (frame, this); + + if (afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret > -1) { - local->op_ret = op_ret; - - if ((local->success_count == 0) || - (child_index == local->read_child_index)) { - local->cont.dir_fop.preparent = *preparent; - local->cont.dir_fop.postparent = *postparent; - if (buf) - local->cont.dir_fop.buf = *buf; - if (prenewparent) - local->cont.dir_fop.prenewparent = *prenewparent; - if (postnewparent) - local->cont.dir_fop.postnewparent = *postnewparent; - } - - local->cont.dir_fop.inode = inode; - - local->fresh_children[local->success_count] = child_index; - local->success_count++; - local->child_errno[child_index] = 0; - } else { - local->child_errno[child_index] = op_errno; + afr_mark_entry_pending_changelog (frame, this); + + local->transaction.resume (frame, this); } - local->op_errno = op_errno; + return 0; } + int afr_mark_new_entry_changelog_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, + xlator_t *this, int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - int call_count = 0; + int call_count = 0; call_count = afr_frame_return (frame); - if (call_count == 0) { + + if (call_count == 0) AFR_STACK_DESTROY (frame); - } + return 0; } + void afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this) { @@ -136,125 +275,109 @@ afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_local_t *new_local = NULL; afr_private_t *priv = NULL; - dict_t **xattr = NULL; + dict_t *xattr = NULL; int32_t **changelog = NULL; int i = 0; - GF_UNUSED int op_errno = 0; + int idx = 0; + int op_errno = ENOMEM; + unsigned char *pending = NULL; + int call_count = 0; local = frame->local; priv = this->private; new_frame = copy_frame (frame); - if (!new_frame) { + if (!new_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (new_frame->local, out); - new_local = new_frame->local; + new_local = AFR_FRAME_INIT (new_frame, op_errno); + if (!new_local) + goto out; + changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); if (!changelog) goto out; - xattr = GF_CALLOC (priv->child_count, sizeof (*xattr), - gf_afr_mt_dict_t); - if (!xattr) - goto out; - for (i = 0; i < priv->child_count; i++) { - if (local->child_errno[i]) - continue; - xattr[i] = dict_new (); - if (!xattr[i]) - goto out; - } + xattr = dict_new (); + if (!xattr) + goto out; + + idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - afr_prepare_new_entry_pending_matrix (changelog, - afr_is_errno_set, - local->child_errno, - &local->cont.dir_fop.buf, - priv->child_count); + pending = alloca0 (priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] && + !local->transaction.failed_subvols[i]) { + call_count ++; + continue; + } + + changelog[i][idx] = hton32(1); + pending[i] = 1; + } new_local->pending = changelog; uuid_copy (new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid); - new_local->loc.inode = inode_ref (local->cont.dir_fop.inode); - new_local->call_count = local->success_count; + new_local->loc.inode = inode_ref (local->inode); + + + afr_set_pending_dict (priv, xattr, changelog); + + new_local->call_count = call_count; for (i = 0; i < priv->child_count; i++) { - if (local->child_errno[i]) + if (pending[i]) continue; - afr_set_pending_dict (priv, xattr[i], changelog, i, LOCAL_LAST); STACK_WIND_COOKIE (new_frame, afr_mark_new_entry_changelog_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->xattrop, &new_local->loc, GF_XATTROP_ADD_ARRAY, - xattr[i], NULL); + xattr, NULL); + if (!--call_count) + break; } + new_frame = NULL; out: if (new_frame) AFR_STACK_DESTROY (new_frame); - afr_xattr_array_destroy (xattr, priv->child_count); + if (xattr) + dict_unref (xattr); return; } -gf_boolean_t -afr_is_new_entry_changelog_needed (glusterfs_fop_t fop) -{ - glusterfs_fop_t fops[] = {GF_FOP_CREATE, GF_FOP_MKNOD, GF_FOP_NULL}; - int i = 0; - - for (i = 0; fops[i] != GF_FOP_NULL; i++) { - if (fop == fops[i]) - return _gf_true; - } - return _gf_false; -} void -afr_dir_fop_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this) +afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int pre_op_count = 0; + int failed_count = 0; local = frame->local; priv = this->private; if (local->op_ret < 0) - goto out; + return; - if (local->success_count == priv->child_count) - goto out; + if (local->op != GF_FOP_CREATE && local->op != GF_FOP_MKNOD) + return; - if (!afr_is_new_entry_changelog_needed (local->op)) - goto out; + pre_op_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); + failed_count = AFR_COUNT (local->transaction.failed_subvols, + priv->child_count); + + if (pre_op_count == priv->child_count && !failed_count) + return; afr_mark_new_entry_changelog (frame, this); -out: return; } -void -afr_dir_fop_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - - if (local->cont.dir_fop.inode == NULL) - goto done; - afr_set_read_ctx_from_policy (this, local->cont.dir_fop.inode, - local->fresh_children, - local->read_child_index, - priv->read_child, - local->cont.dir_fop.buf.ia_gfid); -done: - local->transaction.unwind (frame, this); - afr_dir_fop_mark_entry_pending_changelog (frame, this); - local->transaction.resume (frame, this); -} /* {{{ create */ @@ -266,26 +389,16 @@ afr_create_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (create, main_frame, - local->op_ret, local->op_errno, - local->cont.create.fd, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - local->xdata_rsp); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + + AFR_STACK_UNWIND (create, main_frame, local->op_ret, local->op_errno, + local->cont.create.fd, local->inode, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -297,175 +410,79 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - afr_local_t *local = NULL; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int ret = 0; - int call_count = -1; - int child_index = -1; - - local = frame->local; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret > -1) { - ret = afr_fd_ctx_set (this, fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not set ctx on fd=%p", fd); - - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } - - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "could not get fd ctx for fd=%p", fd); - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - fd_ctx->opened_on[child_index] = AFR_FD_OPENED; - fd_ctx->flags = local->cont.create.flags; - - if (local->success_count == 0) { - if (xdata) - local->xdata_rsp = dict_ref(xdata); - } - } - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } - -unlock: - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } int -afr_create_wind (call_frame_t *frame, xlator_t *this) +afr_create_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_create_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->create, - &local->loc, - local->cont.create.flags, - local->cont.create.mode, - local->umask, - local->cont.create.fd, - local->xdata_req); - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_create_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_create_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->create, + &local->loc, local->cont.create.flags, + local->cont.create.mode, local->umask, + local->cont.create.fd, local->xdata_req); return 0; } int -afr_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - mode_t umask, fd_t *fd, dict_t *params) +afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(create,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, loc); - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + local->fd_ctx = afr_fd_ctx_get (fd, this); + if (!local->fd_ctx) + goto out; + + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); local->op = GF_FOP_CREATE; local->cont.create.flags = flags; local->cont.create.mode = mode; local->cont.create.fd = fd_ref (fd); local->umask = umask; - if (params) - local->xdata_req = dict_ref (params); - local->transaction.fop = afr_create_wind; - local->transaction.done = afr_create_done; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_create_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_create_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -492,15 +509,13 @@ afr_create (call_frame_t *frame, xlator_t *this, goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (create, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); return 0; } @@ -516,25 +531,14 @@ afr_mknod_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (mknod, main_frame, - local->op_ret, local->op_errno, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (mknod, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -545,131 +549,72 @@ afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - int call_count = -1; - int child_index = -1; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } -int32_t -afr_mknod_wind (call_frame_t *frame, xlator_t *this) +int +afr_mknod_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mknod, - &local->loc, local->cont.mknod.mode, - local->cont.mknod.dev, - local->umask, - local->xdata_req); - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_mknod_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->mknod, + &local->loc, local->cont.mknod.mode, + local->cont.mknod.dev, local->umask, + local->xdata_req); return 0; } - int afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t dev, mode_t umask, dict_t *params) + dev_t dev, mode_t umask, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(mknod,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); local->op = GF_FOP_MKNOD; local->cont.mknod.mode = mode; local->cont.mknod.dev = dev; local->umask = umask; - if (params) - local->xdata_req = dict_ref (params); - local->transaction.fop = afr_mknod_wind; - local->transaction.done = afr_mknod_done; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_mknod_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_mknod_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -692,19 +637,17 @@ afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (mknod, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); return 0; } @@ -721,25 +664,14 @@ afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (mkdir, main_frame, - local->op_ret, local->op_errno, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (mkdir, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -750,130 +682,71 @@ afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - int call_count = -1; - int child_index = -1; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } int -afr_mkdir_wind (call_frame_t *frame, xlator_t *this) +afr_mkdir_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->mkdir, - &local->loc, local->cont.mkdir.mode, - local->umask, - local->xdata_req); - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->mkdir, &local->loc, + local->cont.mkdir.mode, local->umask, + local->xdata_req); return 0; } int -afr_mkdir_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - -int -afr_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, mode_t umask, dict_t *params) +afr_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(mkdir,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); local->cont.mkdir.mode = mode; local->umask = umask; - if (params) - local->xdata_req = dict_ref (params); + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; local->op = GF_FOP_MKDIR; - local->transaction.fop = afr_mkdir_wind; - local->transaction.done = afr_mkdir_done; + local->transaction.wind = afr_mkdir_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_mkdir_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -896,20 +769,17 @@ afr_mkdir (call_frame_t *frame, xlator_t *this, int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - - AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); return 0; } @@ -926,25 +796,14 @@ afr_link_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (link, main_frame, - local->op_ret, local->op_errno, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (link, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -955,127 +814,70 @@ afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - int call_count = -1; - int child_index = -1; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } int -afr_link_wind (call_frame_t *frame, xlator_t *this) +afr_link_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_link_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->link, - &local->loc, - &local->newloc, local->xdata_req); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->link, + &local->loc, &local->newloc, local->xdata_req); return 0; } int -afr_link_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - -int -afr_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata) +afr_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(link,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, oldloc); loc_copy (&local->newloc, newloc); + + local->inode = inode_ref (oldloc->inode); + local->parent = inode_ref (newloc->parent); + if (xdata) - local->xdata_req = dict_ref (xdata); + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + if (!local->xdata_req) + goto out; local->op = GF_FOP_LINK; - local->transaction.fop = afr_link_wind; - local->transaction.done = afr_link_done; + + local->transaction.wind = afr_link_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_link_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, newloc, @@ -1098,18 +900,17 @@ afr_link (call_frame_t *frame, xlator_t *this, int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (link, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); return 0; } @@ -1126,25 +927,14 @@ afr_symlink_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (symlink, main_frame, - local->op_ret, local->op_errno, - local->cont.dir_fop.inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (symlink, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -1155,132 +945,71 @@ afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *buf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - int call_count = -1; - int child_index = -1; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, inode, buf, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } int -afr_symlink_wind (call_frame_t *frame, xlator_t *this) +afr_symlink_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->symlink, - local->cont.symlink.linkpath, - &local->loc, - local->umask, - local->xdata_req); - - if (!--call_count) - break; - - } - } - - return 0; -} - - -int -afr_symlink_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->symlink, + local->cont.symlink.linkpath, &local->loc, + local->umask, local->xdata_req); return 0; } int -afr_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *loc, mode_t umask, dict_t *params) +afr_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(symlink,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, loc); - - LOCK (&priv->read_child_lock); - { - local->read_child_index = (++priv->read_child_rr) - % (priv->child_count); - } - UNLOCK (&priv->read_child_lock); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); local->cont.symlink.linkpath = gf_strdup (linkpath); local->umask = umask; - if (params) - local->xdata_req = dict_ref (params); + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; local->op = GF_FOP_SYMLINK; - local->transaction.fop = afr_symlink_wind; - local->transaction.done = afr_symlink_done; + local->transaction.wind = afr_symlink_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_symlink_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -1303,19 +1032,17 @@ afr_symlink (call_frame_t *frame, xlator_t *this, int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (symlink, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (symlink, frame, -1, op_errno, NULL, NULL, NULL, + NULL, NULL); return 0; } @@ -1331,26 +1058,16 @@ afr_rename_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (rename, main_frame, - local->op_ret, local->op_errno, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - &local->cont.dir_fop.prenewparent, - &local->cont.dir_fop.postnewparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (rename, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + &local->cont.dir_fop.prenewparent, + &local->cont.dir_fop.postnewparent, local->xdata_rsp); return 0; } @@ -1362,131 +1079,72 @@ afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *prenewparent, struct iatt *postnewparent, dict_t *xdata) { - afr_local_t * local = NULL; - int call_count = -1; - int child_index = -1; - - local = frame->local; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (afr_fop_failed (op_ret, op_errno) && op_errno != ENOTEMPTY) - afr_transaction_fop_failed (frame, this, child_index); - local->op_errno = op_errno; - local->child_errno[child_index] = op_errno; - - if (op_ret > -1) - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, NULL, buf, - preoldparent, postoldparent, - prenewparent, postnewparent); - - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, + preoldparent, postoldparent, prenewparent, + postnewparent, xdata); } -int32_t -afr_rename_wind (call_frame_t *frame, xlator_t *this) +int +afr_rename_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rename, - &local->loc, - &local->newloc, NULL); - if (!--call_count) - break; - } - } + local = frame->local; + priv = this->private; + STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->rename, + &local->loc, &local->newloc, local->xdata_req); return 0; } int -afr_rename_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - -int -afr_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata) +afr_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; + int op_errno = ENOMEM; int nlockee = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; QUORUM_CHECK(rename,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { + if (!transaction_frame) op_errno = ENOMEM; - goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, oldloc); loc_copy (&local->newloc, newloc); - local->read_child_index = afr_inode_get_read_ctx (this, oldloc->inode, NULL); + local->inode = inode_ref (oldloc->inode); + local->parent = inode_ref (oldloc->parent); + local->parent2 = inode_ref (newloc->parent); + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; local->op = GF_FOP_RENAME; - local->transaction.fop = afr_rename_wind; - local->transaction.done = afr_rename_done; + local->transaction.wind = afr_rename_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_rename_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, oldloc, @@ -1536,20 +1194,17 @@ afr_rename (call_frame_t *frame, xlator_t *this, ret = afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - - AFR_STACK_UNWIND (rename, frame, -1, op_errno, - NULL, NULL, NULL, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL, NULL); return 0; } @@ -1565,23 +1220,13 @@ afr_unlink_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (unlink, main_frame, - local->op_ret, local->op_errno, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (unlink, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -1591,123 +1236,69 @@ afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - int call_count = -1; - int child_index = (long) cookie; - - local = frame->local; - - LOCK (&frame->lock); - { - if (child_index == local->read_child_index) { - local->read_child_returned = _gf_true; - } - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, NULL, NULL, - preparent, postparent, NULL, NULL); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL, + preparent, postparent, NULL, NULL, xdata); } -int32_t -afr_unlink_wind (call_frame_t *frame, xlator_t *this) +int +afr_unlink_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->unlink, - &local->loc, local->xflag, - local->xdata_req); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int32_t -afr_unlink_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->unlink, + &local->loc, local->xflag, local->xdata_req); return 0; } -int32_t -afr_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, int xflag, dict_t *xdata) +int +afr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(unlink,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; loc_copy (&local->loc, loc); local->xflag = xflag; + + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); + if (xdata) - local->xdata_req = dict_ref (xdata); + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; local->op = GF_FOP_UNLINK; - local->transaction.fop = afr_unlink_wind; - local->transaction.done = afr_unlink_done; + local->transaction.wind = afr_unlink_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_unlink_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -1730,19 +1321,16 @@ afr_unlink (call_frame_t *frame, xlator_t *this, int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (unlink, frame, -1, op_errno, - NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -1760,23 +1348,13 @@ afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (rmdir, main_frame, - local->op_ret, local->op_errno, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (rmdir, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); return 0; } @@ -1786,130 +1364,71 @@ afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - afr_local_t * local = NULL; - int call_count = -1; - int child_index = (long) cookie; - int read_child = 0; - - local = frame->local; - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - if (afr_fop_failed (op_ret, op_errno) && (op_errno != ENOTEMPTY)) - afr_transaction_fop_failed (frame, this, child_index); - local->op_errno = op_errno; - local->child_errno[child_index] = op_errno; - if (op_ret > -1) - __dir_entry_fop_common_cbk (frame, child_index, this, - op_ret, op_errno, NULL, NULL, - preparent, postparent, NULL, - NULL); - - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - if (call_count == 0) - afr_dir_fop_done (frame, this); - - return 0; + return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL, + preparent, postparent, NULL, NULL, xdata); } int -afr_rmdir_wind (call_frame_t *frame, xlator_t *this) +afr_rmdir_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rmdir, - &local->loc, local->cont.rmdir.flags, - NULL); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_rmdir_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->rmdir, + &local->loc, local->cont.rmdir.flags, local->xdata_req); return 0; } int -afr_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags, dict_t *xdata) +afr_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; + int op_errno = ENOMEM; int nlockee = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; QUORUM_CHECK(rmdir,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - local->cont.rmdir.flags = flags; loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + local->parent = inode_ref (loc->parent); + + local->cont.rmdir.flags = flags; + + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; local->op = GF_FOP_RMDIR; - local->transaction.fop = afr_rmdir_wind; - local->transaction.done = afr_rmdir_done; + local->transaction.wind = afr_rmdir_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_rmdir_unwind; ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, @@ -1944,18 +1463,16 @@ afr_rmdir (call_frame_t *frame, xlator_t *this, ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); return 0; } diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index 0cfebcb9d..01e078c13 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -35,241 +35,153 @@ #include "compat-errno.h" #include "compat.h" -/** - * Common algorithm for inode read calls: - * - * - Try the fop on the first child that is up - * - if we have failed due to ENOTCONN: - * try the next child - * - * Applicable to: access, stat, fstat, readlink, getxattr - */ +#include "afr-transaction.h" + /* {{{ access */ -int32_t -afr_access_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) +int +afr_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; + afr_local_t *local = NULL; local = frame->local; - read_child = (long) cookie; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - if (op_ret == -1) { - last_index = &local->cont.access.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - unwind = 0; - - STACK_WIND_COOKIE (frame, afr_access_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->access, - &local->loc, local->cont.access.mask, - NULL); - } - -out: - if (unwind) { - AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); - } + AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); return 0; } -int32_t -afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, - dict_t *xdata) +int +afr_access_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - int32_t op_errno = 0; - int32_t read_child = -1; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - AFR_SBRAIN_CHECK_LOC (loc, out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (subvol == -1) { + AFR_STACK_UNWIND (access, frame, local->op_ret, + local->op_errno, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_access_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->access, + &local->loc, local->cont.access.mask, + local->xdata_req); + return 0; +} +int +afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, + int mask, dict_t *xdata) +{ + afr_local_t *local = NULL; + int op_errno = 0; - read_child = afr_inode_get_read_ctx (this, loc->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.access.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - loc_copy (&local->loc, loc); - local->cont.access.mask = mask; + local->op = GF_FOP_ACCESS; + loc_copy (&local->loc, loc); + local->cont.access.mask = mask; + if (xdata) + local->xdata_req = dict_ref (xdata); - STACK_WIND_COOKIE (frame, afr_access_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->access, - loc, mask, xdata); + afr_read_txn (frame, this, loc->inode, afr_access_wind, + AFR_METADATA_TRANSACTION); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL); + return 0; } - /* }}} */ /* {{{ stat */ -int32_t +int afr_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; - - read_child = (long) cookie; + afr_local_t *local = NULL; local = frame->local; - if (op_ret == -1) { - last_index = &local->cont.stat.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - unwind = 0; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - STACK_WIND_COOKIE (frame, afr_stat_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->stat, - &local->loc, NULL); - } - -out: - if (unwind) { - AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); - } + AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); return 0; } -int32_t -afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +int +afr_stat_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int call_child = 0; - int32_t op_errno = 0; - int32_t read_child = -1; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - AFR_SBRAIN_CHECK_LOC (loc, out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (subvol == -1) { + AFR_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno, + 0, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->stat, + &local->loc, local->xdata_req); + return 0; +} - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; +int +afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + afr_local_t *local = NULL; + int op_errno = 0; - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - read_child = afr_inode_get_read_ctx (this, loc->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.stat.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } - loc_copy (&local->loc, loc); + local->op = GF_FOP_STAT; + loc_copy (&local->loc, loc); + if (xdata) + local->xdata_req = dict_ref (xdata); - STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, - children[call_child], - children[call_child]->fops->stat, - loc, xdata); + afr_read_txn (frame, this, loc->inode, afr_stat_wind, + AFR_DATA_TRANSACTION); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); return 0; } @@ -279,52 +191,49 @@ out: /* {{{ fstat */ -int32_t +int afr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; + afr_local_t *local = NULL; local = frame->local; - read_child = (long) cookie; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - if (op_ret == -1) { - last_index = &local->cont.fstat.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - unwind = 0; + AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); - STACK_WIND_COOKIE (frame, afr_fstat_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->fstat, - local->fd, NULL); - } + return 0; +} -out: - if (unwind) { - AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); - } - return 0; +int +afr_fstat_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (subvol == -1) { + AFR_STACK_UNWIND (fstat, frame, local->op_ret, local->op_errno, + 0, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fstat, + local->fd, local->xdata_req); + return 0; } @@ -332,68 +241,26 @@ int32_t afr_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int call_child = 0; - int32_t op_errno = 0; - int32_t read_child = 0; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (fd, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - VALIDATE_OR_GOTO (fd->inode, out); - - AFR_SBRAIN_CHECK_FD (fd, out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } + afr_local_t *local = NULL; + int op_errno = 0; - read_child = afr_inode_get_read_ctx (this, fd->inode, - local->fresh_children); + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + local->op = GF_FOP_FSTAT; + local->fd = fd_ref (fd); + if (xdata) + local->xdata_req = dict_ref (xdata); + afr_fix_open (fd, this); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.fstat.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - local->fd = fd_ref (fd); - - afr_open_fd_fix (fd, this); + afr_read_txn (frame, this, fd->inode, afr_fstat_wind, + AFR_DATA_TRANSACTION); - STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, - children[call_child], - children[call_child]->fops->fstat, - fd, xdata); - - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); return 0; } @@ -402,117 +269,77 @@ out: /* {{{ readlink */ -int32_t +int afr_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, const char *buf, struct iatt *sbuf, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; + afr_local_t *local = NULL; - priv = this->private; - children = priv->children; + local = frame->local; - local = frame->local; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - read_child = (long) cookie; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - if (op_ret == -1) { - last_index = &local->cont.readlink.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; - STACK_WIND_COOKIE (frame, afr_readlink_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->readlink, - &local->loc, - local->cont.readlink.size, NULL); - } - -out: - if (unwind) { - AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, buf, sbuf, - xdata); - } + AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, + buf, sbuf, xdata); + return 0; +} - return 0; +int +afr_readlink_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND (readlink, frame, local->op_ret, + local->op_errno, 0, 0, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_readlink_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readlink, + &local->loc, local->cont.readlink.size, + local->xdata_req); + return 0; } -int32_t +int afr_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, dict_t *xdata) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; - afr_local_t *local = NULL; + afr_local_t * local = NULL; int32_t op_errno = 0; - int32_t read_child = -1; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - children = priv->children; - - AFR_SBRAIN_CHECK_LOC (loc, out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } - read_child = afr_inode_get_read_ctx (this, loc->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.readlink.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + local->op = GF_FOP_READLINK; loc_copy (&local->loc, loc); + local->cont.readlink.size = size; + if (xdata) + local->xdata_req = dict_ref (xdata); - local->cont.readlink.size = size; - - STACK_WIND_COOKIE (frame, afr_readlink_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readlink, - loc, size, xdata); + afr_read_txn (frame, this, loc->inode, afr_readlink_wind, + AFR_DATA_TRANSACTION); - ret = 0; -out: - if (ret < 0) - AFR_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL, NULL); return 0; +out: + AFR_STACK_UNWIND(readlink, frame, -1, op_errno, 0, 0, 0); + + return 0; } @@ -550,7 +377,7 @@ __gather_xattr_keys (dict_t *dict, char *key, data_t *value, void -__filter_xattrs (dict_t *dict) +afr_filter_xattrs (dict_t *dict) { struct list_head keys = {0,}; struct _xattr_key *key = NULL; @@ -571,59 +398,56 @@ __filter_xattrs (dict_t *dict) } - -int32_t +int afr_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; + afr_local_t *local = NULL; local = frame->local; - read_child = (long) cookie; - - if (op_ret == -1) { - last_index = &local->cont.getxattr.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - unwind = 0; - STACK_WIND_COOKIE (frame, afr_getxattr_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->getxattr, - &local->loc, - local->cont.getxattr.name, - NULL); - } + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } -out: - if (unwind) { - if (op_ret >= 0 && dict) - __filter_xattrs (dict); + if (dict) + afr_filter_xattrs (dict); - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); - } + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); return 0; } + +int +afr_getxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND (getxattr, frame, local->op_ret, + local->op_errno, NULL, NULL); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_getxattr_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->getxattr, + &local->loc, local->cont.getxattr.name, + local->xdata_req); + return 0; +} + + int32_t afr_getxattr_unwind (call_frame_t *frame, int op_ret, int op_errno, dict_t *dict, dict_t *xdata) @@ -659,7 +483,7 @@ afr_fgetxattr_clrlk_cbk (call_frame_t *frame, void *cookie, { callcnt = --local->call_count; if (op_ret == -1) - local->child_errno[cky] = op_errno; + local->replies[cky].op_errno = op_errno; if (!local->dict) local->dict = dict_new (); @@ -710,12 +534,10 @@ unlock: unwind: // Updating child_errno with more recent 'events' - local->child_errno[cky] = op_errno; - op_errno = afr_resultant_errno_get (NULL, local->child_errno, - priv->child_count); + op_errno = afr_final_errno (local, priv); + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr, xdata); - if (xattr) dict_unref (xattr); } @@ -749,7 +571,7 @@ afr_getxattr_clrlk_cbk (call_frame_t *frame, void *cookie, { callcnt = --local->call_count; if (op_ret == -1) - local->child_errno[cky] = op_errno; + local->replies[cky].op_errno = op_errno; if (!local->dict) local->dict = dict_new (); @@ -800,9 +622,8 @@ unlock: unwind: // Updating child_errno with more recent 'events' - local->child_errno[cky] = op_errno; - op_errno = afr_resultant_errno_get (NULL, local->child_errno, - priv->child_count); + op_errno = afr_final_errno (local, priv); + AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); if (xattr) @@ -1411,7 +1232,7 @@ afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk, } if (!strcmp (name, GF_XATTR_PATHINFO_KEY) || - !strcmp (name, GF_XATTR_USER_PATHINFO_KEY)) { + !strcmp (name, GF_XATTR_USER_PATHINFO_KEY)) { if (is_fgetxattr) { *cbk = afr_fgetxattr_pathinfo_cbk; } else { @@ -1442,18 +1263,16 @@ out: } static void -afr_getxattr_frm_all_children (xlator_t *this, call_frame_t *frame, - const char *name, loc_t *loc, - fop_getxattr_cbk_t cbk) +afr_getxattr_all_subvols (xlator_t *this, call_frame_t *frame, + const char *name, loc_t *loc, + fop_getxattr_cbk_t cbk) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - xlator_t **children = NULL; int i = 0; int call_count = 0; priv = this->private; - children = priv->children; local = frame->local; //local->call_count set in afr_local_init @@ -1465,8 +1284,8 @@ afr_getxattr_frm_all_children (xlator_t *this, call_frame_t *frame, for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, cbk, - (void *) (long) i, children[i], - children[i]->fops->getxattr, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->getxattr, loc, name, NULL); if (!--call_count) break; @@ -1481,41 +1300,41 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, { afr_private_t *priv = NULL; xlator_t **children = NULL; - int call_child = 0; afr_local_t *local = NULL; xlator_list_t *trav = NULL; xlator_t **sub_volumes = NULL; int i = 0; int32_t op_errno = 0; - int32_t read_child = -1; int ret = -1; fop_getxattr_cbk_t cbk = NULL; int afr_xtime_gauge[MCNT_MAX] = {0,}; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); children = priv->children; - AFR_SBRAIN_CHECK_LOC (loc, out); + loc_copy (&local->loc, loc); - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + local->op = GF_FOP_GETXATTR; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + if (xdata) + local->xdata_req = dict_ref (xdata); - loc_copy (&local->loc, loc); if (!name) goto no_name; local->cont.getxattr.name = gf_strdup (name); + if (!local->cont.getxattr.name) { + op_errno = ENOMEM; + goto out; + } + if (!strncmp (name, AFR_XATTR_PREFIX, strlen (AFR_XATTR_PREFIX))) { gf_log (this->name, GF_LOG_INFO, @@ -1559,8 +1378,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, * collect information from all childs */ if (afr_is_special_xattr (name, &cbk, 0)) { - afr_getxattr_frm_all_children (this, frame, name, - loc, cbk); + afr_getxattr_all_subvols (this, frame, name, loc, cbk); return 0; } @@ -1615,28 +1433,9 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, } no_name: - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } - read_child = afr_inode_get_read_ctx (this, loc->inode, - local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.getxattr.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - STACK_WIND_COOKIE (frame, afr_getxattr_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->getxattr, - loc, name, xdata); + afr_read_txn (frame, this, local->loc.inode, afr_getxattr_wind, + AFR_METADATA_TRANSACTION); ret = 0; out: @@ -1653,76 +1452,60 @@ afr_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - priv = this->private; - children = priv->children; - - local = frame->local; + afr_local_t *local = NULL; - read_child = (long) cookie; + local = frame->local; - if (op_ret == -1) { - last_index = &local->cont.getxattr.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - unwind = 0; - STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->fgetxattr, - local->fd, - local->cont.getxattr.name, - NULL); - } + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } -out: - if (unwind) { - if (op_ret >= 0 && dict) - __filter_xattrs (dict); + if (dict) + afr_filter_xattrs (dict); - AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, - xdata); - } + AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata); - return 0; + return 0; } -int32_t -afr_fgetxattr_unwind (call_frame_t *frame, - int op_ret, int op_errno, dict_t *dict, dict_t *xdata) - +int +afr_fgetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { - AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata); - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND (fgetxattr, frame, local->op_ret, + local->op_errno, NULL, NULL); + return 0; + } + + STACK_WIND_COOKIE (frame, (void *) (long) subvol, afr_fgetxattr_cbk, + priv->children[subvol], + priv->children[subvol]->fops->fgetxattr, + local->fd, local->cont.getxattr.name, + local->xdata_req); + return 0; } + static void -afr_fgetxattr_frm_all_children (xlator_t *this, call_frame_t *frame, - const char *name, fd_t *fd, - fop_fgetxattr_cbk_t cbk) +afr_fgetxattr_all_subvols (xlator_t *this, call_frame_t *frame, + fop_fgetxattr_cbk_t cbk) { afr_private_t *priv = NULL; afr_local_t *local = NULL; - xlator_t **children = NULL; int i = 0; int call_count = 0; priv = this->private; - children = priv->children; local = frame->local; //local->call_count set in afr_local_init @@ -1735,9 +1518,10 @@ afr_fgetxattr_frm_all_children (xlator_t *this, call_frame_t *frame, if (local->child_up[i]) { STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, - children[i], - children[i]->fops->fgetxattr, - fd, name, NULL); + priv->children[i], + priv->children[i]->fops->fgetxattr, + local->fd, local->cont.getxattr.name, + NULL); if (!--call_count) break; } @@ -1746,42 +1530,30 @@ afr_fgetxattr_frm_all_children (xlator_t *this, call_frame_t *frame, return; } -int32_t + +int afr_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, dict_t *xdata) { - afr_private_t *priv = NULL; - xlator_t **children = NULL; - int call_child = 0; afr_local_t *local = NULL; - int32_t op_ret = -1; int32_t op_errno = 0; - int32_t read_child = -1; fop_fgetxattr_cbk_t cbk = NULL; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; - - AFR_SBRAIN_CHECK_FD (fd, out); - - AFR_LOCAL_ALLOC_OR_GOTO (local, out); - frame->local = local; - - op_ret = afr_local_init (local, priv, &op_errno); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + local->op = GF_FOP_FGETXATTR; local->fd = fd_ref (fd); - if (name) + if (name) { local->cont.getxattr.name = gf_strdup (name); + if (!local->cont.getxattr.name) { + op_errno = ENOMEM; + goto out; + } + } + if (xdata) + local->xdata_req = dict_ref (xdata); /* pathinfo gets handled only in getxattr(), but we need to handle * lockinfo. @@ -1789,42 +1561,19 @@ afr_fgetxattr (call_frame_t *frame, xlator_t *this, * collect information from all children. */ if (afr_is_special_xattr (name, &cbk, 1)) { - afr_fgetxattr_frm_all_children (this, frame, name, - fd, cbk); + afr_fgetxattr_all_subvols (this, frame, cbk); return 0; } + afr_fix_open (fd, this); - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } - - read_child = afr_inode_get_read_ctx (this, fd->inode, - local->fresh_children); - op_ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.getxattr.last_index); - if (op_ret < 0) { - op_errno = -op_ret; - op_ret = -1; - goto out; - } - - STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->fgetxattr, - fd, name, xdata); + afr_read_txn (frame, this, fd->inode, afr_fgetxattr_wind, + AFR_METADATA_TRANSACTION); - op_ret = 0; + return 0; out: - if (op_ret == -1) { - AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, NULL, - NULL); - } + AFR_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL, NULL); + return 0; } @@ -1833,144 +1582,84 @@ out: /* {{{ readv */ -/** - * read algorithm: - * - * if the user has specified a read subvolume, use it - * otherwise - - * use the inode number to hash it to one of the subvolumes, and - * read from there (to balance read load) - * - * if any of the above read's fail, try the children in sequence - * beginning at the beginning - */ - -int32_t +int afr_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iovec *vector, int32_t count, struct iatt *buf, struct iobref *iobref, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t *fresh_children = NULL; - int32_t read_child = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); - - children = priv->children; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - read_child = (long) cookie; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - if (op_ret == -1) { - last_index = &local->cont.readv.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; + afr_read_txn_continue (frame, this, (long) cookie); + return 0; + } - STACK_WIND_COOKIE (frame, afr_readv_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->readv, - local->fd, local->cont.readv.size, - local->cont.readv.offset, - local->cont.readv.flags, - NULL); - } + AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, + vector, count, buf, iobref, xdata); + return 0; +} -out: - if (unwind) { - AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, - vector, count, buf, iobref, xdata); - } - return 0; +int +afr_readv_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND (readv, frame, local->op_ret, local->op_errno, + 0, 0, 0, 0, 0); + return 0; + } + + STACK_WIND_COOKIE (frame, afr_readv_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->readv, + local->fd, local->cont.readv.size, + local->cont.readv.offset, local->cont.readv.flags, + local->xdata_req); + return 0; } -int32_t -afr_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata) +int +afr_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - afr_private_t * priv = NULL; afr_local_t * local = NULL; - xlator_t ** children = NULL; - int call_child = 0; int32_t op_errno = 0; - int32_t read_child = -1; - int ret = -1; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - children = priv->children; - AFR_SBRAIN_CHECK_FD (fd, out); + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) { - op_errno = ENOMEM; - goto out; - } - - read_child = afr_inode_get_read_ctx (this, fd->inode, local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.readv.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - local->fd = fd_ref (fd); - - local->cont.readv.size = size; - local->cont.readv.offset = offset; - local->cont.readv.flags = flags; + local->op = GF_FOP_READ; + local->fd = fd_ref (fd); + local->cont.readv.size = size; + local->cont.readv.offset = offset; + local->cont.readv.flags = flags; + if (xdata) + local->xdata_req = dict_ref (xdata); - afr_open_fd_fix (fd, this); + afr_fix_open (fd, this); - STACK_WIND_COOKIE (frame, afr_readv_cbk, - (void *) (long) call_child, - children[call_child], - children[call_child]->fops->readv, - fd, size, offset, flags, xdata); + afr_read_txn (frame, this, fd->inode, afr_readv_wind, + AFR_DATA_TRANSACTION); - ret = 0; -out: - if (ret < 0) { - AFR_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, - NULL, NULL); - } return 0; +out: + AFR_STACK_UNWIND(readv, frame, -1, op_errno, 0, 0, 0, 0, 0); + + return 0; } /* }}} */ diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index d62847def..3dacfc8dd 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -37,46 +37,128 @@ #include "afr.h" #include "afr-transaction.h" -#include "afr-self-heal-common.h" +//#include "afr-self-heal-common.h" -void -__inode_write_fop_cbk (call_frame_t *frame, int child_index, int read_child, - xlator_t *this, int32_t *op_ret, int32_t *op_errno, - struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) + +static void +__afr_inode_write_finalize (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int read_subvol = 0; + int i = 0; + + local = frame->local; + priv = this->private; + + if (local->inode) { + if (local->transaction.type == AFR_METADATA_TRANSACTION) + read_subvol = afr_metadata_subvol_get (local->inode, this, + NULL, NULL); + else + read_subvol = afr_data_subvol_get (local->inode, this, + NULL, NULL); + } + + local->op_ret = -1; + local->op_errno = afr_final_errno (local, priv); + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret < 0) { + afr_inode_read_subvol_reset (local->inode, this); + continue; + } + + /* Order of checks in the compound conditional + below is important. + + - Highest precedence: largest op_ret + - Next precendence: if all op_rets are equal, read subvol + - Least precedence: any succeeded subvol + */ + if ((local->op_ret < local->replies[i].op_ret) || + ((local->op_ret == local->replies[i].op_ret) && + (i == read_subvol))) { + + local->op_ret = local->replies[i].op_ret; + local->op_errno = local->replies[i].op_errno; + + local->cont.inode_wfop.prebuf = + local->replies[i].prestat; + local->cont.inode_wfop.postbuf = + local->replies[i].poststat; + + if (local->replies[i].xdata) { + if (local->xdata_rsp) + dict_unref (local->xdata_rsp); + local->xdata_rsp = + dict_ref (local->replies[i].xdata); + } + } + } +} + + +static void +__afr_inode_write_fill (call_frame_t *frame, xlator_t *this, int child_index, + int op_ret, int op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; local = frame->local; - if (afr_fop_failed (*op_ret, *op_errno)) { - local->child_errno[child_index] = *op_errno; - - switch (local->op) { - case GF_FOP_TRUNCATE: - case GF_FOP_FTRUNCATE: - if (*op_errno != EFBIG) - afr_transaction_fop_failed (frame, this, - child_index); - break; - default: - afr_transaction_fop_failed (frame, this, child_index); - break; - } - local->op_errno = *op_errno; - goto out; + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + + if (op_ret >= 0) { + if (prebuf) + local->replies[child_index].prestat = *prebuf; + if (postbuf) + local->replies[child_index].poststat = *postbuf; + if (xdata) + local->replies[child_index].xdata = dict_ref (xdata); + } else { + afr_transaction_fop_failed (frame, this, child_index); + } + + return; +} + + +static int +__afr_inode_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = (long) cookie; + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + __afr_inode_write_fill (frame, this, child_index, op_ret, + op_errno, prebuf, postbuf, xdata); } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); - if ((local->success_count == 0) || (read_child == child_index)) { - local->op_ret = *op_ret; - if (prebuf) - local->cont.inode_wfop.prebuf = *prebuf; - if (postbuf) - local->cont.inode_wfop.postbuf = *postbuf; + if (call_count == 0) { + __afr_inode_write_finalize (frame, this); + + if (afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); } - local->success_count++; -out: - return; + return 0; } /* {{{ writev */ @@ -94,6 +176,8 @@ afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame) dst_local->op_errno = src_local->op_errno; dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf; dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf; + if (src_local->xdata_rsp) + dst_local->xdata_rsp = dict_ref (src_local->xdata_rsp); } void @@ -106,26 +190,9 @@ afr_writev_unwind (call_frame_t *frame, xlator_t *this) local->op_ret, local->op_errno, &local->cont.inode_wfop.prebuf, &local->cont.inode_wfop.postbuf, - NULL); + local->xdata_rsp); } -call_frame_t* -afr_transaction_detach_fop_frame (call_frame_t *frame) -{ - afr_local_t * local = NULL; - call_frame_t *fop_frame = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - fop_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - return fop_frame; -} int afr_transaction_writev_unwind (call_frame_t *frame, xlator_t *this) @@ -173,82 +240,60 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; - afr_private_t *priv = NULL; call_frame_t *fop_frame = NULL; int child_index = (long) cookie; int call_count = -1; - int read_child = 0; - int ret = 0; + int ret = 0; uint32_t open_fd_count = 0; uint32_t write_is_append = 0; local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); LOCK (&frame->lock); { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); - - local->replies[child_index].valid = 1; - local->replies[child_index].op_ret = op_ret; - local->replies[child_index].op_errno = op_errno; - - - /* stage the best case return value for unwind */ - if ((local->success_count == 0) || (op_ret > local->op_ret)) { - local->op_ret = op_ret; - local->op_errno = op_errno; - } - - if (op_ret != -1) { - if (xdata) { - ret = dict_get_uint32 (xdata, - GLUSTERFS_OPEN_FD_COUNT, - &open_fd_count); - if ((ret == 0) && - (open_fd_count > local->open_fd_count)) { - local->open_fd_count = open_fd_count; - local->update_open_fd_count = _gf_true; - } - - write_is_append = 0; - ret = dict_get_uint32 (xdata, - GLUSTERFS_WRITE_IS_APPEND, - &write_is_append); - if (ret || !write_is_append) - local->append_write = _gf_false; - } - + __afr_inode_write_fill (frame, this, child_index, op_ret, + op_errno, prebuf, postbuf, xdata); + if (op_ret == -1 || !xdata) + goto unlock; + + write_is_append = 0; + ret = dict_get_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, + &write_is_append); + if (ret || !write_is_append) + local->append_write = _gf_false; + + ret = dict_get_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, + &open_fd_count); + if (ret == -1) + goto unlock; + if ((open_fd_count > local->open_fd_count)) { + local->open_fd_count = open_fd_count; + local->update_open_fd_count = _gf_true; } } +unlock: UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) { - - if (local->update_open_fd_count) - afr_handle_open_fd_count (frame, this); - - if (!local->stable_write && !local->append_write) + if (!local->stable_write && !local->append_write) /* An appended write removes the necessity to fsync() the file. This is because self-heal has the logic to check for larger file when the xattrs are not reliably pointing at a stale file. */ - afr_fd_report_unstable_write (this, local->fd); + afr_fd_report_unstable_write (this, local->fd); + + __afr_inode_write_finalize (frame, this); afr_writev_handle_short_writes (frame, this); - if (afr_any_fops_failed (local, priv)) { + + if (local->update_open_fd_count) + afr_handle_open_fd_count (frame, this); + + if (!afr_txn_nothing_failed (frame, this)) { //Don't unwind until post-op is complete local->transaction.resume (frame, this); } else { @@ -272,91 +317,23 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, return 0; } + int -afr_writev_wind (call_frame_t *frame, xlator_t *this) +afr_writev_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int i = 0; - int call_count = -1; - dict_t *xdata = NULL; - GF_UNUSED int ret = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), - gf_afr_mt_reply_t); - if (!local->replies) { - local->op_ret = -1; - local->op_errno = ENOMEM; - local->transaction.unwind(frame, this); - local->transaction.resume(frame, this); - return 0; - } - - xdata = dict_new (); - if (xdata) { - ret = dict_set_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, - sizeof (uint32_t)); - ret = dict_set_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, - 0); - /* Set append_write to be true speculatively. If on any - server it turns not be true, we unset it in the - callback. - */ - local->append_write = _gf_true; - } - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->writev, - local->fd, - local->cont.writev.vector, - local->cont.writev.count, - local->cont.writev.offset, - local->cont.writev.flags, - local->cont.writev.iobref, - xdata); - - if (!--call_count) - break; - } - } - - if (xdata) - dict_unref (xdata); - - return 0; -} - - -int -afr_writev_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - iobref_unref (local->cont.writev.iobref); - local->cont.writev.iobref = NULL; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->writev, + local->fd, local->cont.writev.vector, + local->cont.writev.count, local->cont.writev.offset, + local->cont.writev.flags, local->cont.writev.iobref, + local->xdata_req); return 0; } @@ -366,29 +343,29 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) { call_frame_t *transaction_frame = NULL; afr_local_t *local = NULL; - int op_ret = -1; - int op_errno = 0; - - local = frame->local; + int ret = -1; + int op_errno = ENOMEM; transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } + local = frame->local; transaction_frame->local = local; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + frame->local = NULL; - local->op = GF_FOP_WRITE; + if (!AFR_FRAME_INIT (frame, op_errno)) + goto out; - local->success_count = 0; + local->op = GF_FOP_WRITE; - local->transaction.fop = afr_writev_wind; - local->transaction.done = afr_writev_done; + local->transaction.wind = afr_writev_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_transaction_writev_unwind; local->transaction.main_frame = frame; + if (local->fd->flags & O_APPEND) { /* * Backend vfs ignores the 'offset' for append mode fd so @@ -405,179 +382,86 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) local->cont.writev.count); } - op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; goto out; } - op_ret = 0; + return 0; out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); return 0; } -static void -afr_trigger_open_fd_self_heal (fd_t *fd, xlator_t *this) -{ - call_frame_t *frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - char *reason = NULL; - int32_t op_errno = 0; - int ret = 0; - - if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid args: " - "fd: %p, inode: %p", fd, - fd ? fd->inode : NULL); - goto out; - } - - frame = create_frame (this, this->ctx->pool); - if (!frame) - goto out; - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - ret = afr_local_init (local, this->private, &op_errno); - if (ret < 0) - goto out; - - local->loc.inode = inode_ref (fd->inode); - ret = loc_path (&local->loc, NULL); - if (ret < 0) - goto out; - - sh = &local->self_heal; - sh->do_metadata_self_heal = _gf_true; - if (fd->inode->ia_type == IA_IFREG) - sh->do_data_self_heal = _gf_true; - else if (fd->inode->ia_type == IA_IFDIR) - sh->do_entry_self_heal = _gf_true; - - reason = "subvolume came online"; - afr_launch_self_heal (frame, this, fd->inode, _gf_true, - fd->inode->ia_type, reason, NULL, NULL); - return; -out: - AFR_STACK_DESTROY (frame); -} - -void -afr_open_fd_fix (fd_t *fd, xlator_t *this) -{ - int ret = 0; - int i = 0; - afr_fd_ctx_t *fd_ctx = NULL; - gf_boolean_t need_self_heal = _gf_false; - int *need_open = NULL; - size_t need_open_count = 0; - afr_private_t *priv = NULL; - - priv = this->private; - - if (!afr_is_fd_fixable (fd)) - goto out; - - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) - goto out; - - LOCK (&fd->lock); - { - if (fd_ctx->up_count < priv->up_count) { - need_self_heal = _gf_true; - fd_ctx->up_count = priv->up_count; - fd_ctx->down_count = priv->down_count; - } - - need_open = alloca (priv->child_count * sizeof (*need_open)); - for (i = 0; i < priv->child_count; i++) { - need_open[i] = 0; - if (fd_ctx->opened_on[i] != AFR_FD_NOT_OPENED) - continue; - - if (!priv->child_up[i]) - continue; - - fd_ctx->opened_on[i] = AFR_FD_OPENING; - - need_open[i] = 1; - need_open_count++; - } - } - UNLOCK (&fd->lock); - if (ret) - goto out; - - if (need_self_heal) - afr_trigger_open_fd_self_heal (fd, this); - - if (!need_open_count) - goto out; - - afr_fix_open (this, fd, need_open_count, need_open); -out: - return; -} int afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int op_errno = ENOMEM; priv = this->private; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } - QUORUM_CHECK(writev,out); - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - local->cont.writev.vector = iov_dup (vector, count); + local->cont.writev.vector = iov_dup (vector, count); + if (!local->cont.writev.vector) + goto out; local->cont.writev.count = count; local->cont.writev.offset = offset; local->cont.writev.flags = flags; local->cont.writev.iobref = iobref_ref (iobref); - local->fd = fd_ref (fd); + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; + + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); + + if (dict_set_uint32 (local->xdata_req, GLUSTERFS_OPEN_FD_COUNT, 4)) { + op_errno = ENOMEM; + goto out; + } + + if (dict_set_uint32 (local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) { + op_errno = ENOMEM; + goto out; + } + + /* Set append_write to be true speculatively. If on any + server it turns not be true, we unset it in the + callback. + */ + local->append_write = _gf_true; /* detect here, but set it in writev_wind_cbk *after* the unstable write is performed */ local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC)); - afr_open_fd_fix (fd, this); + afr_fix_open (fd, this); afr_do_writev (frame, this); - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); + AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -595,22 +479,13 @@ afr_truncate_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } @@ -620,96 +495,32 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - int child_index = (long) cookie; - int read_child = 0; - int call_count = -1; - - local = frame->local; - - read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - if (op_ret != -1) { - if (prebuf->ia_size != postbuf->ia_size) - local->stable_write = _gf_false; - } - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); + afr_local_t *local = NULL; - if (call_count == 0) { - if (local->stable_write && afr_txn_nothing_failed (frame, this)) - local->transaction.unwind (frame, this); + local = frame->local; - local->transaction.resume (frame, this); - } + if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); } -int32_t -afr_truncate_wind (call_frame_t *frame, xlator_t *this) +int +afr_truncate_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - local->stable_write = _gf_true; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->truncate, - &local->loc, - local->cont.truncate.offset, - NULL); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_truncate_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->truncate, + &local->loc, local->cont.truncate.offset, + local->xdata_req); return 0; } @@ -721,56 +532,60 @@ afr_truncate (call_frame_t *frame, xlator_t *this, afr_private_t * priv = NULL; afr_local_t * local = NULL; call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int ret = -1; + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(truncate,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; local->cont.truncate.offset = offset; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; - local->transaction.fop = afr_truncate_wind; - local->transaction.done = afr_truncate_done; + local->transaction.wind = afr_truncate_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_truncate_unwind; loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + + local->op = GF_FOP_TRUNCATE; local->transaction.main_frame = frame; local->transaction.start = offset; local->transaction.len = 0; + /* Set it true speculatively, will get reset in afr_truncate_wind_cbk + if truncate was not a NOP */ + local->stable_write = _gf_true; + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -788,21 +603,13 @@ afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } @@ -812,122 +619,75 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - int child_index = (long) cookie; - int call_count = -1; - int read_child = 0; + afr_local_t *local = NULL; - local = frame->local; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - if (op_ret != -1) { - if (prebuf->ia_size != postbuf->ia_size) - local->stable_write = _gf_false; - } - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); + local = frame->local; - if (call_count == 0) { - if (local->stable_write && afr_txn_nothing_failed (frame, this)) - local->transaction.unwind (frame, this); - - local->transaction.resume (frame, this); - } + if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); } int -afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) +afr_ftruncate_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - local->stable_write = _gf_true; - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->ftruncate, - local->fd, - local->cont.ftruncate.offset, - NULL); - - if (!--call_count) - break; - } - } + local = frame->local; + priv = this->private; + STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->ftruncate, + local->fd, local->cont.ftruncate.offset, + local->xdata_req); return 0; } int -afr_ftruncate_done (call_frame_t *frame, xlator_t *this) +afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { + afr_private_t *priv = NULL; afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); + priv = this->private; - return 0; -} + QUORUM_CHECK(ftruncate,out); + transaction_frame = copy_frame (frame); + if (!frame) + goto out; -int -afr_do_ftruncate (call_frame_t *frame, xlator_t *this) -{ - call_frame_t * transaction_frame = NULL; - afr_local_t * local = NULL; - int op_ret = -1; - int op_errno = 0; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; - local = frame->local; + local->cont.ftruncate.offset = offset; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } + if (!local->xdata_req) + goto out; - transaction_frame->local = local; - frame->local = NULL; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); local->op = GF_FOP_FTRUNCATE; - local->transaction.fop = afr_ftruncate_wind; - local->transaction.done = afr_ftruncate_done; + local->transaction.wind = afr_ftruncate_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_ftruncate_unwind; local->transaction.main_frame = frame; @@ -935,69 +695,21 @@ afr_do_ftruncate (call_frame_t *frame, xlator_t *this) local->transaction.start = local->cont.ftruncate.offset; local->transaction.len = 0; - op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } - - op_ret = 0; -out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, - NULL, NULL); - } - - return 0; -} - - -int -afr_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_fix_open (fd, this); - priv = this->private; + /* Set it true speculatively, will get reset in afr_ftruncate_wind_cbk + if truncate was not a NOP */ + local->stable_write = _gf_true; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } - QUORUM_CHECK(ftruncate,out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->cont.ftruncate.offset = offset; - - local->fd = fd_ref (fd); - afr_open_fd_fix (fd, this); - - afr_do_ftruncate (frame, this); - - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); - } + AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -1009,173 +721,92 @@ out: int afr_setattr_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + local->xdata_rsp); return 0; } int afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, + int op_ret, int op_errno, struct iatt *preop, struct iatt *postop, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int read_child = 0; - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, preop, postop, - xdata); - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + preop, postop, xdata); } -int32_t -afr_setattr_wind (call_frame_t *frame, xlator_t *this) +int +afr_setattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setattr, - &local->loc, - &local->cont.setattr.in_buf, - local->cont.setattr.valid, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->setattr, + &local->loc, &local->cont.setattr.in_buf, + local->cont.setattr.valid, local->xdata_req); return 0; } int -afr_setattr_done (call_frame_t *frame, xlator_t *this) +afr_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, + int32_t valid, dict_t *xdata) { + afr_private_t *priv = NULL; afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - - -int -afr_setattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; QUORUM_CHECK(setattr,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; local->cont.setattr.in_buf = *buf; local->cont.setattr.valid = valid; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local->transaction.fop = afr_setattr_wind; - local->transaction.done = afr_setattr_done; + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_setattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_setattr_unwind; loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + + local->op = GF_FOP_SETATTR; local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; @@ -1183,18 +814,16 @@ afr_setattr (call_frame_t *frame, xlator_t *this, ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -1208,22 +837,13 @@ afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; + AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } @@ -1233,149 +853,72 @@ afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *preop, struct iatt *postop, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int read_child = 0; - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, preop, postop, - xdata); - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + preop, postop, xdata); } -int32_t -afr_fsetattr_wind (call_frame_t *frame, xlator_t *this) +int +afr_fsetattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fsetattr, - local->fd, - &local->cont.fsetattr.in_buf, - local->cont.fsetattr.valid, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fsetattr, + local->fd, &local->cont.fsetattr.in_buf, + local->cont.fsetattr.valid, local->xdata_req); return 0; } -int -afr_fsetattr_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - int afr_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; priv = this->private; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } - QUORUM_CHECK(fsetattr,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; local->cont.fsetattr.in_buf = *buf; local->cont.fsetattr.valid = valid; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; - local->transaction.fop = afr_fsetattr_wind; - local->transaction.done = afr_fsetattr_done; + local->transaction.wind = afr_fsetattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_fsetattr_unwind; local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - afr_open_fd_fix (fd, this); + local->op = GF_FOP_FSETATTR; + + afr_fix_open (fd, this); local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; @@ -1383,18 +926,16 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this, ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); return 0; } @@ -1410,19 +951,12 @@ afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (setxattr, main_frame, - local->op_ret, local->op_errno, - NULL); - } + AFR_STACK_UNWIND (setxattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); return 0; } @@ -1431,95 +965,32 @@ int afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int need_unwind = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - __inode_write_fop_cbk (frame, child_index, -1, this, - &op_ret, &op_errno, NULL, NULL, - xdata); - if (local->success_count == priv->child_count) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); } int -afr_setxattr_wind (call_frame_t *frame, xlator_t *this) +afr_setxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); + STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->setxattr, + &local->loc, local->cont.setxattr.dict, + local->cont.setxattr.flags, local->xdata_req); + return 0; +} - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, - local->cont.setxattr.dict, - local->cont.setxattr.flags, - NULL); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_setxattr_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} int -afr_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) +afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -1527,59 +998,60 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, int ret = -1; int op_errno = EINVAL; - VALIDATE_OR_GOTO (this, out); - GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, op_errno, out); GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, op_errno, out); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this->private, out); - priv = this->private; QUORUM_CHECK(setxattr,out); + transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) goto out; local->cont.setxattr.dict = dict_ref (dict); local->cont.setxattr.flags = flags; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local->transaction.fop = afr_setxattr_wind; - local->transaction.done = afr_setxattr_done; + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_setxattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_setxattr_unwind; loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; + local->op = GF_FOP_SETXATTR; + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); return 0; } @@ -1595,19 +1067,12 @@ afr_fsetxattr_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (fsetxattr, main_frame, - local->op_ret, local->op_errno, - NULL); - } + AFR_STACK_UNWIND (fsetxattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); return 0; } @@ -1616,93 +1081,29 @@ int afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int need_unwind = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - - __inode_write_fop_cbk (frame, child_index, -1, this, - &op_ret, &op_errno, NULL, NULL, - xdata); - if (local->success_count == priv->child_count) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); } int -afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this) +afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fsetxattr, - local->fd, - local->cont.fsetxattr.dict, - local->cont.fsetxattr.flags, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fsetxattr, + local->fd, local->cont.fsetxattr.dict, + local->cont.fsetxattr.flags, local->xdata_req); return 0; } -int -afr_fsetxattr_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - int afr_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata) @@ -1711,11 +1112,7 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this, afr_local_t *local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = EINVAL; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + int op_errno = ENOMEM; GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, op_errno, out); @@ -1725,36 +1122,36 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this, priv = this->private; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } - QUORUM_CHECK(fsetxattr,out); - AFR_LOCAL_ALLOC_OR_GOTO (local, out); - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - transaction_frame = copy_frame (frame); - if (!transaction_frame) { + if (!transaction_frame) goto out; - } - - transaction_frame->local = local; - local->op_ret = -1; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; local->cont.fsetxattr.dict = dict_ref (dict); local->cont.fsetxattr.flags = flags; - local->transaction.fop = afr_fsetxattr_wind; - local->transaction.done = afr_fsetxattr_done; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_fsetxattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_fsetxattr_unwind; local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); + + local->op = GF_FOP_FSETXATTR; local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; @@ -1762,18 +1159,16 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this, ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); return 0; } @@ -1791,19 +1186,12 @@ afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (removexattr, main_frame, - local->op_ret, local->op_errno, - NULL); - } + AFR_STACK_UNWIND (removexattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); return 0; } @@ -1812,88 +1200,25 @@ int afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int need_unwind = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - __inode_write_fop_cbk (frame, child_index, -1, this, - &op_ret, &op_errno, NULL, NULL, - xdata); - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); } -int32_t -afr_removexattr_wind (call_frame_t *frame, xlator_t *this) +int +afr_removexattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->removexattr, - &local->loc, - local->cont.removexattr.name, - NULL); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_removexattr_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - AFR_STACK_DESTROY (frame); + local = frame->local; + priv = this->private; + STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->removexattr, + &local->loc, local->cont.removexattr.name, + local->xdata_req); return 0; } @@ -1906,9 +1231,7 @@ afr_removexattr (call_frame_t *frame, xlator_t *this, afr_local_t *local = NULL; call_frame_t *transaction_frame = NULL; int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (this, out); + int op_errno = ENOMEM; GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", name, op_errno, out); @@ -1916,34 +1239,37 @@ afr_removexattr (call_frame_t *frame, xlator_t *this, GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", name, op_errno, out); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); - priv = this->private; QUORUM_CHECK(removexattr,out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; + if (!transaction_frame) goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out); - local = transaction_frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; local->cont.removexattr.name = gf_strdup (name); - local->transaction.fop = afr_removexattr_wind; - local->transaction.done = afr_removexattr_done; + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_removexattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_removexattr_unwind; loc_copy (&local->loc, loc); + local->inode = inode_ref (loc->inode); + + local->op = GF_FOP_REMOVEXATTR; local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; @@ -1951,18 +1277,16 @@ afr_removexattr (call_frame_t *frame, xlator_t *this, ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); if (ret < 0) { - op_errno = -ret; - goto out; + op_errno = -ret; + goto out; } - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); return 0; } @@ -1975,19 +1299,12 @@ afr_fremovexattr_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (fremovexattr, main_frame, - local->op_ret, local->op_errno, - NULL); - } + AFR_STACK_UNWIND (fremovexattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); return 0; } @@ -1996,105 +1313,38 @@ int afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int need_unwind = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - __inode_write_fop_cbk (frame, child_index, -1, this, - &op_ret, &op_errno, NULL, NULL, - xdata); - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + NULL, NULL, xdata); } -int32_t -afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this) +int +afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fremovexattr, - local->fd, - local->cont.removexattr.name, - NULL); - - if (!--call_count) - break; - } - } - - return 0; -} - - -int -afr_fremovexattr_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - + STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fremovexattr, + local->fd, local->cont.removexattr.name, + local->xdata_req); return 0; } int -afr_fremovexattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *name, dict_t *xdata) +afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; int ret = -1; - int op_ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (this, out); + int op_errno = ENOMEM; GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", name, op_errno, out); @@ -2102,64 +1352,59 @@ afr_fremovexattr (call_frame_t *frame, xlator_t *this, GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", name, op_errno, out); - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } + priv = this->private; QUORUM_CHECK(fremovexattr, out); transaction_frame = copy_frame (frame); - if (!transaction_frame) { + if (!transaction_frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (local, out); - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) { - op_errno = -ret; + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) goto out; - } - - transaction_frame->local = local; - - local->op_ret = -1; local->cont.removexattr.name = gf_strdup (name); + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - local->transaction.fop = afr_fremovexattr_wind; - local->transaction.done = afr_fremovexattr_done; + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_fremovexattr_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_fremovexattr_unwind; local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); + + local->op = GF_FOP_FREMOVEXATTR; local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; local->transaction.len = 0; - op_ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; + ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } - op_ret = 0; + return 0; out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (fremovexattr, frame, -1, op_errno, NULL); return 0; } -static int + +int afr_fallocate_unwind (call_frame_t *frame, xlator_t *this) { afr_local_t * local = NULL; @@ -2167,147 +1412,88 @@ afr_fallocate_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } -static int + +int afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int call_count = -1; - int need_unwind = 0; - int read_child = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); +} - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); +int +afr_fallocate_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); + local = frame->local; + priv = this->private; - if (need_unwind) - local->transaction.unwind (frame, this); + STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->fallocate, + local->fd, local->cont.fallocate.mode, + local->cont.fallocate.offset, + local->cont.fallocate.len, local->xdata_req); + return 0; +} - call_count = afr_frame_return (frame); - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; -} - -static int -afr_fallocate_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fallocate, - local->fd, - local->cont.fallocate.mode, - local->cont.fallocate.offset, - local->cont.fallocate.len, - NULL); - - if (!--call_count) - break; - } - } - - return 0; -} - -static int -afr_fallocate_done (call_frame_t *frame, xlator_t *this) +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) { + afr_private_t *priv = NULL; + call_frame_t *transaction_frame = NULL; afr_local_t *local = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; + priv = this->private; - local->transaction.unwind (frame, this); + QUORUM_CHECK(fallocate,out); - AFR_STACK_DESTROY (frame); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - return 0; -} + local = AFR_FRAME_INIT (transaction_frame, op_errno); + if (!local) + goto out; -static int -afr_do_fallocate (call_frame_t *frame, xlator_t *this) -{ - call_frame_t * transaction_frame = NULL; - afr_local_t * local = NULL; - int op_ret = -1; - int op_errno = 0; + local->cont.fallocate.mode = mode; + local->cont.fallocate.offset = offset; + local->cont.fallocate.len = len; - local = frame->local; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - transaction_frame->local = local; - frame->local = NULL; + if (!local->xdata_req) + goto out; local->op = GF_FOP_FALLOCATE; - local->transaction.fop = afr_fallocate_wind; - local->transaction.done = afr_fallocate_done; + local->transaction.wind = afr_fallocate_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_fallocate_unwind; local->transaction.main_frame = frame; @@ -2315,80 +1501,29 @@ afr_do_fallocate (call_frame_t *frame, xlator_t *this) local->transaction.start = local->cont.fallocate.offset; local->transaction.len = 0; - /* fallocate can modify the file size */ - op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } - - op_ret = 0; -out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fallocate, frame, op_ret, op_errno, NULL, - NULL, NULL); - } + afr_fix_open (fd, this); - return 0; -} - -int -afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, - off_t offset, size_t len, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } - QUORUM_CHECK(fallocate,out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->cont.fallocate.mode = mode; - local->cont.fallocate.offset = offset; - local->cont.fallocate.len = len; - - local->fd = fd_ref (fd); - afr_open_fd_fix (fd, this); - - afr_do_fallocate (frame, this); - - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); return 0; } + /* }}} */ /* {{{ discard */ -static int +int afr_discard_unwind (call_frame_t *frame, xlator_t *this) { afr_local_t * local = NULL; @@ -2396,146 +1531,86 @@ afr_discard_unwind (call_frame_t *frame, xlator_t *this) local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (discard, main_frame, local->op_ret, - local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - NULL); - } + AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } -static int + +int afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int child_index = (long) cookie; - int call_count = -1; - int need_unwind = 0; - int read_child = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - __inode_write_fop_cbk (frame, child_index, read_child, this, - &op_ret, &op_errno, prebuf, postbuf, - xdata); - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); } -static int -afr_discard_wind (call_frame_t *frame, xlator_t *this) + +int +afr_discard_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->discard, - local->fd, - local->cont.discard.offset, - local->cont.discard.len, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->discard, + local->fd, local->cont.discard.offset, + local->cont.discard.len, local->xdata_req); return 0; } -static int -afr_discard_done (call_frame_t *frame, xlator_t *this) + +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { + afr_private_t *priv = NULL; afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; + priv = this->private; - local->transaction.unwind (frame, this); + QUORUM_CHECK(discard, out); - AFR_STACK_DESTROY (frame); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - return 0; -} + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; -static int -afr_do_discard (call_frame_t *frame, xlator_t *this) -{ - call_frame_t * transaction_frame = NULL; - afr_local_t * local = NULL; - int op_ret = -1; - int op_errno = 0; + local->cont.discard.offset = offset; + local->cont.discard.len = len; - local = frame->local; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - transaction_frame->local = local; - frame->local = NULL; + if (!local->xdata_req) + goto out; local->op = GF_FOP_DISCARD; - local->transaction.fop = afr_discard_wind; - local->transaction.done = afr_discard_done; + local->transaction.wind = afr_discard_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_discard_unwind; local->transaction.main_frame = frame; @@ -2543,316 +1618,134 @@ afr_do_discard (call_frame_t *frame, xlator_t *this) local->transaction.start = local->cont.discard.offset; local->transaction.len = 0; - op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } - - op_ret = 0; -out: - if (op_ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (discard, frame, op_ret, op_errno, NULL, - NULL, NULL); - } - - return 0; -} + afr_fix_open (fd, this); -int -afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - size_t len, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; } - QUORUM_CHECK(discard, out); - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - - local->cont.discard.offset = offset; - local->cont.discard.len = len; - - local->fd = fd_ref (fd); - - afr_open_fd_fix (fd, this); - - afr_do_discard(frame, this); - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); - AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); return 0; } /* {{{ zerofill */ -static int +int afr_zerofill_unwind (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; local = frame->local; - LOCK (&frame->lock); - { - if (local->transaction.main_frame) { - main_frame = local->transaction.main_frame; - } - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); + main_frame = afr_transaction_detach_fop_frame (frame); + if (!main_frame) + return 0; - if (main_frame) { - AFR_STACK_UNWIND (zerofill, main_frame, local->op_ret, - local->op_errno, - &local->cont.zerofill.prebuf, - &local->cont.zerofill.postbuf, - NULL); - } + AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); return 0; } -static int -afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int child_index = (long) cookie; - int call_count = -1; - int need_unwind = 0; - int read_child = 0; - - local = frame->local; - priv = this->private; - - read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); - LOCK (&frame->lock); - { - if (child_index == read_child) { - local->read_child_returned = _gf_true; - } - - if (afr_fop_failed (op_ret, op_errno)) { - afr_transaction_fop_failed (frame, this, child_index); - } - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.zerofill.prebuf = *prebuf; - local->cont.zerofill.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.zerofill.prebuf = *prebuf; - local->cont.zerofill.postbuf = *postbuf; - } - - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } - } - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (need_unwind) { - local->transaction.unwind (frame, this); - } - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; +int +afr_zerofill_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, + prebuf, postbuf, xdata); } -static int -afr_zerofill_wind (call_frame_t *frame, xlator_t *this) + +int +afr_zerofill_wind (call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->zerofill, - local->fd, - local->cont.zerofill.offset, - local->cont.zerofill.len, - NULL); - - if (!--call_count) - break; - } - } - + STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, (void *) (long) subvol, + priv->children[subvol], + priv->children[subvol]->fops->zerofill, + local->fd, local->cont.zerofill.offset, + local->cont.zerofill.len, local->xdata_req); return 0; } -static int -afr_zerofill_done (call_frame_t *frame, xlator_t *this) +int +afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { + afr_private_t *priv = NULL; afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; + priv = this->private; - local->transaction.unwind (frame, this); + QUORUM_CHECK(discard, out); - AFR_STACK_DESTROY (frame); + transaction_frame = copy_frame (frame); + if (!transaction_frame) + goto out; - return 0; -} + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; -static int -afr_do_zerofill(call_frame_t *frame, xlator_t *this) -{ - call_frame_t *transaction_frame = NULL; - afr_local_t *local = NULL; - int op_ret = -1; - int op_errno = 0; + local->cont.zerofill.offset = offset; + local->cont.zerofill.len = len; - local = frame->local; + local->fd = fd_ref (fd); + local->inode = inode_ref (fd->inode); - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - goto out; - } + if (xdata) + local->xdata_req = dict_copy_with_ref (xdata, NULL); + else + local->xdata_req = dict_new (); - transaction_frame->local = local; - frame->local = NULL; + if (!local->xdata_req) + goto out; local->op = GF_FOP_ZEROFILL; - local->transaction.fop = afr_zerofill_wind; - local->transaction.done = afr_zerofill_done; + local->transaction.wind = afr_zerofill_wind; + local->transaction.fop = __afr_txn_write_fop; + local->transaction.done = __afr_txn_write_done; local->transaction.unwind = afr_zerofill_unwind; local->transaction.main_frame = frame; - local->transaction.start = local->cont.zerofill.offset; - local->transaction.len = 0; - - op_ret = afr_transaction (transaction_frame, this, - AFR_DATA_TRANSACTION); - if (op_ret < 0) { - op_errno = -op_ret; - goto out; - } - - op_ret = 0; -out: - if (op_ret < 0) { - if (transaction_frame) { - AFR_STACK_DESTROY (transaction_frame); - } - AFR_STACK_UNWIND (zerofill, frame, op_ret, op_errno, NULL, - NULL, NULL); - } - - return 0; -} - -int -afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - off_t len, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - priv = this->private; - - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } - QUORUM_CHECK(zerofill, out); + local->transaction.start = local->cont.discard.offset; + local->transaction.len = len; - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + afr_fix_open (fd, this); - ret = afr_local_init (local, priv, &op_errno); + ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); if (ret < 0) { - goto out; + op_errno = -ret; + goto out; } - local->cont.zerofill.offset = offset; - local->cont.zerofill.len = len; - - local->fd = fd_ref (fd); - - afr_open_fd_fix (fd, this); - afr_do_zerofill(frame, this); - - ret = 0; + return 0; out: - if (ret < 0) { - if (transaction_frame) { - AFR_STACK_DESTROY (transaction_frame); - } - AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, - NULL, NULL); - } + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); return 0; } diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index 060d78f35..a2a758f35 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -580,22 +580,6 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) return 0; } -loc_t * -lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) -{ - int ret = 0; - - ret = uuid_compare (l1->inode->gfid, l2->inode->gfid); - - if (ret == 0) - ret = strcmp (b1, b2); - - if (ret <= 0) - return l1; - else - return l2; -} - int afr_lockee_locked_nodes_count (afr_internal_lock_t *int_lock) { @@ -1213,8 +1197,7 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this) case AFR_ENTRY_RENAME_TRANSACTION: case AFR_ENTRY_TRANSACTION: - up_count = afr_up_children_count (local->child_up, - priv->child_count); + up_count = AFR_COUNT (local->child_up, priv->child_count); int_lock->lk_call_count = int_lock->lk_expected_count = (int_lock->lockee_count * up_count); @@ -1647,496 +1630,6 @@ afr_unlock (call_frame_t *frame, xlator_t *this) return 0; } -int -afr_mark_locked_nodes (xlator_t *this, fd_t *fd, - unsigned char *locked_nodes) -{ - afr_private_t *priv = NULL; - afr_fd_ctx_t *fdctx = NULL; - uint64_t tmp = 0; - int ret = 0; - - priv = this->private; - - ret = afr_fd_ctx_set (this, fd); - if (ret) - goto out; - - ret = fd_ctx_get (fd, this, &tmp); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "failed to get the fd ctx"); - goto out; - } - fdctx = (afr_fd_ctx_t *) (long) tmp; - - GF_ASSERT (fdctx->locked_on); - - memcpy (fdctx->locked_on, locked_nodes, - priv->child_count); - -out: - return ret; -} - -static int -__is_fd_saved (xlator_t *this, fd_t *fd) -{ - afr_locked_fd_t *locked_fd = NULL; - afr_private_t *priv = NULL; - int found = 0; - - priv = this->private; - - list_for_each_entry (locked_fd, &priv->saved_fds, list) { - if (locked_fd->fd == fd) { - found = 1; - break; - } - } - - return found; -} - -static int -__afr_save_locked_fd (xlator_t *this, fd_t *fd) -{ - afr_private_t *priv = NULL; - afr_locked_fd_t *locked_fd = NULL; - int ret = 0; - - priv = this->private; - - locked_fd = GF_CALLOC (1, sizeof (*locked_fd), - gf_afr_mt_locked_fd); - if (!locked_fd) { - ret = -1; - goto out; - } - - locked_fd->fd = fd; - INIT_LIST_HEAD (&locked_fd->list); - - list_add_tail (&locked_fd->list, &priv->saved_fds); - -out: - return ret; -} - -int -afr_save_locked_fd (xlator_t *this, fd_t *fd) -{ - afr_private_t *priv = NULL; - int ret = 0; - - priv = this->private; - - pthread_mutex_lock (&priv->mutex); - { - if (__is_fd_saved (this, fd)) { - gf_log (this->name, GF_LOG_DEBUG, - "fd=%p already saved", fd); - goto unlock; - } - - ret = __afr_save_locked_fd (this, fd); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "fd=%p could not be saved", fd); - goto unlock; - } - } -unlock: - pthread_mutex_unlock (&priv->mutex); - - return ret; -} - -static int -afr_lock_recovery_cleanup (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_locked_fd_t *locked_fd = NULL; - - local = frame->local; - - locked_fd = local->locked_fd; - - STACK_DESTROY (frame->root); - afr_local_cleanup (local, this); - - afr_save_locked_fd (this, locked_fd->fd); - - return 0; - -} - -static int -afr_get_source_lock_recovery (xlator_t *this, fd_t *fd) -{ - afr_fd_ctx_t *fdctx = NULL; - afr_private_t *priv = NULL; - uint64_t tmp = 0; - int i = 0; - int source_child = -1; - int ret = 0; - - priv = this->private; - - ret = fd_ctx_get (fd, this, &tmp); - if (ret) - goto out; - - fdctx = (afr_fd_ctx_t *) (long) tmp; - - for (i = 0; i < priv->child_count; i++) { - if (fdctx->locked_on[i]) { - gf_log (this->name, GF_LOG_DEBUG, - "Found lock recovery source=%d", i); - source_child = i; - break; - } - } - -out: - return source_child; - -} - -int32_t -afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock, - dict_t *xdata); -int32_t -afr_recover_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock, - dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int32_t source_child = 0; - struct gf_flock flock = {0,}; - - local = frame->local; - priv = this->private; - - if (op_ret) { - gf_log (this->name, GF_LOG_INFO, - "lock recovery failed"); - goto cleanup; - } - - source_child = local->source_child; - - memcpy (&flock, lock, sizeof (*lock)); - - STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk, - (void *) (long) source_child, - priv->children[source_child], - priv->children[source_child]->fops->lk, - local->fd, F_GETLK_FD, &flock, NULL); - - return 0; - -cleanup: - afr_lock_recovery_cleanup (frame, this); - return 0; -} - -int -afr_recover_lock (call_frame_t *frame, xlator_t *this, - struct gf_flock *flock) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int32_t lock_recovery_child = 0; - - priv = this->private; - local = frame->local; - - lock_recovery_child = local->lock_recovery_child; - - frame->root->lk_owner = flock->l_owner; - - STACK_WIND_COOKIE (frame, afr_recover_lock_cbk, - (void *) (long) lock_recovery_child, - priv->children[lock_recovery_child], - priv->children[lock_recovery_child]->fops->lk, - local->fd, F_SETLK, flock, NULL); - - return 0; -} - -static int -is_afr_lock_eol (struct gf_flock *lock) -{ - int ret = 0; - - if ((lock->l_type == GF_LK_EOL)) - ret = 1; - - return ret; -} - -int32_t -afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock, - dict_t *xdata) -{ - if (op_ret) { - gf_log (this->name, GF_LOG_INFO, - "Failed to get locks on fd"); - goto cleanup; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Got a lock on fd"); - - if (is_afr_lock_eol (lock)) { - gf_log (this->name, GF_LOG_INFO, - "Reached EOL on locks on fd"); - goto cleanup; - } - - afr_recover_lock (frame, this, lock); - - return 0; - -cleanup: - afr_lock_recovery_cleanup (frame, this); - - return 0; -} - -static int -afr_lock_recovery (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - fd_t *fd = NULL; - int ret = 0; - int32_t source_child = 0; - struct gf_flock flock = {0,}; - - priv = this->private; - local = frame->local; - - fd = local->fd; - - source_child = afr_get_source_lock_recovery (this, fd); - if (source_child < 0) { - gf_log (this->name, GF_LOG_ERROR, - "Could not recover locks due to lock " - "split brain"); - ret = -1; - goto out; - } - - local->source_child = source_child; - - /* the flock can be zero filled as we're querying incrementally - the locks held on the fd. - */ - STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk, - (void *) (long) source_child, - priv->children[source_child], - priv->children[source_child]->fops->lk, - local->fd, F_GETLK_FD, &flock, NULL); - -out: - return ret; -} - - -static int -afr_mark_fd_opened (xlator_t *this, fd_t *fd, int32_t child_index) -{ - afr_fd_ctx_t *fdctx = NULL; - uint64_t tmp = 0; - int ret = 0; - - ret = fd_ctx_get (fd, this, &tmp); - if (ret) - goto out; - - fdctx = (afr_fd_ctx_t *) (long) tmp; - - fdctx->opened_on[child_index] = AFR_FD_OPENED; - -out: - return ret; -} - -int32_t -afr_lock_recovery_preopen_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, - dict_t *xdata) -{ - int32_t child_index = (long )cookie; - int ret = 0; - - if (op_ret) { - gf_log (this->name, GF_LOG_INFO, - "Reopen during lock-recovery failed"); - goto cleanup; - } - - gf_log (this->name, GF_LOG_DEBUG, - "Open succeeded => proceed to recover locks"); - - ret = afr_lock_recovery (frame, this); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "Lock recovery failed"); - goto cleanup; - } - - ret = afr_mark_fd_opened (this, fd, child_index); - if (ret) { - gf_log (this->name, GF_LOG_INFO, - "Marking fd open failed"); - goto cleanup; - } - - return 0; - -cleanup: - afr_lock_recovery_cleanup (frame, this); - return 0; -} - -static int -afr_lock_recovery_preopen (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - uint64_t tmp = 0; - afr_fd_ctx_t *fdctx = NULL; - loc_t loc = {0,}; - int32_t child_index = 0; - int ret = 0; - - priv = this->private; - local = frame->local; - - GF_ASSERT (local && local->fd); - - ret = fd_ctx_get (local->fd, this, &tmp); - if (ret) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to get the context of fd", - uuid_utoa (local->fd->inode->gfid)); - fdctx = (afr_fd_ctx_t *) (long) tmp; - /* TODO: instead we should return from the function */ - GF_ASSERT (fdctx); - - child_index = local->lock_recovery_child; - - inode_path (local->fd->inode, NULL, (char **)&loc.path); - loc.name = strrchr (loc.path, '/'); - loc.inode = inode_ref (local->fd->inode); - loc.parent = inode_parent (local->fd->inode, 0, NULL); - - - STACK_WIND_COOKIE (frame, afr_lock_recovery_preopen_cbk, - (void *)(long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->open, - &loc, fdctx->flags, local->fd, NULL); - - return 0; -} - -static int -is_fd_opened (fd_t *fd, int32_t child_index) -{ - afr_fd_ctx_t *fdctx = NULL; - uint64_t tmp = 0; - int ret = 0; - - ret = fd_ctx_get (fd, THIS, &tmp); - if (ret) - goto out; - - fdctx = (afr_fd_ctx_t *) (long) tmp; - - if (fdctx->opened_on[child_index] == AFR_FD_OPENED) - ret = 1; - -out: - return ret; -} - -int -afr_attempt_lock_recovery (xlator_t *this, int32_t child_index) -{ - call_frame_t *frame = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_locked_fd_t *locked_fd = NULL; - afr_locked_fd_t *tmp = NULL; - int ret = -1; - struct list_head locks_list = {0,}; - int32_t op_errno = 0; - - - priv = this->private; - - if (list_empty (&priv->saved_fds)) - goto out; - - frame = create_frame (this, this->ctx->pool); - if (!frame) { - ret = -1; - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) { - ret = -1; - goto out; - } - - frame->local = local; - - INIT_LIST_HEAD (&locks_list); - - pthread_mutex_lock (&priv->mutex); - { - list_splice_init (&priv->saved_fds, &locks_list); - } - pthread_mutex_unlock (&priv->mutex); - - list_for_each_entry_safe (locked_fd, tmp, - &locks_list, list) { - - list_del_init (&locked_fd->list); - - local->fd = fd_ref (locked_fd->fd); - local->lock_recovery_child = child_index; - local->locked_fd = locked_fd; - - if (!is_fd_opened (locked_fd->fd, child_index)) { - gf_log (this->name, GF_LOG_DEBUG, - "attempting open before lock " - "recovery"); - afr_lock_recovery_preopen (frame, this); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "attempting lock recovery " - "without a preopen"); - afr_lock_recovery (frame, this); - } - } - -out: - if ((ret < 0) && frame) - AFR_STACK_DESTROY (frame); - return ret; -} - int afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, unsigned int child_count) diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 73594f265..05df90cc0 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -41,10 +41,8 @@ enum gf_afr_mem_types_ { gf_afr_mt_shd_event_t, gf_afr_mt_time_t, gf_afr_mt_pos_data_t, - gf_afr_mt_reply_t, - gf_afr_mt_stats_t, - gf_afr_mt_shd_crawl_event_t, - gf_afr_mt_uint64_t, + gf_afr_mt_reply_t, + gf_afr_mt_subvol_healer_t, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index 643a5d692..f86aa7fd8 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -43,85 +43,29 @@ #include "afr-dir-read.h" #include "afr-dir-write.h" #include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" -int -afr_stale_child_up (afr_local_t *local, xlator_t *this) -{ - int i = 0; - afr_private_t *priv = NULL; - int up = -1; - - priv = this->private; - - if (!local->fresh_children) - local->fresh_children = afr_children_create (priv->child_count); - if (!local->fresh_children) - goto out; - - afr_inode_get_read_ctx (this, local->fd->inode, local->fresh_children); - if (priv->child_count == afr_get_children_count (local->fresh_children, - priv->child_count)) - goto out; - for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; - if (afr_is_child_present (local->fresh_children, - priv->child_count, i)) - continue; - up = i; - break; - } -out: - return up; -} - -void -afr_perform_data_self_heal (call_frame_t *frame, xlator_t *this) +gf_boolean_t +afr_is_fd_fixable (fd_t *fd) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - inode_t *inode = NULL; - int st_child = -1; - char reason[64] = {0}; - - local = frame->local; - sh = &local->self_heal; - inode = local->fd->inode; - - if (!IA_ISREG (inode->ia_type)) - goto out; - - st_child = afr_stale_child_up (local, this); - if (st_child < 0) - goto out; - - sh->do_data_self_heal = _gf_true; - sh->do_metadata_self_heal = _gf_true; - sh->do_gfid_self_heal = _gf_true; - sh->do_missing_entry_self_heal = _gf_true; - - snprintf (reason, sizeof (reason), "stale subvolume %d detected", - st_child); - afr_launch_self_heal (frame, this, inode, _gf_true, inode->ia_type, - reason, NULL, NULL); -out: - return; + if (!fd || !fd->inode) + return _gf_false; + else if (fd_is_anonymous (fd)) + return _gf_false; + else if (uuid_is_null (fd->inode->gfid)) + return _gf_false; + + return _gf_true; } + int afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = frame->local; - afr_private_t *priv = NULL; - priv = this->private; - if (afr_open_only_data_self_heal (priv->data_self_heal)) - afr_perform_data_self_heal (frame, this); AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, local->fd, xdata); return 0; @@ -134,49 +78,38 @@ afr_open_cbk (call_frame_t *frame, void *cookie, fd_t *fd, dict_t *xdata) { afr_local_t * local = NULL; - int ret = 0; int call_count = -1; int child_index = (long) cookie; - afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; - priv = this->private; local = frame->local; + fd_ctx = local->fd_ctx; LOCK (&frame->lock); { if (op_ret == -1) { local->op_errno = op_errno; - } - - if (op_ret >= 0) { + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } else { local->op_ret = op_ret; - local->success_count++; - - ret = afr_child_fd_ctx_set (this, fd, child_index, - local->cont.open.flags); - if (ret) { - local->op_ret = -1; - local->op_errno = -ret; - goto unlock; - } + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref (xdata); } } -unlock: UNLOCK (&frame->lock); call_count = afr_frame_return (frame); if (call_count == 0) { - if ((local->cont.open.flags & O_TRUNC) - && (local->op_ret >= 0)) { + if ((fd_ctx->flags & O_TRUNC) && (local->op_ret >= 0)) { STACK_WIND (frame, afr_open_ftruncate_cbk, this, this->fops->ftruncate, fd, 0, NULL); } else { - if (afr_open_only_data_self_heal (priv->data_self_heal)) - afr_perform_data_self_heal (frame, this); AFR_STACK_UNWIND (open, frame, local->op_ret, - local->op_errno, local->fd, xdata); + local->op_errno, local->fd, + local->xdata_rsp); } } @@ -190,16 +123,11 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, afr_private_t * priv = NULL; afr_local_t * local = NULL; int i = 0; - int ret = -1; int32_t call_count = 0; int32_t op_errno = 0; - int32_t wind_flags = flags & (~O_TRUNC); - //We can't let truncation to happen outside transaction. + afr_fd_ctx_t *fd_ctx = NULL; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (loc, out); + //We can't let truncation to happen outside transaction. priv = this->private; @@ -207,44 +135,38 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, QUORUM_CHECK(open,out); } - if (afr_is_split_brain (this, loc->inode)) { - /* self-heal failed */ - gf_log (this->name, GF_LOG_WARNING, - "failed to open as split brain seen, returning EIO"); - op_errno = EIO; - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) { + op_errno = ENOMEM; + goto out; + } - call_count = local->call_count; - loc_copy (&local->loc, loc); + local->fd = fd_ref (fd); + local->fd_ctx = fd_ctx; + fd_ctx->flags = flags; - local->cont.open.flags = flags; + call_count = local->call_count; - local->fd = fd_ref (fd); + local->cont.open.flags = flags; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->open, - loc, wind_flags, fd, xdata); - + loc, (flags & ~O_TRUNC), fd, xdata); if (!--call_count) break; } } - ret = 0; + return 0; out: - if (ret < 0) - AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, xdata); + AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, NULL); return 0; } @@ -273,12 +195,7 @@ afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, priv->children[child_index]->name); } - fd_ctx = afr_fd_ctx_get (local->fd, this); - if (!fd_ctx) { - gf_log (this->name, GF_LOG_WARNING, - "failed to get fd context, %p", local->fd); - goto out; - } + fd_ctx = local->fd_ctx; LOCK (&local->fd->lock); { @@ -289,7 +206,7 @@ afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } } UNLOCK (&local->fd->lock); -out: + call_count = afr_frame_return (frame); if (call_count == 0) AFR_STACK_DESTROY (frame); @@ -297,8 +214,42 @@ out: return 0; } + +static int +afr_fd_ctx_need_open (fd_t *fd, xlator_t *this, unsigned char *need_open) +{ + afr_fd_ctx_t *fd_ctx = NULL; + afr_private_t *priv = NULL; + int i = 0; + int count = 0; + + priv = this->private; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return 0; + + LOCK (&fd->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED && + priv->child_up[i]) { + fd_ctx->opened_on[i] = AFR_FD_OPENING; + need_open[i] = 1; + count++; + } else { + need_open[i] = 0; + } + } + } + UNLOCK (&fd->lock); + + return count; +} + + void -afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open) +afr_fix_open (fd_t *fd, xlator_t *this) { afr_private_t *priv = NULL; int i = 0; @@ -307,29 +258,31 @@ afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open) int ret = -1; int32_t op_errno = 0; afr_fd_ctx_t *fd_ctx = NULL; + unsigned char *need_open = NULL; + int call_count = 0; priv = this->private; - if (!afr_is_fd_fixable (fd) || !need_open || !need_open_count) + if (!afr_is_fd_fixable (fd)) goto out; fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) { - ret = -1; + if (!fd_ctx) goto out; - } + + need_open = alloca0 (priv->child_count); + + call_count = afr_fd_ctx_need_open (fd, this, need_open); + if (!call_count) + goto out; frame = create_frame (this, this->ctx->pool); - if (!frame) { - ret = -1; + if (!frame) goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; local->loc.inode = inode_ref (fd->inode); ret = loc_path (&local->loc, NULL); @@ -337,10 +290,12 @@ afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open) goto out; local->fd = fd_ref (fd); - local->call_count = need_open_count; + local->fd_ctx = fd_ctx; + + local->call_count = call_count; - gf_log (this->name, GF_LOG_DEBUG, "need open count: %zd", - need_open_count); + gf_log (this->name, GF_LOG_DEBUG, "need open count: %d", + call_count); for (i = 0; i < priv->child_count; i++) { if (!need_open[i]) @@ -371,12 +326,12 @@ afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open) local->fd, NULL); } + if (!--call_count) + break; } - op_errno = 0; - ret = 0; + + return; out: - if (op_errno) - ret = -1; //For handling ALLOC_OR_GOTO - if (ret && frame) + if (frame) AFR_STACK_DESTROY (frame); } diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c new file mode 100644 index 000000000..186f68c33 --- /dev/null +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -0,0 +1,239 @@ +/* + Copyright (c) 2013 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#include "afr.h" +#include "afr-transaction.h" + +int +afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int subvol = -1; + + local = frame->local; + priv = this->private; + + + for (i = 0; i < priv->child_count; i++) { + if (!local->readable[i]) { + /* don't even bother trying here. + just mark as attempted and move on. */ + local->read_attempted[i] = 1; + continue; + } + + if (!local->read_attempted[i]) { + subvol = i; + break; + } + } + + /* If no more subvols were available for reading, we leave + @subvol as -1, which is an indication we have run out of + readable subvols. */ + if (subvol != -1) + local->read_attempted[subvol] = 1; + local->readfn (frame, this, subvol); + + return 0; +} + + +int +afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err) +{ + afr_local_t *local = NULL; + int read_subvol = 0; + int event_generation = 0; + inode_t *inode = NULL; + int ret = -1; + + local = frame->local; + inode = local->inode; + + if (err) { + local->op_errno = -err; + local->op_ret = -1; + read_subvol = -1; + goto readfn; + } + + ret = afr_inode_read_subvol_type_get (inode, this, local->readable, + &event_generation, + local->transaction.type); + + if (ret == -1 || !event_generation) { + /* Even after refresh, we don't have a good + read subvolume. Time to bail */ + local->op_ret = -1; + local->op_errno = EIO; + read_subvol = -1; + goto readfn; + } + + read_subvol = afr_read_subvol_select_by_policy (inode, this, + local->readable); + + if (read_subvol == -1) { + local->op_ret = -1; + local->op_errno = EIO; + goto readfn; + } + + if (local->read_attempted[read_subvol]) { + afr_read_txn_next_subvol (frame, this); + return 0; + } + + local->read_attempted[read_subvol] = 1; +readfn: + local->readfn (frame, this, read_subvol); + + return 0; +} + + +int +afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (!local->refreshed) { + local->refreshed = _gf_true; + afr_inode_refresh (frame, this, local->inode, + afr_read_txn_refresh_done); + } else { + afr_read_txn_next_subvol (frame, this); + } + + return 0; +} + + +/* afr_read_txn_wipe: + + clean internal variables in @local in order to make + it possible to call afr_read_txn() multiple times from + the same frame +*/ + +void +afr_read_txn_wipe (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + + local->readfn = NULL; + + if (local->inode) + inode_unref (local->inode); + + for (i = 0; i < priv->child_count; i++) { + local->read_attempted[i] = 0; + local->readable[i] = 0; + } +} + + +/* + afr_read_txn: + + This is the read transaction function. The way it works: + + - Determine read-subvolume from inode ctx. + + - If read-subvolume's generation was stale, refresh ctx once by + calling afr_inode_refresh() + + Else make an attempt to read on read-subvolume. + + - If attempted read on read-subvolume fails, refresh ctx once + by calling afr_inode_refresh() + + - After ctx refresh, query read-subvolume freshly and attempt + read once. + + - If read fails, try every other readable[] subvolume before + finally giving up. readable[] elements are set by afr_inode_refresh() + based on dirty and pending flags. + + - If file is in split brain in the backend, generation will be + kept 0 by afr_inode_refresh() and readable[] will be set 0 for + all elements. Therefore reads always fail. +*/ + +int +afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_read_txn_wind_t readfn, afr_transaction_type type) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int read_subvol = -1; + int event_generation = 0; + int ret = -1; + + priv = this->private; + local = frame->local; + + afr_read_txn_wipe (frame, this); + + local->readfn = readfn; + local->inode = inode_ref (inode); + + local->transaction.type = type; + ret = afr_inode_read_subvol_type_get (inode, this, local->readable, + &event_generation, type); + if (ret == -1) + /* very first transaction on this inode */ + goto refresh; + + if (local->event_generation != event_generation) + /* servers have disconnected / reconnected, and possibly + rebooted, very likely changing the state of freshness + of copies */ + goto refresh; + + read_subvol = afr_read_subvol_select_by_policy (inode, this, + local->readable); + + if (read_subvol < 0 || read_subvol > priv->child_count) { + gf_log (this->name, GF_LOG_WARNING, "Unreadable subvolume %d " + "found with event generation %d", read_subvol, + event_generation); + goto refresh; + } + + if (!local->child_up[read_subvol]) { + /* should never happen, just in case */ + gf_log (this->name, GF_LOG_WARNING, "subvolume %d is the " + "read subvolume in this generation, but is not up", + read_subvol); + goto refresh; + } + + local->read_attempted[read_subvol] = 1; + + local->readfn (frame, this, read_subvol); + + return 0; + +refresh: + afr_inode_refresh (frame, this, inode, afr_read_txn_refresh_done); + + return 0; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c deleted file mode 100644 index 83846f152..000000000 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ /dev/null @@ -1,837 +0,0 @@ -/* - Copyright (c) 2008-2012 Red Hat, Inc. - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - - -#include -#include "glusterfs.h" -#include "afr.h" -#include "xlator.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" -#include "afr-self-heal-algorithm.h" - -/* - This file contains the various self-heal algorithms -*/ - -static int -sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, - gf_boolean_t is_first_call, call_frame_t *old_loop_frame); -static int -sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame, - int32_t op_ret, int32_t op_errno); -static int -sh_destroy_frame (call_frame_t *frame, xlator_t *this) -{ - if (!frame) - goto out; - - AFR_STACK_DESTROY (frame); -out: - return 0; -} - -static void -sh_private_cleanup (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_private_t *sh_priv = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh_priv = sh->private; - GF_FREE (sh_priv); -} - -static int -sh_number_of_writes_needed (unsigned char *write_needed, int child_count) -{ - int writes = 0; - int i = 0; - - for (i = 0; i < child_count; i++) { - if (write_needed[i]) - writes++; - } - - return writes; -} - - -static int -sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this, - call_frame_t *last_loop_frame) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_private_t *sh_priv = NULL; - int32_t total_blocks = 0; - int32_t diff_blocks = 0; - - local = sh_frame->local; - sh = &local->self_heal; - sh_priv = sh->private; - if (sh_priv) { - total_blocks = sh_priv->total_blocks; - diff_blocks = sh_priv->diff_blocks; - } - - sh_private_cleanup (sh_frame, this); - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - GF_ASSERT (!last_loop_frame); - //loop_finish should have happened and the old_loop should be NULL - gf_log (this->name, GF_LOG_DEBUG, - "self-heal aborting on %s", - local->loc.path); - - local->self_heal.algo_abort_cbk (sh_frame, this); - } else { - GF_ASSERT (last_loop_frame); - if (diff_blocks == total_blocks) { - gf_log (this->name, GF_LOG_DEBUG, "full self-heal " - "completed on %s",local->loc.path); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "diff self-heal on %s: completed. " - "(%d blocks of %d were different (%.2f%%))", - local->loc.path, diff_blocks, total_blocks, - ((diff_blocks * 1.0)/total_blocks) * 100); - } - - sh->old_loop_frame = last_loop_frame; - local->self_heal.algo_completion_cbk (sh_frame, this); - } - - return 0; -} - -int -sh_loop_finish (call_frame_t *loop_frame, xlator_t *this) -{ - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - - if (!loop_frame) - goto out; - - loop_local = loop_frame->local; - if (loop_local) { - loop_sh = &loop_local->self_heal; - } - - if (loop_sh && loop_sh->data_lock_held) { - afr_sh_data_unlock (loop_frame, this, this->name, - sh_destroy_frame); - } else { - sh_destroy_frame (loop_frame, this); - } -out: - return 0; -} - -static int -sh_loop_lock_success (call_frame_t *loop_frame, xlator_t *this) -{ - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - sh_loop_finish (loop_sh->old_loop_frame, this); - loop_sh->old_loop_frame = NULL; - - gf_log (this->name, GF_LOG_DEBUG, "Acquired lock for range %"PRIu64 - " %"PRIu64, loop_sh->offset, loop_sh->block_size); - loop_sh->data_lock_held = _gf_true; - loop_sh->sh_data_algo_start (loop_frame, this); - return 0; -} - -static int -sh_loop_lock_failure (call_frame_t *loop_frame, xlator_t *this) -{ - call_frame_t *sh_frame = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - sh_frame = loop_sh->sh_frame; - - gf_log (this->name, GF_LOG_ERROR, "failed lock for range %"PRIu64 - " %"PRIu64, loop_sh->offset, loop_sh->block_size); - sh_loop_finish (loop_sh->old_loop_frame, this); - loop_sh->old_loop_frame = NULL; - sh_loop_return (sh_frame, this, loop_frame, -1, ENOTCONN); - return 0; -} - -static int -sh_loop_frame_create (call_frame_t *sh_frame, xlator_t *this, - call_frame_t *old_loop_frame, call_frame_t **loop_frame) -{ - call_frame_t *new_loop_frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *new_loop_local = NULL; - afr_self_heal_t *new_loop_sh = NULL; - afr_private_t *priv = NULL; - - GF_ASSERT (sh_frame); - GF_ASSERT (loop_frame); - - *loop_frame = NULL; - local = sh_frame->local; - sh = &local->self_heal; - priv = this->private; - - new_loop_frame = copy_frame (sh_frame); - if (!new_loop_frame) - goto out; - //We want the frame to have same lk_owner as sh_frame - //so that locks translator allows conflicting locks - new_loop_local = afr_self_heal_local_init (local, this); - if (!new_loop_local) - goto out; - new_loop_frame->local = new_loop_local; - - new_loop_sh = &new_loop_local->self_heal; - new_loop_sh->sources = memdup (sh->sources, - priv->child_count * sizeof (*sh->sources)); - if (!new_loop_sh->sources) - goto out; - new_loop_sh->write_needed = GF_CALLOC (priv->child_count, - sizeof (*new_loop_sh->write_needed), - gf_afr_mt_char); - if (!new_loop_sh->write_needed) - goto out; - new_loop_sh->checksum = GF_CALLOC (priv->child_count, MD5_DIGEST_LENGTH, - gf_afr_mt_uint8_t); - if (!new_loop_sh->checksum) - goto out; - new_loop_sh->inode = inode_ref (sh->inode); - new_loop_sh->sh_data_algo_start = sh->sh_data_algo_start; - new_loop_sh->source = sh->source; - new_loop_sh->active_sinks = sh->active_sinks; - new_loop_sh->healing_fd = fd_ref (sh->healing_fd); - new_loop_sh->file_has_holes = sh->file_has_holes; - new_loop_sh->old_loop_frame = old_loop_frame; - new_loop_sh->sh_frame = sh_frame; - *loop_frame = new_loop_frame; - return 0; -out: - sh_destroy_frame (new_loop_frame, this); - return -ENOMEM; -} - -static int -sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, - call_frame_t *old_loop_frame) -{ - call_frame_t *new_loop_frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *new_loop_local = NULL; - afr_self_heal_t *new_loop_sh = NULL; - int ret = 0; - - GF_ASSERT (sh_frame); - - local = sh_frame->local; - sh = &local->self_heal; - - ret = sh_loop_frame_create (sh_frame, this, old_loop_frame, - &new_loop_frame); - if (ret) - goto out; - new_loop_local = new_loop_frame->local; - new_loop_sh = &new_loop_local->self_heal; - new_loop_sh->offset = offset; - new_loop_sh->block_size = sh->block_size; - afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size, - _gf_true, this->name, sh_loop_lock_success, sh_loop_lock_failure); - return 0; -out: - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - if (old_loop_frame) - sh_loop_finish (old_loop_frame, this); - sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM); - return 0; -} - -static int -sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, - gf_boolean_t is_first_call, call_frame_t *old_loop_frame) -{ - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_sh_algo_private_t *sh_priv = NULL; - gf_boolean_t is_driver_done = _gf_false; - blksize_t block_size = 0; - int loop = 0; - off_t offset = 0; - afr_private_t *priv = NULL; - - priv = this->private; - local = sh_frame->local; - sh = &local->self_heal; - sh_priv = sh->private; - - LOCK (&sh_priv->lock); - { - if (!is_first_call) - sh_priv->loops_running--; - offset = sh_priv->offset; - block_size = sh->block_size; - while ((!sh->eof_reached) && - (!is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) && - (sh_priv->loops_running < priv->data_self_heal_window_size) - && (sh_priv->offset < sh->file_size)) { - - loop++; - sh_priv->offset += block_size; - sh_priv->loops_running++; - - if (!is_first_call) - break; - } - if (0 == sh_priv->loops_running) { - is_driver_done = _gf_true; - } - } - UNLOCK (&sh_priv->lock); - - if (0 == loop) { - //loop finish does unlock, but the erasing of the pending - //xattrs needs to happen before that so do not finish the loop - if (is_driver_done && - !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) - goto driver_done; - if (old_loop_frame) { - sh_loop_finish (old_loop_frame, this); - old_loop_frame = NULL; - } - } - - //If we have more loops to form we should finish previous loop after - //the next loop lock - while (loop--) { - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - // op failed in other loop, stop spawning more loops - if (old_loop_frame) { - sh_loop_finish (old_loop_frame, this); - old_loop_frame = NULL; - } - sh_loop_driver (sh_frame, this, _gf_false, NULL); - } else { - gf_log (this->name, GF_LOG_TRACE, "spawning a loop " - "for offset %"PRId64, offset); - - sh_loop_start (sh_frame, this, offset, old_loop_frame); - old_loop_frame = NULL; - offset += block_size; - } - } - -driver_done: - if (is_driver_done) { - sh_loop_driver_done (sh_frame, this, old_loop_frame); - } - return 0; -} - -static int -sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t * loop_local = NULL; - afr_self_heal_t * loop_sh = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - if (loop_frame) { - loop_local = loop_frame->local; - if (loop_local) - loop_sh = &loop_local->self_heal; - if (loop_sh) - gf_log (this->name, GF_LOG_TRACE, "loop for offset " - "%"PRId64" returned", loop_sh->offset); - } - - if (op_ret == -1) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (sh, op_errno); - if (loop_frame) { - sh_loop_finish (loop_frame, this); - loop_frame = NULL; - } - } - - sh_loop_driver (sh_frame, this, _gf_false, loop_frame); - - return 0; -} - -static int -sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *postbuf, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * loop_local = NULL; - afr_self_heal_t * loop_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t * sh_local = NULL; - afr_self_heal_t *sh = NULL; - int call_count = 0; - int child_index = 0; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - sh_frame = loop_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - child_index = (long) cookie; - - gf_log (this->name, GF_LOG_TRACE, - "wrote %d bytes of data from %s to child %d, offset %"PRId64"", - op_ret, sh_local->loc.path, child_index, loop_sh->offset); - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "write to %s failed on subvolume %s (%s)", - sh_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (loop_sh, op_errno); - } else if (op_ret < loop_local->cont.writev.vector->iov_len) { - gf_log (this->name, GF_LOG_ERROR, - "incomplete write to %s on subvolume %s " - "(expected %lu, returned %d)", sh_local->loc.path, - priv->children[child_index]->name, - loop_local->cont.writev.vector->iov_len, op_ret); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - - call_count = afr_frame_return (loop_frame); - - if (call_count == 0) { - iobref_unref(loop_local->cont.writev.iobref); - - sh_loop_return (sh_frame, this, loop_frame, - loop_sh->op_ret, loop_sh->op_errno); - } - - return 0; -} - -static void -sh_prune_writes_needed (call_frame_t *sh_frame, call_frame_t *loop_frame, - afr_private_t *priv) -{ - afr_local_t *sh_local = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - int i = 0; - - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - if (!strcmp (sh->algo->name, "diff")) - return; - - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - /* full self-heal guarantees there exists atleast 1 file with size 0 - * That means for other files we can preserve holes that come after - * its size before 'trim' - */ - for (i = 0; i < priv->child_count; i++) { - if (loop_sh->write_needed[i] && - ((loop_sh->offset + 1) > sh->buf[i].ia_size)) - loop_sh->write_needed[i] = 0; - } -} - -static int -sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref, dict_t *xdata) -{ - afr_private_t * priv = NULL; - afr_local_t * loop_local = NULL; - afr_self_heal_t * loop_sh = NULL; - call_frame_t *sh_frame = NULL; - int i = 0; - int call_count = 0; - afr_local_t * sh_local = NULL; - afr_self_heal_t * sh = NULL; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - sh_frame = loop_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - gf_log (this->name, GF_LOG_TRACE, - "read %d bytes of data from %s, offset %"PRId64"", - op_ret, loop_local->loc.path, loop_sh->offset); - - if (op_ret <= 0) { - if (op_ret < 0) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - gf_log (this->name, GF_LOG_ERROR, "read failed on %d " - "for %s reason :%s", sh->source, - sh_local->loc.path, strerror (errno)); - } else { - sh->eof_reached = _gf_true; - gf_log (this->name, GF_LOG_DEBUG, "Eof reached for %s", - sh_local->loc.path); - } - sh_loop_return (sh_frame, this, loop_frame, op_ret, op_errno); - goto out; - } - - if (loop_sh->file_has_holes && iov_0filled (vector, count) == 0) - sh_prune_writes_needed (sh_frame, loop_frame, priv); - - call_count = sh_number_of_writes_needed (loop_sh->write_needed, - priv->child_count); - if (call_count == 0) { - sh_loop_return (sh_frame, this, loop_frame, 0, 0); - goto out; - } - - loop_local->call_count = call_count; - - /* - * We only really need the request size at the moment, but the buffer - * is required if we want to issue a retry in the event of a short write. - * Therefore, we duplicate the vector and ref the iobref here... - */ - loop_local->cont.writev.vector = iov_dup(vector, count); - loop_local->cont.writev.iobref = iobref_ref(iobref); - - for (i = 0; i < priv->child_count; i++) { - if (!loop_sh->write_needed[i]) - continue; - STACK_WIND_COOKIE (loop_frame, sh_loop_write_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->writev, - loop_sh->healing_fd, vector, count, - loop_sh->offset, 0, iobref, NULL); - - if (!--call_count) - break; - } - -out: - return 0; -} - - -static int -sh_loop_read (call_frame_t *loop_frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - STACK_WIND_COOKIE (loop_frame, sh_loop_read_cbk, - (void *) (long) loop_sh->source, - priv->children[loop_sh->source], - priv->children[loop_sh->source]->fops->readv, - loop_sh->healing_fd, loop_sh->block_size, - loop_sh->offset, 0, NULL); - - return 0; -} - - -static int -sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - uint32_t weak_checksum, uint8_t *strong_checksum, - dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t *sh_local = NULL; - afr_self_heal_t *sh = NULL; - afr_sh_algo_private_t *sh_priv = NULL; - int child_index = 0; - int call_count = 0; - int i = 0; - int write_needed = 0; - - priv = this->private; - - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - sh_frame = loop_sh->sh_frame; - sh_local = sh_frame->local; - sh = &sh_local->self_heal; - - sh_priv = sh->private; - - child_index = (long) cookie; - - if (op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "checksum on %s failed on subvolume %s (%s)", - sh_local->loc.path, priv->children[child_index]->name, - strerror (op_errno)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } else { - memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LENGTH, - strong_checksum, MD5_DIGEST_LENGTH); - } - - call_count = afr_frame_return (loop_frame); - - if (call_count == 0) { - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] || !sh_local->child_up[i]) - continue; - - if (memcmp (loop_sh->checksum + (i * MD5_DIGEST_LENGTH), - loop_sh->checksum + (sh->source * MD5_DIGEST_LENGTH), - MD5_DIGEST_LENGTH)) { - /* - Checksums differ, so this block - must be written to this sink - */ - - gf_log (this->name, GF_LOG_DEBUG, - "checksum on subvolume %s at offset %" - PRId64" differs from that on source", - priv->children[i]->name, loop_sh->offset); - - write_needed = loop_sh->write_needed[i] = 1; - } - } - - LOCK (&sh_priv->lock); - { - sh_priv->total_blocks++; - if (write_needed) - sh_priv->diff_blocks++; - } - UNLOCK (&sh_priv->lock); - - if (write_needed && - !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - sh_loop_read (loop_frame, this); - } else { - sh_loop_return (sh_frame, this, loop_frame, - op_ret, op_errno); - } - } - - return 0; -} - -static int -sh_diff_checksum (call_frame_t *loop_frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - int call_count = 0; - int i = 0; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - call_count = loop_sh->active_sinks + 1; /* sinks and source */ - - loop_local->call_count = call_count; - - STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk, - (void *) (long) loop_sh->source, - priv->children[loop_sh->source], - priv->children[loop_sh->source]->fops->rchecksum, - loop_sh->healing_fd, - loop_sh->offset, loop_sh->block_size, NULL); - - for (i = 0; i < priv->child_count; i++) { - if (loop_sh->sources[i] || !loop_local->child_up[i]) - continue; - - STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->rchecksum, - loop_sh->healing_fd, - loop_sh->offset, loop_sh->block_size, NULL); - - if (!--call_count) - break; - } - - return 0; -} - -static int -sh_full_read_write_to_sinks (call_frame_t *loop_frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *loop_local = NULL; - afr_self_heal_t *loop_sh = NULL; - int i = 0; - - priv = this->private; - loop_local = loop_frame->local; - loop_sh = &loop_local->self_heal; - - for (i = 0; i < priv->child_count; i++) { - if (loop_sh->sources[i] || !loop_local->child_up[i]) - continue; - loop_sh->write_needed[i] = 1; - } - sh_loop_read (loop_frame, this); - return 0; -} - -afr_sh_algo_private_t* -afr_sh_priv_init () -{ - afr_sh_algo_private_t *sh_priv = NULL; - - sh_priv = GF_CALLOC (1, sizeof (*sh_priv), - gf_afr_mt_afr_private_t); - if (!sh_priv) - goto out; - - LOCK_INIT (&sh_priv->lock); -out: - return sh_priv; -} - -int -afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, char *dom, - unsigned int child_count) -{ - afr_local_t *dst_local = NULL; - afr_self_heal_t *dst_sh = NULL; - afr_local_t *src_local = NULL; - afr_self_heal_t *src_sh = NULL; - int ret = -1; - - dst_local = dst->local; - dst_sh = &dst_local->self_heal; - src_local = src->local; - src_sh = &src_local->self_heal; - GF_ASSERT (src_sh->data_lock_held); - GF_ASSERT (!dst_sh->data_lock_held); - ret = afr_lk_transfer_datalock (dst, src, dom, child_count); - if (ret) - return ret; - src_sh->data_lock_held = _gf_false; - dst_sh->data_lock_held = _gf_true; - return 0; -} - -int -afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, - afr_sh_algo_fn sh_data_algo_start) -{ - call_frame_t *first_loop_frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int ret = 0; - afr_private_t *priv = NULL; - - local = sh_frame->local; - sh = &local->self_heal; - priv = this->private; - - sh->sh_data_algo_start = sh_data_algo_start; - local->call_count = 0; - ret = sh_loop_frame_create (sh_frame, this, NULL, &first_loop_frame); - if (ret) - goto out; - ret = afr_sh_transfer_lock (first_loop_frame, sh_frame, this->name, - priv->child_count); - if (ret) - goto out; - sh->private = afr_sh_priv_init (); - if (!sh->private) { - ret = -1; - goto out; - } - sh_loop_driver (sh_frame, this, _gf_true, first_loop_frame); - ret = 0; -out: - if (ret) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - sh_loop_driver_done (sh_frame, this, NULL); - } - return 0; -} - -int -afr_sh_algo_diff (call_frame_t *sh_frame, xlator_t *this) -{ - afr_sh_start_loops (sh_frame, this, sh_diff_checksum); - return 0; -} - -int -afr_sh_algo_full (call_frame_t *sh_frame, xlator_t *this) -{ - afr_sh_start_loops (sh_frame, this, sh_full_read_write_to_sinks); - return 0; -} - -struct afr_sh_algorithm afr_self_heal_algorithms[] = { - {.name = "full", .fn = afr_sh_algo_full}, - {.name = "diff", .fn = afr_sh_algo_diff}, - {0, 0}, -}; diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.h b/xlators/cluster/afr/src/afr-self-heal-algorithm.h deleted file mode 100644 index 6b20789b1..000000000 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - Copyright (c) 2008-2012 Red Hat, Inc. - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef __AFR_SELF_HEAL_ALGORITHM_H__ -#define __AFR_SELF_HEAL_ALGORITHM_H__ - -typedef int (*afr_sh_algo_fn) (call_frame_t *frame, - xlator_t *this); - -struct afr_sh_algorithm { - const char *name; - afr_sh_algo_fn fn; -}; - -extern struct afr_sh_algorithm afr_self_heal_algorithms[3]; -typedef struct { - gf_lock_t lock; - unsigned int loops_running; - off_t offset; - - int32_t total_blocks; - int32_t diff_blocks; -} afr_sh_algo_private_t; - -#endif /* __AFR_SELF_HEAL_ALGORITHM_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index ef92b4205..4dac83113 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. + Copyright (c) 2013 Red Hat, Inc. This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,2805 +8,1002 @@ cases as published by the Free Software Foundation. */ -#include "glusterfs.h" -#include "xlator.h" -#include "byte-order.h" + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif #include "afr.h" -#include "afr-transaction.h" -#include "afr-self-heal-common.h" #include "afr-self-heal.h" -#include "pump.h" - -#define ADD_FMT_STRING(msg, off, sh_str, status, print_log) \ - do { \ - if (AFR_SELF_HEAL_NOT_ATTEMPTED != status) { \ - off += snprintf (msg + off, sizeof (msg) - off, \ - " "sh_str" self heal %s,", \ - get_sh_completion_status (status));\ - print_log = 1; \ - } \ - } while (0) - -#define ADD_FMT_STRING_SYNC(msg, off, sh_str, status, print_log) \ - do { \ - if (AFR_SELF_HEAL_SYNC_BEGIN == status || \ - AFR_SELF_HEAL_FAILED == status) { \ - off += snprintf (msg + off, sizeof (msg) - off, \ - " "sh_str" self heal %s,", \ - get_sh_completion_status (status));\ - print_log = 1; \ - } \ - } while (0) +#include "byte-order.h" -void -afr_sh_reset (call_frame_t *frame, xlator_t *this) +int +afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - memset (sh->child_errno, 0, - sizeof (*sh->child_errno) * priv->child_count); - memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count); - memset (sh->parentbufs, 0, - sizeof (*sh->parentbufs) * priv->child_count); - memset (sh->success, 0, sizeof (*sh->success) * priv->child_count); - memset (sh->locked_nodes, 0, - sizeof (*sh->locked_nodes) * priv->child_count); - sh->active_sinks = 0; - - afr_reset_xattr (sh->xattr, priv->child_count); -} + afr_local_t *local = NULL; -//Intersection[child]=1 if child is part of intersection -void -afr_children_intersection_get (int32_t *set1, int32_t *set2, - int *intersection, unsigned int child_count) -{ - int i = 0; - - memset (intersection, 0, sizeof (*intersection) * child_count); - for (i = 0; i < child_count; i++) { - intersection[i] = afr_is_child_present (set1, child_count, i) - && afr_is_child_present (set2, child_count, - i); - } + local = frame->local; + + syncbarrier_wake (&local->barrier); + + return 0; } -/** - * select_source - select a source and return it - */ int -afr_sh_select_source (int sources[], int child_count) +afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode, + int subvol, dict_t *xattr) { - int i = 0; - for (i = 0; i < child_count; i++) - if (sources[i]) - return i; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + loc_t loc = {0, }; - return -1; -} + priv = this->private; + local = frame->local; -void -afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this) -{ - int i = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int active_sinks = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (sh->sources[i] == 0 && local->child_up[i] == 1) { - active_sinks++; - sh->success[i] = 1; - } else if (sh->sources[i] == 1 && local->child_up[i] == 1) { - sh->success[i] = 1; - } - } - sh->active_sinks = active_sinks; -} + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); -int -afr_sh_source_count (int sources[], int child_count) -{ - int i = 0; - int nsource = 0; + STACK_WIND (frame, afr_selfheal_post_op_cbk, priv->children[subvol], + priv->children[subvol]->fops->xattrop, &loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL); - for (i = 0; i < child_count; i++) - if (sources[i]) - nsource++; - return nsource; -} + syncbarrier_wait (&local->barrier, 1); -void -afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno) -{ - sh->op_ret = -1; - sh->op_errno = afr_most_important_error(sh->op_errno, op_errno, - _gf_false); + return 0; } -void -afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) -{ - afr_private_t * priv = this->private; - char *buf = NULL; - char *ptr = NULL; - int i = 0; - int j = 0; - - /* 10 digits per entry + 1 space + '[' and ']' */ - buf = GF_MALLOC (priv->child_count * 11 + 8, gf_afr_mt_char); - - for (i = 0; i < priv->child_count; i++) { - ptr = buf; - ptr += sprintf (ptr, "[ "); - for (j = 0; j < priv->child_count; j++) { - ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); - } - sprintf (ptr, "]"); - gf_log (this->name, GF_LOG_DEBUG, "pending_matrix: %s", buf); - } - - GF_FREE (buf); -} -char* -afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this) +dict_t * +afr_selfheal_output_xattr (xlator_t *this, afr_transaction_type type, + int *output_dirty, int **output_matrix, int subvol) { - afr_private_t * priv = this->private; - char *buf = NULL; - char *ptr = NULL; - int i = 0; - int j = 0; - int child_count = priv->child_count; - char *matrix_begin = "[ [ "; - char *matrix_end = "] ]"; - char *seperator = "] [ "; - int pending_entry_strlen = 12; //Including space after entry - int matrix_begin_strlen = 0; - int matrix_end_strlen = 0; - int seperator_strlen = 0; - int string_length = 0; - char *msg = "- Pending matrix: "; - - /* - * for a list of lists of [ [ a b ] [ c d ] ] - * */ - - matrix_begin_strlen = strlen (matrix_begin); - matrix_end_strlen = strlen (matrix_end); - seperator_strlen = strlen (seperator); - string_length = matrix_begin_strlen + matrix_end_strlen - + (child_count -1) * seperator_strlen - + (child_count * child_count * pending_entry_strlen); - - buf = GF_CALLOC (1, 1 + strlen (msg) + string_length , gf_afr_mt_char); - if (!buf) - goto out; - - ptr = buf; - ptr += sprintf (ptr, "%s", msg); - ptr += sprintf (ptr, "%s", matrix_begin); - for (i = 0; i < priv->child_count; i++) { - for (j = 0; j < priv->child_count; j++) { - ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); - } - if (i < priv->child_count -1) - ptr += sprintf (ptr, "%s", seperator); - } - - ptr += sprintf (ptr, "%s", matrix_end); + dict_t *xattr = NULL; + afr_private_t *priv = NULL; + int j = 0; + int idx = 0; + int ret = 0; + int *raw = 0; -out: - return buf; -} + priv = this->private; + idx = afr_index_for_transaction_type (type); -void -afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, - const char *loc) -{ - char *buf = NULL; - char *free_ptr = NULL; + xattr = dict_new (); + if (!xattr) + return NULL; - buf = afr_get_pending_matrix_str (pending_matrix, this); - if (buf) - free_ptr = buf; - else - buf = ""; + if (output_dirty[subvol]) { + /* clear dirty */ + raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t); + if (!raw) + goto err; + raw[idx] = hton32 (output_dirty[subvol]); + ret = dict_set_bin (xattr, AFR_DIRTY, raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) + goto err; + } - gf_log (this->name, GF_LOG_ERROR, "Unable to self-heal contents of '%s'" - " (possible split-brain). Please delete the file from all but " - "the preferred subvolume.%s", loc, buf); - GF_FREE (free_ptr); - return; -} + /* clear/set pending */ + for (j = 0; j < priv->child_count; j++) { + if (!output_matrix[subvol][j]) + continue; + raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, + gf_afr_mt_int32_t); + if (!raw) + goto err; -void -afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count) -{ - int i = 0; - int j = 0; + raw[idx] = hton32 (output_matrix[subvol][j]); - GF_ASSERT (pending_matrix); + ret = dict_set_bin (xattr, priv->pending_key[j], + raw, sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) + goto err; + } - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - pending_matrix[i][j] = 0; - } - } + return xattr; +err: + if (xattr) + dict_unref (xattr); + return NULL; } -void -afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix, - unsigned char *ignorant_subvols, - size_t child_count) -{ - int i = 0; - int j = 0; - - GF_ASSERT (pending_matrix); - GF_ASSERT (ignorant_subvols); - - for (i = 0; i < child_count; i++) { - if (ignorant_subvols[i]) { - for (j = 0; j < child_count; j++) { - if (!ignorant_subvols[j]) - pending_matrix[j][i] += 1; - } - } - } -} int -afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, - unsigned char *ignorant_subvols, - dict_t *xattr[], afr_transaction_type type, - size_t child_count) -{ - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - int ret = -1; - int i = 0; - int j = 0; - int k = 0; - - afr_init_pending_matrix (pending_matrix, child_count); - - for (i = 0; i < child_count; i++) { - pending_raw = NULL; - - for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], pending_key[j], - &pending_raw); - - if (ret != 0) { - /* - * There is no xattr present. This means this - * subvolume should be considered an 'ignorant' - * subvolume. - */ - - if (ignorant_subvols) - ignorant_subvols[i] = 1; - continue; - } - - memcpy (pending, pending_raw, sizeof(pending)); - k = afr_index_for_transaction_type (type); - - pending_matrix[i][j] = ntoh32 (pending[k]); - } - } - - return ret; -} +afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, afr_transaction_type type, + struct afr_reply *replies, unsigned char *locked_on) +{ + afr_private_t *priv = NULL; + int i = 0; + int j = 0; + unsigned char *pending = NULL; + int *input_dirty = NULL; + int **input_matrix = NULL; + int *output_dirty = NULL; + int **output_matrix = NULL; + dict_t *xattr = NULL; + + priv = this->private; + + pending = alloca0 (priv->child_count); + + input_dirty = alloca0 (priv->child_count * sizeof (int)); + input_matrix = ALLOC_MATRIX (priv->child_count, int); + output_dirty = alloca0 (priv->child_count * sizeof (int)); + output_matrix = ALLOC_MATRIX (priv->child_count, int); + + afr_selfheal_extract_xattr (this, replies, type, input_dirty, + input_matrix); + + for (i = 0; i < priv->child_count; i++) + if (sinks[i] && !healed_sinks[i]) + pending[i] = 1; + + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (pending[j]) + output_matrix[i][j] = 1; + else + output_matrix[i][j] = -input_matrix[i][j]; + } + } -typedef enum { - AFR_NODE_INVALID, - AFR_NODE_INNOCENT, - AFR_NODE_FOOL, - AFR_NODE_WISE, -} afr_node_type; + for (i = 0; i < priv->child_count; i++) { + if (!pending[i]) + output_dirty[i] = -input_dirty[i]; + } -typedef struct { - afr_node_type type; - int wisdom; -} afr_node_character; + for (i = 0; i < priv->child_count; i++) { + if (!locked_on[i]) + /* perform post-op only on subvols we had locked + and inspected on. + */ + continue; + xattr = afr_selfheal_output_xattr (this, type, output_dirty, + output_matrix, i); + if (!xattr) { + gf_log (this->name, GF_LOG_ERROR, + "unable to allocate xdata for subvol %d", i); + continue; + } -static int -afr_sh_is_innocent (int32_t *array, int child_count) -{ - int i = 0; - int ret = 1; /* innocent until proven guilty */ + afr_selfheal_post_op (frame, this, inode, i, xattr); - for (i = 0; i < child_count; i++) { - if (array[i]) { - ret = 0; - break; - } - } + dict_unref (xattr); + } - return ret; + return 0; } -static int -afr_sh_is_fool (int32_t *array, int i, int child_count) -{ - return array[i]; /* fool if accuses itself */ +void +afr_replies_copy (struct afr_reply *dst, struct afr_reply *src, int count) +{ + int i = 0; + dict_t *xdata = NULL; + + if (dst == src) + return; + + for (i = 0; i < count; i++) { + dst[i].valid = src[i].valid; + dst[i].op_ret = src[i].op_ret; + dst[i].op_errno = src[i].op_errno; + dst[i].prestat = src[i].prestat; + dst[i].poststat = src[i].poststat; + dst[i].preparent = src[i].preparent; + dst[i].postparent = src[i].postparent; + dst[i].preparent2 = src[i].preparent2; + dst[i].postparent2 = src[i].postparent2; + if (src[i].xdata) + xdata = dict_ref (src[i].xdata); + else + xdata = NULL; + if (dst[i].xdata) + dict_unref (dst[i].xdata); + dst[i].xdata = xdata; + memcpy (dst[i].checksum, src[i].checksum, + MD5_DIGEST_LENGTH); + } } -static int -afr_sh_is_wise (int32_t *array, int i, int child_count) +int +afr_selfheal_fill_dirty (xlator_t *this, int *dirty, int subvol, + int idx, dict_t *xdata) { - return !array[i]; /* wise if does not accuse itself */ -} + void *pending_raw = NULL; + int pending[3] = {0, }; + if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw)) + return -1; -static int -afr_sh_all_nodes_innocent (afr_node_character *characters, - int child_count) -{ - int i = 0; - int ret = 1; + if (!pending_raw) + return -1; + + memcpy (pending, pending_raw, sizeof(pending)); - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_INNOCENT) { - ret = 0; - break; - } - } + dirty[subvol] = ntoh32 (pending[idx]); - return ret; + return 0; } -static int -afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count) +int +afr_selfheal_fill_matrix (xlator_t *this, int **matrix, int subvol, + int idx, dict_t *xdata) { - int i = 0; - int ret = 0; + int i = 0; + void *pending_raw = NULL; + int pending[3] = {0, }; + afr_private_t *priv = NULL; - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_WISE) { - ret = 1; - break; - } - } + priv = this->private; - return ret; -} + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw)) + continue; + if (!pending_raw) + continue; -/* - * The 'wisdom' of a wise node is 0 if any other wise node accuses it. - * It is 1 if no other wise node accuses it. - * Only wise nodes with wisdom 1 are sources. - * - * If no nodes with wisdom 1 exist, a split-brain has occurred. - */ + memcpy (pending, pending_raw, sizeof(pending)); -static void -afr_sh_compute_wisdom (int32_t *pending_matrix[], - afr_node_character characters[], int child_count) -{ - int i = 0; - int j = 0; - - for (i = 0; i < child_count; i++) { - if (characters[i].type == AFR_NODE_WISE) { - characters[i].wisdom = 1; - - for (j = 0; j < child_count; j++) { - if ((characters[j].type == AFR_NODE_WISE) - && pending_matrix[j][i]) { - - characters[i].wisdom = 0; - } - } - } - } + matrix[subvol][i] = ntoh32 (pending[idx]); + } + + return 0; } -static int -afr_sh_wise_nodes_conflict (afr_node_character *characters, - int child_count) +int +afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, + afr_transaction_type type, int *dirty, int **matrix) { - int i = 0; - int ret = 1; + afr_private_t *priv = NULL; + int i = 0; + dict_t *xdata = NULL; + int idx = -1; + + idx = afr_index_for_transaction_type (type); - for (i = 0; i < child_count; i++) { - if ((characters[i].type == AFR_NODE_WISE) - && characters[i].wisdom == 1) { + priv = this->private; - /* There is atleast one bona-fide wise node */ - ret = 0; - break; - } - } + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].xdata) + continue; + + xdata = replies[i].xdata; - return ret; + afr_selfheal_fill_dirty (this, dirty, i, idx, xdata); + afr_selfheal_fill_matrix (this, matrix, i, idx, xdata); + } + + return 0; } -static int -afr_sh_mark_wisest_as_sources (int sources[], - afr_node_character *characters, - int child_count) -{ - int nsources = 0; - int i = 0; - for (i = 0; i < child_count; i++) { - if (characters[i].wisdom == 1) { - sources[i] = 1; - nsources++; - } - } +/* + * This function determines if a self-heal is required for a given inode, + * and if needed, in what direction. + * + * locked_on[] is the array representing servers which have been locked and + * from which xattrs have been fetched for analysis. + * + * The output of the function is by filling the arrays sources[] and sinks[]. + * + * sources[i] is set if i'th server is an eligible source for a selfheal. + * + * sinks[i] is set if i'th server needs to be healed. + * + * if sources[0..N] are all set, there is no need for a selfheal. + * + * if sinks[0..N] are all set, the inode is in split brain. + * + */ - return nsources; -} +int +afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + afr_transaction_type type, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks) +{ + afr_private_t *priv = NULL; + int i = 0; + int j = 0; + int *dirty = NULL; + int **matrix = NULL; + char *accused = NULL; + + priv = this->private; + + dirty = alloca0 (priv->child_count * sizeof (int)); + accused = alloca0 (priv->child_count); + matrix = ALLOC_MATRIX(priv->child_count, int); + + /* First construct the pending matrix for further analysis */ + afr_selfheal_extract_xattr (this, replies, type, dirty, matrix); + + /* Next short list all accused to exclude them from being sources */ + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) + accused[j] = 1; + } + } -static void -afr_compute_witness_of_fools (int32_t *witnesses, int32_t **pending_matrix, - afr_node_character *characters, - int32_t child_count) -{ - int i = 0; - int j = 0; - int witness = 0; - - GF_ASSERT (witnesses); - GF_ASSERT (pending_matrix); - GF_ASSERT (characters); - GF_ASSERT (child_count > 0); - - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; - - witness = 0; - for (j = 0; j < child_count; j++) { - if (i == j) - continue; - witness += pending_matrix[i][j]; - } - witnesses[i] = witness; - } -} + /* Short list all non-accused as sources */ + memset (sources, 0, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (!accused[i] && locked_on[i]) + sources[i] = 1; + } -static int32_t -afr_find_biggest_witness_among_fools (int32_t *witnesses, - afr_node_character *characters, - int32_t child_count) -{ - int i = 0; - int biggest_witness = -1; - int biggest_witness_idx = -1; - int biggest_witness_cnt = -1; - - GF_ASSERT (witnesses); - GF_ASSERT (characters); - GF_ASSERT (child_count > 0); - - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; - - if (biggest_witness < witnesses[i]) { - biggest_witness = witnesses[i]; - biggest_witness_idx = i; - biggest_witness_cnt = 1; + /* Everyone accused by sources are sinks */ + memset (sinks, 0, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) continue; + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) + sinks[j] = 1; } + } - if (biggest_witness == witnesses[i]) - biggest_witness_cnt++; - } + /* If any source has 'dirty' bit, pick first + 'dirty' source and make everybody else sinks */ + for (i = 0; i < priv->child_count; i++) { + if (sources[i] && dirty[i]) { + for (j = 0; j < priv->child_count; j++) { + if (j != i) { + sources[j] = 0; + sinks[j] = 1; + } + } + break; + } + } - if (biggest_witness_cnt != 1) - return -1; + /* If no sources, all locked nodes are sinks - split brain */ + if (AFR_COUNT (sources, priv->child_count) == 0) { + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) + sinks[i] = 1; + } + } - return biggest_witness_idx; + return 0; } + int -afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses, - afr_node_character *characters, - int32_t child_count, int32_t witness) +afr_selfheal_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *parbuf) { - int i = 0; - int nsources = 0; - - GF_ASSERT (sources); - GF_ASSERT (witnesses); - GF_ASSERT (characters); - GF_ASSERT (child_count > 0); - - for (i = 0; i < child_count; i++) { - if (characters[i].type != AFR_NODE_FOOL) - continue; - - if (witness == witnesses[i]) { - sources[i] = 1; - nsources++; - } - } - return nsources; -} + afr_local_t *local = NULL; + int i = -1; + local = frame->local; + i = (long) cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (buf) + local->replies[i].poststat = *buf; + if (parbuf) + local->replies[i].postparent = *parbuf; + if (xdata) + local->replies[i].xdata = dict_ref (xdata); + + syncbarrier_wake (&local->barrier); -int -afr_mark_fool_as_source_by_idx (int32_t *sources, int child_count, int idx) -{ - if (idx >= 0 && idx < child_count) { - sources[idx] = 1; - return 1; - } return 0; } -static int -afr_find_largest_file_size (struct iatt *bufs, int32_t *success_children, - int child_count) +inode_t * +afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent, + const char *name, struct afr_reply *replies, + unsigned char *lookup_on) { - int idx = -1; - int i = -1; - int child = -1; - uint64_t max_size = 0; - uint64_t min_size = 0; - int num_children = 0; + loc_t loc = {0, }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + inode_t *inode = NULL; - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; + local = frame->local; + priv = frame->this->private; - child = success_children[i]; - if (bufs[child].ia_size > max_size) { - max_size = bufs[child].ia_size; - idx = child; - } - - if ((num_children == 0) || (bufs[child].ia_size < min_size)) { - min_size = bufs[child].ia_size; - } + xattr_req = dict_new (); + if (!xattr_req) + return NULL; - num_children++; + if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { + dict_destroy (xattr_req); + return NULL; } - /* If sizes are same for all of them, finding sources will have to - * happen with pending changelog. So return -1 - */ - if ((num_children > 1) && (min_size == max_size)) - return -1; - return idx; -} + inode = inode_new (parent->table); + if (!inode) { + dict_destroy (xattr_req); + return NULL; + } + loc.parent = inode_ref (parent); + uuid_copy (loc.pargfid, parent->gfid); + loc.name = name; + loc.inode = inode_ref (inode); -static int -afr_find_newest_file (struct iatt *bufs, int32_t *success_children, - int child_count) -{ - int idx = -1; - int i = -1; - int child = -1; - uint64_t max_ctime = 0; + AFR_ONLIST (lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; + afr_replies_copy (replies, local->replies, priv->child_count); - child = success_children[i]; - if (bufs[child].ia_ctime > max_ctime) { - max_ctime = bufs[child].ia_ctime; - idx = child; - } - } + loc_wipe (&loc); + dict_unref (xattr_req); - return idx; + return inode; } -static int -afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, - afr_node_character *characters, - int32_t *success_children, - int child_count, struct iatt *bufs) -{ - int32_t biggest_witness = 0; - int nsources = 0; - int32_t *witnesses = NULL; - - GF_ASSERT (child_count > 0); - - biggest_witness = afr_find_largest_file_size (bufs, success_children, - child_count); - if (biggest_witness != -1) - goto found; - - witnesses = GF_CALLOC (child_count, sizeof (*witnesses), - gf_afr_mt_int32_t); - if (NULL == witnesses) { - nsources = -1; - goto out; - } - - afr_compute_witness_of_fools (witnesses, pending_matrix, characters, - child_count); - biggest_witness = afr_find_biggest_witness_among_fools (witnesses, - characters, - child_count); - if (biggest_witness != -1) - goto found; - - biggest_witness = afr_find_newest_file (bufs, success_children, - child_count); - -found: - nsources = afr_mark_fool_as_source_by_idx (sources, child_count, - biggest_witness); -out: - GF_FREE (witnesses); - return nsources; -} - int -afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, - int32_t *success_children, - unsigned int child_count, uint32_t uid) +afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies, + unsigned char *discover_on) { - int i = 0; - int nsources = 0; - int child = 0; - - for (i = 0; i < child_count; i++) { - if (-1 == success_children[i]) - break; - - child = success_children[i]; - if (uid == bufs[child].ia_uid) { - sources[child] = 1; - nsources++; - } - } - return nsources; -} + loc_t loc = {0, }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; -int -afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *success_children, - unsigned int child_count) -{ - int i = 0; - int smallest = -1; - int child = 0; - - for (i = 0; i < child_count; i++) { - if (-1 == success_children[i]) - break; - child = success_children[i]; - if ((smallest == -1) || - (bufs[child].ia_uid < bufs[smallest].ia_uid)) { - smallest = child; - } - } - return smallest; -} + local = frame->local; + priv = frame->this->private; -static int -afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *success_children, - int child_count, int32_t *sources) -{ - int nsources = 0; - int smallest = 0; - - smallest = afr_get_child_with_lowest_uid (bufs, success_children, - child_count); - if (smallest < 0) { - nsources = -1; - goto out; - } - nsources = afr_mark_child_as_source_by_uid (sources, bufs, - success_children, child_count, - bufs[smallest].ia_uid); -out: - return nsources; -} + xattr_req = dict_new (); + if (!xattr_req) + return -ENOMEM; -int -afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children, - struct iatt *bufs) -{ - afr_private_t *priv = NULL; - int i = 0; - int child = -1; - int read_child = -1; - - priv = this->private; - for (i = 0; i < priv->child_count; i++) { - child = success_children[i]; - if (child < 0) - break; - if (read_child < 0) - read_child = child; - else if (bufs[read_child].ia_size < bufs[child].ia_size) - read_child = child; - } - return read_child; -} + if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { + dict_destroy (xattr_req); + return -ENOMEM; + } -int -afr_sh_mark_zero_size_file_as_sink (struct iatt *bufs, int32_t *success_children, - int child_count, int32_t *sources) -{ - int nsources = 0; - int i = 0; - int child = 0; - gf_boolean_t sink_exists = _gf_false; - gf_boolean_t source_exists = _gf_false; - int source = -1; - - for (i = 0; i < child_count; i++) { - child = success_children[i]; - if (child < 0) - break; - if (!bufs[child].ia_size) { - sink_exists = _gf_true; - continue; - } - if (!source_exists) { - source_exists = _gf_true; - source = child; - continue; - } - if (bufs[source].ia_size != bufs[child].ia_size) { - nsources = -1; - goto out; - } - } - if (!source_exists && !sink_exists) { - nsources = -1; - goto out; - } - - if (!source_exists || !sink_exists) - goto out; - - for (i = 0; i < child_count; i++) { - child = success_children[i]; - if (child < 0) - break; - if (bufs[child].ia_size) { - sources[child] = 1; - nsources++; - } - } -out: - return nsources; -} + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, gfid); -char * -afr_get_character_str (afr_node_type type) -{ - char *character = NULL; - - switch (type) { - case AFR_NODE_INNOCENT: - character = "innocent"; - break; - case AFR_NODE_FOOL: - character = "fool"; - break; - case AFR_NODE_WISE: - character = "wise"; - break; - default: - character = "invalid"; - break; - } - return character; -} + AFR_ONLIST (discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); -afr_node_type -afr_find_child_character_type (int32_t *pending_row, int32_t child, - unsigned int child_count) -{ - afr_node_type type = AFR_NODE_INVALID; + afr_replies_copy (replies, local->replies, priv->child_count); - GF_ASSERT ((child >= 0) && (child < child_count)); + loc_wipe (&loc); + dict_unref (xattr_req); - if (afr_sh_is_innocent (pending_row, child_count)) - type = AFR_NODE_INNOCENT; - else if (afr_sh_is_fool (pending_row, child, child_count)) - type = AFR_NODE_FOOL; - else if (afr_sh_is_wise (pending_row, child, child_count)) - type = AFR_NODE_WISE; - return type; + return 0; } int -afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, - int32_t **pending_matrix, int32_t *sources, - int32_t *success_children, afr_transaction_type type, - int32_t *subvol_status, gf_boolean_t ignore_ignorant) +afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies) { - afr_private_t *priv = NULL; - afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; - int nsources = -1; - unsigned char *ignorant_subvols = NULL; - unsigned int child_count = 0; - - priv = this->private; - child_count = priv->child_count; - - if (afr_get_children_count (success_children, priv->child_count) == 0) - goto out; - - if (!ignore_ignorant) { - ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), - child_count, gf_afr_mt_char); - if (NULL == ignorant_subvols) - goto out; - } - - afr_build_pending_matrix (priv->pending_key, pending_matrix, - ignorant_subvols, xattr, type, - priv->child_count); - - if (!ignore_ignorant) - afr_mark_ignorant_subvols_as_pending (pending_matrix, - ignorant_subvols, - priv->child_count); - sh_type = afr_self_heal_type_for_transaction (type); - if (AFR_SELF_HEAL_INVALID == sh_type) - goto out; - - afr_sh_print_pending_matrix (pending_matrix, this); - - nsources = afr_mark_sources (this, sources, pending_matrix, bufs, - sh_type, success_children, subvol_status); -out: - GF_FREE (ignorant_subvols); - return nsources; -} + afr_private_t *priv = NULL; -void -afr_find_character_types (afr_node_character *characters, - int32_t **pending_matrix, int32_t *success_children, - unsigned int child_count) -{ - afr_node_type type = AFR_NODE_INVALID; - int child = 0; - int i = 0; - - for (i = 0; i < child_count; i++) { - child = success_children[i]; - if (child == -1) - break; - type = afr_find_child_character_type (pending_matrix[child], - child, child_count); - characters[child].type = type; - } -} + priv = frame->this->private; -void -afr_mark_success_children_sources (int32_t *sources, int32_t *success_children, - unsigned int child_count) -{ - int i = 0; - for (i = 0; i < child_count; i++) { - if (success_children[i] == -1) - break; - sources[success_children[i]] = 1; - } + return afr_selfheal_unlocked_discover_on (frame, inode, gfid, replies, + priv->child_up); } -/** - * mark_sources: Mark all 'source' nodes and return number of source - * nodes found - * - * A node (a row in the pending matrix) belongs to one of - * three categories: - * - * M is the pending matrix. - * - * 'innocent' - M[i] is all zeroes - * 'fool' - M[i] has i'th element = 1 (self-reference) - * 'wise' - M[i] has i'th element = 0, others are 1 or 0. - * - * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is - * needed. - * - * A 'wise' node can be a source. If two 'wise' nodes conflict, it is - * a split-brain. If one wise node refers to the other but the other doesn't - * refer back, the referrer is a source. - * - * All fools are sinks, unless there are no 'wise' nodes. In that case, - * one of the fools is made a source. - */ + int -afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, - struct iatt *bufs, afr_self_heal_type type, - int32_t *success_children, int32_t *subvol_status) +afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - /* stores the 'characters' (innocent, fool, wise) of the nodes */ - afr_node_character *characters = NULL; - int nsources = -1; - unsigned int child_count = 0; - afr_private_t *priv = NULL; - - priv = this->private; - child_count = priv->child_count; - characters = GF_CALLOC (sizeof (afr_node_character), - child_count, gf_afr_mt_afr_node_character); - if (!characters) - goto out; - - this = THIS; - - /* start clean */ - memset (sources, 0, sizeof (*sources) * child_count); - nsources = 0; - afr_find_character_types (characters, pending_matrix, success_children, - child_count); - if (afr_sh_all_nodes_innocent (characters, child_count)) { - switch (type) { - case AFR_SELF_HEAL_METADATA: - nsources = afr_sh_mark_lowest_uid_as_source (bufs, - success_children, - child_count, - sources); - break; - case AFR_SELF_HEAL_DATA: - nsources = afr_sh_mark_zero_size_file_as_sink (bufs, - success_children, - child_count, - sources); - if ((nsources < 0) && subvol_status) - *subvol_status |= SPLIT_BRAIN; - break; - default: - break; - } - goto out; - } - - if (afr_sh_wise_nodes_exist (characters, child_count)) { - afr_sh_compute_wisdom (pending_matrix, characters, child_count); - - if (afr_sh_wise_nodes_conflict (characters, child_count)) { - if (subvol_status) - *subvol_status |= SPLIT_BRAIN; - nsources = -1; - } else { - nsources = afr_sh_mark_wisest_as_sources (sources, - characters, - child_count); - } - } else { - if (subvol_status) - *subvol_status |= ALL_FOOLS; - nsources = afr_mark_biggest_of_fools_as_source (sources, - pending_matrix, - characters, - success_children, - child_count, bufs); - } + afr_local_t *local = NULL; + int i = 0; -out: - if (nsources == 0) - afr_mark_success_children_sources (sources, success_children, - child_count); - GF_FREE (characters); + local = frame->local; + i = (long) cookie; - gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources); - return nsources; -} + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; -void -afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, - int32_t *delta_matrix[], unsigned char success[], - int child_count, afr_transaction_type type) -{ - int tgt = 0; - int src = 0; - int value = 0; - - afr_build_pending_matrix (priv->pending_key, delta_matrix, NULL, - xattr, type, priv->child_count); - - /* - * The algorithm here has two parts. First, for each subvol indexed - * as tgt, we try to figure out what count everyone should have for it. - * If the self-heal succeeded, that's easy; the value is zero. - * Otherwise, the value is the maximum of the succeeding nodes' counts. - * Once we know the value, we loop through (possibly for a second time) - * setting each count to the difference so that when we're done all - * succeeding nodes will have the same count for tgt. - */ - for (tgt = 0; tgt < priv->child_count; ++tgt) { - value = 0; - if (!success[tgt]) { - /* Find the maximum. */ - for (src = 0; src < priv->child_count; ++src) { - if (!success[src]) { - continue; - } - if (delta_matrix[src][tgt] > value) { - value = delta_matrix[src][tgt]; - } - } - } - /* Force everyone who succeeded to the chosen value. */ - for (src = 0; src < priv->child_count; ++src) { - if (success[src]) { - delta_matrix[src][tgt] = value - - delta_matrix[src][tgt]; - } - else { - delta_matrix[src][tgt] = 0; - } - } - } + syncbarrier_wake (&local->barrier); + + return 0; } int -afr_sh_delta_to_xattr (xlator_t *this, - int32_t *delta_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type) -{ - int i = 0; - int j = 0; - int k = 0; - int ret = 0; - int32_t *pending = NULL; - int32_t *local_pending = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - for (i = 0; i < child_count; i++) { - if (!xattr[i]) - continue; - - local_pending = NULL; - for (j = 0; j < child_count; j++) { - pending = GF_CALLOC (sizeof (int32_t), 3, - gf_afr_mt_int32_t); - - if (!pending) { - gf_log (this->name, GF_LOG_ERROR, - "failed to allocate pending entry " - "for %s[%d] on %s", - priv->pending_key[j], type, - priv->children[i]->name); - continue; - } - /* 3 = data+metadata+entry */ - - k = afr_index_for_transaction_type (type); - - pending[k] = hton32 (delta_matrix[i][j]); - - if (j == i) { - local_pending = pending; - continue; - } - ret = dict_set_bin (xattr[i], priv->pending_key[j], - pending, - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); - GF_FREE (pending); - } - } - if (local_pending) { - ret = dict_set_bin (xattr[i], priv->pending_key[i], - local_pending, - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value."); - GF_FREE (local_pending); - } - } - } - return 0; +afr_selfheal_locked_fill (call_frame_t *frame, xlator_t *this, + unsigned char *locked_on) +{ + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int count = 0; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && local->replies[i].op_ret == 0) { + locked_on[i] = 1; + count++; + } else { + locked_on[i] = 0; + } + } + + return count; } int -afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) +afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - afr_sh_reset (frame, this); - - if (local->unhealable) { - gf_log (this->name, GF_LOG_DEBUG, - "split brain found, aborting selfheal of %s", - local->loc.path); - } - - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - sh->completion_cbk (frame, this); - } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to metadata check on %s", - local->loc.path); - afr_self_heal_metadata (frame, this); - } - - return 0; -} + loc_t loc = {0,}; + struct gf_flock flock = {0, }; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); -static int -afr_sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; - local = frame->local; - int_lock = &local->internal_lock; + AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom, + &loc, F_SETLK, &flock, NULL); - int_lock->lock_cbk = afr_sh_missing_entries_done; - afr_unlock (frame, this); + loc_wipe (&loc); - return 0; + return afr_selfheal_locked_fill (frame, this, locked_on); } + int -afr_sh_common_create (afr_self_heal_t *sh, unsigned int child_count) -{ - int ret = -ENOMEM; - sh->buf = GF_CALLOC (child_count, sizeof (*sh->buf), - gf_afr_mt_iatt); - if (!sh->buf) - goto out; - sh->parentbufs = GF_CALLOC (child_count, sizeof (*sh->parentbufs), - gf_afr_mt_iatt); - if (!sh->parentbufs) - goto out; - sh->child_errno = GF_CALLOC (child_count, sizeof (*sh->child_errno), - gf_afr_mt_int); - if (!sh->child_errno) - goto out; - sh->success_children = afr_children_create (child_count); - if (!sh->success_children) - goto out; - sh->fresh_children = afr_children_create (child_count); - if (!sh->fresh_children) - goto out; - sh->xattr = GF_CALLOC (child_count, sizeof (*sh->xattr), - gf_afr_mt_dict_t); - if (!sh->xattr) - goto out; - ret = 0; -out: - return ret; -} +afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) +{ + loc_t loc = {0,}; + struct gf_flock flock = {0, }; + afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; + + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); + + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; + + AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom, + &loc, F_SETLK, &flock, NULL); + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == -1 && + local->replies[i].op_errno == EAGAIN) { + afr_selfheal_locked_fill (frame, this, locked_on); + afr_selfheal_uninodelk (frame, this, inode, dom, off, + size, locked_on); + + AFR_SEQ (frame, afr_selfheal_lock_cbk, inodelk, dom, + &loc, F_SETLKW, &flock, NULL); + break; + } + } -void -afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, - dict_t *xattr, struct iatt *postparent, - loc_t *loc) -{ - int child_index = 0; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == 0) { - sh->buf[child_index] = *buf; - sh->parentbufs[child_index] = *postparent; - sh->success_children[sh->success_count] = child_index; - sh->success_count++; - sh->xattr[child_index] = dict_ref (xattr); - } else { - gf_log (this->name, GF_LOG_DEBUG, "path %s on subvolume" - " %s => -1 (%s)", loc->path, - priv->children[child_index]->name, - strerror (op_errno)); - local->self_heal.child_errno[child_index] = op_errno; - } - } - UNLOCK (&frame->lock); - return; -} + loc_wipe (&loc); -gf_boolean_t -afr_valid_ia_type (ia_type_t ia_type) -{ - switch (ia_type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - case IA_IFLNK: - case IA_IFDIR: - return _gf_true; - default: - return _gf_false; - } - return _gf_false; + return afr_selfheal_locked_fill (frame, this, locked_on); } + int -afr_impunge_frame_create (call_frame_t *frame, xlator_t *this, - int active_source, call_frame_t **impunge_frame) +afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + const unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int32_t op_errno = 0; - afr_private_t *priv = NULL; - int ret = 0; - call_frame_t *new_frame = NULL; - - op_errno = ENOMEM; - priv = this->private; - new_frame = copy_frame (frame); - if (!new_frame) { - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (impunge_local, out); - - local = frame->local; - new_frame->local = impunge_local; - impunge_sh = &impunge_local->self_heal; - impunge_sh->sh_frame = frame; - impunge_sh->active_source = active_source; - impunge_local->child_up = memdup (local->child_up, - sizeof (*local->child_up) * - priv->child_count); - if (!impunge_local->child_up) - goto out; - - impunge_local->pending = afr_matrix_create (priv->child_count, - AFR_NUM_CHANGE_LOGS); - if (!impunge_local->pending) - goto out; - - ret = afr_sh_common_create (impunge_sh, priv->child_count); - if (ret) { - op_errno = -ret; - goto out; - } - op_errno = 0; - *impunge_frame = new_frame; -out: - if (op_errno && new_frame) - AFR_STACK_DESTROY (new_frame); - return -op_errno; -} + loc_t loc = {0,}; + struct gf_flock flock = {0, }; -void -afr_sh_missing_entry_call_impunge_recreate (call_frame_t *frame, xlator_t *this, - struct iatt *buf, - struct iatt *postparent, - afr_impunge_done_cbk_t impunge_done) -{ - call_frame_t *impunge_frame = NULL; - afr_local_t *local = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *impunge_sh = NULL; - int ret = 0; - unsigned int enoent_count = 0; - afr_private_t *priv = NULL; - int i = 0; - int32_t op_errno = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - enoent_count = afr_errno_count (NULL, sh->child_errno, - priv->child_count, ENOENT); - if (!enoent_count) { - gf_log (this->name, GF_LOG_INFO, - "no missing files - %s. proceeding to metadata check", - local->loc.path); - goto out; - } - sh->impunge_done = impunge_done; - ret = afr_impunge_frame_create (frame, this, sh->source, &impunge_frame); - if (ret) - goto out; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - loc_copy (&impunge_local->loc, &local->loc); - ret = afr_build_parent_loc (&impunge_sh->parent_loc, - &impunge_local->loc, &op_errno); - if (ret) { - ret = -op_errno; - goto out; - } - impunge_local->call_count = enoent_count; - impunge_sh->entrybuf = sh->buf[sh->source]; - impunge_sh->parentbuf = sh->parentbufs[sh->source]; - for (i = 0; i < priv->child_count; i++) { - if (!impunge_local->child_up[i]) { - impunge_sh->child_errno[i] = ENOTCONN; - continue; - } - if (sh->child_errno[i] != ENOENT) { - impunge_sh->child_errno[i] = EEXIST; - continue; - } - } - for (i = 0; i < priv->child_count; i++) { - if (sh->child_errno[i] != ENOENT) - continue; - afr_sh_entry_impunge_create (impunge_frame, this, i); - enoent_count--; - } - GF_ASSERT (!enoent_count); - return; -out: - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, " - "reason: %s", local->loc.path, strerror (-ret)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - afr_sh_missing_entries_finish (frame, this); -} -int -afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - if (op_ret < 0) - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_missing_entries_finish (frame, this); - return 0; -} + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); -static int -sh_missing_entries_create (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int type = 0; - struct iatt *buf = NULL; - struct iatt *postparent = NULL; - - local = frame->local; - sh = &local->self_heal; - - buf = &sh->buf[sh->source]; - postparent = &sh->parentbufs[sh->source]; - - type = buf->ia_type; - if (!afr_valid_ia_type (type)) { - gf_log (this->name, GF_LOG_ERROR, - "%s: unknown file type: 0%o", local->loc.path, type); - afr_set_local_for_unhealable (local); - afr_sh_missing_entries_finish (frame, this); - goto out; - } - - afr_sh_missing_entry_call_impunge_recreate (frame, this, - buf, postparent, - afr_sh_create_entry_cbk); -out: - return 0; -} + flock.l_type = F_UNLCK; + flock.l_start = off; + flock.l_len = size; -void -afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - ia_type_t ia_type = IA_INVAL; - int32_t nsources = 0; - loc_t *loc = NULL; - int32_t subvol_status = 0; - afr_transaction_type txn_type = AFR_DATA_TRANSACTION; - gf_boolean_t split_brain = _gf_false; - int read_child = -1; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - loc = &local->loc; - - if (op_ret < 0) { - if (op_errno == EIO) { - afr_set_local_for_unhealable (local); - } - // EIO can happen if finding the fresh parent dir failed - goto out; - } - - //now No chance for the ia_type to conflict - ia_type = sh->buf[sh->success_children[0]].ia_type; - txn_type = afr_transaction_type_get (ia_type); - nsources = afr_build_sources (this, sh->xattr, sh->buf, - sh->pending_matrix, sh->sources, - sh->success_children, txn_type, - &subvol_status, _gf_false); - if (nsources < 0) { - gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s," - " in missing entry self-heal, continuing with the rest" - " of the self-heals", local->loc.path); - if (subvol_status & SPLIT_BRAIN) { - split_brain = _gf_true; - switch (txn_type) { - case AFR_DATA_TRANSACTION: - nsources = 1; - sh->sources[sh->success_children[0]] = 1; - break; - case AFR_ENTRY_TRANSACTION: - read_child = afr_get_no_xattr_dir_read_child - (this, - sh->success_children, - sh->buf); - sh->sources[read_child] = 1; - nsources = 1; - break; - default: - op_errno = EIO; - goto out; - } - } else { - op_errno = EIO; - goto out; - } - } - - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - sh->source = sh->fresh_children[0]; - if (sh->source == -1) { - gf_log (this->name, GF_LOG_DEBUG, "No active sources found."); - op_errno = EIO; - goto out; - } - - if (sh->gfid_sh_success_cbk) - sh->gfid_sh_success_cbk (frame, this); - sh->type = sh->buf[sh->source].ia_type; - if (uuid_is_null (loc->inode->gfid)) - uuid_copy (loc->gfid, sh->buf[sh->source].ia_gfid); - if (split_brain) { - afr_sh_missing_entries_finish (frame, this); - } else { - sh_missing_entries_create (frame, this); - } - return; -out: - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (sh, op_errno); - afr_sh_missing_entries_finish (frame, this); - return; -} + AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, inodelk, + dom, &loc, F_SETLK, &flock, NULL); -static int -afr_sh_common_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, - op_errno, inode, buf, xattr, - postparent, &sh->lookup_loc); - call_count = afr_frame_return (frame); - - if (call_count) - goto out; - op_ret = -1; - if (!sh->success_count) { - op_errno = afr_resultant_errno_get (NULL, sh->child_errno, - priv->child_count); - gf_log (this->name, GF_LOG_ERROR, "Failed to lookup %s, " - "reason %s", sh->lookup_loc.path, - strerror (op_errno)); - goto done; - } - - if ((sh->lookup_flags & AFR_LOOKUP_FAIL_CONFLICTS) && - (afr_conflicting_iattrs (sh->buf, sh->success_children, - priv->child_count, - sh->lookup_loc.path, this->name))) { - op_errno = EIO; - gf_log (this->name, GF_LOG_ERROR, "Conflicting entries " - "for %s", sh->lookup_loc.path); - goto done; - } - - if ((sh->lookup_flags & AFR_LOOKUP_FAIL_MISSING_GFIDS) && - (afr_gfid_missing_count (this->name, sh->success_children, - sh->buf, priv->child_count, - sh->lookup_loc.path))) { - op_errno = ENODATA; - gf_log (this->name, GF_LOG_ERROR, "Missing Gfids " - "for %s", sh->lookup_loc.path); - goto done; - } - op_ret = 0; - -done: - sh->lookup_done (frame, this, op_ret, op_errno); -out: - return 0; + loc_wipe (&loc); + + return 0; } + int -afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child, - int32_t op_ret, int32_t op_errno) +afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - GF_ASSERT (sh->post_remove_call); - if ((op_ret == -1) && (op_errno != ENOENT)) { - gf_log (this->name, GF_LOG_ERROR, - "purge entry %s failed, on child %d reason, %s", - local->loc.path, child, strerror (op_errno)); - LOCK (&frame->lock); - { - afr_sh_set_error (sh, EIO); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - UNLOCK (&frame->lock); - } - call_count = afr_frame_return (frame); - if (call_count == 0) - sh->post_remove_call (frame, this); - return 0; -} + loc_t loc = {0,}; -void -afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this, - int child_index, struct iatt *buf, - struct iatt *parentbuf, - afr_expunge_done_cbk_t expunge_done) -{ - call_frame_t *expunge_frame = NULL; - afr_local_t *local = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *expunge_sh = NULL; - int32_t op_errno = 0; - int ret = 0; - - expunge_frame = copy_frame (frame); - if (!expunge_frame) { - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out); - - local = frame->local; - sh = &local->self_heal; - expunge_frame->local = expunge_local; - expunge_sh = &expunge_local->self_heal; - expunge_sh->sh_frame = frame; - loc_copy (&expunge_local->loc, &local->loc); - ret = afr_build_parent_loc (&expunge_sh->parent_loc, - &expunge_local->loc, &op_errno); - if (ret) { - ret = -op_errno; - goto out; - } - sh->expunge_done = expunge_done; - afr_sh_entry_expunge_remove (expunge_frame, this, child_index, buf, - parentbuf); - return; -out: - gf_log (this->name, GF_LOG_ERROR, "Expunge of %s failed, reason: %s", - local->loc.path, strerror (op_errno)); - expunge_done (frame, this, child_index, -1, op_errno); -} + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); -void -afr_sh_remove_stale_lookup_info (afr_self_heal_t *sh, int32_t *success_children, - int32_t *fresh_children, - unsigned int child_count) -{ - int i = 0; - - for (i = 0; i < child_count; i++) { - if (afr_is_child_present (success_children, child_count, i) && - !afr_is_child_present (fresh_children, child_count, i)) { - sh->child_errno[i] = ENOENT; - GF_ASSERT (sh->xattr[i]); - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } - } -} + AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom, + &loc, name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); -int -afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - afr_sh_missing_entries_finish (frame, this); - } else { - if (afr_gfid_missing_count (this->name, sh->fresh_children, - sh->buf, priv->child_count, - local->loc.path)) { - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_missing_entries_lookup_done, - sh->sh_gfid_req, - AFR_LOOKUP_FAIL_CONFLICTS| - AFR_LOOKUP_FAIL_MISSING_GFIDS, - NULL); - } else { - //No need to set gfid so goto missing entries lookup done - //Behave as if you have done the lookup - afr_sh_remove_stale_lookup_info (sh, - sh->success_children, - sh->fresh_children, - priv->child_count); - afr_children_copy (sh->success_children, - sh->fresh_children, - priv->child_count); - afr_sh_missing_entries_lookup_done (frame, this, 0, 0); - } - } - return 0; + loc_wipe (&loc); + + return afr_selfheal_locked_fill (frame, this, locked_on); } -gf_boolean_t -afr_sh_purge_entry_condition (afr_local_t *local, afr_private_t *priv, - int child) + +int +afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - afr_self_heal_t *sh = NULL; + loc_t loc = {0,}; + afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; - sh = &local->self_heal; + priv = this->private; + local = frame->local; - if (local->child_up[child] && - (!afr_is_child_present (sh->fresh_parent_dirs, priv->child_count, - child)) - && (sh->child_errno[child] != ENOENT)) - return _gf_true; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - return _gf_false; -} + AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, + name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); -gf_boolean_t -afr_sh_purge_stale_entry_condition (afr_local_t *local, afr_private_t *priv, - int child) -{ - afr_self_heal_t *sh = NULL; + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == -1 && + local->replies[i].op_errno == EAGAIN) { + afr_selfheal_locked_fill (frame, this, locked_on); + afr_selfheal_unentrylk (frame, this, inode, dom, name, + locked_on); - sh = &local->self_heal; + AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, dom, + &loc, name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + break; + } + } - if (local->child_up[child] && - (!afr_is_child_present (sh->fresh_children, priv->child_count, - child)) - && (sh->child_errno[child] != ENOENT)) - return _gf_true; + loc_wipe (&loc); - return _gf_false; + return afr_selfheal_locked_fill (frame, this, locked_on); } -void -afr_sh_purge_entry_common (call_frame_t *frame, xlator_t *this, - gf_boolean_t purge_condition (afr_local_t *local, - afr_private_t *priv, - int child)) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - int call_count = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (purge_condition (local, priv, i)) - call_count++; - } - - if (call_count == 0) { - sh->post_remove_call (frame, this); - goto out; - } - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!purge_condition (local, priv, i)) - continue; - gf_log (this->name, GF_LOG_INFO, "purging the stale entry %s " - "on %s", local->loc.path, priv->children[i]->name); - afr_sh_call_entry_expunge_remove (frame, this, - (long) i, &sh->buf[i], - &sh->parentbufs[i], - afr_sh_remove_entry_cbk); - } -out: - return; -} -void -afr_sh_purge_entry (call_frame_t *frame, xlator_t *this) +int +afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + loc_t loc = {0,}; + + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); + + AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, entrylk, + dom, &loc, name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); - local = frame->local; - sh = &local->self_heal; - sh->post_remove_call = afr_sh_missing_entries_finish; + loc_wipe (&loc); - afr_sh_purge_entry_common (frame, this, afr_sh_purge_entry_condition); + return 0; } -void -afr_sh_purge_stale_entry (call_frame_t *frame, xlator_t *this) + +gf_boolean_t +afr_is_pending_set (xlator_t *this, dict_t *xdata, int type) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; + int idx = -1; + afr_private_t *priv = NULL; + void *pending_raw = NULL; + int *pending_int = NULL; + int i = 0; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + priv = this->private; + idx = afr_index_for_transaction_type (type); - sh->post_remove_call = afr_sh_purge_stale_entries_done; + if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw) == 0) { + if (pending_raw) { + pending_int = pending_raw; - for (i = 0; i < priv->child_count; i++) { - if (afr_is_child_present (sh->fresh_children, - priv->child_count, i)) - continue; + if (ntoh32 (pending_int[idx])) + return _gf_true; + } + } - if ((!local->child_up[i]) || sh->child_errno[i] != 0) - continue; + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr (xdata, priv->pending_key[i], + &pending_raw)) + continue; + if (!pending_raw) + continue; + pending_int = pending_raw; - GF_ASSERT (!uuid_is_null (sh->entrybuf.ia_gfid) || - uuid_is_null (sh->buf[i].ia_gfid)); + if (ntoh32 (pending_int[idx])) + return _gf_true; + } - if ((sh->entrybuf.ia_type != sh->buf[i].ia_type) || - (uuid_compare (sh->buf[i].ia_gfid, - sh->entrybuf.ia_gfid))) - continue; + return _gf_false; +} - afr_children_add_child (sh->fresh_children, i, - priv->child_count); - } - afr_sh_purge_entry_common (frame, this, - afr_sh_purge_stale_entry_condition); +gf_boolean_t +afr_is_data_set (xlator_t *this, dict_t *xdata) +{ + return afr_is_pending_set (this, xdata, AFR_DATA_TRANSACTION); } -void -afr_sh_save_child_iatts_from_policy (int32_t *children, struct iatt *bufs, - struct iatt *save, - unsigned int child_count) +gf_boolean_t +afr_is_metadata_set (xlator_t *this, dict_t *xdata) { - int i = 0; - int child = 0; - gf_boolean_t saved = _gf_false; - - GF_ASSERT (save); - //if iatt buf with gfid exists sets it - for (i = 0; i < child_count; i++) { - child = children[i]; - if (child == -1) - break; - *save = bufs[child]; - saved = _gf_true; - if (!uuid_is_null (save->ia_gfid)) - break; - } - GF_ASSERT (saved); + return afr_is_pending_set (this, xdata, AFR_METADATA_TRANSACTION); } -void -afr_get_children_of_fresh_parent_dirs (afr_self_heal_t *sh, - unsigned int child_count) +gf_boolean_t +afr_is_entry_set (xlator_t *this, dict_t *xdata) { - afr_children_intersection_get (sh->success_children, - sh->fresh_parent_dirs, - sh->sources, child_count); - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, child_count); - memset (sh->sources, 0, sizeof (*sh->sources) * child_count); + return afr_is_pending_set (this, xdata, AFR_ENTRY_TRANSACTION); } + void -afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) +afr_inode_link (inode_t *inode, struct iatt *iatt) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int32_t fresh_child_enoents = 0; - int32_t fresh_parent_count = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (op_ret < 0) - goto fail; - afr_get_children_of_fresh_parent_dirs (sh, priv->child_count); - fresh_parent_count = afr_get_children_count (sh->fresh_parent_dirs, - priv->child_count); - //we need the enoent count of the subvols present in fresh_parent_dirs - fresh_child_enoents = afr_errno_count (sh->fresh_parent_dirs, - sh->child_errno, - priv->child_count, ENOENT); - if (fresh_child_enoents == fresh_parent_count) { - afr_sh_set_error (sh, ENOENT); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_purge_entry (frame, this); - } else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children, - priv->child_count, local->loc.path, - this->name)) { - afr_sh_save_child_iatts_from_policy (sh->fresh_children, - sh->buf, &sh->entrybuf, - priv->child_count); - afr_update_gfid_from_iatts (sh->sh_gfid_req, sh->buf, - sh->fresh_children, - priv->child_count); - afr_sh_purge_stale_entry (frame, this); - } else { - op_errno = EIO; - afr_set_local_for_unhealable (local); - goto fail; - } - - return; - -fail: - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (sh, op_errno); - afr_sh_missing_entries_finish (frame, this); - return; -} + inode_t *linked_inode = NULL; -static void -afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int enoent_count = 0; - int nsources = 0; - int source = -1; - int32_t subvol_status = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (op_ret < 0) - goto out; - enoent_count = afr_errno_count (NULL, sh->child_errno, - priv->child_count, ENOENT); - if (enoent_count > 0) { - gf_log (this->name, GF_LOG_INFO, "Parent dir missing for %s," - " in missing entry self-heal, aborting missing-entry " - "self-heal", - local->loc.path); - afr_sh_missing_entries_finish (frame, this); - return; - } - - nsources = afr_build_sources (this, sh->xattr, sh->buf, - sh->pending_matrix, sh->sources, - sh->success_children, - AFR_ENTRY_TRANSACTION, &subvol_status, - _gf_true); - if ((subvol_status & ALL_FOOLS) || - (subvol_status & SPLIT_BRAIN)) { - gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative " - "merge", sh->parent_loc.path); - afr_mark_success_children_sources (sh->sources, - sh->success_children, - priv->child_count); - } else if (nsources < 0) { - gf_log (this->name, GF_LOG_ERROR, "No sources for dir " - "of %s, in missing entry self-heal, aborting " - "self-heal", local->loc.path); - op_errno = EIO; - goto out; - } - - source = afr_sh_select_source (sh->sources, priv->child_count); - if (source == -1) { - GF_ASSERT (0); - gf_log (this->name, GF_LOG_DEBUG, "No active sources found."); - op_errno = EIO; - goto out; - } - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_parent_dirs, priv->child_count); - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_children_lookup_done, NULL, 0, - NULL); - return; + linked_inode = inode_link (inode, NULL, NULL, iatt); -out: - afr_sh_set_error (sh, op_errno); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_missing_entries_finish (frame, this); - return; -} + uuid_copy (inode->gfid, iatt->ia_gfid); + inode->ia_type = iatt->ia_type; -void -afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count) -{ - int i = 0; - - for (i = 0; i < child_count; i++) { - memset (&sh->buf[i], 0, sizeof (sh->buf[i])); - memset (&sh->parentbufs[i], 0, sizeof (sh->parentbufs[i])); - sh->child_errno[i] = 0; - } - memset (&sh->parentbuf, 0, sizeof (sh->parentbuf)); - sh->success_count = 0; - afr_reset_children (sh->success_children, child_count); - afr_reset_children (sh->fresh_children, child_count); - afr_reset_xattr (sh->xattr, child_count); - loc_wipe (&sh->lookup_loc); + if (linked_inode) { + inode_lookup (linked_inode); + inode_unref (linked_inode); + } } -/* afr self-heal state will be lost if this call is made - * please check the afr_sh_common_reset that is called in this function + +/* + * This function inspects the looked up replies (in an unlocked manner) + * and decides whether a locked verification and possible healing is + * required or not. It updates the three booleans for each type + * of healing. If the boolean flag gets set to FALSE, then we are sure + * no healing is required. If the boolean flag gets set to TRUE then + * we have to proceed with locked reinspection. */ + int -afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - afr_lookup_done_cbk_t lookup_done , uuid_t gfid, - int32_t flags, dict_t *xdata) +afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, + inode_t *inode, uuid_t gfid, + gf_boolean_t *data_selfheal, + gf_boolean_t *metadata_selfheal, + gf_boolean_t *entry_selfheal) { - afr_local_t *local = NULL; - int i = 0; - int call_count = 0; - afr_private_t *priv = NULL; - dict_t *xattr_req = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - - call_count = afr_up_children_count (local->child_up, priv->child_count); - - local->call_count = call_count; - - xattr_req = dict_new(); - - if (xattr_req) { - afr_xattr_req_prepare (this, xattr_req, loc->path); - if (gfid) { - gf_log (this->name, GF_LOG_DEBUG, - "looking up %s with gfid: %s", - loc->path, uuid_utoa (gfid)); - GF_ASSERT (!uuid_is_null (gfid)); - afr_set_dict_gfid (xattr_req, gfid); - } - } - - afr_sh_common_reset (sh, priv->child_count); - sh->lookup_done = lookup_done; - loc_copy (&sh->lookup_loc, loc); - sh->lookup_flags = flags; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - gf_log (this->name, GF_LOG_DEBUG, - "looking up %s on subvolume %s", - loc->path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, - afr_sh_common_lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - loc, xattr_req); - - if (!--call_count) - break; - } - } - - if (xattr_req) - dict_unref (xattr_req); - - return 0; -} + afr_private_t *priv = NULL; + int i = 0; + int valid_cnt = 0; + struct iatt first = {0, }; + struct afr_reply *replies = NULL; + int ret = -1; + priv = this->private; + replies = alloca0 (sizeof (*replies) * priv->child_count); -int -afr_sh_post_nb_entrylk_missing_entry_sh_cbk (call_frame_t *frame, - xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "Non blocking entrylks failed."); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_missing_entries_done (frame, this); - } else { - - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking entrylks done. Proceeding to FOP"); - afr_sh_common_lookup (frame, this, &sh->parent_loc, - afr_sh_find_fresh_parents, - NULL, AFR_LOOKUP_FAIL_CONFLICTS, - NULL); - } - - return 0; -} + ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies); + if (ret) + return ret; -int -afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, - char *base_name, afr_lock_cbk_t lock_cbk) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + if (replies[i].op_ret == -1) + continue; - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; + if (afr_is_data_set (this, replies[i].xdata)) + *data_selfheal = _gf_true; - int_lock->transaction_lk_type = AFR_SELFHEAL_LK; - int_lock->selfheal_lk_type = AFR_ENTRY_SELF_HEAL_LK; + if (afr_is_metadata_set (this, replies[i].xdata)) + *metadata_selfheal = _gf_true; - afr_set_lock_number (frame, this); + if (afr_is_entry_set (this, replies[i].xdata)) + *entry_selfheal = _gf_true; - int_lock->lk_basename = base_name; - int_lock->lk_loc = loc; - int_lock->lock_cbk = lock_cbk; - int_lock->domain = this->name; + valid_cnt ++; + if (valid_cnt == 1) { + first = replies[i].poststat; + continue; + } - int_lock->lockee_count = 0; - afr_init_entry_lockee (&int_lock->lockee[0], local, loc, - base_name, priv->child_count); - int_lock->lockee_count++; - afr_nonblocking_entrylk (frame, this); + if (!IA_EQUAL (first, replies[i].poststat, type)) { + gf_log (this->name, GF_LOG_ERROR, + "TYPE mismatch %d vs %d on %s for gfid:%s", + (int) first.ia_type, + (int) replies[i].poststat.ia_type, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); + return -EIO; + } - return 0; -} + if (!IA_EQUAL (first, replies[i].poststat, uid)) { + gf_log (this->name, GF_LOG_DEBUG, + "UID mismatch %d vs %d on %s for gfid:%s", + (int) first.ia_uid, + (int) replies[i].poststat.ia_uid, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); -static int -afr_self_heal_parent_entrylk (call_frame_t *frame, xlator_t *this, - afr_lock_cbk_t lock_cbk) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_internal_lock_t *int_lock = NULL; - int ret = -1; - int32_t op_errno = 0; - - local = frame->local; - sh = &local->self_heal; - - gf_log (this->name, GF_LOG_TRACE, - "attempting to recreate missing entries for path=%s", - local->loc.path); - - ret = afr_build_parent_loc (&sh->parent_loc, &local->loc, &op_errno); - if (ret) - goto out; - - afr_sh_entrylk (frame, this, &sh->parent_loc, NULL, - lock_cbk); - return 0; -out: - int_lock = &local->internal_lock; - int_lock->lock_op_ret = -1; - lock_cbk (frame, this); - return 0; -} + *metadata_selfheal = _gf_true; + } -static int -afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + if (!IA_EQUAL (first, replies[i].poststat, gid)) { + gf_log (this->name, GF_LOG_DEBUG, + "GID mismatch %d vs %d on %s for gfid:%s", + (int) first.ia_uid, + (int) replies[i].poststat.ia_uid, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); - local = frame->local; - sh = &local->self_heal; + *metadata_selfheal = _gf_true; + } - sh->sh_type_in_action = AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY; + if (!IA_EQUAL (first, replies[i].poststat, prot)) { + gf_log (this->name, GF_LOG_DEBUG, + "MODE mismatch %d vs %d on %s for gfid:%s", + (int) st_mode_from_ia (first.ia_prot, 0), + (int) st_mode_from_ia (replies[i].poststat.ia_prot, 0), + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); + *metadata_selfheal = _gf_true; + } - afr_self_heal_parent_entrylk (frame, this, - afr_sh_post_nb_entrylk_missing_entry_sh_cbk); - return 0; -} + if (IA_ISREG(first.ia_type) && + !IA_EQUAL (first, replies[i].poststat, size)) { + gf_log (this->name, GF_LOG_DEBUG, + "SIZE mismatch %lld vs %lld on %s for gfid:%s", + (long long) first.ia_size, + (long long) replies[i].poststat.ia_size, + priv->children[i]->name, + uuid_utoa (replies[i].poststat.ia_gfid)); -afr_local_t* -afr_self_heal_local_init (afr_local_t *l, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *lc = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *shc = NULL; - int ret = 0; - - priv = this->private; - - sh = &l->self_heal; - - lc = mem_get0 (this->local_pool); - if (!lc) - goto out; - - shc = &lc->self_heal; - - shc->unwind = sh->unwind; - shc->gfid_sh_success_cbk = sh->gfid_sh_success_cbk; - shc->do_missing_entry_self_heal = sh->do_missing_entry_self_heal; - shc->do_gfid_self_heal = sh->do_gfid_self_heal; - shc->do_data_self_heal = sh->do_data_self_heal; - shc->do_metadata_self_heal = sh->do_metadata_self_heal; - shc->do_entry_self_heal = sh->do_entry_self_heal; - shc->force_confirm_spb = sh->force_confirm_spb; - shc->forced_merge = sh->forced_merge; - shc->background = sh->background; - shc->type = sh->type; - shc->data_sh_info = ""; - shc->metadata_sh_info = ""; - - uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req); - if (l->loc.path) { - ret = loc_copy (&lc->loc, &l->loc); - if (ret < 0) - goto out; - } - - lc->child_up = memdup (l->child_up, - sizeof (*lc->child_up) * priv->child_count); - if (!lc->child_up) { - ret = -1; - goto out; - } - - if (l->xattr_req) - lc->xattr_req = dict_ref (l->xattr_req); - - if (l->cont.lookup.inode) - lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode); - if (l->cont.lookup.xattr) - lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr); - - lc->internal_lock.locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), - priv->child_count, gf_afr_mt_char); - if (!lc->internal_lock.locked_nodes) { - ret = -1; - goto out; - } - - ret = afr_inodelk_init (&lc->internal_lock.inodelk[0], - this->name, priv->child_count); - if (ret) - goto out; + *data_selfheal = _gf_true; + } + } -out: - if (ret) { - afr_local_cleanup (lc, this); - lc = NULL; - } - return lc; -} + if (valid_cnt > 0) + afr_inode_link (inode, &first); -int -afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - afr_local_t * orig_frame_local = NULL; - afr_self_heal_t * orig_frame_sh = NULL; - char sh_type_str[256] = {0,}; - gf_loglevel_t loglevel = 0; - - priv = this->private; - local = bgsh_frame->local; - sh = &local->self_heal; - - if (local->unhealable) { - afr_set_split_brain (this, sh->inode, SPB, SPB); - } - - afr_self_heal_type_str_get (sh, sh_type_str, - sizeof(sh_type_str)); - if (is_self_heal_failed (sh, AFR_CHECK_ALL) && !priv->shd.iamshd) { - loglevel = GF_LOG_ERROR; - } else if (!is_self_heal_failed (sh, AFR_CHECK_ALL)) { - loglevel = GF_LOG_INFO; - } else { - loglevel = GF_LOG_DEBUG; - } - - afr_log_self_heal_completion_status (local, loglevel); - - FRAME_SU_UNDO (bgsh_frame, afr_local_t); - - if (!sh->unwound && sh->unwind) { - orig_frame_local = sh->orig_frame->local; - orig_frame_sh = &orig_frame_local->self_heal; - orig_frame_sh->actual_sh_started = _gf_true; - sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, - is_self_heal_failed (sh, AFR_CHECK_ALL)); - } - - if (sh->background) { - LOCK (&priv->lock); - { - priv->background_self_heals_started--; - } - UNLOCK (&priv->lock); - } - - AFR_STACK_DESTROY (bgsh_frame); - - return 0; -} - -int -afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int32_t op_errno = 0; - int ret = 0; - afr_self_heal_t *orig_sh = NULL; - call_frame_t *sh_frame = NULL; - afr_local_t *sh_local = NULL; - loc_t *loc = NULL; - - local = frame->local; - orig_sh = &local->self_heal; - priv = this->private; - - GF_ASSERT (local->loc.path); - - gf_log (this->name, GF_LOG_TRACE, - "performing self heal on %s (metadata=%d data=%d entry=%d)", - local->loc.path, - local->self_heal.do_metadata_self_heal, - local->self_heal.do_data_self_heal, - local->self_heal.do_entry_self_heal); - - op_errno = ENOMEM; - sh_frame = copy_frame (frame); - if (!sh_frame) - goto out; - afr_set_lk_owner (sh_frame, this, sh_frame->root); - afr_set_low_priority (sh_frame); - - sh_local = afr_self_heal_local_init (local, this); - if (!sh_local) - goto out; - sh_frame->local = sh_local; - sh = &sh_local->self_heal; - - sh->inode = inode_ref (inode); - sh->orig_frame = frame; - - sh->completion_cbk = afr_self_heal_completion_cbk; - - sh->success = GF_CALLOC (priv->child_count, sizeof (*sh->success), - gf_afr_mt_char); - if (!sh->success) - goto out; - sh->sources = GF_CALLOC (sizeof (*sh->sources), priv->child_count, - gf_afr_mt_int); - if (!sh->sources) - goto out; - sh->locked_nodes = GF_CALLOC (sizeof (*sh->locked_nodes), - priv->child_count, - gf_afr_mt_int); - if (!sh->locked_nodes) - goto out; - - sh->pending_matrix = afr_matrix_create (priv->child_count, - priv->child_count); - if (!sh->pending_matrix) - goto out; - - sh->delta_matrix = afr_matrix_create (priv->child_count, - priv->child_count); - if (!sh->delta_matrix) - goto out; - - sh->fresh_parent_dirs = afr_children_create (priv->child_count); - if (!sh->fresh_parent_dirs) - goto out; - ret = afr_sh_common_create (sh, priv->child_count); - if (ret) { - op_errno = -ret; - goto out; - } - - if (local->self_heal.background) { - LOCK (&priv->lock); - { - if (priv->background_self_heals_started - < priv->background_self_heal_count) { - priv->background_self_heals_started++; - - - } else { - local->self_heal.background = _gf_false; - sh->background = _gf_false; - } - } - UNLOCK (&priv->lock); - } - - if (!local->loc.parent) { - sh->do_missing_entry_self_heal = _gf_false; - sh->do_gfid_self_heal = _gf_false; - } - - sh->sh_type_in_action = AFR_SELF_HEAL_INVALID; - - FRAME_SU_DO (sh_frame, afr_local_t); - if (sh->do_missing_entry_self_heal || sh->do_gfid_self_heal) { - afr_self_heal_missing_entries (sh_frame, this); - } else { - loc = &sh_local->loc; - if (uuid_is_null (loc->inode->gfid) && uuid_is_null (loc->gfid)) { - if (!uuid_is_null (inode->gfid)) - GF_ASSERT (!uuid_compare (inode->gfid, - sh->sh_gfid_req)); - uuid_copy (loc->gfid, sh->sh_gfid_req); - } - gf_log (this->name, GF_LOG_TRACE, - "proceeding to metadata check on %s", - local->loc.path); - - afr_sh_missing_entries_done (sh_frame, this); - } - op_errno = 0; + if (valid_cnt < 2) + return -ENOTCONN; -out: - if (op_errno) { - orig_sh->unwind (frame, this, -1, op_errno, 1); - if (sh_frame) - AFR_STACK_DESTROY (sh_frame); - } - return 0; + return 0; } -void -afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, - size_t size) + +inode_t * +afr_inode_find (xlator_t *this, uuid_t gfid) { - GF_ASSERT (str && (size > strlen (" missing-entry gfid " - "meta-data data entry"))); + inode_table_t *table = NULL; + inode_t *inode = NULL; - if (self_heal_p->do_metadata_self_heal) { - snprintf (str, size, " meta-data"); - } + table = this->itable; + if (!table) + return NULL; - if (self_heal_p->do_data_self_heal) { - snprintf (str + strlen(str), size - strlen(str), " data"); - } + inode = inode_find (table, gfid); + if (inode) + return inode; - if (self_heal_p->do_entry_self_heal) { - snprintf (str + strlen(str), size - strlen(str), " entry"); - } + inode = inode_new (table); + if (!inode) + return NULL; - if (self_heal_p->do_missing_entry_self_heal) { - snprintf (str + strlen(str), size - strlen(str), - " missing-entry"); - } + uuid_copy (inode->gfid, gfid); - if (self_heal_p->do_gfid_self_heal) { - snprintf (str + strlen(str), size - strlen(str), " gfid"); - } + return inode; } -afr_self_heal_type -afr_self_heal_type_for_transaction (afr_transaction_type type) -{ - afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; - - switch (type) { - case AFR_DATA_TRANSACTION: - sh_type = AFR_SELF_HEAL_DATA; - break; - case AFR_METADATA_TRANSACTION: - sh_type = AFR_SELF_HEAL_METADATA; - break; - case AFR_ENTRY_TRANSACTION: - sh_type = AFR_SELF_HEAL_ENTRY; - break; - case AFR_ENTRY_RENAME_TRANSACTION: - GF_ASSERT (0); - break; - } - return sh_type; -} -int -afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +call_frame_t * +afr_frame_create (xlator_t *this) { - int ret = -1; - uuid_t pargfid = {0}; - - if (!child) - goto out; - - if (!uuid_is_null (parent->inode->gfid)) - uuid_copy (pargfid, parent->inode->gfid); - else if (!uuid_is_null (parent->gfid)) - uuid_copy (pargfid, parent->gfid); - - if (uuid_is_null (pargfid)) - goto out; - - if (strcmp (parent->path, "/") == 0) - ret = gf_asprintf ((char **)&child->path, "/%s", name); - else - ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path, - name); + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int op_errno = 0; + pid_t pid = -1; - if (-1 == ret) { - gf_log (this->name, GF_LOG_ERROR, - "asprintf failed while setting child path"); - } + frame = create_frame (this, this->ctx->pool); + if (!frame) + return NULL; - child->name = strrchr (child->path, '/'); - if (child->name) - child->name++; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) { + STACK_DESTROY (frame->root); + return NULL; + } - child->parent = inode_ref (parent->inode); - child->inode = inode_new (parent->inode->table); - uuid_copy (child->pargfid, pargfid); + syncopctx_setfspid (&pid); - if (!child->inode) { - ret = -1; - goto out; - } + frame->root->pid = pid; - ret = 0; -out: - if ((ret == -1) && child) - loc_wipe (child); + afr_set_lk_owner (frame, this, frame->root); - return ret; + return frame; } -int -afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, - afr_transaction_type type, afr_fxattrop_cbk_t cbk, - int (*finish)(call_frame_t *frame, xlator_t *this)) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - dict_t **erase_xattr = NULL; - int ret = -1; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, - sh->success, priv->child_count, type); - - erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, - gf_afr_mt_dict_t); - if (!erase_xattr) - goto out; - - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) { - call_count++; - erase_xattr[i] = dict_new (); - if (!erase_xattr[i]) - goto out; - } - } - - afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr, - priv->child_count, type); - - gf_log (this->name, GF_LOG_DEBUG, "Delta matrix for: %s", - lkowner_utoa (&frame->root->lk_owner)); - afr_sh_print_pending_matrix (sh->delta_matrix, this); - local->call_count = call_count; - if (call_count == 0) { - ret = 0; - finish (frame, this); - goto out; - } - - for (i = 0; i < priv->child_count; i++) { - if (!erase_xattr[i]) - continue; - - if (sh->healing_fd) {//true for ENTRY, reg file DATA transaction - STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - sh->healing_fd, - GF_XATTROP_ADD_ARRAY, erase_xattr[i], - NULL); - } else { - STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, erase_xattr[i], - NULL); - } - } - - ret = 0; -out: - if (erase_xattr) { - for (i = 0; i < priv->child_count; i++) { - if (erase_xattr[i]) { - dict_unref (erase_xattr[i]); - } - } - } - - GF_FREE (erase_xattr); - - if (ret < 0) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - finish (frame, this); - } - - return 0; -} -void -afr_set_self_heal_status(afr_self_heal_t *sh, afr_self_heal_status status) -{ - xlator_t *this = NULL; - afr_sh_status_for_all_type *sh_status = &(sh->afr_all_sh_status); - afr_self_heal_type sh_type_in_action = sh->sh_type_in_action; - this = THIS; - - if (!sh) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal" - "Structure"); - goto out; - } - - switch (sh_type_in_action) { - case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: - sh_status->gfid_or_missing_entry_self_heal = status; - break; - case AFR_SELF_HEAL_METADATA: - sh_status->metadata_self_heal = status; - break; - case AFR_SELF_HEAL_DATA: - sh_status->data_self_heal = status; - break; - case AFR_SELF_HEAL_ENTRY: - sh_status->entry_self_heal = status; - break; - case AFR_SELF_HEAL_INVALID: - gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid" - "self heal type in action"); - break; - } -out: - return; -} +/* + * This is the entry point for healing a given GFID + */ -void -afr_set_local_for_unhealable (afr_local_t *local) +int +afr_selfheal (xlator_t *this, uuid_t gfid) { - afr_self_heal_t *sh = NULL; - - sh = &local->self_heal; + inode_t *inode = NULL; + call_frame_t *frame = NULL; + int ret = -1; + gf_boolean_t data_selfheal = _gf_false; + gf_boolean_t metadata_selfheal = _gf_false; + gf_boolean_t entry_selfheal = _gf_false; - local->unhealable = 1; - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); -} + inode = afr_inode_find (this, gfid); + if (!inode) + goto out; -int -is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type) -{ - afr_sh_status_for_all_type sh_status = sh->afr_all_sh_status; - afr_self_heal_type sh_type_in_action = AFR_SELF_HEAL_INVALID; - afr_self_heal_status status = AFR_SELF_HEAL_FAILED; - xlator_t *this = NULL; - int sh_failed = 0; - - this = THIS; - - if (!sh) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal " - "structure"); - sh_failed = 1; - goto out; - } - - if (type == AFR_CHECK_ALL) { - if ((sh_status.gfid_or_missing_entry_self_heal == AFR_SELF_HEAL_FAILED) - || (sh_status.metadata_self_heal == AFR_SELF_HEAL_FAILED) - || (sh_status.data_self_heal == AFR_SELF_HEAL_FAILED) - || (sh_status.entry_self_heal == AFR_SELF_HEAL_FAILED)) - sh_failed = 1; - } else if (type == AFR_CHECK_SPECIFIC) { - sh_type_in_action = sh->sh_type_in_action; - switch (sh_type_in_action) { - case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: - status = sh_status.gfid_or_missing_entry_self_heal; - break; - case AFR_SELF_HEAL_METADATA: - status = sh_status.metadata_self_heal; - break; - case AFR_SELF_HEAL_ENTRY: - status = sh_status.entry_self_heal; - break; - case AFR_SELF_HEAL_DATA: - status = sh_status.data_self_heal; - break; - case AFR_SELF_HEAL_INVALID: - status = AFR_SELF_HEAL_NOT_ATTEMPTED; - break; - } - if (status == AFR_SELF_HEAL_FAILED) - sh_failed = 1; - - } + frame = afr_frame_create (this); + if (!frame) + goto out; -out: - return sh_failed; -} + ret = afr_selfheal_unlocked_inspect (frame, this, inode, gfid, + &data_selfheal, + &metadata_selfheal, + &entry_selfheal); + if (ret) + goto out; -char * -get_sh_completion_status (afr_self_heal_status status) -{ + if (data_selfheal) + afr_selfheal_data (frame, this, inode); - char *not_attempted = " is not attempted"; - char *failed = " failed"; - char *started = " is started"; - char *sync_begin = " is successfully completed"; - char *result = " has unknown status"; - - switch (status) - { - case AFR_SELF_HEAL_NOT_ATTEMPTED: - result = not_attempted; - break; - case AFR_SELF_HEAL_FAILED: - result = failed; - break; - case AFR_SELF_HEAL_STARTED: - result = started; - break; - case AFR_SELF_HEAL_SYNC_BEGIN: - result = sync_begin; - break; - } - - return result; + if (metadata_selfheal) + afr_selfheal_metadata (frame, this, inode); -} + if (entry_selfheal) + afr_selfheal_entry (frame, this, inode); -void -afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t loglvl) -{ + inode_forget (inode, 1); +out: + if (inode) + inode_unref (inode); + if (frame) + AFR_STACK_DESTROY (frame); - char sh_log[4096] = {0}; - afr_self_heal_t *sh = &local->self_heal; - afr_sh_status_for_all_type all_status = sh->afr_all_sh_status; - xlator_t *this = NULL; - size_t off = 0; - int data_sh = 0; - int metadata_sh = 0; - int print_log = 0; - - this = THIS; - - ADD_FMT_STRING (sh_log, off, "gfid or missing entry", - all_status.gfid_or_missing_entry_self_heal, print_log); - ADD_FMT_STRING_SYNC (sh_log, off, "metadata", - all_status.metadata_self_heal, print_log); - if (sh->background) { - ADD_FMT_STRING_SYNC (sh_log, off, "backgroung data", - all_status.data_self_heal, print_log); - } else { - ADD_FMT_STRING_SYNC (sh_log, off, "foreground data", - all_status.data_self_heal, print_log); - } - ADD_FMT_STRING_SYNC (sh_log, off, "entry", all_status.entry_self_heal, - print_log); - - if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.data_self_heal && - strcmp (sh->data_sh_info, "") && sh->data_sh_info ) - data_sh = 1; - if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.metadata_self_heal && - strcmp (sh->metadata_sh_info, "") && sh->metadata_sh_info) - metadata_sh = 1; - - if (!print_log) - return; - - gf_log (this->name, loglvl, "%s %s %s on %s", sh_log, - ((data_sh == 1) ? sh->data_sh_info : ""), - ((metadata_sh == 1) ? sh->metadata_sh_info : ""), - local->loc.path); + return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h deleted file mode 100644 index 473264776..000000000 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ /dev/null @@ -1,144 +0,0 @@ -/* - Copyright (c) 2008-2012 Red Hat, Inc. - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef __AFR_SELF_HEAL_COMMON_H__ -#define __AFR_SELF_HEAL_COMMON_H__ - -#define FILE_HAS_HOLES(buf) (((buf)->ia_size) > ((buf)->ia_blocks * 512)) -#define AFR_SH_MIN_PARTICIPANTS 2 - -typedef enum { - AFR_LOOKUP_FAIL_CONFLICTS = 1, - AFR_LOOKUP_FAIL_MISSING_GFIDS = 2, -} afr_lookup_flags_t; - -int -afr_sh_select_source (int sources[], int child_count); - -int -afr_sh_source_count (int sources[], int child_count); - -void -afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); - -void -afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, - const char *loc); - -int -afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, - unsigned char *ignorant_subvols, - dict_t *xattr[], afr_transaction_type type, - size_t child_count); - -void -afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, - int32_t *delta_matrix[], unsigned char success[], - int child_count, afr_transaction_type type); - -int -afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, - struct iatt *bufs, afr_self_heal_type type, - int32_t *success_children, int32_t *subvol_status); - -int -afr_sh_delta_to_xattr (xlator_t *this, - int32_t *delta_matrix[], dict_t *xattr[], - int child_count, afr_transaction_type type); - -void -afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, - size_t size); - -afr_self_heal_type -afr_self_heal_type_for_transaction (afr_transaction_type type); - -int -afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, - int32_t **pending_matrix, int32_t *sources, - int32_t *success_children, afr_transaction_type type, - int32_t *subvol_status, gf_boolean_t ignore_ignorant); -void -afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count); - -void -afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, - dict_t *xattr, struct iatt *postparent, - loc_t *loc); - -int -afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - afr_lookup_done_cbk_t lookup_cbk, uuid_t uuid, - int32_t flags, dict_t *xdata); -int -afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, - int active_src, struct iatt *buf, - struct iatt *parentbuf); -int -afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, - char *base_name, afr_lock_cbk_t lock_cbk); -int -afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, - int child_index); -int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, - afr_lock_cbk_t lock_cbk); -afr_local_t * -afr_self_heal_local_init (afr_local_t *l, xlator_t *this); -int -afr_sh_data_lock (call_frame_t *frame, xlator_t *this, - off_t start, off_t len, gf_boolean_t block, char *dom, - afr_lock_cbk_t success_handler, - afr_lock_cbk_t failure_handler); -void -afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno); -void -afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this); -typedef int -(*afr_fxattrop_cbk_t) (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr, dict_t *xdata); -int -afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name); -int -afr_impunge_frame_create (call_frame_t *frame, xlator_t *this, - int active_source, call_frame_t **impunge_frame); -void -afr_sh_reset (call_frame_t *frame, xlator_t *this); - -void -afr_children_intersection_get (int32_t *set1, int32_t *set2, - int *intersection, unsigned int child_count); -int -afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children, - struct iatt *bufs); -int -afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, - afr_transaction_type type, afr_fxattrop_cbk_t cbk, - int (*finish)(call_frame_t *frame, xlator_t *this)); - -void -afr_set_local_for_unhealable (afr_local_t *local); - -int -is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type); - -void -afr_set_self_heal_status (afr_self_heal_t *sh, afr_self_heal_status status); - -void -afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t logl); - -char* -afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this); -#endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 9de26ee56..c0385153f 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. + Copyright (c) 2013 Red Hat, Inc. This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,1747 +8,609 @@ cases as published by the Free Software Foundation. */ -#include -#include -#include -#include -#include -#include #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" #endif -#include "glusterfs.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" -#include "afr-self-heal-algorithm.h" - -int -afr_sh_data_fail (call_frame_t *frame, xlator_t *this); - -static inline gf_boolean_t -afr_sh_data_proceed (unsigned int success_count) -{ - return (success_count >= AFR_SH_MIN_PARTICIPANTS); -} - -extern int -sh_loop_finish (call_frame_t *loop_frame, xlator_t *this); +#include "byte-order.h" -int -afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this); +enum { + AFR_SELFHEAL_DATA_FULL = 0, + AFR_SELFHEAL_DATA_DIFF, +}; -int -afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this); -int -afr_sh_data_finish (call_frame_t *frame, xlator_t *this); - -int -afr_sh_data_done (call_frame_t *frame, xlator_t *this) +#define HAS_HOLES(i) ((i->ia_blocks * 512) < (i->ia_size)) +static int +__checksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, uint32_t weak, uint8_t *strong, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + int i = (long) cookie; - local = frame->local; - sh = &local->self_heal; + local = frame->local; - sh->completion_cbk (frame, this); + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (strong) + memcpy (local->replies[i].checksum, strong, MD5_DIGEST_LENGTH); - return 0; + syncbarrier_wake (&local->barrier); + return 0; } -int -afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "flush failed on %s on subvolume %s: %s", - local->loc.path, priv->children[child_index]->name, - strerror (op_errno)); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_done (frame, this); - } - - return 0; -} - -int -afr_sh_data_close (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - int call_count = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (!sh->healing_fd) { - //This happens when file is non-reg - afr_sh_data_done (frame, this); - return 0; - } - call_count = afr_set_elem_count_get (sh->success, - priv->child_count); - local->call_count = call_count; - - if (call_count == 0) { - afr_sh_data_done (frame, this); - return 0; - } - - for (i = 0; i < priv->child_count; i++) { - if (!sh->success[i]) - continue; - gf_log (this->name, GF_LOG_DEBUG, - "closing fd of %s on %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->flush, - sh->healing_fd, NULL); - - if (!--call_count) - break; - } - - return 0; -} - -int -afr_sh_dom_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (sh->sh_dom_lock_held) - afr_sh_data_unlock (frame, this, priv->sh_domain, - afr_sh_data_close); - else - afr_sh_data_close (frame, this); - return 0; -} - -int -afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *statpre, - struct iatt *statpost, dict_t *xdata) +static int +attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, struct iatt *post, + dict_t *xdata) { + int i = (long) cookie; + afr_local_t *local = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = (long) cookie; + local = frame->local; - local = frame->local; - priv = this->private; + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (pre) + local->replies[i].prestat = *pre; + if (post) + local->replies[i].poststat = *post; + if (xdata) + local->replies[i].xdata = dict_ref (xdata); - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "setattr failed on %s on subvolume %s: %s", - local->loc.path, priv->children[child_index]->name, - strerror (op_errno)); - } - } - UNLOCK (&frame->lock); + syncbarrier_wake (&local->barrier); - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_sh_data_finish (frame, this); - } - - return 0; + return 0; } -int -afr_sh_data_setattr (call_frame_t *frame, xlator_t *this, struct iatt* stbuf) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - int call_count = 0; - int32_t valid = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - valid = (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); - - call_count = afr_set_elem_count_get (sh->success, - priv->child_count); - local->call_count = call_count; - - if (call_count == 0) { - GF_ASSERT (0); - afr_sh_data_finish (frame, this); - return 0; - } - - for (i = 0; i < priv->child_count; i++) { - if (!sh->success[i]) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setattr, - &local->loc, stbuf, valid, NULL); - - if (!--call_count) - break; - } - - return 0; -} -int -afr_sh_data_setattr_fstat_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf, dict_t *xdata) +static gf_boolean_t +__afr_selfheal_data_checksums_match (call_frame_t *frame, xlator_t *this, + fd_t *fd, int source, + unsigned char *healed_sinks, + off_t offset, size_t size) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int child_index = (long) cookie; - - local = frame->local; - sh = &local->self_heal; - - GF_ASSERT (sh->source == child_index); - if (op_ret != -1) { - sh->buf[child_index] = *buf; - afr_sh_data_setattr (frame, this, buf); - } else { - gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " - "time-stamps after self-heal", local->loc.path); - afr_sh_data_fail (frame, this); - } - - return 0; -} + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + unsigned char *wind_subvols = NULL; + int i = 0; -/* - * If there are any writes after the self-heal is triggered then the - * stbuf stored in local->self_heal.buf[] will be invalid so we do one more - * stat on the source and then set the [am]times - */ -int -afr_sh_set_timestamps (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - STACK_WIND_COOKIE (frame, afr_sh_data_setattr_fstat_cbk, - (void *) (long) sh->source, - priv->children[sh->source], - priv->children[sh->source]->fops->fstat, - sh->healing_fd, NULL); - return 0; -} - -//Fun fact, lock_cbk is being used for both lock & unlock -int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, - afr_lock_cbk_t lock_cbk) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int ret = 0; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - priv = this->private; - - if (strcmp (dom, this->name) == 0) { - sh->data_lock_held = _gf_false; - } else if (strcmp (dom, priv->sh_domain) == 0) { - sh->sh_dom_lock_held = _gf_false; - } else { - ret = -1; - goto out; - } - int_lock->lock_cbk = lock_cbk; - int_lock->domain = dom; - afr_unlock (frame, this); - -out: - if (ret) { - int_lock->lock_op_ret = -1; - int_lock->lock_cbk (frame, this); - } - return 0; -} - -int -afr_sh_data_finish (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - gf_log (this->name, GF_LOG_DEBUG, - "finishing data selfheal of %s", local->loc.path); - - if (sh->data_lock_held) - afr_sh_data_unlock (frame, this, this->name, afr_sh_dom_unlock); - else - afr_sh_dom_unlock (frame, this); - - return 0; -} - -int -afr_sh_data_fail (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + priv = this->private; + local = frame->local; - local = frame->local; - sh = &local->self_heal; + wind_subvols = alloca0 (priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (i == source || healed_sinks[i]) + wind_subvols[i] = 1; + } - gf_log (this->name, GF_LOG_DEBUG, - "finishing failed data selfheal of %s", local->loc.path); + AFR_ONLIST (wind_subvols, frame, __checksum_cbk, rchecksum, fd, + offset, size, NULL); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_data_finish (frame, this); - return 0; -} + if (!local->replies[source].valid || local->replies[source].op_ret != 0) + return _gf_false; -int -afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr, dict_t *xdata) -{ - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int32_t child_index = (long) cookie; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - if (op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "Erasing of pending change " - "log failed on %s for subvol %s, reason: %s", - local->loc.path, priv->children[child_index]->name, - strerror (op_errno)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - if (sh->old_loop_frame) - sh_loop_finish (sh->old_loop_frame, this); - sh->old_loop_frame = NULL; - afr_sh_data_fail (frame, this); - goto out; - } - if (!IA_ISREG (sh->type)) { - afr_sh_data_finish (frame, this); - goto out; - } - GF_ASSERT (sh->old_loop_frame); - afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, - afr_post_sh_big_lock_success, - afr_post_sh_big_lock_failure); - } -out: - return 0; -} + for (i = 0; i < priv->child_count; i++) { + if (i == source) + continue; + if (memcmp (local->replies[source].checksum, + local->replies[i].checksum, + MD5_DIGEST_LENGTH)) + return _gf_false; + } -int -afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) -{ - afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION, - afr_sh_data_erase_pending_cbk, - afr_sh_data_finish); - return 0; + return _gf_true; } -int -afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *pre, - struct iatt *post, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int call_count = 0; - int child_index = (long) cookie; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - - if (op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "%s: Failed to fsync on " - "%s - %s", local->loc.path, - priv->children[child_index]->name, strerror (op_errno)); - LOCK (&frame->lock); - { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - UNLOCK (&frame->lock); - if (sh->old_loop_frame) - sh_loop_finish (sh->old_loop_frame, this); - sh->old_loop_frame = NULL; - } - - call_count = afr_frame_return (frame); - if (call_count == 0) { - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) - afr_sh_data_fail (frame, this); - else - afr_sh_data_erase_pending (frame, this); - } - return 0; -} -/* - * Before erasing xattrs, make sure the data is written to disk - */ -int -afr_sh_data_fsync (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - int i = 0; - int call_count = 0; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - - call_count = sh->active_sinks; - if (call_count == 0) { - afr_sh_data_erase_pending (frame, this); - return 0; - } - - local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!sh->success[i] || sh->sources[i]) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_data_fsync_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->fsync, - sh->healing_fd, 1, NULL); - } - - return 0; -} - -static struct afr_sh_algorithm * -sh_algo_from_name (xlator_t *this, char *name) +static int +__afr_selfheal_data_read_write (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, + off_t offset, size_t size, + struct afr_reply *replies) { - int i = 0; + struct iovec *iovec = NULL; + int count = 0; + struct iobref *iobref = NULL; + int ret = 0; + int i = 0; + afr_private_t *priv = NULL; - if (name == NULL) - goto out; + priv = this->private; - while (afr_self_heal_algorithms[i].name) { - if (!strcmp (name, afr_self_heal_algorithms[i].name)) { - return &afr_self_heal_algorithms[i]; - } + ret = syncop_readv (priv->children[source], fd, size, offset, 0, + &iovec, &count, &iobref); + if (ret <= 0) + return ret; - i++; - } + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + + /* + * TODO: Use fiemap() and discard() to heal holes + * in the future. + * + * For now, + * + * - if the source had any holes at all, + * AND + * - if we are writing past the original file size + * of the sink + * AND + * - is NOT the last block of the source file. if + * the block contains EOF, it has to be written + * in order to set the file size even if the + * last block is 0-filled. + * AND + * - if the read buffer is filled with only 0's + * + * then, skip writing to this source. We don't depend + * on the write to happen to update the size as we + * have performed an ftruncate() upfront anyways. + */ +#define is_last_block(o,b,s) ((s >= o) && (s <= (o + b))) + if (HAS_HOLES ((&replies[source].poststat)) && + offset > replies[i].poststat.ia_size && + !is_last_block (offset, size, + replies[source].poststat.ia_size) && + (iov_0filled (iovec, count) == 0)) + continue; + + ret = syncop_writev (priv->children[i], fd, iovec, count, + offset, iobref, 0); + if (ret != iov_length (iovec, count)) { + /* write() failed on this sink. unset the corresponding + member in sinks[] (which is healed_sinks[] in the + caller) so that this server does NOT get considered + as successfully healed. + */ + healed_sinks[i] = 0; + } + } + if (iobref) + iobref_unref (iobref); -out: - return NULL; + return ret; } static int -sh_zero_byte_files_exist (afr_local_t *local, int child_count) -{ - int i = 0; - int ret = 0; - afr_self_heal_t *sh = NULL; - - sh = &local->self_heal; - for (i = 0; i < child_count; i++) { - if (!local->child_up[i] || sh->child_errno[i]) - continue; - if (sh->buf[i].ia_size == 0) { - ret = 1; - break; - } - } - - return ret; -} - - -struct afr_sh_algorithm * -afr_sh_data_pick_algo (call_frame_t *frame, xlator_t *this) -{ - afr_private_t * priv = NULL; - struct afr_sh_algorithm * algo = NULL; - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - algo = sh_algo_from_name (this, priv->data_self_heal_algorithm); - - if (algo == NULL) { - /* option not set, so fall back on heuristics */ - - if (sh_zero_byte_files_exist (local, priv->child_count) - || (sh->file_size <= (priv->data_self_heal_window_size * - this->ctx->page_size))) { - - /* - * If the file does not exist on one of the subvolumes, - * or a zero-byte file exists (created by entry self-heal) - * the entire content has to be copied anyway, so there - * is no benefit from using the "diff" algorithm. - * - * If the file size is about the same as page size, - * the entire file can be read and written with a few - * (pipelined) STACK_WINDs, which will be faster - * than "diff" which has to read checksums and then - * read and write. - */ - - algo = sh_algo_from_name (this, "full"); - - } else { - algo = sh_algo_from_name (this, "diff"); - } - } - - return algo; -} - - -int -afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - struct afr_sh_algorithm *sh_algo = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh->algo_completion_cbk = afr_sh_data_fsync; - sh->algo_abort_cbk = afr_sh_data_fail; - - sh_algo = afr_sh_data_pick_algo (frame, this); +afr_selfheal_data_block (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, off_t offset, + size_t size, int type, struct afr_reply *replies) +{ + int ret = -1; + int sink_count = 0; + afr_private_t *priv = NULL; + unsigned char *data_lock = NULL; + + priv = this->private; + sink_count = AFR_COUNT (healed_sinks, priv->child_count); + data_lock = alloca0 (priv->child_count); + + ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, + offset, size, data_lock); + { + if (ret < sink_count) { + ret = -ENOTCONN; + goto unlock; + } - sh->algo = sh_algo; - sh_algo->fn (frame, this); + if (type == AFR_SELFHEAL_DATA_DIFF && + __afr_selfheal_data_checksums_match (frame, this, fd, source, + healed_sinks, offset, size)) { + ret = 0; + goto unlock; + } - return 0; + ret = __afr_selfheal_data_read_write (frame, this, fd, source, + healed_sinks, offset, size, + replies); + } +unlock: + afr_selfheal_uninodelk (frame, this, fd->inode, this->name, + offset, size, data_lock); + return ret; } -int -afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - int call_count = 0; - int child_index = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "ftruncate of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "ftruncate of %s on subvolume %s completed", - local->loc.path, - priv->children[child_index]->name); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) - afr_sh_data_fail (frame, this); - else - afr_sh_data_sync_prepare (frame, this); - } - - return 0; -} -int -afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) +static int +afr_selfheal_data_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *healed_sinks) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - afr_self_heal_t *sh = NULL; - int *sources = NULL; - int call_count = 0; - int i = 0; - - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sources = sh->sources; - call_count = sh->active_sinks; - - local->call_count = call_count; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; - for (i = 0; i < priv->child_count; i++) { - if (sources[i] || !local->child_up[i]) - continue; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->ftruncate, - sh->healing_fd, sh->file_size, - NULL); + AFR_ONLIST (healed_sinks, frame, attr_cbk, fsync, fd, 0, NULL); - if (!--call_count) - break; - } - - return 0; + for (i = 0; i < priv->child_count; i++) + if (healed_sinks[i] && local->replies[i].op_ret != 0) + /* fsync() failed. Do NOT consider this server + as successfully healed. Mark it so. + */ + healed_sinks[i] = 0; + return 0; } -int -afr_sh_inode_set_read_ctx (afr_self_heal_t *sh, xlator_t *this) -{ - afr_private_t *priv = NULL; - int ret = 0; - int i = 0; - - priv = this->private; - sh->source = afr_sh_select_source (sh->sources, priv->child_count); - if (sh->source < 0) { - ret = -1; - goto out; - } - - /* detect changes not visible through pending flags -- JIC */ - for (i = 0; i < priv->child_count; i++) { - if (i == sh->source || sh->child_errno[i]) - continue; - - if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[sh->source])) - sh->sources[i] = 0; - } - - afr_reset_children (sh->fresh_children, priv->child_count); - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); -out: - return ret; -} -char* -afr_get_sizes_str (afr_local_t *local, struct iatt *bufs, xlator_t *this) -{ - afr_private_t *priv = NULL; - int i = 0; - char num[1024] = {0}; - size_t len = 0; - char *sizes_str = NULL; - size_t off = 0; - char *fmt_str = "%llu bytes on %s, "; - char *child_down = " %s,"; - char *child_unknown = " %s,"; - int down_child_present = 0; - int down_count = 0; - int unknown_count = 0; - int unknown_child_present = 0; - char *down_subvol_1 = " down subvolume is "; - char *unknown_subvol_1 = " unknown subvolume is "; - char *down_subvol_2 = " down subvolumes are "; - char *unknown_subvol_2 = " unknown subvolumes are "; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] == 1) { - len += snprintf (num, sizeof (num), fmt_str, - (unsigned long long) bufs[i].ia_size, - priv->children[i]->name); - } else if (local->child_up[i] == 0) { - len += snprintf (num, sizeof (num), child_down, - priv->children[i]->name); - if (!down_child_present) - down_child_present = 1; - down_count ++; - } else if (local->child_up[i] == -1) { - len += snprintf (num, sizeof (num), child_unknown, - priv->children[i]->name); - if (!unknown_child_present) - unknown_child_present = 1; - unknown_count++; - } - - } - - if (down_child_present) { - if (down_count > 1) - len += snprintf (num, sizeof (num), "%s", - down_subvol_2); - else - len += snprintf (num, sizeof (num), "%s", - down_subvol_1); - } - if (unknown_child_present) { - if (unknown_count > 1) - len += snprintf (num, sizeof (num), "%s", - unknown_subvol_2); - else - len += snprintf (num, sizeof (num), "%s", - unknown_subvol_1); - } - - len++;//for '\0' - - sizes_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); - - if (!sizes_str) - return NULL; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] == 1) { - off += snprintf (sizes_str + off, len - off, fmt_str, - (unsigned long long) bufs[i].ia_size, - priv->children[i]->name); - } - } - - if (down_child_present) { - if (down_count > 1) { - off += snprintf (sizes_str + off, len - off, "%s", - down_subvol_2); - } else { - off += snprintf (sizes_str + off, len - off, "%s", - down_subvol_1); - } - } - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] == 0) { - off += snprintf (sizes_str + off, len - off, child_down, - priv->children[i]->name); - } - } - - if (unknown_child_present) { - if (unknown_count > 1) { - off += snprintf (sizes_str + off, len - off, "%s", - unknown_subvol_2); - } else { - off += snprintf (sizes_str + off, len - off, "%s", - unknown_subvol_1); - } - } - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] == -1) { - off += snprintf (sizes_str + off, len - off, - child_unknown, - priv->children[i]->name); - - } - } - - return sizes_str; -} - -char* -afr_get_sinks_str (xlator_t *this, afr_local_t *local, afr_self_heal_t *sh) +static int +afr_selfheal_data_restore_time (call_frame_t *frame, xlator_t *this, + inode_t *inode, int source, + unsigned char *healed_sinks, + struct afr_reply *replies) { - afr_private_t *priv = NULL; - int i = 0; - char num[1024] = {0}; - size_t len = 0; - char *sinks_str = NULL; - char *temp_str = " to sinks "; - char *str_format = " %s,"; - char off = 0; - - priv = this->private; - - len += snprintf (num, sizeof (num), "%s", temp_str); - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { - len += snprintf (num, sizeof (num), str_format, - priv->children[i]->name); - } - } + loc_t loc = {0, }; - len ++; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - sinks_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); + AFR_ONLIST (healed_sinks, frame, attr_cbk, setattr, &loc, + &replies[source].poststat, + (GF_SET_ATTR_ATIME|GF_SET_ATTR_MTIME), NULL); - if (!sinks_str) - return NULL; - - off += snprintf (sinks_str + off, len - off, "%s", temp_str); - - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { - off += snprintf (sinks_str + off, len - off, - str_format, - priv->children[i]->name); - } - } - - return sinks_str; + loc_wipe (&loc); + return 0; } +static int +afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, + struct afr_reply *replies) +{ + afr_private_t *priv = NULL; + int i = 0; + off_t off = 0; + size_t block = 128 * 1024; + int type = AFR_SELFHEAL_DATA_FULL; + int ret = -1; + call_frame_t *iter_frame = NULL; + char *sinks_str = NULL; + char *p = NULL; + + priv = this->private; + + sinks_str = alloca0 (priv->child_count * 8); + p = sinks_str; + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + p += sprintf (p, "%d ", i); + } -void -afr_set_data_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, xlator_t *this) -{ - char *pending_matrix_str = NULL; - char *sizes_str = NULL; - char *sinks_str = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - - pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, - this); - if (!pending_matrix_str) - pending_matrix_str = ""; - - sizes_str = afr_get_sizes_str (local, sh->buf, this); - if (!sizes_str) - sizes_str = ""; + gf_log (this->name, GF_LOG_INFO, "performing data selfheal on %s. " + "source=%d sinks=%s", + uuid_utoa (fd->inode->gfid), source, sinks_str); - sinks_str = afr_get_sinks_str (this, local, sh); - if (!sinks_str) - sinks_str = ""; + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i] && i != source) + continue; + if (replies[i].poststat.ia_size) { + type = AFR_SELFHEAL_DATA_DIFF; + break; + } + } - gf_asprintf (&sh->data_sh_info, " data self heal from %s %s with " - "%s data %s", priv->children[sh->source]->name, sinks_str, - sizes_str, pending_matrix_str); + iter_frame = afr_copy_frame (frame); + if (!iter_frame) + return -ENOMEM; - if (pending_matrix_str && strcmp (pending_matrix_str, "")) - GF_FREE (pending_matrix_str); + for (off = 0; off < replies[source].poststat.ia_size; off += block) { + ret = afr_selfheal_data_block (iter_frame, this, fd, source, + healed_sinks, off, block, type, + replies); + if (ret < 0) + goto out; - if (sizes_str && strcmp (sizes_str, "")) - GF_FREE (sizes_str); -} + AFR_STACK_RESET (iter_frame); + } -void -afr_sh_data_fix (call_frame_t *frame, xlator_t *this) -{ - int source = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - sh->block_size = this->ctx->page_size; - sh->file_size = sh->buf[source].ia_size; - - if (FILE_HAS_HOLES (&sh->buf[source])) - sh->file_has_holes = 1; - - if (sh->background && sh->unwind && !sh->unwound) { - sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, - is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)); - sh->unwound = _gf_true; - } - - afr_sh_mark_source_sinks (frame, this); - if (sh->active_sinks == 0) { - gf_log (this->name, GF_LOG_INFO, - "no active sinks for performing self-heal on file %s", - local->loc.path); - afr_sh_data_finish (frame, this); - return; - } - - gf_log (this->name, GF_LOG_DEBUG, - "self-healing file %s from subvolume %s to %d other", - local->loc.path, priv->children[sh->source]->name, - sh->active_sinks); - - sh->actual_sh_started = _gf_true; - afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); - afr_sh_data_trim_sinks (frame, this); -} + afr_selfheal_data_restore_time (frame, this, fd->inode, source, + healed_sinks, replies); -int -afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int nsources = 0; - int ret = 0; - int *old_sources = NULL; - int tstamp_source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - gf_log (this->name, GF_LOG_DEBUG, "Pending matrix for: %s", - lkowner_utoa (&frame->root->lk_owner)); - if (sh->sync_done) { - //store sources before sync so that mtime can be set using the - //iatt buf from one of them. - old_sources = alloca (priv->child_count*sizeof (*old_sources)); - memcpy (old_sources, sh->sources, - priv->child_count * sizeof (*old_sources)); - } - - nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, - sh->sources, sh->success_children, - AFR_DATA_TRANSACTION, NULL, _gf_true); - if ((nsources == -1) - && (priv->favorite_child != -1) - && (sh->child_errno[priv->favorite_child] == 0)) { - - gf_log (this->name, GF_LOG_DEBUG, - "Picking favorite child %s as authentic source to " - "resolve conflicting data of %s", - priv->children[priv->favorite_child]->name, - local->loc.path); - - sh->sources[priv->favorite_child] = 1; - - nsources = afr_sh_source_count (sh->sources, - priv->child_count); - } - - if (nsources == -1) { - afr_sh_print_split_brain_log (sh->pending_matrix, this, - local->loc.path); - afr_set_split_brain (this, sh->inode, DONT_KNOW, SPB); - - afr_sh_data_fail (frame, this); - return 0; - } - - afr_set_split_brain (this, sh->inode, DONT_KNOW, NO_SPB); - - ret = afr_sh_inode_set_read_ctx (sh, this); - if (ret) { - gf_log (this->name, GF_LOG_DEBUG, - "No active sources found."); - - afr_sh_data_fail (frame, this); - return 0; - } - - if (sh->sync_done) { - /* Perform setattr from one of the old_sources if possible - * Because only they have the correct mtime, the new sources - * (i.e. old sinks) have mtime from last writev in sync. - */ - tstamp_source = sh->source; - for (i = 0; i < priv->child_count; i++) { - if (old_sources[i] && sh->sources[i]) - tstamp_source = i; - } - afr_sh_data_setattr (frame, this, &sh->buf[tstamp_source]); - } else { - afr_set_data_sh_info_str (local, sh, this); - if (nsources == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "No self-heal needed for %s", - local->loc.path); - - afr_sh_data_finish (frame, this); - return 0; - } - - if (sh->do_data_self_heal && - afr_data_self_heal_enabled (priv->data_self_heal)) - afr_sh_data_fix (frame, this); - else - afr_sh_data_finish (frame, this); - } - return 0; -} + ret = afr_selfheal_data_fsync (frame, this, fd, healed_sinks); -int -afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, - dict_t **xattr, - afr_transaction_type txn_type, - uuid_t gfid) -{ - afr_private_t *priv = NULL; - int read_child = -1; - int32_t **pending_matrix = NULL; - int32_t *sources = NULL; - int32_t *success_children = NULL; - struct iatt *bufs = NULL; - int32_t nsources = 0; - int32_t prev_read_child = -1; - int32_t config_read_child = -1; - int32_t subvol_status = 0; - - priv = this->private; - bufs = local->cont.lookup.bufs; - success_children = local->cont.lookup.success_children; - - pending_matrix = local->cont.lookup.pending_matrix; - sources = local->cont.lookup.sources; - memset (sources, 0, sizeof (*sources) * priv->child_count); - - nsources = afr_build_sources (this, xattr, bufs, pending_matrix, - sources, success_children, txn_type, - &subvol_status, _gf_false); - if (subvol_status & SPLIT_BRAIN) { - gf_log (this->name, GF_LOG_DEBUG, "%s: Possible split-brain", - local->loc.path); - switch (txn_type) { - case AFR_DATA_TRANSACTION: - local->cont.lookup.possible_spb = _gf_true; - nsources = 1; - sources[success_children[0]] = 1; - break; - case AFR_ENTRY_TRANSACTION: - read_child = afr_get_no_xattr_dir_read_child (this, - success_children, - bufs); - sources[read_child] = 1; - nsources = 1; - break; - default: - break; - } - } - if (nsources < 0) - goto out; - - prev_read_child = local->read_child_index; - config_read_child = priv->read_child; - read_child = afr_select_read_child_from_policy (success_children, - priv->child_count, - prev_read_child, - config_read_child, - sources, - priv->hash_mode, gfid); out: - gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d", - read_child); - return read_child; + if (iter_frame) + AFR_STACK_DESTROY (iter_frame); + return ret; } -int -afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf, dict_t *xdata) + +static int +__afr_selfheal_truncate_sinks (call_frame_t *frame, xlator_t *this, + fd_t *fd, unsigned char *healed_sinks, + struct afr_reply *replies, uint64_t size) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int call_count = -1; - int child_index = (long) cookie; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - gf_log (this->name, GF_LOG_TRACE, - "fstat of %s on %s succeeded", - local->loc.path, - priv->children[child_index]->name); - - sh->buf[child_index] = *buf; - sh->success_children[sh->success_count] = child_index; - sh->success_count++; - } else { - gf_log (this->name, GF_LOG_ERROR, "%s: fstat failed " - "on %s, reason %s", local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - sh->child_errno[child_index] = op_errno; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - /* Previous versions of glusterfs might have set - * the pending data xattrs which need to be erased - */ - if (!afr_sh_data_proceed (sh->success_count)) { - gf_log (this->name, GF_LOG_ERROR, "inspecting metadata " - "succeeded on < %d children, aborting " - "self-heal for %s", AFR_SH_MIN_PARTICIPANTS, - local->loc.path); - afr_sh_data_fail (frame, this); - goto out; - } - afr_sh_data_fxattrop_fstat_done (frame, this); - } -out: - return 0; -} + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + unsigned char *larger_sinks = 0; + int i = 0; + local = frame->local; + priv = this->private; -int -afr_sh_data_fstat (call_frame_t *frame, xlator_t *this) -{ - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - int child = 0; - int32_t *fstat_children = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - fstat_children = memdup (sh->success_children, - sizeof (*fstat_children) * priv->child_count); - if (!fstat_children) { - afr_sh_data_fail (frame, this); - goto out; - } - call_count = sh->success_count; - local->call_count = call_count; - - memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count); - afr_reset_children (sh->success_children, priv->child_count); - sh->success_count = 0; - for (i = 0; i < priv->child_count; i++) { - child = fstat_children[i]; - if (child == -1) - break; - STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk, - (void *) (long) child, - priv->children[child], - priv->children[child]->fops->fstat, - sh->healing_fd, NULL); - --call_count; - } - GF_ASSERT (!call_count); -out: - GF_FREE (fstat_children); - return 0; -} + larger_sinks = alloca0 (priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (healed_sinks[i] && replies[i].poststat.ia_size > size) + larger_sinks[i] = 1; + } -void -afr_sh_common_fxattrop_resp_handler (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int child_index = (long) cookie; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - gf_log (this->name, GF_LOG_TRACE, - "fxattrop of %s on %s succeeded", - local->loc.path, - priv->children[child_index]->name); - - sh->xattr[child_index] = dict_ref (xattr); - sh->success_children[sh->success_count] = child_index; - sh->success_count++; - } else { - gf_log (this->name, GF_LOG_ERROR, "fxattrop of %s " - "failed on %s, reason %s", local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - sh->child_errno[child_index] = op_errno; - } - } - UNLOCK (&frame->lock); -} + AFR_ONLIST (larger_sinks, frame, attr_cbk, ftruncate, fd, size, NULL); -int -afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr, dict_t *xdata) -{ - int call_count = -1; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - afr_sh_common_fxattrop_resp_handler (frame, cookie, this, op_ret, - op_errno, xattr); - - call_count = afr_frame_return (frame); - if (call_count == 0) { - if (!afr_sh_data_proceed (sh->success_count)) { - gf_log (this->name, GF_LOG_ERROR, "%s, inspecting " - "change log succeeded on < %d children", - local->loc.path, AFR_SH_MIN_PARTICIPANTS); - afr_sh_data_fail (frame, this); - goto out; - } - afr_sh_data_fstat (frame, this); - } -out: - return 0; + for (i = 0; i < priv->child_count; i++) + if (healed_sinks[i] && local->replies[i].op_ret == -1) + /* truncate() failed. Do NOT consider this server + as successfully healed. Mark it so. + */ + healed_sinks[i] = 0; + return 0; } - -int -afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this) -{ - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - dict_t **xattr_req; - int32_t *zero_pending = NULL; - int call_count = 0; - int i = 0; - int ret = 0; - int j; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - call_count = afr_up_children_count (local->child_up, - priv->child_count); - - local->call_count = call_count; - - xattr_req = GF_CALLOC(priv->child_count, sizeof(struct dict_t *), - gf_afr_mt_dict_t); - if (!xattr_req) - goto out; +/* + * If by chance there are multiple sources with differing sizes, select + * the largest file as the source. + * + * This can only happen if data was directly modified in the backend. + */ +static int +__afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + uint64_t size = 0; + int source = -1; + int locked_count = 0; + int sources_count = 0; + int healed_sinks_count = 0; + + priv = this->private; + + locked_count = AFR_COUNT (locked_on, priv->child_count); + sources_count = AFR_COUNT (sources, priv->child_count); + healed_sinks_count = AFR_COUNT (healed_sinks, priv->child_count); + + if (locked_count == healed_sinks_count || !sources_count) { + /* split brain */ + return -EIO; + } for (i = 0; i < priv->child_count; i++) { - xattr_req[i] = dict_new(); - if (!xattr_req[i]) { - ret = -1; - goto out; + if (!sources[i]) + continue; + if (size <= replies[i].poststat.ia_size) { + size = replies[i].poststat.ia_size; + source = i; } } for (i = 0; i < priv->child_count; i++) { - for (j = 0; j < priv->child_count; j++) { - zero_pending = GF_CALLOC (3, sizeof (*zero_pending), - gf_afr_mt_int32_t); - if (!zero_pending) { - ret = -1; - goto out; - } - ret = dict_set_dynptr (xattr_req[i], priv->pending_key[j], - zero_pending, - 3 * sizeof (*zero_pending)); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "Unable to set dict value"); - goto out; - } else { - zero_pending = NULL; - } + if (!sources[i]) + continue; + if (replies[i].poststat.ia_size < size) { + sources[i] = 0; + sinks[i] = 1; } } - afr_reset_xattr (sh->xattr, priv->child_count); - afr_reset_children (sh->success_children, priv->child_count); - memset (sh->child_errno, 0, - sizeof (*sh->child_errno) * priv->child_count); - sh->success_count = 0; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_sh_data_fxattrop_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - sh->healing_fd, GF_XATTROP_ADD_ARRAY, - xattr_req[i], NULL); - - if (!--call_count) - break; - } - } - -out: - if (xattr_req) { - for (i = 0; i < priv->child_count; i++) - if (xattr_req[i]) - dict_unref(xattr_req[i]); - GF_FREE(xattr_req); - } - - if (ret) { - GF_FREE (zero_pending); - afr_sh_data_fail (frame, this); - } - - return 0; + return source; } -int -afr_sh_data_big_lock_success (call_frame_t *frame, xlator_t *this) +/* + * __afr_selfheal_data_prepare: + * + * This function inspects the on-disk xattrs and determines which subvols + * are sources and sinks. + * + * The return value is the index of the subvolume to be used as the source + * for self-healing, or -1 if no healing is necessary/split brain. + */ +static int +__afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; - local = frame->local; - sh = &local->self_heal; + priv = this->private; - sh->data_lock_held = _gf_true; - afr_sh_data_fxattrop (frame, this); - return 0; -} + ret = afr_selfheal_unlocked_discover (frame, fd->inode, fd->inode->gfid, + replies); + if (ret) + return ret; -int -afr_sh_dom_lock_success (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + ret = afr_selfheal_find_direction (frame, this, replies, + AFR_DATA_TRANSACTION, + locked_on, sources, sinks); + if (ret) + return ret; - local = frame->local; - sh = &local->self_heal; + source = __afr_selfheal_data_finalize_source (this, sources, sinks, + healed_sinks, locked_on, + replies); + if (source < 0) + return -EIO; - sh->sh_dom_lock_held = _gf_true; - afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, - afr_sh_data_big_lock_success, - afr_sh_data_fail); - return 0; -} + for (i = 0; i < priv->child_count; i++) + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). -int -afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + healed_sinks[i] = sinks[i] && locked_on[i]; - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; + return source; +} - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "Blocking data inodelks " - "failed for %s. by %s", - local->loc.path, lkowner_utoa (&frame->root->lk_owner)); - sh->data_lock_failure_handler (frame, this); - } else { +static int +__afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on) +{ + afr_private_t *priv = NULL; + int ret = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *healed_sinks = NULL; + struct afr_reply *locked_replies = NULL; + int source = -1; + gf_boolean_t compat = _gf_false; + unsigned char *compat_lock = NULL; + + priv = this->private; + + sources = alloca0 (priv->child_count); + sinks = alloca0 (priv->child_count); + healed_sinks = alloca0 (priv->child_count); + data_lock = alloca0 (priv->child_count); + compat_lock = alloca0 (priv->child_count); + + locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); + + ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, 0, 0, + data_lock); + { + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; + } - gf_log (this->name, GF_LOG_DEBUG, "Blocking data inodelks " - "done for %s by %s. Proceding to self-heal", - local->loc.path, lkowner_utoa (&frame->root->lk_owner)); + ret = __afr_selfheal_data_prepare (frame, this, fd, data_lock, + sources, sinks, healed_sinks, + locked_replies); + if (ret < 0) + goto unlock; + + source = ret; + + ret = __afr_selfheal_truncate_sinks (frame, this, fd, healed_sinks, + locked_replies, + locked_replies[source].poststat.ia_size); + if (ret < 0) + goto unlock; + + ret = 0; + + /* Locking from (LLONG_MAX - 2) to (LLONG_MAX - 1) is for + compatibility with older self-heal clients which do not + hold a lock in the @priv->sh_domain domain to guard + against concurrent ongoing self-heals + */ + afr_selfheal_inodelk (frame, this, fd->inode, this->name, + LLONG_MAX - 2, 1, compat_lock); + compat = _gf_true; + } +unlock: + afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0, + data_lock); + if (ret < 0) + goto out; - sh->data_lock_success_handler (frame, this); - } + ret = afr_selfheal_data_do (frame, this, fd, source, healed_sinks, + locked_replies); + if (ret) + goto out; - return 0; + ret = afr_selfheal_undo_pending (frame, this, fd->inode, sources, sinks, + healed_sinks, AFR_DATA_TRANSACTION, + locked_replies, data_lock); +out: + if (compat) + afr_selfheal_uninodelk (frame, this, fd->inode, this->name, + LLONG_MAX - 2, 1, compat_lock); + return ret; } -int -afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " - "failed for %s. by %s", - local->loc.path, lkowner_utoa (&frame->root->lk_owner)); - - if (!sh->data_lock_block) { - sh->data_lock_failure_handler(frame, this); - } else { - int_lock->lock_cbk = - afr_sh_data_post_blocking_inodelk_cbk; - afr_blocking_lock (frame, this); - } - } else { - - gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks " - "done for %s by %s. Proceeding to self-heal", - local->loc.path, lkowner_utoa (&frame->root->lk_owner)); - sh->data_lock_success_handler (frame, this); - } - - return 0; -} -int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, char *dom, - off_t start, off_t len) +static fd_t * +afr_selfheal_data_open (xlator_t *this, inode_t *inode) { - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->transaction_lk_type = AFR_SELFHEAL_LK; - int_lock->selfheal_lk_type = AFR_DATA_SELF_HEAL_LK; + loc_t loc = {0,}; + int ret = 0; + fd_t *fd = NULL; - afr_set_lock_number (frame, this); + fd = fd_create (inode, 0); + if (!fd) + return NULL; - int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); - int_lock->domain = dom; - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - inodelk->flock.l_start = start; - inodelk->flock.l_len = len; - inodelk->flock.l_type = F_WRLCK; - - afr_nonblocking_inodelk (frame, this); - - return 0; -} - -int -afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - GF_ASSERT (sh->old_loop_frame); - sh_loop_finish (sh->old_loop_frame, this); - sh->old_loop_frame = NULL; - sh->data_lock_held = _gf_true; - sh->sync_done = _gf_true; - afr_sh_data_fxattrop (frame, this); - return 0; -} - -int -afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + ret = syncop_open (this, &loc, O_RDWR|O_LARGEFILE, fd); + if (ret) { + fd_unref (fd); + fd = NULL; + } else { + fd_bind (fd); + } - local = frame->local; - sh = &local->self_heal; + loc_wipe (&loc); - GF_ASSERT (sh->old_loop_frame); - sh_loop_finish (sh->old_loop_frame, this); - sh->old_loop_frame = NULL; - afr_sh_set_timestamps (frame, this); - return 0; + return fd; } int -afr_sh_data_lock (call_frame_t *frame, xlator_t *this, - off_t start, off_t len, gf_boolean_t block, - char *dom, afr_lock_cbk_t success_handler, - afr_lock_cbk_t failure_handler) +afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode) { - afr_local_t * local = NULL; - afr_self_heal_t * sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh->data_lock_success_handler = success_handler; - sh->data_lock_failure_handler = failure_handler; - sh->data_lock_block = block; - return afr_sh_data_lock_rec (frame, this, dom, start, len); -} + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + int ret = 0; + fd_t *fd = NULL; -int -afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - /* TODO: some of the open's might fail. - In that case, modify cleanup fn to send flush on those - fd's which are already open */ - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "open of %s failed on child %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } else { - gf_log (this->name, GF_LOG_TRACE, - "open of %s succeeded on child %s", - local->loc.path, - priv->children[child_index]->name); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - afr_sh_data_fail (frame, this); - return 0; - } - - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened, commencing sync", - local->loc.path); - - afr_sh_data_lock (frame, this, 0, 0, _gf_true, priv->sh_domain, - afr_sh_dom_lock_success, afr_sh_data_fail); - } - - return 0; -} + priv = this->private; + fd = afr_selfheal_data_open (this, inode); + if (!fd) + return -EIO; -int -afr_sh_data_open (call_frame_t *frame, xlator_t *this) -{ - int i = 0; - int call_count = 0; - fd_t *fd = NULL; - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - call_count = afr_up_children_count (local->child_up, priv->child_count); - local->call_count = call_count; - - fd = fd_create (local->loc.inode, frame->root->pid); - sh->healing_fd = fd; - - /* open sinks */ - for (i = 0; i < priv->child_count; i++) { - if(!local->child_up[i]) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->open, - &local->loc, - O_RDWR|O_LARGEFILE, fd, NULL); - - if (!--call_count) - break; - } - - return 0; -} + locked_on = alloca0 (priv->child_count); -void -afr_sh_non_reg_fix (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - int i = 0; - - if (op_ret < 0) { - afr_sh_data_fail (frame, this); - return; - } - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - for (i = 0; i < priv->child_count ; i++) { - if (1 == local->child_up[i]) - sh->success[i] = 1; - } - - afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION, - afr_sh_data_erase_pending_cbk, - afr_sh_data_finish); -} + ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0, + locked_on); + { + if (ret < 2) { + /* Either less than two subvols available, or another + selfheal (from another server) is in progress. Skip + for now in any case there isn't anything to do. + */ + ret = -ENOTCONN; + goto unlock; + } -int -afr_sh_non_reg_lock_success (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - sh->data_lock_held = _gf_true; - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_non_reg_fix, NULL, - AFR_LOOKUP_FAIL_CONFLICTS | - AFR_LOOKUP_FAIL_MISSING_GFIDS, - NULL); - return 0; -} + ret = __afr_selfheal_data (frame, this, fd, locked_on); + } +unlock: + afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on); -gf_boolean_t -afr_can_start_data_self_heal (afr_self_heal_t *sh, afr_private_t *priv) -{ - if (sh->force_confirm_spb) - return _gf_true; - if (sh->do_data_self_heal && - afr_data_self_heal_enabled (priv->data_self_heal)) - return _gf_true; - return _gf_false; -} + if (fd) + fd_unref (fd); -int -afr_self_heal_data (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = this->private; - int ret = -1; - - local = frame->local; - sh = &local->self_heal; - - sh->sh_type_in_action = AFR_SELF_HEAL_DATA; - - if (afr_can_start_data_self_heal (sh, priv)) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); - ret = afr_inodelk_init (&local->internal_lock.inodelk[1], - priv->sh_domain, priv->child_count); - if (ret < 0) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_data_done (frame, this); - return 0; - } - - if (IA_ISREG (sh->type)) { - afr_sh_data_open (frame, this); - } else { - afr_sh_data_lock (frame, this, 0, 0, _gf_true, - this->name, - afr_sh_non_reg_lock_success, - afr_sh_data_fail); - } - } else { - gf_log (this->name, GF_LOG_TRACE, - "not doing data self heal on %s", - local->loc.path); - afr_sh_data_done (frame, this); - } - - return 0; + return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 00f1a9cb9..9605d69f4 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. + Copyright (c) 2013 Red Hat, Inc. This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,2399 +8,624 @@ cases as published by the Free Software Foundation. */ -#include -#include -#include -#include -#include -#include #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" #endif -#include "glusterfs.h" -#include "inode.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" +#include "afr-self-heal.h" #include "byte-order.h" - #include "afr-transaction.h" -#include "afr-self-heal.h" -#include "afr-self-heal-common.h" - -#define AFR_INIT_SH_FRAME_VALS(_frame, _local, _sh, _sh_frame, _sh_local, _sh_sh)\ - do {\ - _local = _frame->local;\ - _sh = &_local->self_heal;\ - _sh_frame = _sh->sh_frame;\ - _sh_local = _sh_frame->local;\ - _sh_sh = &_sh_local->self_heal;\ - } while (0); - -int -afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this, - int child_index); -int -afr_sh_entry_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - sh->completion_cbk (frame, this); - - return 0; -} - - -int -afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->lock_cbk = afr_sh_entry_done; - afr_unlock (frame, this); - - return 0; -} - - -int -afr_sh_entry_finish (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "finishing entry selfheal of %s", local->loc.path); - - afr_sh_entry_unlock (frame, this); - - return 0; -} - - -int -afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr, dict_t *xdata) -{ - long i = 0; - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_local_t *orig_local = NULL; - call_frame_t *orig_frame = NULL; - afr_private_t *priv = NULL; - int32_t read_child = -1; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - i = (long)cookie; - - - afr_children_add_child (sh->fresh_children, i, priv->child_count); - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to erase pending xattrs on %s (%s)", - local->loc.path, priv->children[i]->name, - strerror (op_errno)); - } - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (sh->source == -1) { - //this happens if the forced merge option is set - read_child = sh->fresh_children[0]; - } else { - read_child = sh->source; - } - afr_inode_set_read_ctx (this, sh->inode, read_child, - sh->fresh_children); - orig_frame = sh->orig_frame; - orig_local = orig_frame->local; - - if (sh->source != -1) { - orig_local->cont.lookup.buf.ia_nlink = sh->buf[sh->source].ia_nlink; - } - - afr_sh_entry_finish (frame, this); - } - - return 0; -} - - -int -afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - if (sh->entries_skipped) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - goto out; - } - afr_sh_erase_pending (frame, this, AFR_ENTRY_TRANSACTION, - afr_sh_entry_erase_pending_cbk, - afr_sh_entry_finish); - return 0; -out: - afr_sh_entry_finish (frame, this); - return 0; -} - - - -static int -next_active_source (call_frame_t *frame, xlator_t *this, - int current_active_source) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int source = -1; - int next_active_source = -1; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - source = sh->source; - - if (source != -1) { - if (current_active_source != source) - next_active_source = source; - goto out; - } - - /* - the next active sink becomes the source for the - 'conservative decision' of merging all entries - */ - - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) - && (local->child_up[i] == 1) - && (i > current_active_source)) { - - next_active_source = i; - break; - } - } -out: - return next_active_source; -} - static int -next_active_sink (call_frame_t *frame, xlator_t *this, - int current_active_sink) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int next_active_sink = -1; - int i = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - /* - the next active sink becomes the source for the - 'conservative decision' of merging all entries - */ - - for (i = 0; i < priv->child_count; i++) { - if ((sh->sources[i] == 0) - && (local->child_up[i] == 1) - && (i > current_active_sink)) { - - next_active_sink = i; - break; - } - } - - return next_active_sink; -} - -int -afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this); - -int -afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this); +afr_selfheal_entry_delete (call_frame_t *frame, xlator_t *this, inode_t *dir, + const char *name, inode_t *inode, int child, + struct afr_reply *replies) +{ + afr_private_t *priv = NULL; + xlator_t *subvol = NULL; + int ret = 0; + loc_t loc = {0, }; + char g[64]; + + priv = this->private; + + subvol = priv->children[child]; + + loc.parent = inode_ref (dir); + uuid_copy (loc.pargfid, dir->gfid); + loc.name = name; + loc.inode = inode_ref (inode); + + if (replies[child].valid && replies[child].op_ret == 0) { + switch (replies[child].poststat.ia_type) { + case IA_IFDIR: + gf_log (this->name, GF_LOG_WARNING, + "expunging dir %s/%s (%s) on %s", + uuid_utoa (dir->gfid), name, + uuid_utoa_r (replies[child].poststat.ia_gfid, g), + subvol->name); + ret = syncop_rmdir (subvol, &loc, 1); + break; + default: + gf_log (this->name, GF_LOG_WARNING, + "expunging file %s/%s (%s) on %s", + uuid_utoa (dir->gfid), name, + uuid_utoa_r (replies[child].poststat.ia_gfid, g), + subvol->name); + ret = syncop_unlink (subvol, &loc); + break; + } + } + + loc_wipe (&loc); + + return ret; +} + + +int +afr_selfheal_recreate_entry (call_frame_t *frame, xlator_t *this, int dst, + int source, inode_t *dir, const char *name, + inode_t *inode, struct afr_reply *replies) +{ + int ret = 0; + loc_t loc = {0,}; + loc_t srcloc = {0,}; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + struct iatt *iatt = NULL; + char *linkname = NULL; + mode_t mode = 0; + struct iatt newent = {0,}; + + priv = this->private; + + xdata = dict_new(); + if (!xdata) + return -ENOMEM; + + loc.parent = inode_ref (dir); + uuid_copy (loc.pargfid, dir->gfid); + loc.name = name; + loc.inode = inode_ref (inode); + + ret = afr_selfheal_entry_delete (frame, this, dir, name, inode, dst, + replies); + if (ret) + goto out; + + ret = dict_set_static_bin (xdata, "gfid-req", + replies[source].poststat.ia_gfid, 16); + if (ret) + goto out; + + iatt = &replies[source].poststat; + + srcloc.inode = inode_ref (inode); + uuid_copy (srcloc.gfid, iatt->ia_gfid); + + mode = st_mode_from_ia (iatt->ia_prot, iatt->ia_type); + + switch (iatt->ia_type) { + case IA_IFDIR: + ret = syncop_mkdir (priv->children[dst], &loc, mode, xdata, 0); + break; + case IA_IFLNK: + ret = syncop_lookup (priv->children[dst], &srcloc, 0, 0, 0, 0); + if (ret == 0) { + ret = syncop_link (priv->children[dst], &srcloc, &loc); + } else { + ret = syncop_readlink (priv->children[source], &srcloc, + &linkname, 4096); + if (ret <= 0) + goto out; + ret = syncop_symlink (priv->children[dst], &loc, linkname, + xdata, NULL); + } + break; + default: + ret = dict_set_int32 (xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1); + if (ret) + goto out; + ret = syncop_mknod (priv->children[dst], &loc, mode, + iatt->ia_rdev, xdata, &newent); + if (ret == 0 && iatt->ia_size && !newent.ia_size) { + /* New entry created. Mark @dst pending on all sources */ + ret = 1; + } + break; + } -int -afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this); - -int -afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src); - -int -afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this, - int active_src, int32_t op_ret, - int32_t op_errno) -{ - int call_count = 0; - - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_entry_expunge_subvol (frame, this, active_src); - - return 0; -} - -int -afr_sh_entry_expunge_parent_setattr_cbk (call_frame_t *expunge_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, - dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = (long) cookie; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - - if (op_ret != 0) { - gf_log (this->name, GF_LOG_ERROR, - "setattr on parent directory of %s on subvolume %s failed: %s", - expunge_local->loc.path, - priv->children[active_src]->name, strerror (op_errno)); - } - - AFR_STACK_DESTROY (expunge_frame); - sh->expunge_done (frame, this, active_src, op_ret, op_errno); - - return 0; -} - - -int -afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int active_src = 0; - int32_t valid = 0; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - - active_src = (long) cookie; - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "removed %s on %s", - expunge_local->loc.path, - priv->children[active_src]->name); - } else { - gf_log (this->name, GF_LOG_INFO, - "removing %s on %s failed (%s)", - expunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - } - - valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_parent_setattr_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->setattr, - &expunge_sh->parent_loc, - &expunge_sh->parentbuf, - valid, NULL); - - return 0; -} - - -int -afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "expunging file %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->unlink, - &expunge_local->loc, 0, NULL); - - return 0; -} - - - -int -afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - - gf_log (this->name, GF_LOG_DEBUG, - "expunging directory %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->rmdir, - &expunge_local->loc, 1, NULL); - - return 0; -} - - -int -afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, - int active_src, struct iatt *buf, - struct iatt *parentbuf) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - call_frame_t *frame = NULL; - int type = 0; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - loc_t *loc = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - local = frame->local; - sh = &local->self_heal; - loc = &expunge_local->loc; - - type = buf->ia_type; - if (loc->parent && uuid_is_null (loc->parent->gfid)) - uuid_copy (loc->pargfid, parentbuf->ia_gfid); - - switch (type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - case IA_IFLNK: - afr_sh_entry_expunge_unlink (expunge_frame, this, active_src); - break; - case IA_IFDIR: - afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "%s has unknown file type on %s: 0%o", - expunge_local->loc.path, - priv->children[active_src]->name, type); - goto out; - break; - } - - return 0; -out: - AFR_STACK_DESTROY (expunge_frame); - sh->expunge_done (frame, this, active_src, -1, EINVAL); - - return 0; -} - - -int -afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *x, - struct iatt *postparent) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - call_frame_t *frame = NULL; - int active_src = 0; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - active_src = (long) cookie; - local = frame->local; - sh = &local->self_heal; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "lookup of %s on %s failed (%s)", - expunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - goto out; - } - - afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf, - postparent); - - return 0; out: - AFR_STACK_DESTROY (expunge_frame); - sh->expunge_done (frame, this, active_src, op_ret, op_errno); - - return 0; + if (xdata) + dict_unref (xdata); + loc_wipe (&loc); + loc_wipe (&srcloc); + return ret; } -int -afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", - expunge_local->loc.path, priv->children[active_src]->name); - - STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk, - (void *) (long) active_src, - priv->children[active_src], - priv->children[active_src]->fops->lookup, - &expunge_local->loc, NULL); - - return 0; -} - -int -afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, dict_t *x, - struct iatt *postparent) -{ - afr_private_t *priv = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int source = 0; - call_frame_t *frame = NULL; - int active_src = 0; - int need_expunge = 0; - afr_self_heal_t *sh = NULL; - afr_local_t *local = NULL; - - priv = this->private; - expunge_local = expunge_frame->local; - expunge_sh = &expunge_local->self_heal; - frame = expunge_sh->sh_frame; - active_src = expunge_sh->active_source; - source = (long) cookie; - local = frame->local; - sh = &local->self_heal; - - if (op_ret == -1 && op_errno == ENOENT) - need_expunge = 1; - else if (op_ret == -1) - goto out; - - if (!uuid_is_null (expunge_sh->entrybuf.ia_gfid) && - !uuid_is_null (buf->ia_gfid) && - (uuid_compare (expunge_sh->entrybuf.ia_gfid, buf->ia_gfid) != 0)) { - char uuidbuf1[64]; - char uuidbuf2[64]; - gf_log (this->name, GF_LOG_DEBUG, - "entry %s found on %s with mismatching gfid (%s/%s)", - expunge_local->loc.path, - priv->children[source]->name, - uuid_utoa_r (expunge_sh->entrybuf.ia_gfid, uuidbuf1), - uuid_utoa_r (buf->ia_gfid, uuidbuf2)); - need_expunge = 1; - } - - if (need_expunge) { - gf_log (this->name, GF_LOG_INFO, - "Entry %s is missing on %s and deleting from " - "replica's other bricks", - expunge_local->loc.path, - priv->children[source]->name); - - if (postparent) - expunge_sh->parentbuf = *postparent; - - afr_sh_entry_expunge_purge (expunge_frame, this, active_src); - - return 0; - } - -out: - if (op_ret == 0) { - gf_log (this->name, GF_LOG_TRACE, - "%s exists under %s", - expunge_local->loc.path, - priv->children[source]->name); - } else { - gf_log (this->name, GF_LOG_INFO, - "looking up %s under %s failed (%s)", - expunge_local->loc.path, - priv->children[source]->name, - strerror (op_errno)); - } - - AFR_STACK_DESTROY (expunge_frame); - sh->expunge_done (frame, this, active_src, op_ret, op_errno); - - return 0; -} - -static gf_boolean_t -can_skip_entry_self_heal (char *name, loc_t *parent_loc) -{ - if (strcmp (name, ".") == 0) { - return _gf_true; - } else if (strcmp (name, "..") == 0) { - return _gf_true; - } else if (loc_is_root (parent_loc) && - (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0)) { - return _gf_true; - } - return _gf_false; -} - -int -afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, - gf_dirent_t *entry) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int ret = -1; - call_frame_t *expunge_frame = NULL; - afr_local_t *expunge_local = NULL; - afr_self_heal_t *expunge_sh = NULL; - int active_src = 0; - int source = 0; - int op_errno = 0; - char *name = NULL; - int op_ret = -1; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - source = sh->source; - sh->expunge_done = afr_sh_entry_expunge_entry_done; - - name = entry->d_name; - if (can_skip_entry_self_heal (name, &local->loc)) { - op_ret = 0; - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "inspecting existence of %s under %s", - name, local->loc.path); - - expunge_frame = copy_frame (frame); - if (!expunge_frame) { - op_errno = ENOMEM; - goto out; - } - - AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out); - - expunge_frame->local = expunge_local; - expunge_sh = &expunge_local->self_heal; - expunge_sh->sh_frame = frame; - expunge_sh->active_source = active_src; - expunge_sh->entrybuf = entry->d_stat; - loc_copy (&expunge_sh->parent_loc, &local->loc); - - ret = afr_build_child_loc (this, &expunge_local->loc, &local->loc, - name); - if (ret != 0) { - op_errno = EINVAL; - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "looking up %s on %s", expunge_local->loc.path, - priv->children[source]->name); - - STACK_WIND_COOKIE (expunge_frame, - afr_sh_entry_expunge_entry_cbk, - (void *) (long) source, - priv->children[source], - priv->children[source]->fops->lookup, - &expunge_local->loc, NULL); - - ret = 0; -out: - if (ret == -1) - sh->expunge_done (frame, this, active_src, op_ret, op_errno); - - return 0; -} - - -int -afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - gf_dirent_t *entry = NULL; - off_t last_offset = 0; - int active_src = 0; - int entry_count = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - - if (op_ret <= 0) { - if (op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "readdir of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - } else { - gf_log (this->name, GF_LOG_TRACE, - "readdir of %s on subvolume %s complete", - local->loc.path, - priv->children[active_src]->name); - } - - afr_sh_entry_expunge_all (frame, this); - return 0; - } - - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; - entry_count++; - } - - gf_log (this->name, GF_LOG_TRACE, - "readdir'ed %d entries from %s", - entry_count, priv->children[active_src]->name); - - sh->offset = last_offset; - local->call_count = entry_count; - - list_for_each_entry (entry, &entries->list, list) { - afr_sh_entry_expunge_entry (frame, this, entry); - } - - return 0; -} - -int -afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, - int active_src) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk, - priv->children[active_src], - priv->children[active_src]->fops->readdirp, - sh->healing_fd, sh->block_size, sh->offset, NULL); - - return 0; -} - - -int -afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this) +static int +afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, struct afr_reply *replies, + unsigned char *sources, unsigned char *newentry) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int active_src = -1; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; + int ret = 0; + int i = 0; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int **changelog = NULL; + int idx = 0; - sh->offset = 0; + priv = this->private; - if (sh->source == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "no active sources for %s to expunge entries", - local->loc.path); - goto out; - } + idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - active_src = next_active_sink (frame, this, sh->active_source); - sh->active_source = active_src; + uuid_copy (inode->gfid, replies[source].poststat.ia_gfid); - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - goto out; - } + changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); - if (active_src == -1) { - /* completed creating missing files on all subvolumes */ - goto out; - } + xattr = dict_new(); + if (!xattr) + return -ENOMEM; - gf_log (this->name, GF_LOG_TRACE, - "expunging entries of %s on %s to other sinks", - local->loc.path, priv->children[active_src]->name); + for (i = 0; i < priv->child_count; i++) { + if (!newentry[i]) + continue; + changelog[i][idx] = hton32(1); + } - afr_sh_entry_expunge_subvol (frame, this, active_src); + afr_set_pending_dict (priv, xattr, changelog); - return 0; -out: - afr_sh_entry_impunge_all (frame, this); - return 0; + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + afr_selfheal_post_op (frame, this, inode, i, xattr); + } + dict_unref (xattr); + return ret; } -int -afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - if (op_ret < 0) - sh->entries_skipped = _gf_true; - call_count = afr_frame_return (frame); - if (call_count == 0) - afr_sh_entry_impunge_subvol (frame, this); - - return 0; +static int +__afr_selfheal_heal_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *inode, int source, + unsigned char *sources, unsigned char *healed_sinks, + unsigned char *locked_on, struct afr_reply *replies) +{ + int ret = 0; + afr_private_t *priv = NULL; + int i = 0; + unsigned char *newentry = NULL; + + priv = this->private; + newentry = alloca0 (priv->child_count); + + if (!replies[source].valid) + return -EIO; + + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + if (replies[source].op_ret == -1 && + replies[source].op_errno == ENOENT) { + ret = afr_selfheal_entry_delete (frame, this, fd->inode, + name, inode, i, replies); + } else { + if (!uuid_compare (replies[i].poststat.ia_gfid, + replies[source].poststat.ia_gfid)) + continue; + + ret = afr_selfheal_recreate_entry (frame, this, i, source, + fd->inode, name, inode, + replies); + if (ret > 0) { + newentry[i] = 1; + ret = 0; + } + } + if (ret < 0) + break; + } + + if (AFR_COUNT (newentry, priv->child_count)) + afr_selfheal_newentry_mark (frame, this, inode, source, replies, + sources, newentry); + return ret; } -void -afr_sh_entry_call_impunge_done (call_frame_t *impunge_frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *impunge_local = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; - AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, - frame, local, sh); - - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, op_ret, op_errno); -} - -int -afr_sh_entry_impunge_setattr_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, - dict_t *xdata) +static int +__afr_selfheal_merge_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *inode, unsigned char *sources, + unsigned char *healed_sinks, unsigned char *locked_on, + struct afr_reply *replies) { - int call_count = 0; - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - int child_index = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - child_index = (long) cookie; - - if (op_ret == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "setattr done for %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - } else { - gf_log (this->name, GF_LOG_INFO, - "setattr (%s) on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - } - - call_count = afr_frame_return (impunge_frame); - if (call_count == 0) { - afr_sh_entry_call_impunge_done (impunge_frame, this, - 0, op_errno); - } - - return 0; -} + int ret = 0; + afr_private_t *priv = NULL; + int i = 0; + int source = -1; -int -afr_sh_entry_impunge_parent_setattr_cbk (call_frame_t *setattr_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, - dict_t *xdata) -{ - int call_count = 0; - afr_local_t *setattr_local = NULL; - - setattr_local = setattr_frame->local; - if (op_ret != 0) { - gf_log (this->name, GF_LOG_INFO, - "setattr on parent directory (%s) failed: %s", - setattr_local->loc.path, strerror (op_errno)); - } - - call_count = afr_frame_return (setattr_frame); - if (call_count == 0) - AFR_STACK_DESTROY (setattr_frame); - return 0; -} + priv = this->private; -int -afr_sh_entry_impunge_setattr (call_frame_t *impunge_frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_local_t *setattr_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *setattr_frame = NULL; - int32_t valid = 0; - int32_t op_errno = 0; - int child_index = 0; - int call_count = 0; - int i = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - - gf_log (this->name, GF_LOG_DEBUG, - "setting ownership of %s on %s to %d/%d", - impunge_local->loc.path, - priv->children[child_index]->name, - impunge_sh->entrybuf.ia_uid, - impunge_sh->entrybuf.ia_gid); - - setattr_frame = copy_frame (impunge_frame); - if (!setattr_frame) { - op_errno = ENOMEM; - goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (setattr_frame->local, out); - setattr_local = setattr_frame->local; - call_count = afr_errno_count (NULL, impunge_sh->child_errno, - priv->child_count, 0); - loc_copy (&setattr_local->loc, &impunge_sh->parent_loc); - impunge_local->call_count = call_count; - setattr_local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (impunge_sh->child_errno[i]) - continue; - valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - STACK_WIND_COOKIE (setattr_frame, - afr_sh_entry_impunge_parent_setattr_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->setattr, - &setattr_local->loc, - &impunge_sh->parentbuf, valid, NULL); - - valid = GF_SET_ATTR_UID | GF_SET_ATTR_GID | - GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - STACK_WIND_COOKIE (impunge_frame, - afr_sh_entry_impunge_setattr_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->setattr, - &impunge_local->loc, - &impunge_sh->entrybuf, valid, NULL); - call_count--; - } - GF_ASSERT (!call_count); - return 0; -out: - if (setattr_frame) - AFR_STACK_DESTROY (setattr_frame); - afr_sh_entry_call_impunge_done (impunge_frame, this, 0, op_errno); - return 0; -} + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid && replies[i].op_ret == 0) { + source = i; + break; + } + } -int -afr_sh_entry_impunge_xattrop_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - dict_t *xattr, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - int child_index = 0; - int call_count = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "%s: failed to perform xattrop on %s (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, strerror (op_errno)); - - LOCK (&impunge_frame->lock); - { - impunge_local->op_ret = -1; - impunge_local->op_errno = op_errno; - } - UNLOCK (&impunge_frame->lock); - } - - call_count = afr_frame_return (impunge_frame); - - if (call_count == 0) { - if (impunge_local->op_ret == 0) { - afr_sh_entry_impunge_setattr (impunge_frame, this); - } else { - afr_sh_entry_call_impunge_done (impunge_frame, this, - -1, impunge_local->op_errno); - } - } - return 0; -} + if (source == -1) { + /* entry got deleted in the mean time? */ + return 0; + } -int -afr_sh_entry_impunge_perform_xattrop (call_frame_t *impunge_frame, - xlator_t *this) -{ - int active_src = 0; - dict_t *xattr = NULL; - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int32_t op_errno = 0; - int32_t call_count = 0; - int32_t i = 0; - - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - active_src = impunge_sh->active_source; - impunge_local->op_ret = 0; - - afr_prepare_new_entry_pending_matrix (impunge_local->pending, - afr_is_errno_unset, - impunge_sh->child_errno, - &impunge_sh->entrybuf, - priv->child_count); - xattr = dict_new (); - if (!xattr) { - op_errno = ENOMEM; - goto out; - } - - afr_set_pending_dict (priv, xattr, impunge_local->pending, active_src, - LOCAL_LAST); - - for (i = 0; i < priv->child_count; i++) { - if ((impunge_sh->child_errno[i] == EEXIST) && - (impunge_local->child_up[i] == 1)) - - call_count++; - } - - impunge_local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - - if ((impunge_sh->child_errno[i] == EEXIST) - && (impunge_local->child_up[i] == 1)) { - - - STACK_WIND_COOKIE (impunge_frame, - afr_sh_entry_impunge_xattrop_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &impunge_local->loc, - GF_XATTROP_ADD_ARRAY, xattr, NULL); - if (!--call_count) - break; - } - } - - if (xattr) - dict_unref (xattr); - return 0; -out: - afr_sh_entry_call_impunge_done (impunge_frame, this, - -1, op_errno); - return 0; -} + for (i = 0; i < priv->child_count; i++) { + if (i == source || !healed_sinks[i]) + continue; -int -afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *stbuf, - struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - int call_count = 0; - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - - child_index = (long) cookie; - - if (op_ret == -1) { - impunge_sh->child_errno[child_index] = op_errno; - gf_log (this->name, GF_LOG_ERROR, - "creation of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - } else { - impunge_sh->child_errno[child_index] = 0; - } - - call_count = afr_frame_return (impunge_frame); - if (call_count == 0) { - if (!afr_errno_count (NULL, impunge_sh->child_errno, - priv->child_count, 0)) { - // new_file creation failed every where - afr_sh_entry_call_impunge_done (impunge_frame, this, - -1, op_errno); - goto out; - } - afr_sh_entry_impunge_perform_xattrop (impunge_frame, this); - } -out: - return 0; -} + if (replies[i].op_errno != ENOENT) + continue; -int -afr_sh_entry_impunge_hardlink_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - int call_count = 0; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - - if (IA_IFLNK == impunge_sh->entrybuf.ia_type) { - //For symlinks impunge is attempted un-conditionally - //So the file can already exist. - if ((op_ret < 0) && (op_errno == EEXIST)) - op_ret = 0; - } - - call_count = afr_frame_return (impunge_frame); - if (call_count == 0) - afr_sh_entry_call_impunge_done (impunge_frame, this, - op_ret, op_errno); - - return 0; -} + ret = afr_selfheal_recreate_entry (frame, this, i, source, + fd->inode, name, inode, + replies); + } -int -afr_sh_entry_impunge_hardlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - loc_t *loc = NULL; - struct iatt *buf = NULL; - loc_t oldloc = {0}; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - loc = &impunge_local->loc; - buf = &impunge_sh->entrybuf; - - oldloc.inode = inode_ref (loc->inode); - uuid_copy (oldloc.gfid, buf->ia_gfid); - gf_log (this->name, GF_LOG_DEBUG, "linking missing file %s on %s", - loc->path, priv->children[child_index]->name); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_hardlink_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->link, - &oldloc, loc, NULL); - loc_wipe (&oldloc); - - return 0; + return ret; } -int -afr_sh_nameless_lookup_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - if (op_ret < 0) { - afr_sh_entry_impunge_create_file (impunge_frame, this, - (long)cookie); - } else { - afr_sh_entry_impunge_hardlink (impunge_frame, this, - (long)cookie); - } - return 0; -} -int -afr_sh_entry_impunge_check_hardlink (call_frame_t *impunge_frame, - xlator_t *this, - int child_index, struct iatt *stbuf) +static int +__afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, + char *name, inode_t *inode, int source, + unsigned char *sources, unsigned char *healed_sinks, + unsigned char *locked_on, struct afr_reply *replies) { - afr_private_t *priv = NULL; - call_frame_t *frame = NULL; - afr_local_t *impunge_local = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *impunge_sh = NULL; - afr_self_heal_t *sh = NULL; - loc_t *loc = NULL; - dict_t *xattr_req = NULL; - loc_t oldloc = {0}; - int ret = -1; - - priv = this->private; - AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, - frame, local, sh); - loc = &impunge_local->loc; - - xattr_req = dict_new (); - if (!xattr_req) - goto out; - oldloc.inode = inode_ref (loc->inode); - uuid_copy (oldloc.gfid, stbuf->ia_gfid); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_nameless_lookup_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->lookup, - &oldloc, xattr_req); - ret = 0; -out: - if (xattr_req) - dict_unref (xattr_req); - loc_wipe (&oldloc); - if (ret) - sh->impunge_done (frame, this, -1, ENOMEM); - return 0; -} + int ret = -1; -int -afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *stbuf) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - dict_t *dict = NULL; - int ret = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - - gf_log (this->name, GF_LOG_DEBUG, - "creating missing file %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - dict = dict_new (); - if (!dict) - gf_log (this->name, GF_LOG_ERROR, "Out of memory"); - - GF_ASSERT (!uuid_is_null (stbuf->ia_gfid)); - ret = afr_set_dict_gfid (dict, stbuf->ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", - impunge_local->loc.path); - - /* - * Reason for adding GLUSTERFS_INTERNAL_FOP_KEY : - * - * Problem: - * While a brick is down in a replica pair, lets say the user creates - * one file(file-A) and a hard link to that file(h-file-A). After the - * brick comes back up, entry self-heal is attempted on parent dir of - * these two files. As part of readdir in self-heal it reads both the - * entries file-A and h-file-A for both of them it does name less lookup - * to check if there are any hardlinks already present in the - * destination brick. It finds that there are no hard links already - * present for files file-A, h-file-A. Self-heal does mknods for both - * file-A and h-file-A. This leads to file-A and h-file-A not being - * hardlinks anymore. - * - * Fix: (More like shrinking of race-window, the race itself is still - * present in posix-mknod). - * If mknod comes with the presence of GLUSTERFS_INTERNAL_FOP_KEY then - * posix_mknod checks if there are already any gfid-links and does - * link() instead of mknod. There still can be a race where two - * posix_mknods same gfid see that - * gfid-link file is not present and proceeds with mknods and result in - * two different files with same gfid. - */ - ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); - if (ret) - gf_log (this->name, GF_LOG_INFO, "%s: %s set failed", - impunge_local->loc.path, GLUSTERFS_INTERNAL_FOP_KEY); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->mknod, - &impunge_local->loc, - st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), - makedev (ia_major (stbuf->ia_rdev), - ia_minor (stbuf->ia_rdev)), 0, dict); - - if (dict) - dict_unref (dict); - - return 0; + if (source < 0) + ret = __afr_selfheal_merge_dirent (frame, this, fd, name, inode, + sources, healed_sinks, + locked_on, replies); + else + ret = __afr_selfheal_heal_dirent (frame, this, fd, name, inode, + source, sources, healed_sinks, + locked_on, replies); + return ret; } - -int -afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *stbuf) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - dict_t *dict = NULL; - - int ret = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - - dict = dict_new (); - if (!dict) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory"); - return 0; - } - - GF_ASSERT (!uuid_is_null (stbuf->ia_gfid)); - ret = afr_set_dict_gfid (dict, stbuf->ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", - impunge_local->loc.path); - - gf_log (this->name, GF_LOG_DEBUG, - "creating missing directory %s on %s", - impunge_local->loc.path, - priv->children[child_index]->name); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->mkdir, - &impunge_local->loc, - st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type), - 0, dict); - - if (dict) - dict_unref (dict); - - return 0; +static int +afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *sources, + unsigned char *healed_sinks, char *name) +{ + afr_private_t *priv = NULL; + int ret = 0; + unsigned char *locked_on = NULL; + struct afr_reply *replies = NULL; + inode_t *inode = NULL; + + priv = this->private; + + locked_on = alloca0 (priv->child_count); + + replies = alloca0 (priv->child_count * sizeof(*replies)); + + ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name, + name, locked_on); + { + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; + } + + inode = afr_selfheal_unlocked_lookup_on (frame, fd->inode, name, + replies, locked_on); + if (!inode) { + ret = -ENOMEM; + goto unlock; + } + + ret = __afr_selfheal_entry_dirent (frame, this, fd, name, inode, + source, sources, healed_sinks, + locked_on, replies); + } +unlock: + afr_selfheal_unentrylk (frame, this, fd->inode, this->name, name, + locked_on); + if (inode) + inode_unref (inode); + return ret; } -int -afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index, const char *linkname) +static int +afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this, fd_t *fd, + int child, int source, unsigned char *sources, + unsigned char *healed_sinks) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - dict_t *dict = NULL; - struct iatt *buf = NULL; - int ret = 0; - - priv = this->private; - impunge_local = impunge_frame->local; - - buf = &impunge_local->cont.dir_fop.buf; - - dict = dict_new (); - if (!dict) { - afr_sh_entry_call_impunge_done (impunge_frame, this, - -1, ENOMEM); - goto out; - } - - GF_ASSERT (!uuid_is_null (buf->ia_gfid)); - ret = afr_set_dict_gfid (dict, buf->ia_gfid); - if (ret) - gf_log (this->name, GF_LOG_INFO, - "%s: dict set gfid failed", - impunge_local->loc.path); - - gf_log (this->name, GF_LOG_DEBUG, - "creating missing symlink %s -> %s on %s", - impunge_local->loc.path, linkname, - priv->children[child_index]->name); - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->symlink, - linkname, &impunge_local->loc, 0, dict); - - if (dict) - dict_unref (dict); -out: - return 0; -} + int ret = 0; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + off_t offset = 0; + call_frame_t *iter_frame = NULL; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + priv = this->private; + subvol = priv->children[child]; -int -afr_sh_entry_impunge_symlink_unlink_cbk (call_frame_t *impunge_frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = -1; - int call_count = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "unlink of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - goto out; - } - - afr_sh_entry_impunge_symlink (impunge_frame, this, child_index, - impunge_sh->linkname); - - return 0; -out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); + INIT_LIST_HEAD (&entries.list); - if (call_count == 0) - afr_sh_entry_call_impunge_done (impunge_frame, this, - op_ret, op_errno); + iter_frame = afr_copy_frame (frame); + if (!iter_frame) + return -ENOMEM; - return 0; -} + while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) { + if (ret > 0) + ret = 0; + list_for_each_entry (entry, &entries.list, list) { + offset = entry->d_off; + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; -int -afr_sh_entry_impunge_symlink_unlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; + if (__is_root_gfid (fd->inode->gfid) && + !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) + continue; - priv = this->private; - impunge_local = impunge_frame->local; + ret = afr_selfheal_entry_dirent (iter_frame, this, fd, + source, sources, + healed_sinks, + entry->d_name); + AFR_STACK_RESET (iter_frame); - gf_log (this->name, GF_LOG_DEBUG, - "unlinking symlink %s with wrong target on %s", - impunge_local->loc.path, - priv->children[child_index]->name); + if (ret) + break; + } - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_symlink_unlink_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->unlink, - &impunge_local->loc, 0, NULL); + gf_dirent_free (&entries); + if (ret) + break; + } - return 0; + AFR_STACK_DESTROY (iter_frame); + return ret; } - -int -afr_sh_entry_impunge_readlink_sink_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - const char *linkname, struct iatt *sbuf, dict_t *xdata) +static int +afr_selfheal_entry_do (call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *sources, + unsigned char *healed_sinks, + struct afr_reply *locked_replies) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = -1; - int call_count = -1; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - active_src = impunge_sh->active_source; - - child_index = (long) cookie; - - if ((op_ret == -1) && (!afr_inode_missing(op_errno))) { - gf_log (this->name, GF_LOG_INFO, - "readlink of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - goto out; - } - - /* symlink doesn't exist on the sink */ - - if ((op_ret == -1) && (afr_inode_missing(op_errno))) { - afr_sh_entry_impunge_symlink (impunge_frame, this, - child_index, impunge_sh->linkname); - return 0; - } - - - /* symlink exists on the sink, so check if targets match */ - - if (strcmp (linkname, impunge_sh->linkname) == 0) { - /* targets match, nothing to do */ - - goto out; - } else { - /* - * Hah! Sneaky wolf in sheep's clothing! - */ - afr_sh_entry_impunge_symlink_unlink (impunge_frame, this, - child_index); - return 0; - } + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; -out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); + priv = this->private; - if (call_count == 0) - afr_sh_entry_call_impunge_done (impunge_frame, this, - op_ret, op_errno); + gf_log (this->name, GF_LOG_INFO, "performing entry selfheal on %s", + uuid_utoa (fd->inode->gfid)); - return 0; + for (i = 0; i < priv->child_count; i++) { + if (i != source && !healed_sinks[i]) + continue; + ret = afr_selfheal_entry_do_subvol (frame, this, fd, i, source, + sources, healed_sinks); + if (ret) + break; + } + return ret; } -int -afr_sh_entry_impunge_readlink_sink (call_frame_t *impunge_frame, xlator_t *this, - int child_index) +static int +__afr_selfheal_entry_finalize_source (xlator_t *this, unsigned char *sources, + unsigned char *sinks, + unsigned char *locked_on, + struct afr_reply *replies) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; + int i = 0; + afr_private_t *priv = NULL; + int source = -1; + int locked_count = 0; + int sources_count = 0; + int sinks_count = 0; - priv = this->private; - impunge_local = impunge_frame->local; + priv = this->private; - gf_log (this->name, GF_LOG_DEBUG, - "checking symlink target of %s on %s", - impunge_local->loc.path, priv->children[child_index]->name); + locked_count = AFR_COUNT (locked_on, priv->child_count); + sources_count = AFR_COUNT (sources, priv->child_count); + sinks_count = AFR_COUNT (sinks, priv->child_count); - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_sink_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->readlink, - &impunge_local->loc, 4096, NULL); + if (locked_count == sinks_count || !sources_count) { + return -1; + } - return 0; -} - - -int -afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - const char *linkname, struct iatt *sbuf, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int child_index = -1; - int call_count = -1; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - active_src = impunge_sh->active_source; - - child_index = (long) cookie; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "readlink of %s on %s failed (%s)", - impunge_local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - goto out; - } - - impunge_sh->linkname = gf_strdup (linkname); - afr_sh_entry_impunge_readlink_sink (impunge_frame, this, child_index); - - return 0; + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source = i; + break; + } + } -out: - LOCK (&impunge_frame->lock); - { - call_count = --impunge_local->call_count; - } - UNLOCK (&impunge_frame->lock); - - if (call_count == 0) - afr_sh_entry_call_impunge_done (impunge_frame, this, - op_ret, op_errno); - - return 0; + return source; } -int -afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this, - int child_index, struct iatt *stbuf) +static int +__afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + struct afr_reply *replies, int *source_p) { - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - int active_src = -1; - - priv = this->private; - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - active_src = impunge_sh->active_source; - impunge_local->cont.dir_fop.buf = *stbuf; - - STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk, - (void *) (long) child_index, - priv->children[active_src], - priv->children[active_src]->fops->readlink, - &impunge_local->loc, 4096, NULL); - - return 0; -} + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; -int -afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, - int child_index) -{ - call_frame_t *frame = NULL; - afr_local_t *impunge_local = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *impunge_sh = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - ia_type_t type = IA_INVAL; - int active_src = 0; - struct iatt *buf = NULL; - - AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, - frame, local, sh); - active_src = impunge_sh->active_source; - afr_update_loc_gfids (&impunge_local->loc, &impunge_sh->entrybuf, - &impunge_sh->parentbuf); - - buf = &impunge_sh->entrybuf; - type = buf->ia_type; - - switch (type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - case IA_IFLNK: - afr_sh_entry_impunge_check_hardlink (impunge_frame, this, - child_index, buf); - break; - case IA_IFDIR: - afr_sh_entry_impunge_mkdir (impunge_frame, this, - child_index, buf); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "%s has unknown file type on %s: 0%o", - impunge_local->loc.path, - priv->children[active_src]->name, type); - sh->impunge_done (frame, this, -1, EINVAL); - break; - } - - return 0; -} + priv = this->private; -int -afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this, - int child_index) -{ - call_frame_t *frame = NULL; - afr_local_t *impunge_local = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *impunge_sh = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - ia_type_t type = IA_INVAL; - int active_src = 0; - struct iatt *buf = NULL; - - AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, - frame, local, sh); - active_src = impunge_sh->active_source; - buf = &impunge_sh->entrybuf; - type = buf->ia_type; - - switch (type) { - case IA_IFSOCK: - case IA_IFREG: - case IA_IFBLK: - case IA_IFCHR: - case IA_IFIFO: - afr_sh_entry_impunge_mknod (impunge_frame, this, - child_index, buf); - break; - case IA_IFLNK: - afr_sh_entry_impunge_readlink (impunge_frame, this, - child_index, buf); - break; - default: - gf_log (this->name, GF_LOG_ERROR, - "%s has unknown file type on %s: 0%o", - impunge_local->loc.path, - priv->children[active_src]->name, type); - sh->impunge_done (frame, this, -1, EINVAL); - break; - } - - return 0; -} + ret = afr_selfheal_unlocked_discover (frame, fd->inode, fd->inode->gfid, + replies); + if (ret) + return ret; -gf_boolean_t -afr_sh_need_recreate (afr_self_heal_t *impunge_sh, unsigned int child, - unsigned int child_count) -{ - gf_boolean_t recreate = _gf_false; + ret = afr_selfheal_find_direction (frame, this, replies, + AFR_ENTRY_TRANSACTION, + locked_on, sources, sinks); + if (ret) + return ret; - GF_ASSERT (impunge_sh->child_errno); + source = __afr_selfheal_entry_finalize_source (this, sources, sinks, + locked_on, replies); + if (source < 0) { + /* If source is < 0 (typically split-brain), we perform a + conservative merge of entries rather than erroring out */ + } + *source_p = source; - if (child == impunge_sh->active_source) - goto out; + for (i = 0; i < priv->child_count; i++) + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). - if (IA_IFLNK == impunge_sh->entrybuf.ia_type) { - recreate = _gf_true; - goto out; - } + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + healed_sinks[i] = sinks[i] && locked_on[i]; - if (impunge_sh->child_errno[child] == ENOENT) - recreate = _gf_true; -out: - return recreate; + return ret; } -unsigned int -afr_sh_recreate_count (afr_self_heal_t *impunge_sh, int *sources, - unsigned int child_count) -{ - int count = 0; - int i = 0; - - for (i = 0; i < child_count; i++) { - if (afr_sh_need_recreate (impunge_sh, i, child_count)) - count++; - } - - return count; -} -int -afr_sh_entry_call_impunge_recreate (call_frame_t *impunge_frame, - xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - unsigned int recreate_count = 0; - int i = 0; - int active_src = 0; - - priv = this->private; - AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, - frame, local, sh); - active_src = impunge_sh->active_source; - impunge_sh->entrybuf = impunge_sh->buf[active_src]; - impunge_sh->parentbuf = impunge_sh->parentbufs[active_src]; - recreate_count = afr_sh_recreate_count (impunge_sh, sh->sources, - priv->child_count); - if (!recreate_count) { - afr_sh_entry_call_impunge_done (impunge_frame, this, 0, 0); - goto out; - } - impunge_local->call_count = recreate_count; - for (i = 0; i < priv->child_count; i++) { - if (!impunge_local->child_up[i]) { - impunge_sh->child_errno[i] = ENOTCONN; - continue; - } - if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count)) { - impunge_sh->child_errno[i] = EEXIST; - continue; - } - } - for (i = 0; i < priv->child_count; i++) { - if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count)) - continue; - (void)afr_sh_entry_impunge_create (impunge_frame, this, i); - recreate_count--; - } - GF_ASSERT (!recreate_count); +static int +__afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on) +{ + afr_private_t *priv = NULL; + int ret = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *healed_sinks = NULL; + struct afr_reply *locked_replies = NULL; + int source = -1; + + priv = this->private; + + sources = alloca0 (priv->child_count); + sinks = alloca0 (priv->child_count); + healed_sinks = alloca0 (priv->child_count); + data_lock = alloca0 (priv->child_count); + + locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); + + ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name, NULL, + data_lock); + { + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_entry_prepare (frame, this, fd, data_lock, + sources, sinks, healed_sinks, + locked_replies, &source); + } +unlock: + afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL, + data_lock); + if (ret < 0) + goto out; + + ret = afr_selfheal_entry_do (frame, this, fd, source, sources, + healed_sinks, locked_replies); + if (ret) + goto out; + + ret = afr_selfheal_undo_pending (frame, this, fd->inode, sources, sinks, + healed_sinks, AFR_ENTRY_TRANSACTION, + locked_replies, data_lock); out: - return 0; + return ret; } -void -afr_sh_entry_common_lookup_done (call_frame_t *impunge_frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_private_t *priv = NULL; - afr_local_t *impunge_local = NULL; - afr_self_heal_t *impunge_sh = NULL; - call_frame_t *frame = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - unsigned int gfid_miss_count = 0; - unsigned int children_up_count = 0; - uuid_t gfid = {0}; - int active_src = 0; - - priv = this->private; - AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh, - frame, local, sh); - active_src = impunge_sh->active_source; - - if (op_ret < 0) - goto done; - if (impunge_sh->child_errno[active_src]) { - op_ret = -1; - op_errno = impunge_sh->child_errno[active_src]; - goto done; - } - - gfid_miss_count = afr_gfid_missing_count (this->name, - impunge_sh->success_children, - impunge_sh->buf, priv->child_count, - impunge_local->loc.path); - children_up_count = afr_up_children_count (impunge_local->child_up, - priv->child_count); - if ((gfid_miss_count == children_up_count) && - (children_up_count < priv->child_count)) { - op_ret = -1; - op_errno = ENODATA; - gf_log (this->name, GF_LOG_ERROR, "Not all children are up, " - "gfid should not be assigned in this state for %s", - impunge_local->loc.path); - goto done; - } - - if (gfid_miss_count) { - afr_update_gfid_from_iatts (gfid, impunge_sh->buf, - impunge_sh->success_children, - priv->child_count); - if (uuid_is_null (gfid)) { - sh->entries_skipped = _gf_true; - gf_log (this->name, GF_LOG_INFO, "%s: Skipping entry " - "self-heal because of gfid absence", - impunge_local->loc.path); - goto done; - } - afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc, - afr_sh_entry_common_lookup_done, gfid, - AFR_LOOKUP_FAIL_CONFLICTS | - AFR_LOOKUP_FAIL_MISSING_GFIDS, - NULL); - } else { - afr_sh_entry_call_impunge_recreate (impunge_frame, this); - } - return; -done: - afr_sh_entry_call_impunge_done (impunge_frame, this, - op_ret, op_errno); - return; -} -int -afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, - gf_dirent_t *entry) +static fd_t * +afr_selfheal_data_opendir (xlator_t *this, inode_t *inode) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *impunge_sh = NULL; - int ret = -1; - call_frame_t *impunge_frame = NULL; - afr_local_t *impunge_local = NULL; - int active_src = 0; - int op_errno = 0; - int op_ret = -1; - - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - sh->impunge_done = afr_sh_entry_impunge_entry_done; - - if (can_skip_entry_self_heal (entry->d_name, &local->loc)) { - op_ret = 0; - goto out; - } - - gf_log (this->name, GF_LOG_TRACE, - "inspecting existence of %s under %s", - entry->d_name, local->loc.path); - - ret = afr_impunge_frame_create (frame, this, active_src, - &impunge_frame); - if (ret) { - op_errno = -ret; - goto out; - } - - impunge_local = impunge_frame->local; - impunge_sh = &impunge_local->self_heal; - ret = afr_build_child_loc (this, &impunge_local->loc, &local->loc, - entry->d_name); - loc_copy (&impunge_sh->parent_loc, &local->loc); - if (ret != 0) { - op_errno = ENOMEM; - goto out; - } - - afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc, - afr_sh_entry_common_lookup_done, NULL, - AFR_LOOKUP_FAIL_CONFLICTS, NULL); - - op_ret = 0; -out: - if (ret) { - if (impunge_frame) - AFR_STACK_DESTROY (impunge_frame); - sh->impunge_done (frame, this, op_ret, op_errno); - } + loc_t loc = {0,}; + int ret = 0; + fd_t *fd = NULL; - return 0; -} + fd = fd_create (inode, 0); + if (!fd) + return NULL; + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); -int -afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - gf_dirent_t *entries, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - gf_dirent_t *entry = NULL; - off_t last_offset = 0; - int active_src = 0; - int entry_count = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - active_src = sh->active_source; - - if (op_ret <= 0) { - if (op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "readdir of %s on subvolume %s failed (%s)", - local->loc.path, - priv->children[active_src]->name, - strerror (op_errno)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } else { - gf_log (this->name, GF_LOG_TRACE, - "readdir of %s on subvolume %s complete", - local->loc.path, - priv->children[active_src]->name); - } - - afr_sh_entry_impunge_all (frame, this); - return 0; - } - - list_for_each_entry (entry, &entries->list, list) { - last_offset = entry->d_off; - entry_count++; - } - - gf_log (this->name, GF_LOG_DEBUG, - "readdir'ed %d entries from %s", - entry_count, priv->children[active_src]->name); - - sh->offset = last_offset; - local->call_count = entry_count; - - list_for_each_entry (entry, &entries->list, list) { - afr_sh_entry_impunge_entry (frame, this, entry); - } - - return 0; -} + ret = syncop_opendir (this, &loc, fd); + if (ret) { + fd_unref (fd); + fd = NULL; + } else { + fd_bind (fd); + } + loc_wipe (&loc); -int -afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int32_t active_src = 0; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - active_src = sh->active_source; - gf_log (this->name, GF_LOG_DEBUG, "%s: readdir from offset %zd", - local->loc.path, sh->offset); - - STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk, - priv->children[active_src], - priv->children[active_src]->fops->readdirp, - sh->healing_fd, sh->block_size, sh->offset, NULL); - - return 0; + return fd; } -int -afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - int active_src = -1; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh->offset = 0; - - active_src = next_active_source (frame, this, sh->active_source); - sh->active_source = active_src; - - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - afr_sh_entry_finish (frame, this); - return 0; - } - - if (active_src == -1) { - /* completed creating missing files on all subvolumes */ - afr_sh_entry_erase_pending (frame, this); - return 0; - } - - gf_log (this->name, GF_LOG_TRACE, - "impunging entries of %s on %s to other sinks", - local->loc.path, priv->children[active_src]->name); - - afr_sh_entry_impunge_subvol (frame, this); - - return 0; -} - int -afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) +afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - /* TODO: some of the open's might fail. - In that case, modify cleanup fn to send flush on those - fd's which are already open */ - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "opendir of %s failed on child %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { - afr_sh_entry_finish (frame, this); - return 0; - } - gf_log (this->name, GF_LOG_TRACE, - "fd for %s opened, commencing sync", - local->loc.path); - - sh->active_source = -1; - afr_sh_entry_expunge_all (frame, this); - } - - return 0; -} - - -int -afr_sh_entry_open (call_frame_t *frame, xlator_t *this) -{ - int i = 0; - int call_count = 0; - - int source = -1; - int *sources = NULL; + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + fd_t *fd = NULL; + int ret = 0; - fd_t *fd = NULL; + priv = this->private; - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - afr_self_heal_t *sh = NULL; + fd = afr_selfheal_data_opendir (this, inode); + if (!fd) + return -EIO; - local = frame->local; - sh = &local->self_heal; - priv = this->private; + locked_on = alloca0 (priv->child_count); - source = local->self_heal.source; - sources = local->self_heal.sources; + ret = afr_selfheal_tryentrylk (frame, this, inode, priv->sh_domain, NULL, + locked_on); + { + if (ret < 2) { + /* Either less than two subvols available, or another + selfheal (from another server) is in progress. Skip + for now in any case there isn't anything to do. + */ + ret = -ENOTCONN; + goto unlock; + } - sh->block_size = priv->sh_readdir_size; - sh->offset = 0; + ret = __afr_selfheal_entry (frame, this, fd, locked_on); + } +unlock: + afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL, locked_on); - call_count = sh->active_sinks; - if (source != -1) - call_count++; + if (fd) + fd_unref (fd); - local->call_count = call_count; - - fd = fd_create (local->loc.inode, frame->root->pid); - sh->healing_fd = fd; - - if (source != -1) { - gf_log (this->name, GF_LOG_TRACE, - "opening directory %s on subvolume %s (source)", - local->loc.path, priv->children[source]->name); - - /* open source */ - STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, - (void *) (long) source, - priv->children[source], - priv->children[source]->fops->opendir, - &local->loc, fd, NULL); - call_count--; - } - - /* open sinks */ - for (i = 0; i < priv->child_count; i++) { - if (sources[i] || !local->child_up[i]) - continue; - - gf_log (this->name, GF_LOG_TRACE, - "opening directory %s on subvolume %s (sink)", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->opendir, - &local->loc, fd, NULL); - - if (!--call_count) - break; - } - - return 0; -} - - -int -afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - afr_sh_mark_source_sinks (frame, this); - if (source != -1) - sh->success[source] = 1; - - if (sh->active_sinks == 0) { - gf_log (this->name, GF_LOG_TRACE, - "no active sinks for self-heal on dir %s", - local->loc.path); - afr_sh_entry_finish (frame, this); - return 0; - } - if (source == -1 && sh->active_sinks < 2) { - gf_log (this->name, GF_LOG_TRACE, - "cannot sync with 0 sources and 1 sink on dir %s", - local->loc.path); - afr_sh_entry_finish (frame, this); - return 0; - } - - if (source != -1) - gf_log (this->name, GF_LOG_DEBUG, - "self-healing directory %s from subvolume %s to " - "%d other", - local->loc.path, priv->children[source]->name, - sh->active_sinks); - else - gf_log (this->name, GF_LOG_DEBUG, - "no active sources for %s found. " - "merging all entries as a conservative decision", - local->loc.path); - - sh->actual_sh_started = _gf_true; - afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); - afr_sh_entry_open (frame, this); - - return 0; -} - - -void -afr_sh_entry_fix (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - int nsources = 0; - int32_t subvol_status = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (op_ret < 0) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (sh, op_errno); - afr_sh_entry_finish (frame, this); - goto out; - } - - if (sh->forced_merge) { - sh->source = -1; - goto heal; - } - - nsources = afr_build_sources (this, sh->xattr, sh->buf, - sh->pending_matrix, sh->sources, - sh->success_children, - AFR_ENTRY_TRANSACTION, &subvol_status, - _gf_true); - if ((subvol_status & ALL_FOOLS) || - (subvol_status & SPLIT_BRAIN)) { - gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative " - "merge", local->loc.path); - source = -1; - memset (sh->sources, 0, - sizeof (*sh->sources) * priv->child_count); - } else if (nsources == 0) { - gf_log (this->name, GF_LOG_TRACE, - "No self-heal needed for %s", - local->loc.path); - - afr_sh_entry_finish (frame, this); - return; - } else { - source = afr_sh_select_source (sh->sources, priv->child_count); - } - - sh->source = source; - - afr_reset_children (sh->fresh_children, priv->child_count); - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - if (sh->source >= 0) - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); - -heal: - afr_sh_entry_sync_prepare (frame, this); -out: - return; -} - -int -afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - sh = &local->self_heal; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "Non Blocking entrylks " - "failed for %s.", local->loc.path); - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_entry_done (frame, this); - } else { - - gf_log (this->name, GF_LOG_DEBUG, "Non Blocking entrylks done " - "for %s. Proceeding to FOP", local->loc.path); - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_entry_fix, NULL, - AFR_LOOKUP_FAIL_CONFLICTS | - AFR_LOOKUP_FAIL_MISSING_GFIDS, - NULL); - } - - return 0; -} - -int -afr_self_heal_entry (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_self_heal_t *sh = NULL; - - priv = this->private; - local = frame->local; - sh = &local->self_heal; - - sh->sh_type_in_action = AFR_SELF_HEAL_ENTRY; - - if (local->self_heal.do_entry_self_heal && priv->entry_self_heal) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); - afr_sh_entrylk (frame, this, &local->loc, NULL, - afr_sh_post_nonblocking_entry_cbk); - } else { - gf_log (this->name, GF_LOG_TRACE, - "proceeding to completion on %s", - local->loc.path); - afr_sh_entry_done (frame, this); - } - - return 0; + return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index fd5da6cfd..b31a33237 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. + Copyright (c) 2013 Red Hat, Inc. This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,763 +8,274 @@ cases as published by the Free Software Foundation. */ -#include -#include -#include -#include -#include -#include #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" #endif -#include "glusterfs.h" #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "stack.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" - -#include "afr-transaction.h" #include "afr-self-heal.h" -#include "afr-self-heal-common.h" - - -int -afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - afr_sh_reset (frame, this); - if (IA_ISDIR (sh->type)) { - gf_log (this->name, GF_LOG_DEBUG, - "proceeding to entry check on %s", - local->loc.path); - afr_self_heal_entry (frame, this); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "proceeding to data check on %s", - local->loc.path); - afr_self_heal_data (frame, this); - } - - return 0; -} - -int -afr_sh_inode_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->lock_cbk = afr_sh_metadata_done; - afr_unlock (frame, this); - - return 0; -} - -int -afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this) -{ - afr_sh_inode_unlock (frame, this); - - return 0; -} - -int -afr_sh_metadata_fail (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_metadata_finish (frame, this); - return 0; -} - -int -afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xattr, dict_t *xdata) -{ - afr_local_t *local = NULL; - int call_count = 0; - long i = 0; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - sh = &local->self_heal; - i = (long)cookie; - - if ((!IA_ISREG (sh->buf[sh->source].ia_type)) && - (!IA_ISDIR (sh->buf[sh->source].ia_type))) { - afr_children_add_child (sh->fresh_children, i, - priv->child_count); - } - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if ((!IA_ISREG (sh->buf[sh->source].ia_type)) && - (!IA_ISDIR (sh->buf[sh->source].ia_type))) { - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); - } - afr_sh_metadata_finish (frame, this); - } - - return 0; -} - -int -afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this) -{ - afr_sh_erase_pending (frame, this, AFR_METADATA_TRANSACTION, - afr_sh_metadata_erase_pending_cbk, - afr_sh_metadata_finish); - return 0; -} - - -int -afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int child_index = 0; - - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - gf_log (this->name, GF_LOG_INFO, - "setting attributes failed for %s on %s (%s)", - local->loc.path, - priv->children[child_index]->name, - strerror (op_errno)); - - sh->success[child_index] = 0; - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if (local->xattr_req) { - dict_unref (local->xattr_req); - local->xattr_req = NULL; - } - afr_sh_metadata_erase_pending (frame, this); - } - - return 0; -} - - -int -afr_sh_metadata_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, dict_t *xdata) -{ - afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno, xdata); - - return 0; -} - - -int -afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno, xdata); - - return 0; -} - -int -afr_sh_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - dict_t *xdata) -{ - int i = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - priv = this->private; - local = frame->local; - - if (op_ret < 0) { - afr_sh_metadata_sync_cbk (frame, cookie, - this, -1, op_errno, xdata); - goto out; - } - - i = (long) cookie; - - STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, local->xattr_req, 0, NULL); - - out: - return 0; -} - -inline void -afr_prune_special_keys (dict_t *xattr_dict) -{ - dict_del (xattr_dict, GF_SELINUX_XATTR_KEY); -} - -inline void -afr_prune_pending_keys (dict_t *xattr_dict, afr_private_t *priv) -{ - int i = 0; - - for (; i < priv->child_count; i++) { - dict_del (xattr_dict, priv->pending_key[i]); - } -} - -int -afr_sh_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr, - dict_t *xdata) -{ - int i = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - priv = this->private; - local = frame->local; - - if (op_ret < 0) { - afr_sh_metadata_sync_cbk (frame, cookie, - this, -1, op_errno, xdata); - goto out; - } - - afr_prune_pending_keys (xattr, priv); - - afr_prune_special_keys (xattr); - - i = (long) cookie; +#include "byte-order.h" - /* send removexattr in bulk via xdata */ - STACK_WIND_COOKIE (frame, afr_sh_removexattr_cbk, - cookie, - priv->children[i], - priv->children[i]->fops->removexattr, - &local->loc, "", xattr); - out: - return 0; -} +#define AFR_HEAL_ATTR (GF_SET_ATTR_UID|GF_SET_ATTR_GID|GF_SET_ATTR_MODE) int -afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) +afr_selfheal_metadata_do (call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, unsigned char *healed_sinks, + struct afr_reply *locked_replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - int active_sinks = 0; - int call_count = 0; - int i = 0; - - struct iatt stbuf = {0,}; - int32_t valid = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - active_sinks = sh->active_sinks; - - /* - * 2 calls per sink - setattr, setxattr - */ - if (xattr) { - call_count = active_sinks * 2; - local->xattr_req = dict_ref (xattr); - } else - call_count = active_sinks; - - local->call_count = call_count; - - stbuf.ia_atime = sh->buf[source].ia_atime; - stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec; - stbuf.ia_mtime = sh->buf[source].ia_mtime; - stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec; - - stbuf.ia_uid = sh->buf[source].ia_uid; - stbuf.ia_gid = sh->buf[source].ia_gid; - - stbuf.ia_type = sh->buf[source].ia_type; - stbuf.ia_prot = sh->buf[source].ia_prot; - - valid = GF_SET_ATTR_MODE | - GF_SET_ATTR_UID | GF_SET_ATTR_GID | - GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; - - for (i = 0; i < priv->child_count; i++) { - if (call_count == 0) { - break; - } - if (sh->sources[i] || !local->child_up[i]) - continue; - - gf_log (this->name, GF_LOG_DEBUG, - "self-healing metadata of %s from %s to %s", - local->loc.path, priv->children[source]->name, - priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_sh_metadata_setattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setattr, - &local->loc, &stbuf, valid, NULL); - - call_count--; - - if (!xattr) - continue; - - STACK_WIND_COOKIE (frame, afr_sh_getxattr_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->getxattr, - &local->loc, NULL, NULL); - call_count--; - } - - return 0; + int ret = -1; + loc_t loc = {0,}; + dict_t *xattr = NULL; + dict_t *old_xattr = NULL; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + + loc.inode = inode_ref (inode); + uuid_copy (loc.gfid, inode->gfid); + + gf_log (this->name, GF_LOG_INFO, "performing metadata selfheal on %s", + uuid_utoa (inode->gfid)); + + ret = syncop_getxattr (priv->children[source], &loc, &xattr, NULL); + if (ret < 0) { + loc_wipe (&loc); + return -EIO; + } + + afr_filter_xattrs (xattr); + dict_del (xattr, GF_SELINUX_XATTR_KEY); + + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + + ret = syncop_setattr (priv->children[i], &loc, + &locked_replies[source].poststat, + AFR_HEAL_ATTR, NULL, NULL); + if (ret) + healed_sinks[i] = 0; + + old_xattr = NULL; + ret = syncop_getxattr (priv->children[i], &loc, &old_xattr, 0); + if (old_xattr) { + dict_del (old_xattr, GF_SELINUX_XATTR_KEY); + afr_filter_xattrs (old_xattr); + ret = syncop_removexattr (priv->children[i], &loc, "", + old_xattr); + } + + ret = syncop_setxattr (priv->children[i], &loc, xattr, 0); + if (ret) + healed_sinks[i] = 0; + } + + loc_wipe (&loc); + if (xattr) + dict_unref (xattr); + + return 0; } -int -afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr, - dict_t *xdata) +/* + * Look for mismatching uid/gid or mode even if xattrs don't say so, and + * pick one arbitrarily as winner. + */ + +static int +__afr_selfheal_metadata_finalize_source (xlator_t *this, unsigned char *sources, + unsigned char *sinks, + unsigned char *locked_on, + struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - if (op_ret == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "getxattr of %s failed on subvolume %s (%s). proceeding without xattr", - local->loc.path, priv->children[source]->name, - strerror (op_errno)); - - afr_sh_metadata_sync (frame, this, NULL); - } else { - afr_prune_pending_keys (xattr, priv); - afr_sh_metadata_sync (frame, this, xattr); - } - - return 0; + int i = 0; + afr_private_t *priv = NULL; + struct iatt first = {0, }; + int source = -1; + int locked_count = 0; + int sources_count = 0; + int sinks_count = 0; + + priv = this->private; + + locked_count = AFR_COUNT (locked_on, priv->child_count); + sources_count = AFR_COUNT (sources, priv->child_count); + sinks_count = AFR_COUNT (sinks, priv->child_count); + + if (locked_count == sinks_count || !sources_count) { + if (!priv->metadata_splitbrain_forced_heal) { + return -EIO; + } + /* Metadata split brain, select one subvol + arbitrarily */ + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i] && sinks[i]) { + sources[i] = 1; + sinks[i] = 0; + break; + } + } + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (source == -1) { + source = i; + first = replies[i].poststat; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (!IA_EQUAL (first, replies[i].poststat, type) || + !IA_EQUAL (first, replies[i].poststat, uid) || + !IA_EQUAL (first, replies[i].poststat, gid) || + !IA_EQUAL (first, replies[i].poststat, prot)) { + sources[i] = 0; + sinks[i] = 1; + } + } + + return source; } -static void -afr_set_metadata_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, - xlator_t *this) -{ - afr_private_t *priv = NULL; - int i = 0; - char num[1024] = {0}; - size_t len = 0; - char *string = NULL; - size_t off = 0; - char *source_child = " from source %s to"; - char *format = " %s, "; - char *string_msg = " metadata self heal"; - char *pending_matrix_str = NULL; - int down_child_present = 0; - int unknown_child_present = 0; - char *down_subvol_1 = " down subvolume is "; - char *unknown_subvol_1 = " unknown subvolume is"; - char *down_subvol_2 = " down subvolumes are "; - char *unknown_subvol_2 = " unknown subvolumes are "; - int down_count = 0; - int unknown_count = 0; - - priv = this->private; - - pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, - this); - - if (!pending_matrix_str) - pending_matrix_str = ""; - - len += snprintf (num, sizeof (num), "%s", string_msg); - - for (i = 0; i < priv->child_count; i++) { - if ((sh->source == i) && (local->child_up[i] == 1)) { - len += snprintf (num, sizeof (num), source_child, - priv->children[i]->name); - } else if ((local->child_up[i] == 1) && (sh->sources[i] == 0)) { - len += snprintf (num, sizeof (num), format, - priv->children[i]->name); - } else if (local->child_up[i] == 0) { - len += snprintf (num, sizeof (num), format, - priv->children[i]->name); - if (!down_child_present) - down_child_present = 1; - down_count++; - } else if (local->child_up[i] == -1) { - len += snprintf (num, sizeof (num), format, - priv->children[i]->name); - if (!unknown_child_present) - unknown_child_present = 1; - unknown_count++; - } - } - - if (down_child_present) { - if (down_count > 1) { - len += snprintf (num, sizeof (num), "%s", - down_subvol_2); - } else { - len += snprintf (num, sizeof (num), "%s", - down_subvol_1); - } - } - if (unknown_child_present) { - if (unknown_count > 1) { - len += snprintf (num, sizeof (num), "%s", - unknown_subvol_2); - } else { - len += snprintf (num, sizeof (num), "%s", - unknown_subvol_1); - } - } - - len ++; - - string = GF_CALLOC (len, sizeof (char), gf_common_mt_char); - if (!string) - return; - - off += snprintf (string + off, len - off, "%s", string_msg); - for (i=0; i < priv->child_count; i++) { - if ((sh->source == i) && (local->child_up[i] == 1)) - off += snprintf (string + off, len - off, source_child, - priv->children[i]->name); - } - - for (i = 0; i < priv->child_count; i++) { - if ((local->child_up[i] == 1)&& (sh->sources[i] == 0)) - off += snprintf (string + off, len - off, format, - priv->children[i]->name); - } - - if (down_child_present) { - if (down_count > 1) { - off += snprintf (string + off, len - off, "%s", - down_subvol_2); - } else { - off += snprintf (string + off, len - off, "%s", - down_subvol_1); - } - } - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] == 0) - off += snprintf (string + off, len - off, format, - priv->children[i]->name); - } - - if (unknown_child_present) { - if (unknown_count > 1) { - off += snprintf (string + off, len - off, "%s", - unknown_subvol_2); - } else { - off += snprintf (string + off, len - off, "%s", - unknown_subvol_1); - } - } - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] == -1) - off += snprintf (string + off, len - off, format, - priv->children[i]->name); - } - - gf_asprintf (&sh->metadata_sh_info, "%s metadata %s,", string, - pending_matrix_str); - - if (pending_matrix_str && strcmp (pending_matrix_str, "")) - GF_FREE (pending_matrix_str); - - if (string && strcmp (string, "")) - GF_FREE (string); -} -int -afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) +static int +__afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int source = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - source = sh->source; - - afr_sh_mark_source_sinks (frame, this); - if (sh->active_sinks == 0) { - gf_log (this->name, GF_LOG_DEBUG, - "no active sinks for performing self-heal on file %s", - local->loc.path); - afr_sh_metadata_finish (frame, this); - return 0; - } - - gf_log (this->name, GF_LOG_TRACE, - "syncing metadata of %s from subvolume %s to %d active sinks", - local->loc.path, priv->children[source]->name, - sh->active_sinks); - - sh->actual_sh_started = _gf_true; - afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); - afr_set_metadata_sh_info_str (local, sh, this); - STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, - priv->children[source], - priv->children[source]->fops->getxattr, - &local->loc, NULL, NULL); - - return 0; + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + + ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid, + replies); + if (ret) + return ret; + + ret = afr_selfheal_find_direction (frame, this, replies, + AFR_METADATA_TRANSACTION, + locked_on, sources, sinks); + if (ret) + return ret; + + source = __afr_selfheal_metadata_finalize_source (this, sources, sinks, + locked_on, replies); + if (source < 0) + return -EIO; + + for (i = 0; i < priv->child_count; i++) + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + healed_sinks[i] = sinks[i] && locked_on[i]; + + return source; } -void -afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno) +static int +__afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *locked_on) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int nsources = 0; - int source = 0; - int i = 0; - - local = frame->local; - sh = &local->self_heal; - priv = this->private; - - if (op_ret < 0) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); - afr_sh_set_error (sh, op_errno); - afr_sh_metadata_finish (frame, this); - goto out; - } - nsources = afr_build_sources (this, sh->xattr, sh->buf, - sh->pending_matrix, sh->sources, - sh->success_children, - AFR_METADATA_TRANSACTION, NULL, _gf_false); - if ((nsources == -1) - && (priv->favorite_child != -1) - && (sh->child_errno[priv->favorite_child] == 0)) { - - gf_log (this->name, GF_LOG_WARNING, - "Picking favorite child %s as authentic source to resolve conflicting metadata of %s", - priv->children[priv->favorite_child]->name, - local->loc.path); - - sh->sources[priv->favorite_child] = 1; - - nsources = afr_sh_source_count (sh->sources, - priv->child_count); - } - - if (nsources == -1) { - afr_sh_print_split_brain_log (sh->pending_matrix, this, - local->loc.path); - afr_set_split_brain (this, sh->inode, SPB, DONT_KNOW); - afr_sh_metadata_fail (frame, this); - goto out; - } - - afr_set_split_brain (this, sh->inode, NO_SPB, DONT_KNOW); - if (nsources == 0) { - gf_log (this->name, GF_LOG_TRACE, - "No self-heal needed for %s", - local->loc.path); - - afr_sh_metadata_finish (frame, this); - goto out; - } - - source = afr_sh_select_source (sh->sources, priv->child_count); - - if (source == -1) { - gf_log (this->name, GF_LOG_DEBUG, - "No active sources found."); - - afr_sh_metadata_finish (frame, this); - goto out; - } - - sh->source = source; - - /* detect changes not visible through pending flags -- JIC */ - for (i = 0; i < priv->child_count; i++) { - if (i == source || sh->child_errno[i]) - continue; - - if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; - - if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source])) - sh->sources[i] = 0; - } - - if ((!IA_ISREG (sh->buf[source].ia_type)) && - (!IA_ISDIR (sh->buf[source].ia_type))) { - afr_reset_children (sh->fresh_children, priv->child_count); - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - afr_inode_set_read_ctx (this, sh->inode, sh->source, - sh->fresh_children); - } - - if (sh->do_metadata_self_heal && priv->metadata_self_heal) - afr_sh_metadata_sync_prepare (frame, this); - else - afr_sh_metadata_finish (frame, this); + afr_private_t *priv = NULL; + int ret = -1; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *data_lock = NULL; + unsigned char *healed_sinks = NULL; + struct afr_reply *locked_replies = NULL; + int source = -1; + + priv = this->private; + + sources = alloca0 (priv->child_count); + sinks = alloca0 (priv->child_count); + healed_sinks = alloca0 (priv->child_count); + data_lock = alloca0 (priv->child_count); + + locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); + + ret = afr_selfheal_inodelk (frame, this, inode, this->name, + LLONG_MAX - 1, 0, data_lock); + { + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_metadata_prepare (frame, this, inode, data_lock, + sources, sinks, healed_sinks, + locked_replies); + if (ret < 0) + goto unlock; + + source = ret; + ret = 0; + } +unlock: + afr_selfheal_uninodelk (frame, this, inode, this->name, + LLONG_MAX -1, 0, data_lock); + if (ret < 0) + goto out; + + ret = afr_selfheal_metadata_do (frame, this, inode, source, healed_sinks, + locked_replies); + if (ret) + goto out; + + ret = afr_selfheal_undo_pending (frame, this, inode, sources, sinks, + healed_sinks, AFR_METADATA_TRANSACTION, + locked_replies, data_lock); out: - return; -} - -int -afr_sh_metadata_post_nonblocking_inodelk_cbk (call_frame_t *frame, - xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, "Non Blocking metadata " - "inodelks failed for %s.", local->loc.path); - gf_log (this->name, GF_LOG_DEBUG, "Metadata self-heal " - "failed for %s.", local->loc.path); - afr_sh_metadata_done (frame, this); - } else { - - gf_log (this->name, GF_LOG_DEBUG, "Non Blocking metadata " - "inodelks done for %s. Proceeding to FOP", - local->loc.path); - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_metadata_fix, NULL, - AFR_LOOKUP_FAIL_CONFLICTS | - AFR_LOOKUP_FAIL_MISSING_GFIDS, - NULL); - } - - return 0; + return ret; } -int -afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->domain = this->name; - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - int_lock->transaction_lk_type = AFR_SELFHEAL_LK; - int_lock->selfheal_lk_type = AFR_METADATA_SELF_HEAL_LK; - - afr_set_lock_number (frame, this); - - inodelk->flock.l_start = LLONG_MAX - 1; - inodelk->flock.l_len = 0; - inodelk->flock.l_type = F_WRLCK; - int_lock->lock_cbk = afr_sh_metadata_post_nonblocking_inodelk_cbk; - - afr_nonblocking_inodelk (frame, this); - - return 0; -} - -gf_boolean_t -afr_can_start_metadata_self_heal (afr_self_heal_t *sh, afr_private_t *priv) -{ - if (sh->force_confirm_spb) - return _gf_true; - if (sh->do_metadata_self_heal && priv->metadata_self_heal) - return _gf_true; - return _gf_false; -} int -afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) +afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode) { - afr_local_t *local = NULL; - afr_private_t *priv = this->private; - afr_self_heal_t *sh = &local->self_heal; - - local = frame->local; - sh = &local->self_heal; - sh->sh_type_in_action = AFR_SELF_HEAL_METADATA; - - if (afr_can_start_metadata_self_heal (sh, priv)) { - afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); - afr_sh_metadata_lock (frame, this); - } else { - afr_sh_metadata_done (frame, this); - } - - return 0; + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + int ret = 0; + + priv = this->private; + + locked_on = alloca0 (priv->child_count); + + ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0, + locked_on); + { + if (ret < 2) { + /* Either less than two subvols available, or another + selfheal (from another server) is in progress. Skip + for now in any case there isn't anything to do. + */ + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_metadata (frame, this, inode, locked_on); + } +unlock: + afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on); + + return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c new file mode 100644 index 000000000..ce80b8da3 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-name.c @@ -0,0 +1,457 @@ +/* + Copyright (c) 2013 Red Hat, Inc. + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "afr.h" +#include "afr-self-heal.h" + + +int +__afr_selfheal_assign_gfid (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, inode_t *inode, + struct afr_reply *replies, int gfid_idx) +{ + int i = 0; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + int ret = 0; + loc_t loc = {0, }; + + priv = this->private; + + uuid_copy (parent->gfid, pargfid); + + xdata = dict_new (); + if (!xdata) { + return -ENOMEM; + } + + ret = dict_set_static_bin (xdata, "gfid-req", + replies[gfid_idx].poststat.ia_gfid, 16); + if (ret) { + dict_destroy (xdata); + return -ENOMEM; + } + + loc.parent = inode_ref (parent); + loc.inode = inode_ref (inode); + uuid_copy (loc.pargfid, pargfid); + loc.name = bname; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].op_ret == 0 || replies[i].op_errno != ENODATA) + continue; + + ret = syncop_lookup (priv->children[i], &loc, xdata, 0, 0, 0); + } + + loc_wipe (&loc); + dict_unref (xdata); + + return ret; +} + + +int +__afr_selfheal_name_impunge (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, inode_t *inode, + struct afr_reply *replies, int gfid_idx) +{ + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; + + priv = this->private; + + uuid_copy (parent->gfid, pargfid); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (uuid_compare (replies[i].poststat.ia_gfid, + replies[gfid_idx].poststat.ia_gfid) == 0) + continue; + + ret |= afr_selfheal_recreate_entry (frame, this, i, gfid_idx, + parent, bname, inode, replies); + } + + return ret; +} + + +int +__afr_selfheal_name_expunge (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, inode_t *inode, + struct afr_reply *replies) +{ + loc_t loc = {0, }; + int i = 0; + afr_private_t *priv = NULL; + char g[64]; + int ret = 0; + + priv = this->private; + + loc.parent = inode_ref (parent); + uuid_copy (loc.pargfid, pargfid); + loc.name = bname; + loc.inode = inode_ref (inode); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (replies[i].op_ret) + continue; + + switch (replies[i].poststat.ia_type) { + case IA_IFDIR: + gf_log (this->name, GF_LOG_WARNING, + "expunging dir %s/%s (%s) on %s", + uuid_utoa (pargfid), bname, + uuid_utoa_r (replies[i].poststat.ia_gfid, g), + priv->children[i]->name); + ret |= syncop_rmdir (priv->children[i], &loc, 1); + break; + default: + gf_log (this->name, GF_LOG_WARNING, + "expunging file %s/%s (%s) on %s", + uuid_utoa (pargfid), bname, + uuid_utoa_r (replies[i].poststat.ia_gfid, g), + priv->children[i]->name); + ret |= syncop_unlink (priv->children[i], &loc); + break; + } + } + + loc_wipe (&loc); + + return ret; + +} + + +int +__afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, int source, + unsigned char *locked_on, struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + uuid_t gfid = {0, }; + int gfid_idx = -1; + gf_boolean_t source_is_empty = _gf_true; + gf_boolean_t need_heal = _gf_false; + int first_idx = -1; + char g1[64],g2[64]; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (first_idx == -1) { + first_idx = i; + continue; + } + + if (replies[i].op_ret != replies[first_idx].op_ret) + need_heal = _gf_true; + + if (uuid_compare (replies[i].poststat.ia_gfid, + replies[first_idx].poststat.ia_gfid)) + need_heal = _gf_true; + } + + if (!need_heal) + return 0; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (!replies[i].op_ret && (source == -1 || sources[i])) { + source_is_empty = _gf_false; + break; + } + } + + if (source_is_empty) { + return __afr_selfheal_name_expunge (frame, this, parent, pargfid, + bname, inode, replies); + } + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (uuid_is_null (replies[i].poststat.ia_gfid)) + continue; + + if (uuid_is_null (gfid)) { + uuid_copy (gfid, replies[i].poststat.ia_gfid); + gfid_idx = i; + continue; + } + + if (sources[i] || source == -1) { + if (gfid_idx != -1 && + (sources[gfid_idx] || source == -1) && + uuid_compare (gfid, replies[i].poststat.ia_gfid)) { + gf_log (this->name, GF_LOG_WARNING, + "GFID mismatch for /%s " + "%s on %s and %s on %s", + uuid_utoa (pargfid), bname, + uuid_utoa_r (replies[i].poststat.ia_gfid, g1), + priv->children[i]->name, + uuid_utoa_r (replies[gfid_idx].poststat.ia_gfid, g2), + priv->children[gfid_idx]->name); + return -1; + } + + uuid_copy (gfid, replies[i].poststat.ia_gfid); + gfid_idx = i; + continue; + } + } + + if (gfid_idx == -1) + return -1; + + __afr_selfheal_assign_gfid (frame, this, parent, pargfid, bname, inode, + replies, gfid_idx); + + return __afr_selfheal_name_impunge (frame, this, parent, pargfid, + bname, inode, replies, gfid_idx); +} + + +int +__afr_selfheal_name_finalize_source (xlator_t *this, unsigned char *sources, + unsigned char *sinks, unsigned char *locked_on, + struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + int source = -1; + int locked_count = 0; + int sources_count = 0; + int sinks_count = 0; + + priv = this->private; + + locked_count = AFR_COUNT (locked_on, priv->child_count); + sources_count = AFR_COUNT (sources, priv->child_count); + sinks_count = AFR_COUNT (sinks, priv->child_count); + + if (locked_count == sinks_count || !sources_count) { + return -1; + } + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source = i; + break; + } + } + + return source; +} + + +int +__afr_selfheal_name_prepare (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, struct afr_reply *replies, + int *source_p) +{ + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + int i = 0; + + priv = this->private; + + ret = afr_selfheal_unlocked_discover (frame, parent, pargfid, replies); + if (ret) + return ret; + + ret = afr_selfheal_find_direction (frame, this, replies, + AFR_ENTRY_TRANSACTION, + locked_on, sources, sinks); + if (ret) + return ret; + + source = __afr_selfheal_name_finalize_source (this, sources, sinks, + locked_on, replies); + if (source < 0) { + /* If source is < 0 (typically split-brain), we perform a + conservative merge of entries rather than erroring out */ + } + *source_p = source; + + for (i = 0; i < priv->child_count; i++) + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + healed_sinks[i] = sinks[i] && locked_on[i]; + + return ret; +} + + +int +afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent, + uuid_t pargfid, const char *bname) +{ + afr_private_t *priv = NULL; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *locked_on = NULL; + int source = -1; + struct afr_reply *replies = NULL; + int ret = -1; + inode_t *inode = NULL; + + priv = this->private; + + locked_on = alloca0 (priv->child_count); + sources = alloca0 (priv->child_count); + sinks = alloca0 (priv->child_count); + healed_sinks = alloca0 (priv->child_count); + + replies = alloca0 (priv->child_count * sizeof(*replies)); + + ret = afr_selfheal_entrylk (frame, this, parent, this->name, bname, + locked_on); + { + if (ret < 2) { + ret = -ENOTCONN; + goto unlock; + } + + ret = __afr_selfheal_name_prepare (frame, this, parent, pargfid, + locked_on, sources, sinks, + healed_sinks, replies, + &source); + if (ret) + goto unlock; + + inode = afr_selfheal_unlocked_lookup_on (frame, parent, bname, + replies, locked_on); + if (!inode) { + ret = -ENOMEM; + goto unlock; + } + + ret = __afr_selfheal_name_do (frame, this, parent, pargfid, bname, + inode, sources, sinks, healed_sinks, + source, locked_on, replies); + } +unlock: + afr_selfheal_unentrylk (frame, this, parent, this->name, bname, + locked_on); + if (inode) + inode_unref (inode); + + return ret; +} + + +int +afr_selfheal_name_unlocked_inspect (call_frame_t *frame, xlator_t *this, + inode_t *parent, uuid_t pargfid, + const char *bname, gf_boolean_t *need_heal) +{ + afr_private_t *priv = NULL; + int i = 0; + struct afr_reply *replies = NULL; + inode_t *inode = NULL; + int first_idx = -1; + + priv = this->private; + + replies = alloca0 (sizeof (*replies) * priv->child_count); + + inode = afr_selfheal_unlocked_lookup_on (frame, parent, bname, + replies, priv->child_up); + if (!inode) + return -ENOMEM; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (first_idx == -1) { + first_idx = i; + continue; + } + + if (replies[i].op_ret != replies[first_idx].op_ret) + *need_heal = _gf_true; + + if (uuid_compare (replies[i].poststat.ia_gfid, + replies[first_idx].poststat.ia_gfid)) + *need_heal = _gf_true; + } + + if (inode) + inode_unref (inode); + return 0; +} + +int +afr_selfheal_name (xlator_t *this, uuid_t pargfid, const char *bname) +{ + inode_t *parent = NULL; + call_frame_t *frame = NULL; + int ret = -1; + gf_boolean_t need_heal = _gf_false; + + parent = afr_inode_find (this, pargfid); + if (!parent) + goto out; + + frame = afr_frame_create (this); + if (!frame) + goto out; + + ret = afr_selfheal_name_unlocked_inspect (frame, this, parent, pargfid, + bname, &need_heal); + if (ret) + goto out; + + if (need_heal) + afr_selfheal_name_do (frame, this, parent, pargfid, bname); +out: + if (parent) + inode_unref (parent); + if (frame) + AFR_STACK_DESTROY (frame); + + return ret; +} diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 7c9bc8111..a1b972ac3 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. + Copyright (c) 2013 Red Hat, Inc. This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,36 +8,160 @@ cases as published by the Free Software Foundation. */ -#ifndef __AFR_SELF_HEAL_H__ -#define __AFR_SELF_HEAL_H__ -#include +#ifndef _AFR_SELFHEAL_H +#define _AFR_SELFHEAL_H -#define FILETYPE_DIFFERS(buf1,buf2) ((buf1)->ia_type != (buf2)->ia_type) -#define PERMISSION_DIFFERS(buf1,buf2) (st_mode_from_ia ((buf1)->ia_prot, (buf1)->ia_type) != st_mode_from_ia ((buf2)->ia_prot, (buf2)->ia_type)) -#define OWNERSHIP_DIFFERS(buf1,buf2) (((buf1)->ia_uid != (buf2)->ia_uid) || ((buf1)->ia_gid != (buf2)->ia_gid)) -#define SIZE_DIFFERS(buf1,buf2) ((buf1)->ia_size != (buf2)->ia_size) -#define SIZE_GREATER(buf1,buf2) ((buf1)->ia_size > (buf2)->ia_size) +/* Perform fop on all UP subvolumes and wait for all callbacks to return */ + +#define AFR_ONALL(frame, rfn, fop, args ...) do { \ + afr_local_t *__local = frame->local; \ + afr_private_t *__priv = frame->this->private; \ + int __i = 0, __count = 0; \ + \ + afr_replies_wipe (__local, __priv); \ + \ + for (__i = 0; __i < __priv->child_count; __i++) { \ + if (!__priv->child_up[__i]) continue; \ + STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \ + __priv->children[__i], \ + __priv->children[__i]->fops->fop, args); \ + __count++; \ + } \ + syncbarrier_wait (&__local->barrier, __count); \ + } while (0) + + +/* Perform fop on all subvolumes represented by list[] array and wait + for all callbacks to return */ + +#define AFR_ONLIST(list, frame, rfn, fop, args ...) do { \ + afr_local_t *__local = frame->local; \ + afr_private_t *__priv = frame->this->private; \ + int __i = 0, __count = 0; \ + \ + afr_replies_wipe (__local, __priv); \ + \ + for (__i = 0; __i < __priv->child_count; __i++) { \ + if (!list[__i]) continue; \ + STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \ + __priv->children[__i], \ + __priv->children[__i]->fops->fop, args); \ + __count++; \ + } \ + syncbarrier_wait (&__local->barrier, __count); \ + } while (0) + + +#define AFR_SEQ(frame, rfn, fop, args ...) do { \ + afr_local_t *__local = frame->local; \ + afr_private_t *__priv = frame->this->private; \ + int __i = 0; \ + \ + afr_replies_wipe (__local, __priv); \ + \ + for (__i = 0; __i < __priv->child_count; __i++) { \ + if (!__priv->child_up[__i]) continue; \ + STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \ + __priv->children[__i], \ + __priv->children[__i]->fops->fop, args); \ + syncbarrier_wait (&__local->barrier, 1); \ + } \ + } while (0) + + +#define ALLOC_MATRIX(n, type) ({type **__ptr = NULL; \ + int __i; \ + __ptr = alloca0 (n * sizeof(type *)); \ + for (__i = 0; __i < n; __i++) __ptr[__i] = alloca0 (n * sizeof(type)); \ + __ptr;}) + + +#define IA_EQUAL(f,s,field) (memcmp (&(f.ia_##field), &(s.ia_##field), sizeof (s.ia_##field)) == 0) + + +int +afr_selfheal (xlator_t *this, uuid_t gfid); + +int +afr_selfheal_name (xlator_t *this, uuid_t gfid, const char *name); + +int +afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode); + +int +afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode); int -afr_self_heal_entry (call_frame_t *frame, xlator_t *this); +afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode); + + +int +afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on); + +int +afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on); int -afr_self_heal_data (call_frame_t *frame, xlator_t *this); +afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + const unsigned char *locked_on); int -afr_self_heal_metadata (call_frame_t *frame, xlator_t *this); +afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on); int -afr_self_heal_get_source (xlator_t *this, afr_local_t *local, dict_t **xattr); +afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on); + +int +afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on); + +int +afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies); + +inode_t * +afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent, + const char *name, struct afr_reply *replies, + unsigned char *lookup_on); int -afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode); +afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + afr_transaction_type type, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks); int -afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, - dict_t **xattr, - afr_transaction_type txn_type, - uuid_t gfid); -#endif /* __AFR_SELF_HEAL_H__ */ +afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, + afr_transaction_type type, int *dirty, int **matrix); + +int +afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, afr_transaction_type type, + struct afr_reply *replies, unsigned char *locked_on); + +int +afr_selfheal_recreate_entry (call_frame_t *frame, xlator_t *this, int dst, + int source, inode_t *dir, const char *name, + inode_t *inode, struct afr_reply *replies); + +int +afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode, + int subvol, dict_t *xattr); + +call_frame_t * +afr_frame_create (xlator_t *this); + +inode_t * +afr_inode_find (xlator_t *this, uuid_t gfid); + +#endif /* !_AFR_SELFHEAL_H */ diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index 9e5c1b3e7..4bfe909bc 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. + Copyright (c) 2013 Red Hat, Inc. This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,157 +8,672 @@ cases as published by the Free Software Foundation. */ + #ifndef _CONFIG_H #define _CONFIG_H #include "config.h" #endif + #include "afr.h" -#include "syncop.h" +#include "afr-self-heal.h" #include "afr-self-heald.h" -#include "afr-self-heal-common.h" #include "protocol-common.h" -#include "event-history.h" - -typedef enum { - STOP_CRAWL_ON_SINGLE_SUBVOL = 1, - STOP_INDEX_CRAWL_ON_PENDING_FULL_CRAWL = 2 -} afr_crawl_flags_t; - -typedef enum { - HEAL = 1, - INFO, - STATISTICS_TO_BE_HEALED, -} shd_crawl_op; - -typedef struct shd_dump { - dict_t *dict; - xlator_t *this; - int child; -} shd_dump_t; - -typedef struct shd_event_ { - int child; - char *path; -} shd_event_t; - -typedef struct shd_pos_ { - int child; - xlator_t *this; - afr_child_pos_t pos; -} shd_pos_t; - -typedef int -(*afr_crawl_done_cbk_t) (int ret, call_frame_t *sync_frame, void *crawl_data); + +#define SHD_INODE_LRU_LIMIT 2048 +#define AFR_EH_HEALED_LIMIT 1024 +#define AFR_EH_HEAL_FAIL_LIMIT 1024 +#define AFR_EH_SPLIT_BRAIN_LIMIT 1024 +#define AFR_STATISTICS_HISTORY_SIZE 50 + + +#define ASSERT_LOCAL(this, healer) \ + if (!afr_shd_is_subvol_local(this, healer->subvol)) { \ + healer->local = _gf_false; \ + if (safe_break (healer)) { \ + break; \ + } else { \ + continue; \ + } \ + } else { \ + healer->local = _gf_true; \ + } + + +#define NTH_INDEX_HEALER(this, n) &((((afr_private_t *)this->private))->shd.index_healers[n]) +#define NTH_FULL_HEALER(this, n) &((((afr_private_t *)this->private))->shd.full_healers[n]) + +int afr_shd_gfid_to_path (xlator_t *this, xlator_t *subvol, uuid_t gfid, char **path_p); + +char * +afr_subvol_name (xlator_t *this, int subvol) +{ + afr_private_t *priv = NULL; + + priv = this->private; + if (subvol < 0 || subvol > priv->child_count) + return NULL; + + return priv->children[subvol]->name; +} + + +void +afr_destroy_crawl_event_data (void *data) +{ + return; +} + void -afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl, - process_entry_cbk_t process_entry, void *op_data, - gf_boolean_t exclusive, int crawl_flags, - afr_crawl_done_cbk_t crawl_done); +afr_destroy_shd_event_data (void *data) +{ + shd_event_t *shd_event = data; -static int -_crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data); + if (!shd_event) + return; + GF_FREE (shd_event->path); + + return; +} + + +gf_boolean_t +afr_shd_is_subvol_local (xlator_t *this, int subvol) +{ + char *pathinfo = NULL; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int ret = 0; + gf_boolean_t is_local = _gf_false; + loc_t loc = {0, }; + + priv = this->private; + + loc.inode = this->itable->root; + uuid_copy (loc.gfid, loc.inode->gfid); + + ret = syncop_getxattr (priv->children[subvol], &loc, &xattr, + GF_XATTR_PATHINFO_KEY); + if (ret) + return _gf_false; + if (!xattr) + return _gf_false; + + ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &pathinfo); + if (ret) + return _gf_false; + + afr_local_pathinfo (pathinfo, &is_local); + + gf_log (this->name, GF_LOG_DEBUG, "subvol %s is %slocal", + priv->children[subvol]->name, is_local? "" : "not "); + + return is_local; +} -/* For calling straight through (e.g. already in a synctask). */ -int -afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos); -/* For deferring through a new synctask. */ int -afr_syncop_find_child_position (void *data); +__afr_shd_healer_wait (struct subvol_healer *healer) +{ + afr_private_t *priv = NULL; + struct timespec wait_till = {0, }; + int ret = 0; + + priv = healer->this->private; -static int -_loc_assign_gfid_path (loc_t *loc) +disabled_loop: + wait_till.tv_sec = time (NULL) + 60; + + while (!healer->rerun) { + ret = pthread_cond_timedwait (&healer->cond, + &healer->mutex, + &wait_till); + if (ret == ETIMEDOUT) + break; + } + + ret = healer->rerun; + healer->rerun = 0; + + if (!priv->shd.enabled) + goto disabled_loop; + + return ret; +} + + +int +afr_shd_healer_wait (struct subvol_healer *healer) { - int ret = -1; - char gfid_path[64] = {0}; - - if (loc->inode && !uuid_is_null (loc->inode->gfid)) { - ret = inode_path (loc->inode, NULL, (char**)&loc->path); - } else if (!uuid_is_null (loc->gfid)) { - snprintf (gfid_path, sizeof (gfid_path), "", - uuid_utoa (loc->gfid)); - loc->path = gf_strdup (gfid_path); - if (loc->path) - ret = 0; - } - return ret; + int ret = 0; + + pthread_mutex_lock (&healer->mutex); + { + ret = __afr_shd_healer_wait (healer); + } + pthread_mutex_unlock (&healer->mutex); + + return ret; } -void -_destroy_crawl_event_data (void *data) + +gf_boolean_t +safe_break (struct subvol_healer *healer) { - shd_crawl_event_t *crawl_event = NULL; + gf_boolean_t ret = _gf_false; - if (!data) - goto out; + pthread_mutex_lock (&healer->mutex); + { + if (healer->rerun) + goto unlock; + + healer->running = _gf_false; + ret = _gf_true; + } +unlock: + pthread_mutex_unlock (&healer->mutex); + + return ret; +} - crawl_event = (shd_crawl_event_t *)data; - GF_FREE (crawl_event->start_time_str); - GF_FREE (crawl_event->end_time_str); +inode_t * +afr_shd_inode_find (xlator_t *this, xlator_t *subvol, uuid_t gfid) +{ + inode_t *inode = NULL; + int ret = 0; + loc_t loc = {0, }; + struct iatt iatt = {0, }; + + inode = inode_find (this->itable, gfid); + if (inode) + goto out; + + loc.inode = inode_new (this->itable); + if (!loc.inode) + goto out; + uuid_copy (loc.gfid, gfid); + + ret = syncop_lookup (subvol, &loc, NULL, &iatt, NULL, NULL); + if (ret < 0) + goto out; + + inode = inode_link (loc.inode, NULL, NULL, &iatt); + if (inode) + inode_lookup (inode); out: - return; + loc_wipe (&loc); + return inode; } -void -_destroy_shd_event_data (void *data) + +fd_t * +afr_shd_index_opendir (xlator_t *this, int child) { - shd_event_t *event = NULL; - if (!data) - goto out; - event = (shd_event_t*)data; - GF_FREE (event->path); + fd_t *fd = NULL; + afr_private_t *priv = NULL; + xlator_t *subvol = NULL; + loc_t rootloc = {0, }; + inode_t *inode = NULL; + int ret = 0; + dict_t *xattr = NULL; + void *index_gfid = NULL; + + priv = this->private; + subvol = priv->children[child]; + + rootloc.inode = inode_ref (this->itable->root); + uuid_copy (rootloc.gfid, rootloc.inode->gfid); + + ret = syncop_getxattr (subvol, &rootloc, &xattr, + GF_XATTROP_INDEX_GFID); + if (ret || !xattr) { + errno = -ret; + goto out; + } + + ret = dict_get_ptr (xattr, GF_XATTROP_INDEX_GFID, &index_gfid); + if (ret) + goto out; + + gf_log (this->name, GF_LOG_DEBUG, "index-dir gfid for %s: %s", + subvol->name, uuid_utoa (index_gfid)); + + inode = afr_shd_inode_find (this, subvol, index_gfid); + if (!inode) + goto out; + fd = fd_anonymous (inode); out: - return; + loc_wipe (&rootloc); + if (xattr) + dict_unref (xattr); + return fd; +} + + +int +afr_shd_index_purge (xlator_t *subvol, inode_t *inode, char *name) +{ + loc_t loc = {0, }; + int ret = 0; + + loc.parent = inode_ref (inode); + loc.name = name; + + ret = syncop_unlink (subvol, &loc); + + loc_wipe (&loc); + return ret; +} + + +int +afr_shd_selfheal_name (struct subvol_healer *healer, int child, uuid_t parent, + const char *bname) +{ + int ret = -1; + + ret = afr_selfheal_name (THIS, parent, bname); + + return ret; +} + +int +afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid) +{ + int ret = 0; + eh_t *eh = NULL; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + shd_event_t *shd_event = NULL; + char *path = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + crawl_event_t *crawl_event = NULL; + + this = healer->this; + priv = this->private; + shd = &priv->shd; + crawl_event = &healer->crawl_event; + + subvol = priv->children[child]; + + ret = afr_selfheal (this, gfid); + + if (ret == -EIO) { + eh = shd->split_brain; + crawl_event->split_brain_count++; + } else if (ret < 0) { + eh = shd->heal_failed; + crawl_event->heal_failed_count++; + } else if (ret == 0) { + eh = shd->healed; + crawl_event->healed_count++; + } + + afr_shd_gfid_to_path (this, subvol, gfid, &path); + if (!path) + return ret; + + if (eh) { + shd_event = GF_CALLOC (1, sizeof(*shd_event), + gf_afr_mt_shd_event_t); + if (!shd_event) { + GF_FREE (path); + return ret; + } + + shd_event->child = child; + shd_event->path = path; + + if (eh_save_history (eh, shd_event) < 0) { + GF_FREE (shd_event); + GF_FREE (path); + } + } + return ret; } + + void -shd_cleanup_event (void *event) +afr_shd_sweep_prepare (struct subvol_healer *healer) { - shd_event_t *shd_event = event; + crawl_event_t *event = NULL; - if (!shd_event) - goto out; - GF_FREE (shd_event->path); - GF_FREE (shd_event); + event = &healer->crawl_event; + + event->healed_count = 0; + event->split_brain_count = 0; + event->heal_failed_count = 0; + + time (&event->start_time); + event->end_time = 0; +} + + +void +afr_shd_sweep_done (struct subvol_healer *healer) +{ + crawl_event_t *event = NULL; + crawl_event_t *history = NULL; + afr_self_heald_t *shd = NULL; + + event = &healer->crawl_event; + shd = &(((afr_private_t *)healer->this->private)->shd); + + time (&event->end_time); + history = memdup (event, sizeof (*event)); + event->start_time = 0; + + if (!history) + return; + + if (eh_save_history (shd->statistics[healer->subvol], history) < 0) + GF_FREE (history); +} + + +int +afr_shd_index_sweep (struct subvol_healer *healer) +{ + xlator_t *this = NULL; + int child = -1; + fd_t *fd = NULL; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + off_t offset = 0; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + uuid_t gfid; + int ret = 0; + int count = 0; + + this = healer->this; + child = healer->subvol; + priv = this->private; + subvol = priv->children[child]; + + fd = afr_shd_index_opendir (this, child); + if (!fd) { + gf_log (this->name, GF_LOG_WARNING, + "unable to opendir index-dir on %s", subvol->name); + return -errno; + } + + INIT_LIST_HEAD (&entries.list); + + while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) { + if (ret > 0) + ret = 0; + list_for_each_entry (entry, &entries.list, list) { + offset = entry->d_off; + + if (!priv->shd.enabled) { + ret = -EBUSY; + break; + } + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + gf_log (this->name, GF_LOG_DEBUG, "got entry: %s", + entry->d_name); + + ret = uuid_parse (entry->d_name, gfid); + if (ret) + continue; + + ret = afr_shd_selfheal (healer, child, gfid); + if (ret == 0) + count++; + + if (ret == -ENOENT || ret == -ESTALE) { + afr_shd_index_purge (subvol, fd->inode, + entry->d_name); + ret = 0; + } + } + + gf_dirent_free (&entries); + if (ret) + break; + } + + if (fd) + fd_unref (fd); + if (!ret) + ret = count; + return ret; +} + + +int +afr_shd_full_sweep (struct subvol_healer *healer, inode_t *inode) +{ + fd_t *fd = NULL; + xlator_t *this = NULL; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + off_t offset = 0; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + int ret = 0; + + this = healer->this; + priv = this->private; + subvol = priv->children[healer->subvol]; + + fd = fd_anonymous (inode); + if (!fd) + return -errno; + + INIT_LIST_HEAD (&entries.list); + + while ((ret = syncop_readdirp (subvol, fd, 131072, offset, 0, &entries))) { + if (ret < 0) + break; + + ret = gf_link_inodes_from_dirent (this, fd->inode, &entries); + if (ret) + break; + + list_for_each_entry (entry, &entries.list, list) { + offset = entry->d_off; + + if (!priv->shd.enabled) { + ret = -EBUSY; + break; + } + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + afr_shd_selfheal_name (healer, healer->subvol, + inode->gfid, entry->d_name); + + afr_shd_selfheal (healer, healer->subvol, + entry->d_stat.ia_gfid); + + if (entry->d_stat.ia_type == IA_IFDIR) { + ret = afr_shd_full_sweep (healer, entry->inode); + if (ret) + break; + } + } + + gf_dirent_free (&entries); + if (ret) + break; + } + + if (fd) + fd_unref (fd); + return ret; +} + + +void * +afr_shd_index_healer (void *data) +{ + struct subvol_healer *healer = NULL; + xlator_t *this = NULL; + int ret = 0; + + healer = data; + THIS = this = healer->this; + + for (;;) { + afr_shd_healer_wait (healer); + + ASSERT_LOCAL(this, healer); + + do { + gf_log (this->name, GF_LOG_DEBUG, + "starting index sweep on subvol %s", + afr_subvol_name (this, healer->subvol)); + + afr_shd_sweep_prepare (healer); + + ret = afr_shd_index_sweep (healer); + + afr_shd_sweep_done (healer); + /* + As long as at least one gfid was + healed, keep retrying. We may have + just healed a directory and thereby + created entries for other gfids which + could not be healed thus far. + */ + + gf_log (this->name, GF_LOG_DEBUG, + "finished index sweep on subvol %s", + afr_subvol_name (this, healer->subvol)); + /* + Give a pause before retrying to avoid a busy loop + in case the only entry in index is because of + an ongoing I/O. + */ + sleep (1); + } while (ret > 0); + } + + return NULL; +} + + +void * +afr_shd_full_healer (void *data) +{ + struct subvol_healer *healer = NULL; + xlator_t *this = NULL; + int run = 0; + + healer = data; + THIS = this = healer->this; + + for (;;) { + pthread_mutex_lock (&healer->mutex); + { + run = __afr_shd_healer_wait (healer); + if (!run) + healer->running = _gf_false; + } + pthread_mutex_unlock (&healer->mutex); + + if (!run) + break; + + ASSERT_LOCAL(this, healer); + + gf_log (this->name, GF_LOG_INFO, + "starting full sweep on subvol %s", + afr_subvol_name (this, healer->subvol)); + + afr_shd_sweep_prepare (healer); + + afr_shd_full_sweep (healer, this->itable->root); + + afr_shd_sweep_done (healer); + + gf_log (this->name, GF_LOG_INFO, + "finished full sweep on subvol %s", + afr_subvol_name (this, healer->subvol)); + } + + return NULL; +} + + +int +afr_shd_healer_init (xlator_t *this, struct subvol_healer *healer) +{ + int ret = 0; + + ret = pthread_mutex_init (&healer->mutex, NULL); + if (ret) + goto out; + + ret = pthread_cond_init (&healer->cond, NULL); + if (ret) + goto out; + + healer->this = this; + healer->running = _gf_false; + healer->rerun = _gf_false; + healer->local = _gf_false; out: - return; + return ret; } + int -afr_get_local_child (afr_self_heald_t *shd, unsigned int child_count) +afr_shd_healer_spawn (xlator_t *this, struct subvol_healer *healer, + void *(threadfn)(void *)) { - int i = 0; - int ret = -1; - for (i = 0; i < child_count; i++) { - if (shd->pos[i] == AFR_POS_LOCAL) { - ret = i; - break; - } - } - return ret; + int ret = 0; + + pthread_mutex_lock (&healer->mutex); + { + if (healer->running) { + pthread_cond_signal (&healer->cond); + } else { + ret = gf_thread_create (&healer->thread, NULL, + threadfn, healer); + if (ret) + goto unlock; + healer->running = 1; + } + + healer->rerun = 1; + } +unlock: + pthread_mutex_unlock (&healer->mutex); + + return ret; } -static int -_build_index_loc (xlator_t *this, loc_t *loc, char *name, loc_t *parent) + +int +afr_shd_full_healer_spawn (xlator_t *this, int subvol) { - int ret = 0; + return afr_shd_healer_spawn (this, NTH_FULL_HEALER (this, subvol), + afr_shd_full_healer); +} - uuid_copy (loc->pargfid, parent->inode->gfid); - loc->path = ""; - loc->name = name; - loc->parent = inode_ref (parent->inode); - if (!loc->parent) { - loc->path = NULL; - loc_wipe (loc); - ret = -1; - } - return ret; + +int +afr_shd_index_healer_spawn (xlator_t *this, int subvol) +{ + return afr_shd_healer_spawn (this, NTH_INDEX_HEALER (this, subvol), + afr_shd_index_healer); } + int -_add_crawl_stats_to_dict (xlator_t *this, dict_t *output, int child, - shd_crawl_event_t *shd_event, struct timeval *tv) +afr_shd_dict_add_crawl_event (xlator_t *this, dict_t *output, + crawl_event_t *crawl_event) { int ret = 0; uint64_t count = 0; @@ -167,23 +682,25 @@ _add_crawl_stats_to_dict (xlator_t *this, dict_t *output, int child, uint64_t healed_count = 0; uint64_t split_brain_count = 0; uint64_t heal_failed_count = 0; - char *start_time_str = NULL; + char *start_time_str = 0; char *end_time_str = NULL; char *crawl_type = NULL; int progress = -1; + int child = -1; - healed_count = shd_event->healed_count; - split_brain_count = shd_event->split_brain_count; - heal_failed_count = shd_event->heal_failed_count; - start_time_str = shd_event->start_time_str; - end_time_str = shd_event->end_time_str; - crawl_type = shd_event->crawl_type; + child = crawl_event->child; + healed_count = crawl_event->healed_count; + split_brain_count = crawl_event->split_brain_count; + heal_failed_count = crawl_event->heal_failed_count; + crawl_type = crawl_event->crawl_type; - if (!start_time_str) { - ret = -1; - goto out; - } + if (!crawl_event->start_time) + goto out; + + start_time_str = gf_strdup (ctime (&crawl_event->start_time)); + if (crawl_event->end_time) + end_time_str = gf_strdup (ctime (&crawl_event->end_time)); ret = dict_get_int32 (output, this->name, &xl_id); if (ret) { @@ -194,90 +711,100 @@ _add_crawl_stats_to_dict (xlator_t *this, dict_t *output, int child, snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child); ret = dict_get_uint64 (output, key, &count); + snprintf (key, sizeof (key), "statistics_healed_cnt-%d-%d-%"PRIu64, xl_id, child, count); ret = dict_set_uint64(output, key, healed_count); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" - "healed_count to outout"); + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_healed_count to outout"); goto out; - } + } + snprintf (key, sizeof (key), "statistics_sb_cnt-%d-%d-%"PRIu64, xl_id, child, count); ret = dict_set_uint64 (output, key, split_brain_count); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" - "split_brain_count to outout"); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_split_brain_count to outout"); goto out; } + snprintf (key, sizeof (key), "statistics_crawl_type-%d-%d-%"PRIu64, xl_id, child, count); - ret = dict_set_dynstr (output, key, gf_strdup (crawl_type)); + ret = dict_set_str (output, key, crawl_type); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" - "crawl_type to output"); + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_crawl_type to output"); goto out; } + snprintf (key, sizeof (key), "statistics_heal_failed_cnt-%d-%d-%"PRIu64, xl_id, child, count); ret = dict_set_uint64 (output, key, heal_failed_count); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" - "healed_failed_count to outout"); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_healed_failed_count to outout"); goto out; } + snprintf (key, sizeof (key), "statistics_strt_time-%d-%d-%"PRIu64, xl_id, child, count); - ret = dict_set_dynstr (output, key, gf_strdup(start_time_str)); - - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" - "crawl_start_time to outout"); + ret = dict_set_dynstr (output, key, start_time_str); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_crawl_start_time to outout"); goto out; - } + } else { + start_time_str = NULL; + } + + if (!end_time_str) + progress = 1; + else + progress = 0; snprintf (key, sizeof (key), "statistics_end_time-%d-%d-%"PRIu64, xl_id, child, count); - if (!end_time_str) - end_time_str = "Could not determine the end time"; - ret = dict_set_dynstr (output, key, gf_strdup(end_time_str)); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" - "crawl_end_time to outout"); + end_time_str = gf_strdup ("Could not determine the end time"); + ret = dict_set_dynstr (output, key, end_time_str); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_crawl_end_time to outout"); goto out; - } + } else { + end_time_str = NULL; + } + snprintf (key, sizeof (key), "statistics_inprogress-%d-%d-%"PRIu64, xl_id, child, count); - if (shd_event->crawl_inprogress == _gf_true) - progress = 1; - else - progress = 0; - ret = dict_set_int32 (output, key, progress); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" - "inprogress to outout"); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not add statistics_inprogress to outout"); goto out; } - snprintf (key, sizeof (key), "statistics-%d-%d-count",xl_id, child); - ret = dict_set_uint64 (output, key, count + 1); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Could not increment the " - "counter."); + snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child); + ret = dict_set_uint64 (output, key, count + 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Could not increment the counter."); goto out; - } + } out: + GF_FREE (start_time_str); + GF_FREE (end_time_str); return ret; } + int -_add_path_to_dict (xlator_t *this, dict_t *output, int child, char *path, - struct timeval *tv, gf_boolean_t dyn) +afr_shd_dict_add_path (xlator_t *this, dict_t *output, int child, char *path, + struct timeval *tv) { - //subkey not used for now int ret = -1; uint64_t count = 0; char key[256] = {0}; @@ -293,681 +820,323 @@ _add_path_to_dict (xlator_t *this, dict_t *output, int child, char *path, ret = dict_get_uint64 (output, key, &count); snprintf (key, sizeof (key), "%d-%d-%"PRIu64, xl_id, child, count); - if (dyn) - ret = dict_set_dynstr (output, key, path); - else - ret = dict_set_str (output, key, path); + ret = dict_set_dynstr (output, key, path); + if (ret) { gf_log (this->name, GF_LOG_ERROR, "%s: Could not add to output", path); goto out; } - if (!tv) - goto inc_count; - snprintf (key, sizeof (key), "%d-%d-%"PRIu64"-time", xl_id, - child, count); - ret = dict_set_uint32 (output, key, tv->tv_sec); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "%s: Could not set time", - path); - goto out; - } + if (tv) { + snprintf (key, sizeof (key), "%d-%d-%"PRIu64"-time", xl_id, + child, count); + ret = dict_set_uint32 (output, key, tv->tv_sec); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: Could not set time", + path); + goto out; + } + } -inc_count: snprintf (key, sizeof (key), "%d-%d-count", xl_id, child); + ret = dict_set_uint64 (output, key, count + 1); if (ret) { gf_log (this->name, GF_LOG_ERROR, "Could not increment count"); goto out; } - ret = 0; -out: - return ret; -} - -int -_get_path_from_gfid_loc (xlator_t *this, xlator_t *readdir_xl, loc_t *child, - char **fpath, gf_boolean_t *missing) -{ - dict_t *xattr = NULL; - char *path = NULL; - int ret = -1; - ret = syncop_getxattr (readdir_xl, child, &xattr, GFID_TO_PATH_KEY); - if (ret < 0) { - if ((-ret == ENOENT || -ret == ESTALE) && missing) - *missing = _gf_true; - ret = -1; - goto out; - } - ret = dict_get_str (xattr, GFID_TO_PATH_KEY, &path); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to get path for " - "gfid %s", uuid_utoa (child->gfid)); - goto out; - } - path = gf_strdup (path); - if (!path) { - ret = -1; - goto out; - } ret = 0; out: - if (!ret) - *fpath = path; - if (xattr) - dict_unref (xattr); return ret; } + int -_add_event_to_dict (circular_buffer_t *cb, void *data) +afr_shd_gfid_to_path (xlator_t *this, xlator_t *subvol, uuid_t gfid, char **path_p) { - int ret = 0; - shd_dump_t *dump_data = NULL; - shd_event_t *shd_event = NULL; - - dump_data = data; - shd_event = cb->data; - if (shd_event->child != dump_data->child) - goto out; - ret = _add_path_to_dict (dump_data->this, dump_data->dict, - dump_data->child, shd_event->path, &cb->tv, - _gf_false); -out: - return ret; + loc_t loc = {0,}; + char *path = NULL; + dict_t *xattr = NULL; + int ret = 0; + + uuid_copy (loc.gfid, gfid); + loc.inode = inode_new (this->itable); + + ret = syncop_getxattr (subvol, &loc, &xattr, GFID_TO_PATH_KEY); + loc_wipe (&loc); + if (ret) + return ret; + + ret = dict_get_str (xattr, GFID_TO_PATH_KEY, &path); + if (ret || !path) + return -EINVAL; + + *path_p = gf_strdup (path); + if (!*path_p) + return -ENOMEM; + return 0; } + int -_add_crawl_event_statistics_to_dict (circular_buffer_t *cb, void *data) +afr_shd_gather_index_entries (xlator_t *this, int child, dict_t *output) { - int ret = 0; - shd_dump_t *dump_data = NULL; - shd_crawl_event_t *shd_event = NULL; - - dump_data = data; - shd_event = cb->data; - ret = _add_crawl_stats_to_dict (dump_data->this, dump_data->dict, - dump_data->child, shd_event, &cb->tv); - return ret; + fd_t *fd = NULL; + xlator_t *subvol = NULL; + afr_private_t *priv = NULL; + off_t offset = 0; + gf_dirent_t entries; + gf_dirent_t *entry = NULL; + uuid_t gfid; + int ret = 0; + int count = 0; + char *path = NULL; + + priv = this->private; + subvol = priv->children[child]; + + fd = afr_shd_index_opendir (this, child); + if (!fd) { + gf_log (this->name, GF_LOG_WARNING, + "unable to opendir index-dir on %s", subvol->name); + return -errno; + } + + INIT_LIST_HEAD (&entries.list); + + while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) { + if (ret > 0) + ret = 0; + list_for_each_entry (entry, &entries.list, list) { + offset = entry->d_off; + + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + + gf_log (this->name, GF_LOG_DEBUG, "got entry: %s", + entry->d_name); + + ret = uuid_parse (entry->d_name, gfid); + if (ret) + continue; + + path = NULL; + ret = afr_shd_gfid_to_path (this, subvol, gfid, &path); + + if (ret == -ENOENT || ret == -ESTALE) { + afr_shd_index_purge (subvol, fd->inode, + entry->d_name); + ret = 0; + continue; + } + + ret = afr_shd_dict_add_path (this, output, child, path, + NULL); + } + + gf_dirent_free (&entries); + if (ret) + break; + } + + if (fd) + fd_unref (fd); + if (!ret) + ret = count; + return ret; } + int -_add_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict, int child) +afr_add_shd_event (circular_buffer_t *cb, void *data) { - shd_dump_t dump_data = {0}; - - dump_data.this = this; - dump_data.dict = dict; - dump_data.child = child; - eh_dump (eh, &dump_data, _add_event_to_dict); - return 0; + dict_t *output = NULL; + xlator_t *this = THIS; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + shd_event_t *shd_event = NULL; + char *path = NULL; + + output = data; + priv = this->private; + shd = &priv->shd; + shd_event = cb->data; + + if (!shd->index_healers[shd_event->child].local) + return 0; + + path = gf_strdup (shd_event->path); + if (!path) + return -ENOMEM; + + afr_shd_dict_add_path (this, output, shd_event->child, path, + &cb->tv); + return 0; } - int -_add_statistics_to_dict (xlator_t *this, dict_t *dict, int child) +afr_add_crawl_event (circular_buffer_t *cb, void *data) { - shd_dump_t dump_data = {0}; - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - - priv = this->private; - shd = &priv->shd; + dict_t *output = NULL; + xlator_t *this = THIS; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + crawl_event_t *crawl_event = NULL; - dump_data.this = this; - dump_data.dict = dict; - dump_data.child = child; - eh_dump (shd->statistics[child], &dump_data, - _add_crawl_event_statistics_to_dict); - return 0; + output = data; + priv = this->private; + shd = &priv->shd; + crawl_event = cb->data; -} + if (!shd->index_healers[crawl_event->child].local) + return 0; -void -_remove_stale_index (xlator_t *this, xlator_t *readdir_xl, - loc_t *parent, char *fname) -{ - int ret = 0; - loc_t index_loc = {0}; + afr_shd_dict_add_crawl_event (this, output, crawl_event); - ret = _build_index_loc (this, &index_loc, fname, parent); - if (ret) - goto out; - gf_log (this->name, GF_LOG_DEBUG, "Removing stale index " - "for %s on %s", index_loc.name, readdir_xl->name); - ret = syncop_unlink (readdir_xl, &index_loc); - if((ret < 0) && (-ret != ENOENT)) { - gf_log(this->name, GF_LOG_ERROR, "%s: Failed to remove index " - "on %s - %s",index_loc.name, readdir_xl->name, - strerror (-ret)); - } - index_loc.path = NULL; - loc_wipe (&index_loc); -out: - return; + return 0; } + int -_count_hard_links_under_base_indices_dir (xlator_t *this, - afr_crawl_data_t *crawl_data, - gf_dirent_t *entry, loc_t *childloc, - loc_t *parentloc, struct iatt *iattr) +afr_selfheal_daemon_init (xlator_t *this) { - xlator_t *readdir_xl = crawl_data->readdir_xl; - struct iatt parent = {0}; - int ret = 0; - dict_t *output = NULL; - int xl_id = 0; - char key[256] = {0}; - int child = -1; - uint64_t hardlinks = 0; - - output = crawl_data->op_data; - child = crawl_data->child; - - ret = syncop_lookup (readdir_xl, childloc, NULL, iattr, NULL, &parent); - if (ret) { - ret = -1; - goto out; - } - - ret = dict_get_int32 (output, this->name, &xl_id); - if (ret) + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int ret = -1; + int i = 0; + + priv = this->private; + shd = &priv->shd; + + this->itable = inode_table_new (SHD_INODE_LRU_LIMIT, this); + if (!this->itable) + goto out; + + shd->index_healers = GF_CALLOC (sizeof(*shd->index_healers), + priv->child_count, + gf_afr_mt_subvol_healer_t); + if (!shd->index_healers) + goto out; + + for (i = 0; i < priv->child_count; i++) { + shd->index_healers[i].subvol = i; + ret = afr_shd_healer_init (this, &shd->index_healers[i]); + if (ret) + goto out; + } + + shd->full_healers = GF_CALLOC (sizeof(*shd->full_healers), + priv->child_count, + gf_afr_mt_subvol_healer_t); + if (!shd->full_healers) + goto out; + for (i = 0; i < priv->child_count; i++) { + shd->full_healers[i].subvol = i; + ret = afr_shd_healer_init (this, &shd->full_healers[i]); + if (ret) + goto out; + } + + shd->healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false, + afr_destroy_shd_event_data); + if (!shd->healed) + goto out; + + shd->heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false, + afr_destroy_shd_event_data); + if (!shd->heal_failed) + goto out; + + shd->split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false, + afr_destroy_shd_event_data); + if (!shd->split_brain) + goto out; + + shd->statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count, + gf_common_mt_eh_t); + if (!shd->statistics) goto out; - snprintf (key, sizeof (key), "%d-%d-hardlinks", xl_id, child); - ret = dict_get_uint64 (output, key, &hardlinks); - - /*Removing the count of base_entry under indices/base_indicies and - * entry under indices/xattrop */ - hardlinks = hardlinks + iattr->ia_nlink - 2; - ret = dict_set_uint64 (output, key, hardlinks); - if (ret) - goto out; + for (i = 0; i < priv->child_count ; i++) { + shd->statistics[i] = eh_new (AFR_STATISTICS_HISTORY_SIZE, + _gf_false, + afr_destroy_crawl_event_data); + if (!shd->statistics[i]) + goto out; + shd->full_healers[i].crawl_event.child = i; + shd->full_healers[i].crawl_event.crawl_type = "FULL"; + shd->index_healers[i].crawl_event.child = i; + shd->index_healers[i].crawl_event.crawl_type = "INDEX"; + } + ret = 0; out: - return ret; + return ret; } + int -_add_summary_to_dict (xlator_t *this, afr_crawl_data_t *crawl_data, - gf_dirent_t *entry, - loc_t *childloc, loc_t *parentloc, struct iatt *iattr) +afr_selfheal_childup (xlator_t *this, int subvol) { - dict_t *output = NULL; - xlator_t *readdir_xl = NULL; - int ret = -1; - char *path = NULL; - gf_boolean_t missing = _gf_false; - char gfid_str[64] = {0}; + afr_shd_index_healer_spawn (this, subvol); - if (uuid_is_null (childloc->gfid)) - goto out; - - output = crawl_data->op_data; - readdir_xl = crawl_data->readdir_xl; - - ret = _get_path_from_gfid_loc (this, readdir_xl, childloc, &path, - &missing); - if (ret == 0) { - ret = _add_path_to_dict (this, output, crawl_data->child, path, - NULL, _gf_true); - } else if (missing) { - _remove_stale_index (this, readdir_xl, parentloc, - uuid_utoa_r (childloc->gfid, gfid_str)); - } - -out: - if (ret && path) - GF_FREE (path); - return ret; + return 0; } -void -_crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child, - int32_t op_ret, int32_t op_errno, dict_t *xattr_rsp, - afr_crawl_data_t *crawl_data) + +int64_t +afr_shd_get_index_count (xlator_t *this, int i) { - int ret = 0; - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - eh_t *eh = NULL; - char *path = NULL; - char gfid_str[64] = {0}; - shd_event_t *event = NULL; - int32_t sh_failed = 0; - gf_boolean_t split_brain = 0; - int32_t actual_sh_done = 0; - shd_crawl_event_t **shd_crawl_event = NULL; - - priv = this->private; - shd = &priv->shd; - if (crawl_data->crawl == INDEX) { - if ((op_ret < 0) && (op_errno == ENOENT)) { - _remove_stale_index (this, crawl_data->readdir_xl, - parent, uuid_utoa_r (child->gfid, - gfid_str)); - goto out; - } - ret = _get_path_from_gfid_loc (this, crawl_data->readdir_xl, - child, &path, NULL); - if (ret) - goto out; - } else { - path = gf_strdup (child->path); - if (!path) { - ret = -1; - goto out; - } - } + afr_private_t *priv = NULL; + xlator_t *subvol = NULL; + uint64_t count = 0; + loc_t rootloc = {0, }; + dict_t *xattr = NULL; + int ret = -1; + + priv = this->private; + subvol = priv->children[i]; + + rootloc.inode = inode_ref (this->itable->root); + uuid_copy (rootloc.gfid, rootloc.inode->gfid); + + ret = syncop_getxattr (subvol, &rootloc, &xattr, + GF_XATTROP_INDEX_COUNT); + loc_wipe (&rootloc); + + if (ret < 0) + return -1; + + ret = dict_get_uint64 (xattr, GF_XATTROP_INDEX_COUNT, &count); + if (ret) + return -1; + return count; +} - if (xattr_rsp) { - ret = dict_get_int32 (xattr_rsp, "sh-failed", &sh_failed); - ret = dict_get_int32 (xattr_rsp, "actual-sh-done", &actual_sh_done); - } - shd_crawl_event = (shd_crawl_event_t**)(shd->crawl_events); - - split_brain = afr_is_split_brain (this, child->inode); - if ((op_ret < 0 && op_errno == EIO) || split_brain) { - eh = shd->split_brain; - shd_crawl_event[crawl_data->child]->split_brain_count += 1; - } else if ((op_ret < 0) || sh_failed) { - eh = shd->heal_failed; - shd_crawl_event[crawl_data->child]->heal_failed_count += 1; - } else if (actual_sh_done == 1) { - eh = shd->healed; - shd_crawl_event[crawl_data->child]->healed_count += 1; - } - ret = -1; +int +afr_xl_op (xlator_t *this, dict_t *input, dict_t *output) +{ + gf_xl_afr_op_t op = GF_AFR_OP_INVALID; + int ret = 0; + int xl_id = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + struct subvol_healer *healer = NULL; + int i = 0; + char key[64]; + int op_ret = 0; + int64_t cnt = 0; - if (eh != NULL) { - event = GF_CALLOC (1, sizeof (*event), gf_afr_mt_shd_event_t); - if (!event) - goto out; - event->child = crawl_data->child; - event->path = path; + priv = this->private; + shd = &priv->shd; - ret = eh_save_history (eh, event); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "%s:Failed to save " - "to event history, (%d, %s)", path, op_ret, - strerror (op_errno)); - - goto out; - } - } else { - gf_log (this->name, GF_LOG_DEBUG, "%s:Self heal already done ", - path); - - } - ret = 0; -out: - if (ret && path) - GF_FREE (path); - return; -} - -int -_link_inode_update_loc (xlator_t *this, loc_t *loc, struct iatt *iattr) -{ - inode_t *link_inode = NULL; - int ret = -1; - - link_inode = inode_link (loc->inode, NULL, NULL, iattr); - if (link_inode == NULL) { - gf_log (this->name, GF_LOG_ERROR, "inode link failed " - "on the inode (%s)", uuid_utoa (iattr->ia_gfid)); - goto out; - } - inode_unref (loc->inode); - loc->inode = link_inode; - ret = 0; -out: - return ret; -} - -int -_self_heal_entry (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *entry, - loc_t *child, loc_t *parent, struct iatt *iattr) -{ - struct iatt parentbuf = {0}; - int ret = 0; - dict_t *xattr_rsp = NULL; - dict_t *xattr_req = NULL; - - xattr_req = dict_new (); - if (!xattr_req) { - errno = ENOMEM; - ret = -1; - goto out; - } - - ret = dict_set_int32 (xattr_req, "attempt-self-heal", 1); - - gf_log (this->name, GF_LOG_DEBUG, "lookup %s", child->path); - - ret = syncop_lookup (this, child, xattr_req, - iattr, &xattr_rsp, &parentbuf); - _crawl_post_sh_action (this, parent, child, ret, -ret, xattr_rsp, - crawl_data); - if (ret < 0) - ret = -1; - if (xattr_rsp) - dict_unref (xattr_rsp); - if (ret == 0) - ret = _link_inode_update_loc (this, child, iattr); - -out: - if (xattr_req) - dict_unref(xattr_req); - return ret; -} - -static int -afr_crawl_done (int ret, call_frame_t *sync_frame, void *data) -{ - GF_FREE (data); - STACK_DESTROY (sync_frame->root); - return 0; -} - -int -_get_heal_op_flags (shd_crawl_op op, afr_crawl_type_t crawl) -{ - int crawl_flags = 0; - - if (HEAL == op) { - crawl_flags |= STOP_CRAWL_ON_SINGLE_SUBVOL; - - if (crawl == INDEX) - crawl_flags |= STOP_INDEX_CRAWL_ON_PENDING_FULL_CRAWL; - } - - return crawl_flags; -} - -void -_do_self_heal_on_subvol (xlator_t *this, int child, afr_crawl_type_t crawl) -{ - afr_start_crawl (this, child, crawl, _self_heal_entry, - NULL, _gf_true, _get_heal_op_flags (HEAL, crawl), - afr_crawl_done); -} - -gf_boolean_t -_crawl_proceed (xlator_t *this, int child, int crawl_flags, char **reason) -{ - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - gf_boolean_t proceed = _gf_false; - char *msg = NULL; - - priv = this->private; - shd = &priv->shd; - if (!shd->enabled) { - msg = "Self-heal daemon is not enabled"; - gf_log (this->name, GF_LOG_DEBUG, "%s", msg); - goto out; - } - - if (!priv->child_up[child]) { - gf_log (this->name, GF_LOG_DEBUG, "Stopping crawl for %s , " - "subvol went down", priv->children[child]->name); - msg = "Brick is Not connected"; - goto out; - } - - if (crawl_flags & STOP_CRAWL_ON_SINGLE_SUBVOL) { - if (afr_up_children_count (priv->child_up, - priv->child_count) < 2) { - gf_log (this->name, GF_LOG_DEBUG, "Stopping crawl as " - "< 2 children are up"); - msg = "< 2 bricks in replica are running"; - goto out; - } - } - - if (crawl_flags & STOP_INDEX_CRAWL_ON_PENDING_FULL_CRAWL) { - if (shd->pending[child] == FULL) { - gf_log (this->name, GF_LOG_INFO, "Stopping index " - "self-heal as Full self-heal is pending on %s", - priv->children[child]->name); - msg = "Full crawl is pending"; - goto out; - } - } - - proceed = _gf_true; -out: - if (reason) - *reason = msg; - return proceed; -} - -int -_do_crawl_op_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl, - shd_crawl_op op, dict_t *output) -{ - afr_private_t *priv = NULL; - char *status = NULL; - char *subkey = NULL; - char key[256] = {0}; - shd_pos_t pos_data = {0}; - int op_ret = -1; - int xl_id = -1; - int i = 0; - int ret = 0; - int crawl_flags = 0; - - priv = this->private; - crawl_flags = _get_heal_op_flags (op, crawl); - - if (output) { - ret = dict_get_int32 (output, this->name, &xl_id); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Invalid input, " - "translator-id is not available"); - goto out; - } - } - pos_data.this = this; - subkey = "status"; - for (i = 0; i < priv->child_count; i++) { - if (_crawl_proceed (this, i, crawl_flags, &status)) { - pos_data.child = i; - /* - * We're already in a synctask in this case, so we - * don't need to defer through a second (and in fact - * that can cause deadlock). Just call straight - * through instead. - */ - ret = afr_find_child_position(pos_data.this, - pos_data.child, - &pos_data.pos); - if (ret) { - status = "Not able to find brick location"; - } else if (pos_data.pos == AFR_POS_REMOTE) { - status = "brick is remote"; - } else { - op_ret = 0; - if (op == HEAL) { - status = "Started self-heal"; - _do_self_heal_on_subvol (this, i, - crawl); - } else if (output && (op == INFO)) { - status = ""; - afr_start_crawl (this, i, INDEX, - _add_summary_to_dict, - output, _gf_false, 0, - NULL); - } else if (output && - (op == STATISTICS_TO_BE_HEALED)) { - status = ""; - afr_start_crawl (this, i, - INDEX_TO_BE_HEALED, - _count_hard_links_under_base_indices_dir, - output, _gf_false, - 0, NULL); - } - } - if (output) { - snprintf (key, sizeof (key), "%d-%d-%s", xl_id, - i, subkey); - ret = dict_set_str (output, key, status); - } - if (!op_ret && (crawl == FULL)) - break; - } - if (output) { - snprintf (key, sizeof (key), "%d-%d-%s", xl_id, i, - subkey); - ret = dict_set_str (output, key, status); - } - } -out: - return op_ret; -} - -int -_do_self_heal_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl, - dict_t *output) -{ - return _do_crawl_op_on_local_subvols (this, crawl, HEAL, output); -} - -int -_get_index_summary_on_local_subvols (xlator_t *this, dict_t *output) -{ - return _do_crawl_op_on_local_subvols (this, INDEX, INFO, output); -} - -void -afr_fill_completed_crawl_statistics_to_dict (xlator_t *this, dict_t *dict) -{ - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - int i = 0; - priv = this->private; - shd= &priv->shd; - for (i = 0; i < priv->child_count; i++) { - if (shd->pos[i] != AFR_POS_LOCAL) - continue; - _add_statistics_to_dict (this, dict, i); - } - - return ; -} - -static void -reset_crawl_event (shd_crawl_event_t *crawl_event) -{ - crawl_event->healed_count = 0; - crawl_event->split_brain_count = 0; - crawl_event->heal_failed_count = 0; - GF_FREE (crawl_event->start_time_str); - crawl_event->start_time_str = NULL; - crawl_event->end_time_str = NULL; - crawl_event->crawl_type = NULL; - crawl_event->crawl_inprogress = _gf_false; - return; -} - -static void -afr_copy_crawl_event_struct (shd_crawl_event_t *src, shd_crawl_event_t *dst) -{ - dst->healed_count = src->healed_count; - dst->split_brain_count = src->split_brain_count; - dst->heal_failed_count = src->heal_failed_count; - dst->start_time_str = gf_strdup (src->start_time_str); - dst->end_time_str = "Crawl is already in progress"; - dst->crawl_type = src->crawl_type; - dst->crawl_inprogress = _gf_true; - return; -} - -static int -afr_fill_crawl_statistics_of_running_crawl(xlator_t *this, dict_t *dict) -{ - shd_crawl_event_t *evnt = NULL; - int ret = 0; - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - int i = 0; - priv = this->private; - shd = &priv->shd; - - evnt = GF_CALLOC (1, sizeof (shd_crawl_event_t), - gf_afr_mt_shd_crawl_event_t); - if (!evnt) { - ret = -1; - goto out; - } - LOCK (&priv->lock); - { - for (i = 0; i < priv->child_count; i++) { - if (shd->pos[i] != AFR_POS_LOCAL) - continue; - - reset_crawl_event (evnt); - - if (!shd->crawl_events[i]) { - continue; - } - - afr_copy_crawl_event_struct (shd->crawl_events[i], - evnt); - _add_crawl_stats_to_dict (this, dict, i, evnt, NULL); - - } - } - UNLOCK (&priv->lock); - reset_crawl_event (evnt); - GF_FREE (evnt); - -out: - return ret; -} - -static int -_add_local_subvols_crawl_statistics_to_dict (xlator_t *this, dict_t *dict) -{ - int ret = 0; - afr_fill_completed_crawl_statistics_to_dict (this, dict); - ret = afr_fill_crawl_statistics_of_running_crawl (this, dict); - return ret; -} -int -_add_local_subvols_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict) -{ - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - int i = 0; - - priv = this->private; - shd = &priv->shd; - - for (i = 0; i < priv->child_count; i++) { - if (shd->pos[i] != AFR_POS_LOCAL) - continue; - _add_eh_to_dict (this, eh, dict, i); - } - return 0; -} - -int -afr_xl_op (xlator_t *this, dict_t *input, dict_t *output) -{ - gf_xl_afr_op_t op = GF_AFR_OP_INVALID; - int ret = 0; - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - int xl_id = 0; - - priv = this->private; - shd = &priv->shd; + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == -1) + goto out; ret = dict_get_int32 (input, "xl-op", (int32_t*)&op); if (ret) @@ -980,856 +1149,108 @@ afr_xl_op (xlator_t *this, dict_t *input, dict_t *output) goto out; switch (op) { case GF_AFR_OP_HEAL_INDEX: - ret = _do_self_heal_on_local_subvols (this, INDEX, output); + op_ret = -1; + + for (i = 0; i < priv->child_count; i++) { + healer = &shd->index_healers[i]; + snprintf (key, 64, "%d-%d-status", xl_id, i); + + if (!priv->child_up[i]) { + ret = dict_set_str (output, key, + "Brick is not connected"); + } else if (AFR_COUNT (priv->child_up, + priv->child_count) < 2) { + ret = dict_set_str (output, key, + "< 2 bricks in replica are up"); + } else if (!afr_shd_is_subvol_local (this, healer->subvol)) { + ret = dict_set_str (output, key, + "Brick is remote"); + } else { + ret = dict_set_str (output, key, + "Started self-heal"); + afr_shd_index_healer_spawn (this, i); + op_ret = 0; + } + } break; case GF_AFR_OP_HEAL_FULL: - ret = _do_self_heal_on_local_subvols (this, FULL, output); + op_ret = -1; + + for (i = 0; i < priv->child_count; i++) { + healer = &shd->full_healers[i]; + snprintf (key, 64, "%d-%d-status", xl_id, i); + + if (!priv->child_up[i]) { + ret = dict_set_str (output, key, + "Brick is not connected"); + } else if (AFR_COUNT (priv->child_up, + priv->child_count) < 2) { + ret = dict_set_str (output, key, + "< 2 bricks in replica are up"); + } else if (!afr_shd_is_subvol_local (this, healer->subvol)) { + ret = dict_set_str (output, key, + "Brick is remote"); + } else { + ret = dict_set_str (output, key, + "Started self-heal"); + afr_shd_full_healer_spawn (this, i); + op_ret = 0; + } + } break; case GF_AFR_OP_INDEX_SUMMARY: - (void)_get_index_summary_on_local_subvols (this, output); - ret = 0; + for (i = 0; i < priv->child_count; i++) + if (shd->index_healers[i].local) + afr_shd_gather_index_entries (this, i, output); break; case GF_AFR_OP_HEALED_FILES: - ret = _add_local_subvols_eh_to_dict (this, shd->healed, output); + eh_dump (shd->healed, output, afr_add_shd_event); break; case GF_AFR_OP_HEAL_FAILED_FILES: - ret = _add_local_subvols_eh_to_dict (this, shd->heal_failed, - output); + eh_dump (shd->heal_failed, output, afr_add_shd_event); break; case GF_AFR_OP_SPLIT_BRAIN_FILES: - ret = _add_local_subvols_eh_to_dict (this, shd->split_brain, - output); + eh_dump (shd->split_brain, output, afr_add_shd_event); break; case GF_AFR_OP_STATISTICS: - ret = _add_local_subvols_crawl_statistics_to_dict (this, output); + for (i = 0; i < priv->child_count; i++) { + eh_dump (shd->statistics[i], output, + afr_add_crawl_event); + afr_shd_dict_add_crawl_event (this, output, + &shd->index_healers[i].crawl_event); + afr_shd_dict_add_crawl_event (this, output, + &shd->full_healers[i].crawl_event); + } break; case GF_AFR_OP_STATISTICS_HEAL_COUNT: case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: - ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED, - STATISTICS_TO_BE_HEALED, - output); + op_ret = -1; + + for (i = 0; i < priv->child_count; i++) { + if (!priv->child_up[i]) { + snprintf (key, 64, "%d-%d-status", xl_id, i); + ret = dict_set_str (output, key, + "Brick is not connected"); + } else { + snprintf (key, 64, "%d-%d-hardlinks", xl_id, i); + cnt = afr_shd_get_index_count (this, i); + if (cnt >= 0) { + ret = dict_set_uint64 (output, key, cnt); + } + op_ret = 0; + } + } + +// ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED, +// STATISTICS_TO_BE_HEALED, +// output); break; + default: gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op); break; } out: dict_del (output, this->name); - return ret; -} - -void -afr_poll_self_heal (void *data) -{ - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - struct timespec timeout = {0}; - xlator_t *this = NULL; - long child = (long)data; - gf_timer_t *old_timer = NULL; - gf_timer_t *new_timer = NULL; - shd_pos_t pos_data = {0}; - int ret = 0; - - this = THIS; - priv = this->private; - shd = &priv->shd; - - if (shd->pos[child] == AFR_POS_UNKNOWN) { - pos_data.this = this; - pos_data.child = child; - ret = synctask_new (this->ctx->env, - afr_syncop_find_child_position, - NULL, NULL, &pos_data); - if (!ret) - shd->pos[child] = pos_data.pos; - } - if (shd->enabled && (shd->pos[child] == AFR_POS_LOCAL)) - _do_self_heal_on_subvol (this, child, INDEX); - timeout.tv_sec = shd->timeout; - timeout.tv_nsec = 0; - //notify and previous timer should be synchronized. - LOCK (&priv->lock); - { - old_timer = shd->timer[child]; - if (shd->pos[child] == AFR_POS_REMOTE) - goto unlock; - shd->timer[child] = gf_timer_call_after (this->ctx, timeout, - afr_poll_self_heal, - data); - new_timer = shd->timer[child]; - } -unlock: - UNLOCK (&priv->lock); - - if (old_timer) - gf_timer_call_cancel (this->ctx, old_timer); - if (!new_timer && (shd->pos[child] != AFR_POS_REMOTE)) { - gf_log (this->name, GF_LOG_WARNING, - "Could not create self-heal polling timer for %s", - priv->children[child]->name); - } - return; -} - -static int -afr_handle_child_up (int ret, call_frame_t *sync_frame, void *data) -{ - afr_self_heald_t *shd = NULL; - shd_pos_t *pos_data = data; - afr_private_t *priv = NULL; - - if (ret) - goto out; - - priv = pos_data->this->private; - shd = &priv->shd; - shd->pos[pos_data->child] = pos_data->pos; - if (pos_data->pos != AFR_POS_REMOTE) - afr_poll_self_heal ((void*)(long)pos_data->child); - _do_self_heal_on_local_subvols (THIS, INDEX, NULL); -out: - GF_FREE (data); - return 0; -} - -void -afr_proactive_self_heal (void *data) -{ - xlator_t *this = NULL; - long child = (long)data; - shd_pos_t *pos_data = NULL; - int ret = 0; - - this = THIS; - - //Position of brick could have changed and it could be local now. - //Compute the position again - pos_data = GF_CALLOC (1, sizeof (*pos_data), gf_afr_mt_pos_data_t); - if (!pos_data) - goto out; - pos_data->this = this; - pos_data->child = child; - ret = synctask_new (this->ctx->env, afr_syncop_find_child_position, - afr_handle_child_up, NULL, pos_data); - if (ret) - goto out; -out: - return; -} - -static int -get_pathinfo_host (char *pathinfo, char *hostname, size_t size) -{ - char *start = NULL; - char *end = NULL; - int ret = -1; - int i = 0; - - if (!pathinfo) - goto out; - - start = strchr (pathinfo, ':'); - if (!start) - goto out; - end = strrchr (pathinfo, ':'); - if (start == end) - goto out; - - memset (hostname, 0, size); - i = 0; - while (++start != end) - hostname[i++] = *start; - ret = 0; -out: - return ret; -} - -int -afr_local_pathinfo (char *pathinfo, gf_boolean_t *local) -{ - int ret = 0; - char pathinfohost[1024] = {0}; - char localhost[1024] = {0}; - xlator_t *this = THIS; - - *local = _gf_false; - ret = get_pathinfo_host (pathinfo, pathinfohost, sizeof (pathinfohost)); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Invalid pathinfo: %s", - pathinfo); - goto out; - } - - ret = gethostname (localhost, sizeof (localhost)); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "gethostname() failed, " - "reason: %s", strerror (errno)); - goto out; - } - - if (!strcmp (localhost, pathinfohost)) - *local = _gf_true; -out: - return ret; -} - -int -afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data, - loc_t *dirloc) -{ - afr_private_t *priv = NULL; - dict_t *xattr = NULL; - void *index_gfid = NULL; - void *base_indices_holder_vgfid = NULL; - loc_t rootloc = {0}; - struct iatt iattr = {0}; - struct iatt parent = {0}; - int ret = 0; - xlator_t *readdir_xl = crawl_data->readdir_xl; - - priv = this->private; - if (crawl_data->crawl == FULL) { - afr_build_root_loc (this, dirloc); - } else if (crawl_data->crawl == INDEX) { - afr_build_root_loc (this, &rootloc); - ret = syncop_getxattr (readdir_xl, &rootloc, &xattr, - GF_XATTROP_INDEX_GFID); - if (ret < 0) { - ret = -1; - goto out; - } - ret = dict_get_ptr (xattr, GF_XATTROP_INDEX_GFID, &index_gfid); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "failed to get index " - "dir gfid on %s", readdir_xl->name); - goto out; - } - if (!index_gfid) { - gf_log (this->name, GF_LOG_ERROR, "index gfid empty " - "on %s", readdir_xl->name); - ret = -1; - goto out; - } - uuid_copy (dirloc->gfid, index_gfid); - dirloc->path = ""; - dirloc->inode = inode_new (priv->root_inode->table); - ret = syncop_lookup (readdir_xl, dirloc, NULL, - &iattr, NULL, &parent); - if (ret < 0) { - if (-ret != ENOENT) { - gf_log (this->name, GF_LOG_ERROR, "lookup " - "failed on index dir on %s - (%s)", - readdir_xl->name, strerror (-ret)); - } - ret = -1; - goto out; - } - ret = _link_inode_update_loc (this, dirloc, &iattr); - if (ret) - goto out; - } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) { - afr_build_root_loc (this, &rootloc); - ret = syncop_getxattr (readdir_xl, &rootloc, &xattr, - GF_BASE_INDICES_HOLDER_GFID); - if (ret < 0) { - ret = -1; - goto out; - } - ret = dict_get_ptr (xattr, GF_BASE_INDICES_HOLDER_GFID, - &base_indices_holder_vgfid); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "index gfid empty " - "on %s", readdir_xl->name); - ret = -1; - goto out; - } - if (!base_indices_holder_vgfid) { - gf_log (this->name, GF_LOG_ERROR, "Base indices holder" - "virtual gfid is null on %s", readdir_xl->name); - ret = -1; - goto out; - } - uuid_copy (dirloc->gfid, base_indices_holder_vgfid); - dirloc->path = ""; - dirloc->inode = inode_new (priv->root_inode->table); - ret = syncop_lookup (readdir_xl, dirloc, NULL, &iattr, NULL, - &parent); - if (ret < 0) { - if (-ret != ENOENT) { - gf_log (this->name, GF_LOG_ERROR, "lookup " - "failed for base_indices_holder dir" - " on %s - (%s)", readdir_xl->name, - strerror (-ret)); - - } else { - gf_log (this->name, GF_LOG_ERROR, "base_indices" - "_holder is not yet created."); - } - ret = -1; - goto out; - } - ret = _link_inode_update_loc (this, dirloc, &iattr); - if (ret) - goto out; - } - ret = 0; -out: - if (xattr) - dict_unref (xattr); - loc_wipe (&rootloc); - return ret; -} - -int -afr_crawl_opendir (xlator_t *this, afr_crawl_data_t *crawl_data, fd_t **dirfd, - loc_t *dirloc) -{ - fd_t *fd = NULL; - int ret = 0; - - if (crawl_data->crawl == FULL) { - fd = fd_create (dirloc->inode, crawl_data->pid); - if (!fd) { - gf_log (this->name, GF_LOG_ERROR, - "Failed to create fd for %s", dirloc->path); - ret = -1; - goto out; - } - - ret = syncop_opendir (crawl_data->readdir_xl, dirloc, fd); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, - "opendir failed on %s", dirloc->path); - ret = -1; - goto out; - } - } else { - fd = fd_anonymous (dirloc->inode); - } - ret = 0; -out: - if (!ret) - *dirfd = fd; - return ret; -} - -xlator_t* -afr_crawl_readdir_xl_get (xlator_t *this, afr_crawl_data_t *crawl_data) -{ - afr_private_t *priv = this->private; - - if (crawl_data->crawl == FULL) { - return this; - } else { - return priv->children[crawl_data->child]; - } - return NULL; -} - -int -afr_crawl_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, - gf_dirent_t *entry, afr_crawl_data_t *crawl_data) -{ - int ret = -1; - afr_private_t *priv = NULL; - - priv = this->private; - if (crawl_data->crawl == FULL) { - ret = afr_build_child_loc (this, child, parent, entry->d_name); - } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) { - ret = _build_index_loc (this, child, entry->d_name, parent); - if (ret) - goto out; - child->inode = inode_new (priv->root_inode->table); - if (!child->inode) { - ret = -1; - goto out; - } - child->path = NULL; - } else { - child->inode = inode_new (priv->root_inode->table); - if (!child->inode) - goto out; - uuid_parse (entry->d_name, child->gfid); - ret = _loc_assign_gfid_path (child); - } -out: - return ret; -} - -static int -_process_entries (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries, - off_t *offset, afr_crawl_data_t *crawl_data) -{ - gf_dirent_t *entry = NULL; - gf_dirent_t *tmp = NULL; - int ret = 0; - loc_t entry_loc = {0}; - fd_t *fd = NULL; - struct iatt iattr = {0}; - - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - if (!_crawl_proceed (this, crawl_data->child, - crawl_data->crawl_flags, NULL)) { - ret = -1; - goto out; - } - *offset = entry->d_off; - if (IS_ENTRY_CWD (entry->d_name) || - IS_ENTRY_PARENT (entry->d_name)) - continue; - if ((crawl_data->crawl == FULL) && - uuid_is_null (entry->d_stat.ia_gfid)) { - gf_log (this->name, GF_LOG_WARNING, "%s/%s: No " - "gfid present skipping", - parentloc->path, entry->d_name); - continue; - } - - loc_wipe (&entry_loc); - ret = afr_crawl_build_child_loc (this, &entry_loc, parentloc, - entry, crawl_data); - if (ret) - goto out; - - ret = crawl_data->process_entry (this, crawl_data, entry, - &entry_loc, parentloc, &iattr); - - if (crawl_data->crawl == INDEX_TO_BE_HEALED && ret) { - goto out; - } else if (ret) { - continue; - } - - if ((crawl_data->crawl == INDEX) || - (crawl_data->crawl == INDEX_TO_BE_HEALED)) - continue; - - if (!IA_ISDIR (iattr.ia_type)) - continue; - fd = NULL; - ret = afr_crawl_opendir (this, crawl_data, &fd, &entry_loc); - if (ret) - continue; - ret = _crawl_directory (fd, &entry_loc, crawl_data); - if (fd) - fd_unref (fd); - } - ret = 0; -out: - if ((crawl_data->crawl == INDEX_TO_BE_HEALED) && ret) { - gf_log (this->name, GF_LOG_ERROR,"Failed to get the hardlink " - "count"); - } - loc_wipe (&entry_loc); - return ret; -} - -static int -_crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data) -{ - xlator_t *this = NULL; - off_t offset = 0; - gf_dirent_t entries; - int ret = 0; - gf_boolean_t free_entries = _gf_false; - xlator_t *readdir_xl = crawl_data->readdir_xl; - - INIT_LIST_HEAD (&entries.list); - this = THIS; - - GF_ASSERT (loc->inode); - - if (crawl_data->crawl == FULL) - gf_log (this->name, GF_LOG_DEBUG, "crawling %s", loc->path); - else - gf_log (this->name, GF_LOG_DEBUG, "crawling INDEX %s", - uuid_utoa (loc->gfid)); - - while (1) { - if (crawl_data->crawl == FULL) - ret = syncop_readdirp (readdir_xl, fd, 131072, offset, - NULL, &entries); - else - ret = syncop_readdir (readdir_xl, fd, 131072, offset, - &entries); - if (ret < 0) { - ret = -1; - break; - } else if (ret == 0) { - break; - } - - ret = 0; - free_entries = _gf_true; - - if (!_crawl_proceed (this, crawl_data->child, - crawl_data->crawl_flags, NULL)) { - ret = -1; - goto out; - } - if (list_empty (&entries.list)) - goto out; - - ret = _process_entries (this, loc, &entries, &offset, - crawl_data); - if ((ret < 0) && (crawl_data->crawl == INDEX_TO_BE_HEALED)) { - goto out; - } - gf_dirent_free (&entries); - free_entries = _gf_false; - } - ret = 0; -out: - if (free_entries) - gf_dirent_free (&entries); - return ret; -} - -static char* -position_str_get (afr_child_pos_t pos) -{ - switch (pos) { - case AFR_POS_UNKNOWN: - return "unknown"; - case AFR_POS_LOCAL: - return "local"; - case AFR_POS_REMOTE: - return "remote"; - } - return NULL; -} - -int -afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos) -{ - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - dict_t *xattr_rsp = NULL; - loc_t loc = {0}; - int ret = 0; - char *node_uuid = NULL; - - priv = this->private; - shd = &priv->shd; - - afr_build_root_loc (this, &loc); - - ret = syncop_getxattr (priv->children[child], &loc, &xattr_rsp, - GF_XATTR_NODE_UUID_KEY); - if (ret < 0) { - gf_log (this->name, GF_LOG_ERROR, "getxattr failed on %s - " - "(%s)", priv->children[child]->name, strerror (-ret)); - ret = -1; - goto out; - } - - ret = dict_get_str (xattr_rsp, GF_XATTR_NODE_UUID_KEY, &node_uuid); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "node-uuid key not found on " - "child %s", priv->children[child]->name); - goto out; - } - - if (!strcmp (node_uuid, shd->node_uuid)) - *pos = AFR_POS_LOCAL; - else - *pos = AFR_POS_REMOTE; - - gf_log (this->name, GF_LOG_DEBUG, "child %s is %s", - priv->children[child]->name, position_str_get (*pos)); -out: - if (ret) - *pos = AFR_POS_UNKNOWN; - loc_wipe (&loc); - return ret; -} - -int -afr_syncop_find_child_position (void *data) -{ - shd_pos_t *pos_data = data; - int ret = 0; - - ret = afr_find_child_position (pos_data->this, pos_data->child, - &pos_data->pos); - return ret; -} - -static int -afr_dir_crawl (void *data) -{ - xlator_t *this = NULL; - int ret = -1; - xlator_t *readdir_xl = NULL; - fd_t *fd = NULL; - loc_t dirloc = {0}; - afr_crawl_data_t *crawl_data = data; - - this = THIS; - - if (!_crawl_proceed (this, crawl_data->child, crawl_data->crawl_flags, - NULL)) - goto out; - - readdir_xl = afr_crawl_readdir_xl_get (this, crawl_data); - if (!readdir_xl) - goto out; - crawl_data->readdir_xl = readdir_xl; - - ret = afr_crawl_build_start_loc (this, crawl_data, &dirloc); - if (ret) - goto out; - - ret = afr_crawl_opendir (this, crawl_data, &fd, &dirloc); - if (ret) { - if (crawl_data->crawl == INDEX_TO_BE_HEALED) { - gf_log (this->name, GF_LOG_ERROR, "Failed to open base_" - "indices_holder"); - } - goto out; - } - - ret = _crawl_directory (fd, &dirloc, crawl_data); - if (ret) - gf_log (this->name, GF_LOG_ERROR, "Crawl failed on %s", - readdir_xl->name); - else - gf_log (this->name, GF_LOG_DEBUG, "Crawl completed " - "on %s", readdir_xl->name); - if (crawl_data->crawl == INDEX) - dirloc.path = NULL; -out: - if (fd) - fd_unref (fd); - if ((crawl_data->crawl == INDEX) || - (crawl_data->crawl == INDEX_TO_BE_HEALED )) - dirloc.path = NULL; - loc_wipe (&dirloc); - return ret; -} - -char * -get_crawl_type_in_string (afr_crawl_type_t crawl) -{ - char *index = "INDEX"; - char *full = "FULL"; - char *crawl_type = NULL; - - if (crawl == INDEX){ - crawl_type = index; - } else if (crawl == FULL) { - crawl_type = full; - } - - return crawl_type; -} - -static int -afr_allocate_crawl_event (xlator_t *this, int child, afr_crawl_type_t crawl) -{ - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - int ret = 0; - shd_crawl_event_t *crawl_event = NULL; - time_t get_time = 0; - - priv = this->private; - shd = &priv->shd; - - crawl_event = GF_CALLOC (sizeof (shd_crawl_event_t), 1, - gf_afr_mt_shd_crawl_event_t); - if (!crawl_event) { - ret = -1; - goto out; - } - - get_time = time(NULL); - if (get_time == ((time_t)-1)) { - ret = -1; - goto out; - } - - crawl_event->start_time_str = gf_strdup (ctime(&get_time)); - - crawl_event->crawl_type = get_crawl_type_in_string (crawl); - if (!crawl_event->crawl_type) { - ret = -1; - goto out; - } - LOCK (&priv->lock); - { - shd->crawl_events[child] = crawl_event; - } - UNLOCK (&priv->lock); - ret = 0; -out: - return ret; - -} - -static int -afr_put_crawl_event_in_eh (xlator_t *this, int child) -{ - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - int ret = 0; - time_t get_time = 0; - shd_crawl_event_t **crawl_event = NULL; - - priv = this->private; - shd = &priv->shd; - - get_time = time(NULL); - if (get_time == ((time_t)-1)) { - ret = -1; - goto out; - } - crawl_event = (shd_crawl_event_t**)shd->crawl_events; - LOCK (&priv->lock); - { - crawl_event[child]->end_time_str = gf_strdup (ctime(&get_time)); - ret = eh_save_history (shd->statistics[child], - crawl_event[child]); - crawl_event[child] = NULL; - } - UNLOCK (&priv->lock); -out: - return ret; -} - -static int -afr_dir_exclusive_crawl (void *data) -{ - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - gf_boolean_t crawl = _gf_false; - int ret = 0; - int child = -1; - xlator_t *this = NULL; - afr_crawl_data_t *crawl_data = data; - - this = THIS; - priv = this->private; - shd = &priv->shd; - child = crawl_data->child; - - LOCK (&priv->lock); - { - if (shd->inprogress[child]) { - if (shd->pending[child] != FULL) - shd->pending[child] = crawl_data->crawl; - } else { - shd->inprogress[child] = _gf_true; - crawl = _gf_true; - } - } - UNLOCK (&priv->lock); - - if (!crawl) { - gf_log (this->name, GF_LOG_INFO, "Another crawl is in progress " - "for %s while attempting %s heal on %s", - priv->children[child]->name, - get_crawl_type_in_string (crawl_data->crawl), - priv->children[child]->name); - goto out; - } - - do { - ret = afr_allocate_crawl_event (this, child, crawl_data->crawl); - if (ret) - goto out; - afr_dir_crawl (data); - - ret = afr_put_crawl_event_in_eh (this, child); - if (ret < 0) - goto out; - - LOCK (&priv->lock); - { - if (shd->pending[child] != NONE) { - crawl_data->crawl = shd->pending[child]; - shd->pending[child] = NONE; - } else { - shd->inprogress[child] = _gf_false; - crawl = _gf_false; - } - } - UNLOCK (&priv->lock); - } while (crawl); -out: - return ret; -} - -void -afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl, - process_entry_cbk_t process_entry, void *op_data, - gf_boolean_t exclusive, int crawl_flags, - afr_crawl_done_cbk_t crawl_done) -{ - afr_private_t *priv = NULL; - call_frame_t *frame = NULL; - afr_crawl_data_t *crawl_data = NULL; - int ret = 0; - int (*crawler) (void*) = NULL; - - priv = this->private; - - frame = create_frame (this, this->ctx->pool); - if (!frame) - goto out; - - afr_set_lk_owner (frame, this, frame->root); - afr_set_low_priority (frame); - crawl_data = GF_CALLOC (1, sizeof (*crawl_data), - gf_afr_mt_crawl_data_t); - if (!crawl_data) - goto out; - crawl_data->process_entry = process_entry; - crawl_data->child = idx; - crawl_data->pid = frame->root->pid; - crawl_data->crawl = crawl; - crawl_data->op_data = op_data; - crawl_data->crawl_flags = crawl_flags; - gf_log (this->name, GF_LOG_DEBUG, "starting crawl %d for %s", - crawl_data->crawl, priv->children[idx]->name); - - if (exclusive) - crawler = afr_dir_exclusive_crawl; - else - crawler = afr_dir_crawl; - ret = synctask_new (this->ctx->env, crawler, - crawl_done, frame, crawl_data); - if (ret) - gf_log (this->name, GF_LOG_ERROR, "afr crawl failed for child" - " %d with ret %d", idx, ret); -out: - return; -} - -void -afr_build_root_loc (xlator_t *this, loc_t *loc) -{ - afr_private_t *priv = NULL; - - priv = this->private; - loc->path = gf_strdup ("/"); - loc->name = ""; - loc->inode = inode_ref (priv->root_inode); - uuid_copy (loc->gfid, loc->inode->gfid); -} - -int -afr_set_root_gfid (dict_t *dict) -{ - uuid_t gfid; - int ret = 0; - - memset (gfid, 0, 16); - gfid[15] = 1; - - ret = afr_set_dict_gfid (dict, gfid); - - return ret; + return op_ret; } diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h index e0c083754..10e229ee7 100644 --- a/xlators/cluster/afr/src/afr-self-heald.h +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2008-2012 Red Hat, Inc. + Copyright (c) 2013 Red Hat, Inc. This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser @@ -8,58 +8,65 @@ cases as published by the Free Software Foundation. */ -#ifndef __AFR_SELF_HEALD_H__ -#define __AFR_SELF_HEALD_H__ -#include "xlator.h" - -#define IS_ROOT_PATH(path) (!strcmp (path, "/")) -#define IS_ENTRY_CWD(entry) (!strcmp (entry, ".")) -#define IS_ENTRY_PARENT(entry) (!strcmp (entry, "..")) -#define AFR_ALL_CHILDREN -1 - -typedef struct afr_crawl_data_ { - int child; - pid_t pid; - afr_crawl_type_t crawl; - xlator_t *readdir_xl; - void *op_data; - int crawl_flags; - int (*process_entry) (xlator_t *this, struct afr_crawl_data_ *crawl_data, - gf_dirent_t *entry, loc_t *child, loc_t *parent, - struct iatt *iattr); -} afr_crawl_data_t; - -typedef struct crawl_event_stats_ { - uint64_t healed_count; + +#ifndef _AFR_SELF_HEALD_H +#define _AFR_SELF_HEALD_H + +#include + + +typedef struct { + int child; + char *path; +} shd_event_t; + +typedef struct { + int child; + uint64_t healed_count; uint64_t split_brain_count; uint64_t heal_failed_count; - char *start_time_str; - char *end_time_str; + + /* If start_time is 0, it means crawler is not in progress + and stats are not valid */ + time_t start_time; + /* If start_time is NOT 0 and end_time is 0, it means + cralwer is in progress */ + time_t end_time; char *crawl_type; - gf_boolean_t crawl_inprogress; -} shd_crawl_event_t; +} crawl_event_t; -void _destroy_crawl_event_data (void *data); -void _destroy_shd_event_data (void *data); +struct subvol_healer { + xlator_t *this; + int subvol; + gf_boolean_t local; + gf_boolean_t running; + gf_boolean_t rerun; + crawl_event_t crawl_event; + pthread_mutex_t mutex; + pthread_cond_t cond; + pthread_t thread; +}; -typedef int (*process_entry_cbk_t) (xlator_t *this, afr_crawl_data_t *crawl_data, - gf_dirent_t *entry, loc_t *child, loc_t *parent, - struct iatt *iattr); +typedef struct { + gf_boolean_t iamshd; + gf_boolean_t enabled; + struct subvol_healer *index_healers; + struct subvol_healer *full_healers; -void afr_build_root_loc (xlator_t *this, loc_t *loc); + eh_t *healed; + eh_t *heal_failed; + eh_t *split_brain; + eh_t **statistics; +} afr_self_heald_t; -int afr_set_root_gfid (dict_t *dict); -void -afr_proactive_self_heal (void *data); +int +afr_selfheal_childup (xlator_t *this, int subvol); int -afr_xl_op (xlator_t *this, dict_t *input, dict_t *output); +afr_selfheal_daemon_init (xlator_t *this); -/* - * In addition to its self-heal use, this is used to find a local default - * read_child. - */ int -afr_local_pathinfo (char *pathinfo, gf_boolean_t *local); -#endif /* __AFR_SELF_HEALD_H__ */ +afr_xl_op (xlator_t *this, dict_t *input, dict_t *output); + +#endif /* !_AFR_SELF_HEALD_H */ diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 20306e469..f974fdb59 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -18,188 +18,130 @@ #include +gf_boolean_t +afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this); + +gf_boolean_t +afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this); + +int +afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume); -#define LOCKED_NO 0x0 /* no lock held */ -#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path - of RENAME */ -#define LOCKED_LOWER 0x2 /* for lower_path of RENAME */ -afr_fd_ctx_t * -__afr_fd_ctx_get (fd_t *fd, xlator_t *this) +int +__afr_txn_write_fop (call_frame_t *frame, xlator_t *this) { - uint64_t ctx = 0; - int ret = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int i = 0; + afr_local_t *local = NULL; afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + local = frame->local; priv = this->private; - ret = __fd_ctx_get (fd, this, &ctx); - - if (ret < 0 && fd_is_anonymous (fd)) { - ret = __afr_fd_ctx_set (this, fd); - if (ret < 0) - goto out; - - ret = __fd_ctx_get (fd, this, &ctx); - if (ret < 0) - goto out; + call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - for (i = 0; i < priv->child_count; i++) - fd_ctx->opened_on[i] = AFR_FD_OPENED; + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; } - fd_ctx = (afr_fd_ctx_t *)(long) ctx; -out: - return fd_ctx; -} - + local->call_count = call_count; -afr_fd_ctx_t * -afr_fd_ctx_get (fd_t *fd, xlator_t *this) -{ - afr_fd_ctx_t *fd_ctx = NULL; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + local->transaction.wind (frame, this, i); - LOCK(&fd->lock); - { - fd_ctx = __afr_fd_ctx_get (fd, this); + if (!--call_count) + break; + } } - UNLOCK(&fd->lock); - return fd_ctx; + return 0; } -static void -afr_save_lk_owner (call_frame_t *frame) +int +__afr_txn_write_done (call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; + afr_local_t *local = NULL; local = frame->local; - local->saved_lk_owner = frame->root->lk_owner; + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; } -static void -afr_restore_lk_owner (call_frame_t *frame) +call_frame_t* +afr_transaction_detach_fop_frame (call_frame_t *frame) { - afr_local_t * local = NULL; + afr_local_t * local = NULL; + call_frame_t *fop_frame = NULL; local = frame->local; - frame->root->lk_owner = local->saved_lk_owner; -} - -static void -__mark_all_pending (int32_t *pending[], int child_count, - afr_transaction_type type) -{ - int i = 0; - int j = 0; - - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); - pending[i][j] = hton32 (1); + LOCK (&frame->lock); + { + fop_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; } + UNLOCK (&frame->lock); + + return fop_frame; } static void -__mark_child_dead (int32_t *pending[], int child_count, int child, - afr_transaction_type type) +afr_save_lk_owner (call_frame_t *frame) { - int j = 0; + afr_local_t * local = NULL; - j = afr_index_for_transaction_type (type); + local = frame->local; - pending[child][j] = 0; + local->saved_lk_owner = frame->root->lk_owner; } static void -__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index) +afr_restore_lk_owner (call_frame_t *frame) { - afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; + afr_local_t * local = NULL; local = frame->local; - if (!local->fd) - return; - - fd_ctx = afr_fd_ctx_get (local->fd, this); - - if (!fd_ctx) - goto out; - - LOCK (&local->fd->lock); - { - if (local->transaction.type == AFR_DATA_TRANSACTION) - fd_ctx->pre_op_done[child_index]++; - } - UNLOCK (&local->fd->lock); -out: - return; -} - -static void -__mark_non_participant_children (int32_t *pending[], int child_count, - unsigned char *participants, - afr_transaction_type type) -{ - int i = 0; - int j = 0; - - j = afr_index_for_transaction_type (type); - for (i = 0; i < child_count; i++) { - if (!participants[i]) - pending[i][j] = 0; - } + frame->root->lk_owner = local->saved_lk_owner; } - void -__mark_all_success (int32_t *pending[], int child_count, - afr_transaction_type type) +__mark_all_success (call_frame_t *frame, xlator_t *this) { - int i; - int j; - - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); - pending[i][j] = hton32 (-1); - } -} + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i; -void -_set_all_child_errno (int *child_errno, unsigned int child_count) -{ - int i = 0; + local = frame->local; + priv = this->private; - for (i = 0; i < child_count; i++) - if (child_errno[i] == 0) - child_errno[i] = ENOTCONN; + for (i = 0; i < priv->child_count; i++) { + local->transaction.failed_subvols[i] = 0; + } } -void + +int afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; - afr_private_t *priv = NULL; fd_t *fd = NULL; local = frame->local; - priv = this->private; fd = local->fd; - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); - - _set_all_child_errno (local->child_errno, priv->child_count); - /* Perform fops with the lk-owner from top xlator. * Eg: lk-owner of posix-lk and flush should be same, * flush cant clear the posix-lks without that lk-owner. @@ -208,6 +150,10 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) frame->root->lk_owner = local->transaction.main_frame->root->lk_owner; + if (local->pre_op_compat) + /* old mode, pre-op was done as afr_changelog_do() + just now, before OP */ + afr_changelog_pre_op_update (frame, this); /* The wake up needs to happen independent of what type of fop arrives here. If it was @@ -220,6 +166,8 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) if (fd) afr_delayed_changelog_wake_up (this, fd); local->transaction.fop (frame, this); + + return 0; } @@ -285,39 +233,28 @@ __fop_changelog_needed (call_frame_t *frame, xlator_t *this) return op_ret; } + int -afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, - int child, afr_xattrop_type_t op) +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int **pending) { int i = 0; int ret = 0; + int pending_zero[AFR_NUM_CHANGE_LOGS] = {0, }; - if (op == LOCAL_FIRST) { - ret = dict_set_static_bin (xattr, priv->pending_key[child], - pending[child], - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); - if (ret) - goto out; - } for (i = 0; i < priv->child_count; i++) { - if (i == child) - continue; + if (!memcmp (pending_zero, pending[i], sizeof (pending_zero))) + /* don't set xattrs for non-pending servers */ + continue; + ret = dict_set_static_bin (xattr, priv->pending_key[i], - pending[i], - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); + pending[i], + AFR_NUM_CHANGE_LOGS * sizeof (int)); /* 3 = data+metadata+entry */ - if (ret < 0) - goto out; - } - if (op == LOCAL_LAST) { - ret = dict_set_static_bin (xattr, priv->pending_key[child], - pending[child], - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); if (ret) - goto out; + break; } -out: + return ret; } @@ -346,102 +283,34 @@ afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) /* {{{ pending */ -int32_t -afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr, - dict_t *xdata) + +int +afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this) { + afr_local_t *local = NULL; + afr_private_t *priv = NULL; afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int call_count = -1; - priv = this->private; - local = frame->local; + local = frame->local; + priv = this->private; int_lock = &local->internal_lock; - LOCK (&frame->lock); - { - call_count = --local->call_count; - } - UNLOCK (&frame->lock); - - if (call_count == 0) { - if (local->transaction.resume_stub) { - call_resume (local->transaction.resume_stub); - local->transaction.resume_stub = NULL; - } + if (local->transaction.resume_stub) { + call_resume (local->transaction.resume_stub); + local->transaction.resume_stub = NULL; + } - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - local->transaction.done (frame, this); - } else { - int_lock->lock_cbk = local->transaction.done; - afr_unlock (frame, this); - } - } + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + local->transaction.done (frame, this); + } else { + int_lock->lock_cbk = local->transaction.done; + afr_unlock (frame, this); + } - return 0; + return 0; } -void -afr_transaction_rm_stale_children (call_frame_t *frame, xlator_t *this, - inode_t *inode, afr_transaction_type type) -{ - int i = -1; - int count = 0; - int read_child = -1; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int **pending = NULL; - int idx = 0; - int32_t *stale_children = NULL; - int32_t *fresh_children = NULL; - gf_boolean_t rm_stale_children = _gf_false; - - idx = afr_index_for_transaction_type (type); - - priv = this->private; - local = frame->local; - pending = local->pending; - - if (local->op_ret < 0) - goto out; - fresh_children = local->fresh_children; - read_child = afr_inode_get_read_ctx (this, inode, fresh_children); - if (read_child < 0) { - gf_log (this->name, GF_LOG_DEBUG, "Possible split-brain " - "for %s", uuid_utoa (inode->gfid)); - goto out; - } - - for (i = 0; i < priv->child_count; i++) { - if (!afr_is_child_present (fresh_children, - priv->child_count, i)) - continue; - if (pending[i][idx]) - continue; - /* child is down or op failed on it */ - if (!stale_children) - stale_children = afr_children_create (priv->child_count); - if (!stale_children) - goto out; - - rm_stale_children = _gf_true; - stale_children[count++] = i; - gf_log (this->name, GF_LOG_DEBUG, "Removing stale child " - "%d for %s", i, uuid_utoa (inode->gfid)); - } - - if (!rm_stale_children) - goto out; - - afr_inode_rm_stale_children (this, inode, stale_children); -out: - GF_FREE (stale_children); - return; -} - afr_inodelk_t* afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom) { @@ -478,423 +347,468 @@ afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock) return locked_nodes; } + int -afr_changelog_pre_op_call_count (afr_transaction_type type, - afr_internal_lock_t *int_lock, - unsigned int child_count) +afr_changelog_call_count (afr_transaction_type type, + unsigned char *pre_op_subvols, + unsigned int child_count) { - int call_count = 0; - unsigned char *locked_nodes = NULL; + int call_count = 0; - locked_nodes = afr_locked_nodes_get (type, int_lock); - GF_ASSERT (locked_nodes); + call_count = AFR_COUNT(pre_op_subvols, child_count); - call_count = afr_locked_children_count (locked_nodes, child_count); if (type == AFR_ENTRY_RENAME_TRANSACTION) call_count *= 2; return call_count; } -int -afr_changelog_post_op_call_count (afr_transaction_type type, - unsigned char *pre_op, - unsigned int child_count) -{ - int call_count = 0; - call_count = afr_pre_op_done_children_count (pre_op, child_count); - if (type == AFR_ENTRY_RENAME_TRANSACTION) - call_count *= 2; +gf_boolean_t +afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; - return call_count; -} + local = frame->local; + priv = this->private; -void -afr_compute_txn_changelog (afr_local_t *local, afr_private_t *priv) -{ - int i = 0; - int index = 0; - int32_t postop = 0; - int32_t preop = 1; - int32_t **txn_changelog = NULL; - - txn_changelog = local->transaction.txn_changelog; - index = afr_index_for_transaction_type (local->transaction.type); for (i = 0; i < priv->child_count; i++) { - postop = ntoh32 (local->pending[i][index]); - txn_changelog[i][index] = hton32 (postop + preop); + if (local->transaction.failed_subvols[i]) + return _gf_false; } -} -afr_xattrop_type_t -afr_get_postop_xattrop_type (int32_t **pending, int optimized, int child, - afr_transaction_type type) -{ - int index = 0; - afr_xattrop_type_t op = LOCAL_LAST; - - index = afr_index_for_transaction_type (type); - if (optimized && !pending[child][index]) - op = LOCAL_FIRST; - return op; + return _gf_true; } + void -afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr, - int optimized, int child) +afr_handle_symmetric_errors (call_frame_t *frame, xlator_t *this) { - int32_t **txn_changelog = NULL; - int32_t **changelog = NULL; - afr_private_t *priv = NULL; - int ret = 0; - afr_xattrop_type_t op = LOCAL_LAST; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int op_errno = 0; + int i_errno = 0; + gf_boolean_t matching_errors = _gf_true; + int i = 0; - priv = this->private; - txn_changelog = local->transaction.txn_changelog; - op = afr_get_postop_xattrop_type (local->pending, optimized, child, - local->transaction.type); - if (optimized) - changelog = txn_changelog; - else - changelog = local->pending; - ret = afr_set_pending_dict (priv, xattr, changelog, child, op); - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); + priv = this->private; + local = frame->local; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret != -1) { + /* Operation succeeded on at least on subvol, + so it is not a failed-everywhere situation. + */ + matching_errors = _gf_false; + break; + } + i_errno = local->replies[i].op_errno; + + if (i_errno == ENOTCONN) { + /* ENOTCONN is not a symmetric error. We do not + know if the operation was performed on the + backend or not. + */ + matching_errors = _gf_false; + break; + } + + if (!op_errno) { + op_errno = i_errno; + } else if (op_errno != i_errno) { + /* Mismatching op_errno's */ + matching_errors = _gf_false; + break; + } + } + + if (matching_errors) + __mark_all_success (frame, this); } -gf_boolean_t -afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) +int +afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int index = -1; - int i = 0; + afr_private_t * priv = this->private; + int i = 0; + int ret = 0; + int idx = 0; + afr_local_t * local = NULL; + dict_t *xattr = NULL; + int nothing_failed = 1; + gf_boolean_t need_undirty = _gf_false; local = frame->local; - priv = this->private; + idx = afr_index_for_transaction_type (local->transaction.type); - index = afr_index_for_transaction_type (local->transaction.type); + nothing_failed = afr_txn_nothing_failed (frame, this); - for (i = 0; i < priv->child_count; i++) { - if (local->pending[i][index] == 0) - return _gf_false; - } + if (afr_changelog_pre_op_uninherit (frame, this)) + need_undirty = _gf_false; + else + need_undirty = _gf_true; - return _gf_true; -} + if (nothing_failed && !need_undirty) { + afr_changelog_post_op_done (frame, this); + goto out; + } -static void -afr_dir_fop_handle_all_fop_failures (call_frame_t *frame) -{ - xlator_t *this = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + xattr = dict_new (); + if (!xattr) { + local->op_ret = -1; + local->op_errno = ENOMEM; + afr_changelog_post_op_done (frame, this); + goto out; + } - this = frame->this; - local = frame->local; - priv = this->private; + if (need_undirty) { + local->dirty[idx] = hton32(-1); - if ((local->transaction.type != AFR_ENTRY_TRANSACTION) && - (local->transaction.type != AFR_ENTRY_RENAME_TRANSACTION)) - return; + ret = dict_set_static_bin (xattr, AFR_DIRTY, local->dirty, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + local->op_ret = -1; + local->op_errno = ENOMEM; + afr_changelog_post_op_done (frame, this); + goto out; + } - if (local->op_ret >= 0) - goto out; + } + + if (!nothing_failed) { + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) + local->pending[i][idx] = hton32(1); + } + ret = afr_set_pending_dict (priv, xattr, local->pending); + if (ret < 0) { + local->op_ret = -1; + local->op_errno = ENOMEM; + afr_changelog_post_op_done (frame, this); + goto out; + } + + } - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); + afr_changelog_do (frame, this, xattr, afr_changelog_post_op_done); out: - return; + if (xattr) + dict_unref (xattr); + + return 0; } -static void -afr_data_handle_quota_errors (call_frame_t *frame, xlator_t *this) + +gf_boolean_t +afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this) { - int i = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - gf_boolean_t all_quota_failures = _gf_false; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + fd_t *fd = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + afr_fd_ctx_t *fd_ctx = NULL; + int type = 0; - local = frame->local; - priv = this->private; - if (local->transaction.type != AFR_DATA_TRANSACTION) - return; - /* - * Idea is to not leave the file in FOOL-FOOL scenario in case on - * all the bricks data transaction failed with EDQUOT to avoid - * increasing un-necessary load of self-heals in the system. - */ - all_quota_failures = _gf_true; - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i] && - (local->child_errno[i] != EDQUOT)) { - all_quota_failures = _gf_false; - break; - } - } - if (all_quota_failures) - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); + local = frame->local; + priv = this->private; + fd = local->fd; + + type = afr_index_for_transaction_type (local->transaction.type); + if (type != AFR_DATA_TRANSACTION) + return !local->transaction.dirtied; + + if (!fd) + return !local->transaction.dirtied; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return _gf_false; + + if (local->transaction.no_uninherit) + return _gf_false; + + /* This function must be idempotent. So check if we + were called before and return the same answer again. + + It is important to keep this function idempotent for + the call in afr_changelog_post_op_safe() to not have + side effects on the call from afr_changelog_post_op_now() + */ + if (local->transaction.uninherit_done) + return local->transaction.uninherit_value; + + LOCK(&fd->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] != + fd_ctx->pre_op_done[type][i]) { + ret = !local->transaction.dirtied; + goto unlock; + } + } + + if (fd_ctx->inherited[type]) { + ret = _gf_true; + fd_ctx->inherited[type]--; + } else if (fd_ctx->on_disk[type]) { + ret = _gf_false; + fd_ctx->on_disk[type]--; + } else { + /* ASSERT */ + ret = _gf_false; + } + + if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) + fd_ctx->pre_op_done[type][i] = 0; + } + } +unlock: + UNLOCK(&fd->lock); + + local->transaction.uninherit_done = _gf_true; + local->transaction.uninherit_value = ret; + + return ret; } -int -afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) + +gf_boolean_t +afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = this->private; - afr_internal_lock_t *int_lock = NULL; - int i = 0; - int call_count = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + fd_t *fd = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + afr_fd_ctx_t *fd_ctx = NULL; + int type = 0; - afr_local_t * local = NULL; - afr_fd_ctx_t *fdctx = NULL; - dict_t **xattr = NULL; - int piggyback = 0; - int nothing_failed = 1; + local = frame->local; + priv = this->private; + fd = local->fd; - local = frame->local; - int_lock = &local->internal_lock; + if (local->transaction.type != AFR_DATA_TRANSACTION) + return _gf_false; - __mark_non_participant_children (local->pending, priv->child_count, - local->transaction.pre_op, - local->transaction.type); + type = afr_index_for_transaction_type (local->transaction.type); - afr_data_handle_quota_errors (frame, this); - afr_dir_fop_handle_all_fop_failures (frame); + if (!fd) + return _gf_false; - if (local->fd) - afr_transaction_rm_stale_children (frame, this, - local->fd->inode, - local->transaction.type); + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return _gf_false; - xattr = alloca (priv->child_count * sizeof (*xattr)); - memset (xattr, 0, (priv->child_count * sizeof (*xattr))); - for (i = 0; i < priv->child_count; i++) { - xattr[i] = dict_new (); - } + LOCK(&fd->lock); + { + if (!fd_ctx->on_disk[type]) { + /* nothing to inherit yet */ + ret = _gf_false; + goto unlock; + } - call_count = afr_changelog_post_op_call_count (local->transaction.type, - local->transaction.pre_op, - priv->child_count); - local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] != + fd_ctx->pre_op_done[type][i]) { + /* either inherit exactly, or don't */ + ret = _gf_false; + goto unlock; + } + } - if (local->fd) - fdctx = afr_fd_ctx_get (local->fd, this); + fd_ctx->inherited[type]++; - if (call_count == 0) { - /* no child is up */ - int_lock->lock_cbk = local->transaction.done; - afr_unlock (frame, this); - goto out; - } + ret = _gf_true; - nothing_failed = afr_txn_nothing_failed (frame, this); + local->transaction.inherited = _gf_true; + } +unlock: + UNLOCK(&fd->lock); - afr_compute_txn_changelog (local , priv); + return ret; +} - for (i = 0; i < priv->child_count; i++) { - if (!local->transaction.pre_op[i]) - continue; - if (local->transaction.type != AFR_DATA_TRANSACTION) - afr_set_postop_dict (local, this, xattr[i], - local->optimistic_change_log, i); - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - { - if (!fdctx) { - afr_set_postop_dict (local, this, xattr[i], - 0, i); - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - break; - } +gf_boolean_t +afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + fd_t *fd = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + int type = 0; - /* local->transaction.postop_piggybacked[] was - precomputed in is_piggyback_postop() when called from - afr_changelog_post_op_safe() - */ + local = frame->local; + priv = this->private; + fd = local->fd; - piggyback = 0; - if (local->transaction.postop_piggybacked[i]) - piggyback = 1; + if (!fd) + return _gf_false; - afr_set_postop_dict (local, this, xattr[i], - piggyback, i); + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) + return _gf_false; - if (nothing_failed && piggyback) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], NULL); - } else { - STACK_WIND_COOKIE (frame, - afr_changelog_post_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - } - break; - case AFR_METADATA_TRANSACTION: - { - if (nothing_failed && local->optimistic_change_log) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - break; - } + if (local->transaction.inherited) + /* was already inherited in afr_changelog_pre_op */ + return _gf_false; - if (local->fd) - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - else - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; + if (!local->transaction.dirtied) + return _gf_false; - case AFR_ENTRY_RENAME_TRANSACTION: - { - if (nothing_failed && local->optimistic_change_log) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - } else { - STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - call_count--; - } + if (!afr_txn_nothing_failed (frame, this)) + return _gf_false; - /* - set it again because previous stack_wind - might have already returned (think of case - where subvolume is posix) and would have - used the dict as placeholder for return - value - */ + type = afr_index_for_transaction_type (local->transaction.type); - afr_set_postop_dict (local, this, xattr[i], - local->optimistic_change_log, i); + ret = _gf_false; - /* fall through */ + LOCK(&fd->lock); + { + if (!fd_ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) + fd_ctx->pre_op_done[type][i] = + local->transaction.pre_op[i]; + } else { + for (i = 0; i < priv->child_count; i++) + if (fd_ctx->pre_op_done[type][i] != + local->transaction.pre_op[i]) { + local->transaction.no_uninherit = 1; + goto unlock; + } + } + fd_ctx->on_disk[type]++; + + ret = _gf_true; + } +unlock: + UNLOCK(&fd->lock); - case AFR_ENTRY_TRANSACTION: - { - if (nothing_failed && local->optimistic_change_log) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - break; - } + return ret; +} - if (local->fd) - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - else - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; - } - if (!--call_count) - break; - } +int +afr_changelog_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + afr_local_t *local = NULL; + int call_count = -1; -out: - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); - } + local = frame->local; + + if (op_ret == -1) + afr_transaction_fop_failed (frame, this, (long) cookie); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + local->transaction.changelog_resume (frame, this); return 0; } -int32_t -afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr, - dict_t *xdata) +int +afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume) { - afr_local_t * local = NULL; - afr_private_t * priv = this->private; - int call_count = -1; - int child_index = (long) cookie; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; - local = frame->local; + local = frame->local; + priv = this->private; - LOCK (&frame->lock); - { - switch (op_ret) { - case 0: - __mark_pre_op_done_on_fd (frame, this, child_index); - //fallthrough we need to mark the pre_op - case 1: - local->transaction.pre_op[child_index] = 1; - /* special op_ret for piggyback */ - break; - case -1: - if (op_errno == ENOTSUP) { - gf_log (this->name, GF_LOG_ERROR, - "xattrop not supported by %s", - priv->children[child_index]->name); - local->op_ret = -1; - - } else if (!child_went_down (op_ret, op_errno)) { - gf_log (this->name, GF_LOG_ERROR, - "xattrop failed on child %s: %s", - priv->children[child_index]->name, - strerror (op_errno)); + call_count = afr_changelog_call_count (local->transaction.type, + local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + changelog_resume (frame, this); + return 0; + } + + local->call_count = call_count; + + local->transaction.changelog_resume = changelog_resume; + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + if (!local->fd) { + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + } else { + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); } - local->op_errno = op_errno; - break; - } + break; + case AFR_ENTRY_RENAME_TRANSACTION: - call_count = --local->call_count; - } - UNLOCK (&frame->lock); + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.new_parent_loc, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + call_count--; - if (call_count == 0) { - if ((local->op_ret == -1) && - (local->op_errno == ENOTSUP)) { - local->transaction.resume (frame, this); - } else { - afr_transaction_perform_fop (frame, this); - } + /* fall through */ + + case AFR_ENTRY_TRANSACTION: + if (local->fd) + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + else + STACK_WIND_COOKIE (frame, afr_changelog_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.parent_loc, + GF_XATTROP_ADD_ARRAY, xattr, + NULL); + break; + } + + if (!--call_count) + break; } - return 0; + return 0; } + int afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) { @@ -902,206 +816,122 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) int i = 0; int ret = 0; int call_count = 0; - dict_t **xattr = NULL; - afr_fd_ctx_t *fdctx = NULL; + int op_errno = 0; afr_local_t *local = NULL; - int piggyback = 0; afr_internal_lock_t *int_lock = NULL; unsigned char *locked_nodes = NULL; + unsigned char *pending_subvols = NULL; + int idx = -1; + gf_boolean_t pre_nop = _gf_true; + dict_t *xdata_req = NULL; local = frame->local; int_lock = &local->internal_lock; - - xattr = alloca (priv->child_count * sizeof (*xattr)); - memset (xattr, 0, (priv->child_count * sizeof (*xattr))); - - for (i = 0; i < priv->child_count; i++) { - xattr[i] = dict_new (); - } - - call_count = afr_changelog_pre_op_call_count (local->transaction.type, - int_lock, - priv->child_count); - if (call_count == 0) { - local->internal_lock.lock_cbk = - local->transaction.done; - afr_unlock (frame, this); - goto out; - } - - local->call_count = call_count; - - __mark_all_pending (local->pending, priv->child_count, - local->transaction.type); - - if (local->fd) - fdctx = afr_fd_ctx_get (local->fd, this); + idx = afr_index_for_transaction_type (local->transaction.type); locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock); - for (i = 0; i < priv->child_count; i++) { - if (!locked_nodes[i]) - continue; - ret = afr_set_pending_dict (priv, xattr[i], local->pending, - i, LOCAL_FIRST); - - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); + pending_subvols = alloca0 (priv->child_count); - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - { - if (!fdctx) { - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &(local->loc), - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - break; - } + for (i = 0; i < priv->child_count; i++) { + if (locked_nodes[i]) { + local->transaction.pre_op[i] = 1; + call_count++; + } else { + pending_subvols[i] = 1; + } + } - LOCK (&local->fd->lock); - { - piggyback = 0; - if (fdctx->pre_op_done[i]) { - fdctx->pre_op_piggyback[i]++; - piggyback = 1; - fdctx->hit++; - } else { - fdctx->miss++; - } - } - UNLOCK (&local->fd->lock); + /* TBD: quorum check w/ call_count */ - afr_set_delayed_post_op (frame, this); + if (call_count == 0) { + op_errno = ENOTCONN; + goto err; + } - if (piggyback) - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; - case AFR_METADATA_TRANSACTION: - { - if (local->optimistic_change_log) { - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - break; - } + xdata_req = dict_new(); + if (!xdata_req) { + op_errno = ENOMEM; + goto err; + } - if (local->fd) - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &(local->loc), - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; + pre_nop = _gf_true; + + if (afr_changelog_pre_op_inherit (frame, this)) + goto next; + + if (call_count < priv->child_count) { + /* For subvols we are not performing operation on, + mark them as pending up-front along with the FOP + so that we can safely defer unmarking dirty until + later. + */ + for (i = 0; i < priv->child_count; i++) { + if (pending_subvols[i]) + local->pending[i][idx] = hton32(1); + } + ret = afr_set_pending_dict (priv, xdata_req, + local->pending); + if (ret < 0) { + op_errno = ENOMEM; + goto err; + } + pre_nop = _gf_false; + } - case AFR_ENTRY_RENAME_TRANSACTION: - { - if (local->optimistic_change_log) { - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - } else { - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } + if (call_count > 1 && + (local->transaction.type == AFR_DATA_TRANSACTION || + !local->optimistic_change_log)) { + + /* If we are performing change on only one subvol, no + need to mark dirty, because we are setting the pending + counts already anyways + */ + local->dirty[idx] = hton32(1); + + ret = dict_set_static_bin (xdata_req, AFR_DIRTY, local->dirty, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + op_errno = ENOMEM; + goto err; + } + + pre_nop = _gf_false; + local->transaction.dirtied = 1; + } - call_count--; - } + if (pre_nop) + goto next; + if (!local->pre_op_compat) { + dict_copy (xdata_req, local->xdata_req); + goto next; + } - /* - set it again because previous stack_wind - might have already returned (think of case - where subvolume is posix) and would have - used the dict as placeholder for return - value - */ + afr_changelog_do (frame, this, xdata_req, afr_transaction_perform_fop); - ret = afr_set_pending_dict (priv, xattr[i], local->pending, - i, LOCAL_FIRST); + if (xdata_req) + dict_unref (xdata_req); - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); + return 0; +next: + afr_transaction_perform_fop (frame, this); - /* fall through */ + if (xdata_req) + dict_unref (xdata_req); - case AFR_ENTRY_TRANSACTION: - { - if (local->optimistic_change_log) { - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - break; - } + return 0; +err: + local->internal_lock.lock_cbk = local->transaction.done; + local->op_ret = -1; + local->op_errno = op_errno; - if (local->fd) - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; - } + afr_unlock (frame, this); - if (!--call_count) - break; - } -out: - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); - } + if (xdata_req) + dict_unref (xdata_req); - return 0; + return 0; } @@ -1365,15 +1195,15 @@ afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this) } gf_boolean_t -afr_are_multiple_fds_opened (inode_t *inode, xlator_t *this) +afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this) { - afr_inode_ctx_t *ictx = NULL; + afr_fd_ctx_t *fd_ctx = NULL; - if (!inode) { + if (!fd) { /* If false is returned, it may keep on taking eager-lock * which may lead to starvation, so return true to avoid that. */ - gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid inode"); + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid fd"); return _gf_true; } /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock @@ -1383,32 +1213,22 @@ afr_are_multiple_fds_opened (inode_t *inode, xlator_t *this) * if open-fd-count is > 1 */ - ictx = afr_inode_ctx_get (inode, this); - if (!ictx) + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) return _gf_true; - if (ictx->open_fd_count > 1) + if (fd_ctx->open_fd_count > 1) return _gf_true; return _gf_false; } -gf_boolean_t -afr_any_fops_failed (afr_local_t *local, afr_private_t *priv) -{ - if (local->success_count != priv->child_count) - return _gf_true; - return _gf_false; -} gf_boolean_t is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; gf_boolean_t res = _gf_false; - afr_private_t *priv = NULL; - - priv = this->private; local = frame->local; if (!local) @@ -1418,10 +1238,10 @@ is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) goto out; //Mark pending changelog ASAP - if (afr_any_fops_failed (local, priv)) + if (!afr_txn_nothing_failed (frame, this)) goto out; - if (local->fd && afr_are_multiple_fds_opened (local->fd->inode, this)) + if (local->fd && afr_are_multiple_fds_opened (local->fd, this)) goto out; res = _gf_true; @@ -1445,58 +1265,6 @@ afr_delayed_changelog_wake_up_cbk (void *data) } -/* - Check if the frame is destined to get optimized away - with changelog piggybacking -*/ -static gf_boolean_t -is_piggyback_post_op (call_frame_t *frame, fd_t *fd) -{ - afr_fd_ctx_t *fdctx = NULL; - afr_local_t *local = NULL; - gf_boolean_t piggyback = _gf_true; - afr_private_t *priv = NULL; - int i = 0; - - priv = frame->this->private; - local = frame->local; - fdctx = afr_fd_ctx_get (fd, frame->this); - - LOCK(&fd->lock); - { - piggyback = _gf_true; - - for (i = 0; i < priv->child_count; i++) { - if (!local->transaction.pre_op[i]) - continue; - if (fdctx->pre_op_piggyback[i]) { - fdctx->pre_op_piggyback[i]--; - local->transaction.postop_piggybacked[i] = 1; - } else { - /* For at least _one_ subvolume we cannot - piggyback on the changelog, and have to - perform a hard POST-OP and therefore fsync - if necesssary - */ - piggyback = _gf_false; - GF_ASSERT (fdctx->pre_op_done[i]); - fdctx->pre_op_done[i]--; - } - } - } - UNLOCK(&fd->lock); - - if (!afr_txn_nothing_failed (frame, frame->this)) { - /* something failed in this transaction, - we will be performing a hard post-op - */ - return _gf_false; - } - - return piggyback; -} - - /* SET operation */ int afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) @@ -1521,7 +1289,7 @@ afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd) afr_fd_ctx_t *fdctx = NULL; gf_boolean_t witness = _gf_false; - fdctx = afr_fd_ctx_get (fd, this); + fdctx = afr_fd_ctx_get (fd, this); if (!fdctx) return _gf_true; @@ -1551,10 +1319,10 @@ afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, priv = this->private; local = frame->local; - if (afr_fop_failed (op_ret, op_errno)) { + if (op_ret != 0) { /* Failure of fsync() is as good as failure of previous write(). So treat it like one. - */ + */ gf_log (this->name, GF_LOG_WARNING, "fsync(%s) failed on subvolume %s. Transaction was %s", uuid_utoa (local->fd->inode->gfid), @@ -1562,14 +1330,14 @@ afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_fop_list[local->op]); afr_transaction_fop_failed (frame, this, child_index); - } + } - call_count = afr_frame_return (frame); + call_count = afr_frame_return (frame); - if (call_count == 0) - afr_changelog_post_op_now (frame, this); + if (call_count == 0) + afr_changelog_post_op_now (frame, this); - return 0; + return 0; } @@ -1580,14 +1348,13 @@ afr_changelog_fsync (call_frame_t *frame, xlator_t *this) int i = 0; int call_count = 0; afr_private_t *priv = NULL; - dict_t *xdata = NULL; - GF_UNUSED int ret = -1; + dict_t *xdata = NULL; + GF_UNUSED int ret = -1; local = frame->local; priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); + call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); if (!call_count) { /* will go straight to unlock */ @@ -1597,30 +1364,30 @@ afr_changelog_fsync (call_frame_t *frame, xlator_t *this) local->call_count = call_count; - xdata = dict_new(); - if (xdata) - ret = dict_set_int32 (xdata, "batch-fsync", 1); + xdata = dict_new(); + if (xdata) + ret = dict_set_int32 (xdata, "batch-fsync", 1); for (i = 0; i < priv->child_count; i++) { if (!local->transaction.pre_op[i]) continue; STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->fsync, local->fd, - 1, xdata); + (void *) (long) i, priv->children[i], + priv->children[i]->fops->fsync, local->fd, + 1, xdata); if (!--call_count) break; } - if (xdata) - dict_unref (xdata); + if (xdata) + dict_unref (xdata); return 0; } - int +int afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; @@ -1634,7 +1401,8 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) return 0; } - if (is_piggyback_post_op (frame, local->fd)) { + if (afr_changelog_pre_op_uninherit (frame, this) && + afr_txn_nothing_failed (frame, this)) { /* just detected that this post-op is about to be optimized away as a new write() has already piggybacked on this frame's changelog. @@ -1733,7 +1501,7 @@ out: if (prev_frame) { local = prev_frame->local; local->transaction.resume_stub = stub; - afr_changelog_post_op_safe (prev_frame, this); + afr_changelog_post_op_now (prev_frame, this); } else if (stub) { call_resume (stub); } @@ -1779,13 +1547,9 @@ afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd) int afr_transaction_resume (call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; - afr_private_t *priv = NULL; local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; if (local->transaction.eager_lock_on) { /* We don't need to retain "local" in the @@ -1800,15 +1564,17 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) afr_restore_lk_owner (frame); + afr_handle_symmetric_errors (frame, this); + + if (!local->pre_op_compat) + /* new mode, pre-op was done along + with OP */ + afr_changelog_pre_op_update (frame, this); + if (__fop_changelog_needed (frame, this)) { afr_changelog_post_op (frame, this); } else { - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - local->transaction.done (frame, this); - } else { - int_lock->lock_cbk = local->transaction.done; - afr_unlock (frame, this); - } + afr_changelog_post_op_done (frame, this); } return 0; @@ -1824,13 +1590,10 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; local = frame->local; - priv = this->private; - __mark_child_dead (local->pending, priv->child_count, - child_index, local->transaction.type); + local->transaction.failed_subvols[child_index] = 1; } @@ -1878,7 +1641,7 @@ afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) if (!fdctx) return; - if (afr_are_multiple_fds_opened (local->fd->inode, this)) + if (afr_are_multiple_fds_opened (local->fd, this)) return; /* * Once full file lock is acquired in eager-lock phase, overlapping diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index fa626fd0d..77cc8eed0 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -11,10 +11,7 @@ #ifndef __TRANSACTION_H__ #define __TRANSACTION_H__ -typedef enum { - LOCAL_FIRST = 1, - LOCAL_LAST = 2 -} afr_xattrop_type_t; +#include "afr.h" void afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, @@ -29,11 +26,9 @@ afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom); int32_t afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); -afr_fd_ctx_t * -afr_fd_ctx_get (fd_t *fd, xlator_t *this); int -afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, - int child, afr_xattrop_type_t op); +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending); + void afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this); @@ -41,11 +36,18 @@ void afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd); void -__mark_all_success (int32_t *pending[], int child_count, - afr_transaction_type type); -gf_boolean_t -afr_any_fops_failed (afr_local_t *local, afr_private_t *priv); +__mark_all_success (call_frame_t *frame, xlator_t *this); gf_boolean_t afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this); + +int afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_read_txn_wind_t readfn, afr_transaction_type type); + +int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol); + +int __afr_txn_write_fop (call_frame_t *frame, xlator_t *this); +int __afr_txn_write_done (call_frame_t *frame, xlator_t *this); +call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame); + #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index c26453807..5e12910b7 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -21,11 +21,6 @@ #endif #include "afr-common.c" -#define SHD_INODE_LRU_LIMIT 2048 -#define AFR_EH_HEALED_LIMIT 1024 -#define AFR_EH_HEAL_FAIL_LIMIT 1024 -#define AFR_EH_SPLIT_BRAIN_LIMIT 1024 - struct volume_options options[]; int32_t @@ -114,6 +109,14 @@ reconfigure (xlator_t *this, dict_t *options) priv = this->private; + GF_OPTION_RECONF ("afr-dirty-xattr", + priv->afr_dirty, options, str, + out); + + GF_OPTION_RECONF ("metadata-splitbrain-forced-heal", + priv->metadata_splitbrain_forced_heal, options, bool, + out); + GF_OPTION_RECONF ("background-self-heal-count", priv->background_self_heal_count, options, uint32, out); @@ -127,9 +130,6 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("entry-self-heal", priv->entry_self_heal, options, bool, out); - GF_OPTION_RECONF ("strict-readdir", priv->strict_readdir, options, bool, - out); - GF_OPTION_RECONF ("data-self-heal-window-size", priv->data_self_heal_window_size, options, uint32, out); @@ -146,8 +146,6 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF ("data-self-heal-algorithm", priv->data_self_heal_algorithm, options, str, out); - GF_OPTION_RECONF ("self-heal-daemon", priv->shd.enabled, options, bool, out); - GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out); GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode, @@ -175,13 +173,13 @@ reconfigure (xlator_t *this, dict_t *options) priv->read_child = index; } + GF_OPTION_RECONF ("pre-op-compat", priv->pre_op_compat, options, bool, out); + GF_OPTION_RECONF ("eager-lock", priv->eager_lock, options, bool, out); GF_OPTION_RECONF ("quorum-type", qtype, options, str, out); GF_OPTION_RECONF ("quorum-count", priv->quorum_count, options, uint32, out); fix_quorum_options(this,priv,qtype); - GF_OPTION_RECONF ("heal-timeout", priv->shd.timeout, options, - int32, out); GF_OPTION_RECONF ("post-op-delay-secs", priv->post_op_delay_secs, options, uint32, out); @@ -189,10 +187,15 @@ reconfigure (xlator_t *this, dict_t *options) GF_OPTION_RECONF (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, options, size, out); /* Reset this so we re-discover in case the topology changed. */ - GF_OPTION_RECONF ("readdir-failover", priv->readdir_failover, options, - bool, out); GF_OPTION_RECONF ("ensure-durability", priv->ensure_durability, options, bool, out); + + GF_OPTION_RECONF ("self-heal-daemon", priv->shd.enabled, options, + bool, out); + + GF_OPTION_RECONF ("iam-self-heal-daemon", priv->shd.iamshd, options, + bool, out); + priv->did_discovery = _gf_false; ret = 0; @@ -244,10 +247,6 @@ init (xlator_t *this) priv = this->private; LOCK_INIT (&priv->lock); - LOCK_INIT (&priv->read_child_lock); - //lock recovery is not done in afr - pthread_mutex_init (&priv->mutex, NULL); - INIT_LIST_HEAD (&priv->saved_fds); child_count = xlator_subvolume_count (this); @@ -255,6 +254,11 @@ init (xlator_t *this) priv->read_child = -1; + GF_OPTION_INIT ("afr-dirty-xattr", priv->afr_dirty, str, out); + + GF_OPTION_INIT ("metadata-splitbrain-forced-heal", + priv->metadata_splitbrain_forced_heal, bool, out); + GF_OPTION_INIT ("read-subvolume", read_subvol, xlator, out); if (read_subvol) { priv->read_child = xlator_subvolume_index (this, read_subvol); @@ -308,10 +312,6 @@ init (xlator_t *this) GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out); - GF_OPTION_INIT ("self-heal-daemon", priv->shd.enabled, bool, out); - - GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out); - GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out); GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool, @@ -326,7 +326,7 @@ init (xlator_t *this) GF_OPTION_INIT ("entrylk-trace", priv->entrylk_trace, bool, out); - GF_OPTION_INIT ("strict-readdir", priv->strict_readdir, bool, out); + GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out); GF_OPTION_INIT ("eager-lock", priv->eager_lock, bool, out); GF_OPTION_INIT ("quorum-type", qtype, str, out); @@ -336,10 +336,13 @@ init (xlator_t *this) fix_quorum_options(this,priv,qtype); GF_OPTION_INIT ("post-op-delay-secs", priv->post_op_delay_secs, uint32, out); - GF_OPTION_INIT ("readdir-failover", priv->readdir_failover, bool, out); GF_OPTION_INIT ("ensure-durability", priv->ensure_durability, bool, out); + GF_OPTION_INIT ("self-heal-daemon", priv->shd.enabled, bool, out); + + GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out); + priv->wait_count = 1; priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, @@ -402,6 +405,12 @@ init (xlator_t *this) goto out; } + ret = afr_selfheal_daemon_init (this); + if (ret) { + ret = -ENOMEM; + goto out; + } + /* keep more local here as we may need them for self-heal etc */ this->local_pool = mem_pool_new (afr_local_t, 512); if (!this->local_pool) { @@ -411,58 +420,8 @@ init (xlator_t *this) goto out; } - priv->first_lookup = 1; priv->root_inode = NULL; - if (!priv->shd.iamshd) { - ret = 0; - goto out; - } - - ret = -ENOMEM; - priv->shd.pos = GF_CALLOC (sizeof (*priv->shd.pos), child_count, - gf_afr_mt_brick_pos_t); - if (!priv->shd.pos) - goto out; - - priv->shd.pending = GF_CALLOC (sizeof (*priv->shd.pending), child_count, - gf_afr_mt_int32_t); - if (!priv->shd.pending) - goto out; - - priv->shd.inprogress = GF_CALLOC (sizeof (*priv->shd.inprogress), - child_count, gf_afr_mt_shd_bool_t); - if (!priv->shd.inprogress) - goto out; - priv->shd.timer = GF_CALLOC (sizeof (*priv->shd.timer), child_count, - gf_afr_mt_shd_timer_t); - if (!priv->shd.timer) - goto out; - - priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false, - _destroy_shd_event_data); - if (!priv->shd.healed) - goto out; - - priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false, - _destroy_shd_event_data); - if (!priv->shd.heal_failed) - goto out; - - priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false, - _destroy_shd_event_data); - if (!priv->shd.split_brain) - goto out; - - this->itable = inode_table_new (SHD_INODE_LRU_LIMIT, this); - if (!this->itable) - goto out; - priv->root_inode = inode_ref (this->itable->root); - GF_OPTION_INIT ("node-uuid", priv->shd.node_uuid, str, out); - GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out); - ret = afr_initialise_statistics (this); - if (ret) - goto out; ret = 0; out: return ret; @@ -572,11 +531,11 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_INT, .min = 0, .max = 2, - .default_value = "0", + .default_value = "1", .description = "inode-read fops happen only on one of the bricks in " "replicate. AFR will prefer the one computed using " "the method specified using this option" - "0 = first responder, " + "0 = first up server, " "1 = hash by GFID of file (all clients use " "same subvolume), " "2 = hash by GFID of file and client PID", @@ -585,7 +544,7 @@ struct volume_options options[] = { .type = GF_OPTION_TYPE_BOOL, .default_value = "true", .description = "Choose a local subvolume (i.e. Brick) to read from" - " if read-subvolume is not explicitly set.", + " if read-subvolume is not explicitly set.", }, { .key = {"favorite-child"}, .type = GF_OPTION_TYPE_XLATOR, @@ -675,10 +634,6 @@ struct volume_options options[] = { "pre fop changelog operations in afr transaction " "if this option is enabled." }, - { .key = {"strict-readdir"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - }, { .key = {"inodelk-trace"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "off", @@ -689,6 +644,12 @@ struct volume_options options[] = { .default_value = "off", .description = "Enabling this option logs entry lock/unlocks" }, + { .key = {"pre-op-compat"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "Use separate pre-op xattrop() FOP rather than " + "overloading xdata of the OP" + }, { .key = {"eager-lock"}, .type = GF_OPTION_TYPE_BOOL, .default_value = "on", @@ -753,14 +714,6 @@ struct volume_options options[] = { "self-heal-daemon so that it can crawl only on " "local index directories.", }, - { .key = {"heal-timeout"}, - .type = GF_OPTION_TYPE_INT, - .min = 60, - .max = INT_MAX, - .default_value = "600", - .description = "time interval for checking the need to self-heal " - "in self-heal-daemon" - }, { .key = {"post-op-delay-secs"}, .type = GF_OPTION_TYPE_INT, .min = 0, @@ -777,11 +730,6 @@ struct volume_options options[] = { .max = 131072, .default_value = "1KB", }, - { .key = {"readdir-failover"}, - .type = GF_OPTION_TYPE_BOOL, - .description = "readdir(p) will not failover if this option is off", - .default_value = "on", - }, { .key = {"ensure-durability"}, .type = GF_OPTION_TYPE_BOOL, .description = "Afr performs fsyncs for transactions if this " @@ -789,5 +737,13 @@ struct volume_options options[] = { "written to the disk", .default_value = "on", }, + { .key = {"afr-dirty-xattr"}, + .type = GF_OPTION_TYPE_STR, + .default_value = AFR_DIRTY_DEFAULT, + }, + { .key = {"metadata-splitbrain-forced-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, { .key = {NULL} }, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 9196a1f27..2e1b78d1c 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -20,112 +20,42 @@ #include "call-stub.h" #include "compat-errno.h" #include "afr-mem-types.h" -#include "afr-self-heal-algorithm.h" #include "libxlator.h" #include "timer.h" +#include "syncop.h" + +#include "afr-self-heald.h" #define AFR_XATTR_PREFIX "trusted.afr" #define AFR_PATHINFO_HEADER "REPLICATE:" #define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size" #define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal" +#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty" +#define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty) #define AFR_LOCKEE_COUNT_MAX 3 #define AFR_DOM_COUNT_MAX 3 - -#define afr_inode_missing(op_errno) (op_errno == ENOENT || op_errno == ESTALE) - -struct _pump_private; - -typedef int (*afr_expunge_done_cbk_t) (call_frame_t *frame, xlator_t *this, - int child, int32_t op_error, - int32_t op_errno); - -typedef int (*afr_impunge_done_cbk_t) (call_frame_t *frame, xlator_t *this, - int32_t op_error, int32_t op_errno); -typedef int (*afr_post_remove_call_t) (call_frame_t *frame, xlator_t *this); +#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this); -typedef void (*afr_lookup_done_cbk_t) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno); -typedef enum { - AFR_POS_UNKNOWN, - AFR_POS_LOCAL, - AFR_POS_REMOTE -} afr_child_pos_t; +typedef int (*afr_read_txn_wind_t) (call_frame_t *frame, xlator_t *this, int subvol); -typedef enum { - SPLIT_BRAIN = 1, - ALL_FOOLS = 2 -} afr_subvol_status_t; +typedef int (*afr_inode_refresh_cbk_t) (call_frame_t *frame, xlator_t *this, int err); -typedef enum { - AFR_INODE_SET_READ_CTX = 1, - AFR_INODE_RM_STALE_CHILDREN, - AFR_INODE_SET_OPENDIR_DONE, - AFR_INODE_GET_READ_CTX, - AFR_INODE_GET_OPENDIR_DONE, -} afr_inode_op_t; - -typedef struct afr_inode_params_ { - afr_inode_op_t op; - union { - gf_boolean_t value; - struct { - int32_t read_child; - int32_t *children; - } read_ctx; - } u; -} afr_inode_params_t; - -typedef enum afr_spb_state { - DONT_KNOW, - SPB, - NO_SPB -} afr_spb_state_t; - -typedef struct afr_inode_ctx_ { - uint64_t masks; - int32_t *fresh_children;//increasing order of latency - afr_spb_state_t mdata_spb; - afr_spb_state_t data_spb; - uint32_t open_fd_count; -} afr_inode_ctx_t; +typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this); -typedef enum { - NONE, - INDEX, - INDEX_TO_BE_HEALED, - FULL, -} afr_crawl_type_t; - -typedef struct afr_self_heald_ { - gf_boolean_t enabled; - gf_boolean_t iamshd; - afr_crawl_type_t *pending; - gf_boolean_t *inprogress; - afr_child_pos_t *pos; - gf_timer_t **timer; - eh_t *healed; - eh_t *heal_failed; - eh_t *split_brain; - eh_t **statistics; - void **crawl_events; - char *node_uuid; - int timeout; -} afr_self_heald_t; +#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr;}) +#define AFR_COUNT(array,max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res;}) +#define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];}) typedef struct _afr_private { gf_lock_t lock; /* to guard access to child_count, etc */ unsigned int child_count; /* total number of children */ - unsigned int read_child_rr; /* round-robin index of the read_child */ - gf_lock_t read_child_lock; /* lock to protect above */ - xlator_t **children; - int first_lookup; inode_t *root_inode; unsigned char *child_up; @@ -146,6 +76,7 @@ typedef struct _afr_private { gf_boolean_t metadata_change_log; /* on/off */ gf_boolean_t entry_change_log; /* on/off */ + gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */ int read_child; /* read-subvolume */ unsigned int hash_mode; /* for when read_child is not set */ int favorite_child; /* subvolume to be preferred in resolving @@ -154,178 +85,45 @@ typedef struct _afr_private { gf_boolean_t inodelk_trace; gf_boolean_t entrylk_trace; - gf_boolean_t strict_readdir; - unsigned int wait_count; /* # of servers to wait for success */ uint64_t up_count; /* number of CHILD_UPs we have seen */ uint64_t down_count; /* number of CHILD_DOWNs we have seen */ - struct _pump_private *pump_private; /* Set if we are loaded as pump */ - int use_afr_in_pump; - - pthread_mutex_t mutex; - struct list_head saved_fds; /* list of fds on which locks have succeeded */ gf_boolean_t optimistic_change_log; gf_boolean_t eager_lock; + gf_boolean_t pre_op_compat; /* on/off */ uint32_t post_op_delay_secs; unsigned int quorum_count; char vol_uuid[UUID_SIZE + 1]; int32_t *last_event; - afr_self_heald_t shd; + + /* @event_generation: Keeps count of number of events received which can + potentially impact consistency decisions. The events are CHILD_UP + and CHILD_DOWN, when we have to recalculate the freshness/staleness + of copies to detect if changes had happened while the other server + was down. CHILD_DOWN and CHILD_UP can also be received on network + disconnect/reconnects and not necessarily server going down/up. + Recalculating freshness/staleness on network events is equally + important as we might have had a network split brain. + */ + uint32_t event_generation; + gf_boolean_t choose_local; gf_boolean_t did_discovery; - gf_boolean_t readdir_failover; uint64_t sh_readdir_size; gf_boolean_t ensure_durability; char *sh_domain; -} afr_private_t; - -typedef enum { - AFR_SELF_HEAL_NOT_ATTEMPTED, - AFR_SELF_HEAL_STARTED, - AFR_SELF_HEAL_FAILED, - AFR_SELF_HEAL_SYNC_BEGIN, -} afr_self_heal_status; - -typedef struct { - afr_self_heal_status gfid_or_missing_entry_self_heal; - afr_self_heal_status metadata_self_heal; - afr_self_heal_status data_self_heal; - afr_self_heal_status entry_self_heal; -} afr_sh_status_for_all_type; - -typedef enum { - AFR_SELF_HEAL_ENTRY, - AFR_SELF_HEAL_METADATA, - AFR_SELF_HEAL_DATA, - AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY, - AFR_SELF_HEAL_INVALID = -1, -} afr_self_heal_type; - -typedef enum { - AFR_CHECK_ALL, - AFR_CHECK_SPECIFIC, -} afr_sh_fail_check_type; - -struct afr_self_heal_ { - /* External interface: These are variables (some optional) that - are set by whoever has triggered self-heal */ - - gf_boolean_t do_data_self_heal; - gf_boolean_t do_metadata_self_heal; - gf_boolean_t do_entry_self_heal; - gf_boolean_t do_gfid_self_heal; - gf_boolean_t do_missing_entry_self_heal; - gf_boolean_t force_confirm_spb; /* Check for split-brains even when - self-heal is turned off */ - - gf_boolean_t forced_merge; /* Is this a self-heal triggered to - forcibly merge the directories? */ - - gf_boolean_t background; /* do self-heal in background - if possible */ - ia_type_t type; /* st_mode of the entry we're doing - self-heal on */ - inode_t *inode; /* inode on which the self-heal is - performed on */ - uuid_t sh_gfid_req; /* gfid self-heal needs to be done - with this gfid if it is not null */ - - /* Function to call to unwind. If self-heal is being done in the - background, this function will be called as soon as possible. */ - - int (*unwind) (call_frame_t *frame, xlator_t *this, int32_t op_ret, - int32_t op_errno, int32_t sh_failed); - - /* End of external interface members */ - - - /* array of stat's, one for each child */ - struct iatt *buf; - struct iatt *parentbufs; - struct iatt parentbuf; - struct iatt entrybuf; - - afr_expunge_done_cbk_t expunge_done; - afr_impunge_done_cbk_t impunge_done; - - /* array of xattr's, one for each child */ - dict_t **xattr; - - /* array containing if the lookups succeeded in the order of response - */ - int32_t *success_children; - int success_count; - /* array containing the fresh children found in the self-heal process */ - int32_t *fresh_children; - /* array containing the fresh children found in the parent lookup */ - int32_t *fresh_parent_dirs; - /* array of errno's, one for each child */ - int *child_errno; - /*loc used for lookup*/ - loc_t lookup_loc; - int32_t lookup_flags; - afr_lookup_done_cbk_t lookup_done; - - int32_t **pending_matrix; - int32_t **delta_matrix; + char *afr_dirty; - int32_t op_ret; - int32_t op_errno; + afr_self_heald_t shd; - int *sources; - int source; - int active_source; - int active_sinks; - unsigned char *success; - unsigned char *locked_nodes; - int lock_count; - - const char *linkname; - gf_boolean_t entries_skipped; - - gf_boolean_t actual_sh_started; - gf_boolean_t sync_done; - gf_boolean_t data_lock_held; - gf_boolean_t sh_dom_lock_held; - gf_boolean_t eof_reached; - fd_t *healing_fd; - int file_has_holes; - blksize_t block_size; - off_t file_size; - off_t offset; - unsigned char *write_needed; - uint8_t *checksum; - afr_post_remove_call_t post_remove_call; - - char *data_sh_info; - char *metadata_sh_info; - - loc_t parent_loc; - call_frame_t *orig_frame; - call_frame_t *old_loop_frame; - gf_boolean_t unwound; - - afr_sh_algo_private_t *private; - afr_sh_status_for_all_type afr_all_sh_status; - afr_self_heal_type sh_type_in_action; - - struct afr_sh_algorithm *algo; - afr_lock_cbk_t data_lock_success_handler; - afr_lock_cbk_t data_lock_failure_handler; - gf_boolean_t data_lock_block; - int (*completion_cbk) (call_frame_t *frame, xlator_t *this); - int (*sh_data_algo_start) (call_frame_t *frame, xlator_t *this); - int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this); - int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this); - void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this); - - call_frame_t *sh_frame; -}; + /* pump dependencies */ + void *pump_private; + gf_boolean_t use_afr_in_pump; +} afr_private_t; -typedef struct afr_self_heal_ afr_self_heal_t; typedef enum { AFR_DATA_TRANSACTION, /* truncate, write, ... */ @@ -438,32 +236,72 @@ typedef struct { char *domain; /* Domain on which inode/entry lock/unlock in progress.*/ } afr_internal_lock_t; -typedef struct _afr_locked_fd { - fd_t *fd; - struct list_head list; -} afr_locked_fd_t; - struct afr_reply { int valid; int32_t op_ret; int32_t op_errno; + dict_t *xdata; + struct iatt poststat; + struct iatt postparent; + struct iatt prestat; + struct iatt preparent; + struct iatt preparent2; + struct iatt postparent2; + uint8_t checksum[MD5_DIGEST_LENGTH]; }; +typedef enum { + AFR_FD_NOT_OPENED, + AFR_FD_OPENED, + AFR_FD_OPENING +} afr_fd_open_status_t; + +typedef struct { + unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS]; + int inherited[AFR_NUM_CHANGE_LOGS]; + int on_disk[AFR_NUM_CHANGE_LOGS]; + afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ + + unsigned int *lock_piggyback; + unsigned int *lock_acquired; + + int flags; + + /* used for delayed-post-op optimization */ + pthread_mutex_t delay_lock; + gf_timer_t *delay_timer; + call_frame_t *delay_frame; + + /* set if any write on this fd was a non stable write + (i.e, without O_SYNC or O_DSYNC) + */ + gf_boolean_t witnessed_unstable_write; + + /* @open_fd_count: + Number of open FDs queried from the server, as queried through + xdata in FOPs. Currently, used to decide if eager-locking must be + temporarily disabled. + */ + uint32_t open_fd_count; + + + /* list of frames currently in progress */ + struct list_head eager_locked; +} afr_fd_ctx_t; + + typedef struct _afr_local { - int uid; - int gid; + glusterfs_fop_t op; unsigned int call_count; - unsigned int success_count; - unsigned int enoent_count; - uint32_t open_fd_count; - gf_boolean_t update_open_fd_count; + /* @event_generation: copy of priv->event_generation taken at the + time of starting the transaction. The copy is made so that we + have a stable value through the various phases of the transaction. + */ + unsigned int event_generation; - unsigned int unhealable; - - unsigned int read_child_index; - unsigned char read_child_returned; - unsigned int first_up_child; + uint32_t open_fd_count; + gf_boolean_t update_open_fd_count; gf_lkowner_t saved_lk_owner; @@ -472,78 +310,117 @@ typedef struct _afr_local { int32_t **pending; + int dirty[AFR_NUM_CHANGE_LOGS]; + loc_t loc; loc_t newloc; fd_t *fd; + afr_fd_ctx_t *fd_ctx; - glusterfs_fop_t fop; - + /* @child_up: copy of priv->child_up taken at the time of transaction + start. The copy is taken so that we have a stable child_up array + through the phases of the transaction as priv->child_up[i] can keep + changing through time. + */ unsigned char *child_up; - int32_t *fresh_children; //in the order of response - int32_t *child_errno; + /* @read_attempted: + array of flags representing subvolumes where read operations of + the read transaction have already been attempted. The array is + first pre-filled with down subvolumes, and as reads are performed + on other subvolumes, those are set as well. This way if the read + operation fails we do not retry on that subvolume again. + */ + unsigned char *read_attempted; + + /* @readfn: - dict_t *xattr_req; + pointer to function which will perform the read operation on a given + subvolume. Used in read transactions. + */ - int32_t inodelk_count; - int32_t entrylk_count; + afr_read_txn_wind_t readfn; - afr_internal_lock_t internal_lock; + /* @refreshed: - afr_locked_fd_t *locked_fd; - int32_t source_child; - int32_t lock_recovery_child; + the inode was "refreshed" (i.e, pending xattrs from all subvols + freshly inspected and inode ctx updated accordingly) as part of + this transaction already. + */ + gf_boolean_t refreshed; + + /* @inode: + + the inode on which the read txn is performed on. ref'ed and copied + from either fd->inode or loc.inode + */ + + inode_t *inode; + + /* @parent[2]: + + parent inode[s] on which directory transactions are performed. + */ + + inode_t *parent; + inode_t *parent2; + + /* @readable: + + array of flags representing servers from which a read can be + performed. This is the output of afr_inode_refresh() + */ + unsigned char *readable; + + afr_inode_refresh_cbk_t refreshfn; + + /* @refreshinode: + + Inode currently getting refreshed. + */ + inode_t *refreshinode; + + /* + @pre_op_compat: + + compatibility mode of pre-op. send a separate pre-op and + op operations as part of transaction, rather than combining + */ + + gf_boolean_t pre_op_compat; + + dict_t *xattr_req; + + afr_internal_lock_t internal_lock; dict_t *dict; + int optimistic_change_log; gf_boolean_t delayed_post_op; - /* Is the current writev() going to perform a stable write? i.e, is fd->flags or @flags writev param have O_SYNC or O_DSYNC? */ - gf_boolean_t stable_write; - - /* This write appended to the file. Nnot necessarily O_APPEND, - just means the offset of write was at the end of file. - */ - gf_boolean_t append_write; - - int attempt_self_heal; - int foreground_self_heal; + gf_boolean_t stable_write; + /* This write appended to the file. Nnot necessarily O_APPEND, + just means the offset of write was at the end of file. + */ + gf_boolean_t append_write; - /* This struct contains the arguments for the "continuation" - (scheme-like) of fops + /* + This struct contains the arguments for the "continuation" + (scheme-like) of fops */ - int op; struct { struct { unsigned char buf_set; struct statvfs buf; } statfs; - struct { - uint32_t parent_entrylk; - uuid_t gfid_req; - inode_t *inode; - struct iatt buf; - struct iatt postparent; - dict_t **xattrs; - dict_t *xattr; - struct iatt *postparents; - struct iatt *bufs; - int32_t read_child; - int32_t *sources; - int32_t *success_children; - int32_t **pending_matrix; - gf_boolean_t fresh_lookup; - gf_boolean_t possible_spb; - } lookup; - struct { int32_t flags; } open; @@ -737,22 +614,67 @@ typedef struct _afr_local { afr_transaction_type type; - /* pre-compute the post piggyback status before - entering POST-OP phase - */ - int *postop_piggybacked; - /* stub to resume on destruction of the transaction frame */ call_stub_t *resume_stub; struct list_head eager_locked; - int32_t **txn_changelog;//changelog after pre+post ops unsigned char *pre_op; + /* @fop_subvols: subvolumes on which FOP will be attempted */ + unsigned char *fop_subvols; + + /* @failed_subvols: subvolumes on which FOP failed. Always + a subset of @fop_subvols */ + unsigned char *failed_subvols; + + /* @dirtied: flag which indicates whether we set dirty flag + in the OP. Typically true when we are performing operation + on more than one subvol and optimistic changelog is disabled + + A 'true' value set in @dirtied flag means an 'undirtying' + has to be done in POST-OP phase. + */ + gf_boolean_t dirtied; + + /* @inherited: flag which indicates that the dirty flags + of the previous transaction were inherited + */ + gf_boolean_t inherited; + + /* + @no_uninherit: flag which indicates that a pre_op_uninherit() + must _not_ be attempted (and returned as failure) always. This + flag is set when a hard pre-op is performed, but not accounted + for it in fd_ctx->on_disk[]. Such transactions are "isolated" + from the pre-op piggybacking entirely and therefore uninherit + must not be attempted. + */ + gf_boolean_t no_uninherit; + + /* @uninherit_done: + @uninherit_value: + + The above pair variables make pre_op_uninherit() idempotent. + Both are FALSE initially. The first call to pre_op_uninherit + sets @uninherit_done to TRUE and the return value to + @uninherit_value. Further calls will check for @uninherit_done + to be TRUE and if so will simply return @uninherit_value. + */ + gf_boolean_t uninherit_done; + gf_boolean_t uninherit_value; + + /* @changelog_resume: function to be called after changlogging + (either pre-op or post-op) is done + */ + + afr_changelog_resume_t changelog_resume; + call_frame_t *main_frame; + int (*wind) (call_frame_t *frame, xlator_t *this, int subvol); + int (*fop) (call_frame_t *frame, xlator_t *this); int (*done) (call_frame_t *frame, xlator_t *this); @@ -764,7 +686,7 @@ typedef struct _afr_local { /* post-op hook */ } transaction; - afr_self_heal_t self_heal; + syncbarrier_t barrier; struct marker_str marker; @@ -778,75 +700,58 @@ typedef struct _afr_local { struct afr_reply *replies; } afr_local_t; -typedef enum { - AFR_FD_NOT_OPENED, - AFR_FD_OPENED, - AFR_FD_OPENING -} afr_fd_open_status_t; - -typedef struct { - unsigned int *pre_op_done; - afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ - unsigned int *pre_op_piggyback; - - unsigned int *lock_piggyback; - unsigned int *lock_acquired; - - int flags; - uint64_t up_count; /* number of CHILD_UPs this fd has seen */ - uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */ - - int32_t last_tried; - - int hit, miss; - gf_boolean_t failed_over; - struct list_head entries; /* needed for readdir failover */ - - unsigned char *locked_on; /* which subvolumes locks have been successful */ - - /* used for delayed-post-op optimization */ - pthread_mutex_t delay_lock; - gf_timer_t *delay_timer; - call_frame_t *delay_frame; - int call_child; - - /* set if any write on this fd was a non stable write - (i.e, without O_SYNC or O_DSYNC) - */ - gf_boolean_t witnessed_unstable_write; - - /* list of frames currently in progress */ - struct list_head eager_locked; -} afr_fd_ctx_t; - - -/* try alloc and if it fails, goto label */ -#define AFR_LOCAL_ALLOC_OR_GOTO(var, label) do { \ - var = mem_get0 (THIS->local_pool); \ - if (!var) { \ - gf_log (this->name, GF_LOG_ERROR, \ - "out of memory :("); \ - op_errno = ENOMEM; \ - goto label; \ - } \ - } while (0); - /* did a call fail due to a child failing? */ #define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ ((op_errno == ENOTCONN) || \ (op_errno == EBADFD))) -#define afr_fop_failed(op_ret, op_errno) ((op_ret) == -1) +int +afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int *event_generation); +int +__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int *event_generation); -/* have we tried all children? */ -#define all_tried(i, count) ((i) == (count) - 1) +int +__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvol, + int event_generation); +int +afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, + unsigned char *data_subvols, + unsigned char *metadata_subvols, + int event_generation); -int32_t -afr_set_dict_gfid (dict_t *dict, uuid_t gfid); +int +afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this); int -pump_command_reply (call_frame_t *frame, xlator_t *this); +afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, + unsigned char *readable); + +int +afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, + int type); +int +afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p, + int *event_p, afr_transaction_type type); + +#define afr_data_subvol_get(i, t, s, e) \ + afr_read_subvol_get(i, t, s, e, AFR_DATA_TRANSACTION) + +#define afr_metadata_subvol_get(i, t, s, e) \ + afr_read_subvol_get(i, t, s, e, AFR_METADATA_TRANSACTION) + +int +afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_inode_refresh_cbk_t cbk); int32_t afr_notify (xlator_t *this, int32_t event, void *data, void *data2); @@ -861,9 +766,6 @@ afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock); int afr_attempt_lock_recovery (xlator_t *this, int32_t child_index); -int -afr_save_locked_fd (xlator_t *this, fd_t *fd); - int afr_mark_locked_nodes (xlator_t *this, fd_t *fd, unsigned char *locked_nodes); @@ -874,10 +776,6 @@ afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner); int afr_set_lock_number (call_frame_t *frame, xlator_t *this); - -loc_t * -lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2); - int32_t afr_unlock (call_frame_t *frame, xlator_t *this); @@ -897,42 +795,26 @@ int afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, unsigned int child_count); -int pump_start (call_frame_t *frame, xlator_t *this); - int __afr_fd_ctx_set (xlator_t *this, fd_t *fd); int afr_fd_ctx_set (xlator_t *this, fd_t *fd); -int32_t -afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children); - -void -afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, - int32_t *fresh_children); +afr_fd_ctx_t * +afr_fd_ctx_get (fd_t *fd, xlator_t *this); int afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno); -unsigned int -afr_up_children_count (unsigned char *child_up, unsigned int child_count); - -unsigned int -afr_locked_children_count (unsigned char *children, unsigned int child_count); - -unsigned int -afr_pre_op_done_children_count (unsigned char *pre_op, - unsigned int child_count); +int +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); -gf_boolean_t -afr_is_fresh_lookup (loc_t *loc, xlator_t *this); +int +afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode); void -afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent); - -int -afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); +afr_replies_wipe (afr_local_t *local, afr_private_t *priv); void afr_local_cleanup (afr_local_t *local, xlator_t *this); @@ -940,32 +822,16 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this); int afr_frame_return (call_frame_t *frame); -gf_boolean_t -afr_is_split_brain (xlator_t *this, inode_t *inode); - -void -afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, - afr_spb_state_t data_spb); - int afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata); -void -afr_set_opendir_done (xlator_t *this, inode_t *inode); - -gf_boolean_t -afr_is_opendir_done (xlator_t *this, inode_t *inode); - void afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this); int afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd); -int -afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd); - #define AFR_STACK_UNWIND(fop, frame, params ...) \ do { \ afr_local_t *__local = NULL; \ @@ -996,7 +862,16 @@ afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd); } \ } while (0); -#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ +#define AFR_FRAME_INIT(frame, op_errno) \ + ({frame->local = mem_get0 (THIS->local_pool); \ + if (afr_local_init (frame->local, THIS->private, &op_errno)) { \ + afr_local_cleanup (frame->local, THIS); \ + mem_put (frame->local); \ + frame->local = NULL; }; \ + frame->local;}) + +#define AFR_STACK_RESET(frame) do { int opr; STACK_RESET (frame->root); AFR_FRAME_INIT(frame, opr);} while (0) + /* allocate and return a string that is the basename of argument */ static inline char * AFR_BASENAME (const char *str) @@ -1009,6 +884,9 @@ AFR_BASENAME (const char *str) return __basename_str; } +call_frame_t * +afr_copy_frame (call_frame_t *base); + int afr_transaction_local_init (afr_local_t *local, xlator_t *this); @@ -1016,9 +894,6 @@ int32_t afr_marker_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv ); -int32_t * -afr_children_create (int32_t child_count); - int afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno); @@ -1027,101 +902,20 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, transaction_lk_type_t lk_type); int -afr_first_up_child (unsigned char *child_up, size_t child_count); +afr_higher_errno (int32_t old_errno, int32_t new_errno); int -afr_select_read_child_from_policy (int32_t *fresh_children, int32_t child_count, - int32_t prev_read_child, - int32_t config_read_child, int32_t *sources, - unsigned int hmode, uuid_t gfid); +afr_final_errno (afr_local_t *local, afr_private_t *priv); -void -afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, - int32_t *fresh_children, int32_t prev_read_child, - int32_t config_read_child, uuid_t gfid); - -int32_t -afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, - int32_t *fresh_children, - int32_t *call_child, int32_t *last_index); - -int32_t -afr_next_call_child (int32_t *fresh_children, unsigned char *child_up, - size_t child_count, int32_t *last_index, - int32_t read_child); -void -afr_get_fresh_children (int32_t *success_children, int32_t *sources, - int32_t *children, unsigned int child_count); -void -afr_children_add_child (int32_t *children, int32_t child, - int32_t child_count); -void -afr_children_rm_child (int32_t *children, int32_t child, - int32_t child_count); -void -afr_reset_children (int32_t *children, int32_t child_count); -int32_t -afr_most_important_error(int32_t old_errno, int32_t new_errno, - gf_boolean_t eio); int -afr_errno_count (int32_t *children, int *child_errno, - unsigned int child_count, int32_t op_errno); -int -afr_get_children_count (int32_t *children, unsigned int child_count); -gf_boolean_t -afr_is_child_present (int32_t *success_children, int32_t child_count, - int32_t child); -void -afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs, - int32_t *success_children, - unsigned int child_count); -void -afr_reset_xattr (dict_t **xattr, unsigned int child_count); -gf_boolean_t -afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, - unsigned int child_count, const char *path, - const char *xlator_name); -unsigned int -afr_gfid_missing_count (const char *xlator_name, int32_t *children, - struct iatt *bufs, unsigned int child_count, - const char *path); -void -afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path); -void -afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count); -afr_transaction_type -afr_transaction_type_get (ia_type_t ia_type); -int32_t -afr_resultant_errno_get (int32_t *children, - int *child_errno, unsigned int child_count); -void -afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, - int32_t *stale_children); -void -afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, - gf_boolean_t background, ia_type_t ia_type, char *reason, - void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, - xlator_t *this), - int (*unwind) (call_frame_t *frame, xlator_t *this, - int32_t op_ret, int32_t op_errno, - int32_t sh_failed)); -void -afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open); +afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req); void -afr_open_fd_fix (fd_t *fd, xlator_t *this); -int -afr_set_elem_count_get (unsigned char *elems, int child_count); +afr_fix_open (fd_t *fd, xlator_t *this); afr_fd_ctx_t * afr_fd_ctx_get (fd_t *fd, xlator_t *this); -gf_boolean_t -afr_open_only_data_self_heal (char *data_self_heal); - -gf_boolean_t -afr_data_self_heal_enabled (char *data_self_heal); - void afr_set_low_priority (call_frame_t *frame); int @@ -1137,22 +931,9 @@ afr_matrix_cleanup (int32_t **pending, unsigned int m); int32_t** afr_matrix_create (unsigned int m, unsigned int n); -gf_boolean_t -afr_is_errno_set (int *child_errno, int child); - -gf_boolean_t -afr_is_errno_unset (int *child_errno, int child); - -gf_boolean_t -afr_is_fd_fixable (fd_t *fd); - void -afr_prepare_new_entry_pending_matrix (int32_t **pending, - gf_boolean_t (*is_pending) (int *, int), - int *ctx, struct iatt *buf, - unsigned int child_count); -void -afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count); +afr_filter_xattrs (dict_t *xattr); + /* * Special value indicating we should use the "auto" quorum method instead of * a fixed value (including zero to turn off quorum enforcement). @@ -1172,28 +953,6 @@ afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count); } \ } while (0); - -#define AFR_SBRAIN_MSG "Failed on %s as split-brain is seen. Returning EIO." - -#define AFR_SBRAIN_CHECK_FD(fd, label) do { \ - if (fd->inode && afr_is_split_brain (this, fd->inode)) { \ - op_errno = EIO; \ - gf_log (this->name, GF_LOG_WARNING, \ - AFR_SBRAIN_MSG ,uuid_utoa (fd->inode->gfid)); \ - goto label; \ - } \ -} while (0) - -#define AFR_SBRAIN_CHECK_LOC(loc, label) do { \ - if (loc->inode && afr_is_split_brain (this, loc->inode)) { \ - op_errno = EIO; \ - loc_path (loc, NULL); \ - gf_log (this->name, GF_LOG_WARNING, \ - AFR_SBRAIN_MSG , loc->path); \ - goto label; \ - } \ -} while (0) - int afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); @@ -1209,7 +968,7 @@ afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count); void afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this); -afr_inode_ctx_t* -afr_inode_ctx_get (inode_t *inode, xlator_t *this); +int +afr_local_pathinfo (char *pathinfo, gf_boolean_t *is_local); #endif /* __AFR_H__ */ diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c index 987696e55..eed509956 100644 --- a/xlators/cluster/afr/src/pump.c +++ b/xlators/cluster/afr/src/pump.c @@ -21,6 +21,120 @@ #include "afr-common.c" #include "defaults.c" #include "glusterfs.h" +#include "pump.h" + + +static int +afr_set_dict_gfid (dict_t *dict, uuid_t gfid) +{ + int ret = 0; + uuid_t *pgfid = NULL; + + GF_ASSERT (gfid); + + pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char); + if (!pgfid) { + ret = -1; + goto out; + } + + uuid_copy (*pgfid, gfid); + + ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t)); + if (ret) + gf_log (THIS->name, GF_LOG_ERROR, "gfid set failed"); + +out: + if (ret && pgfid) + GF_FREE (pgfid); + return ret; +} + +static int +afr_set_root_gfid (dict_t *dict) +{ + uuid_t gfid; + int ret = 0; + + memset (gfid, 0, 16); + gfid[15] = 1; + + ret = afr_set_dict_gfid (dict, gfid); + + return ret; +} + +static int +afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +{ + int ret = -1; + uuid_t pargfid = {0}; + + if (!child) + goto out; + + if (!uuid_is_null (parent->inode->gfid)) + uuid_copy (pargfid, parent->inode->gfid); + else if (!uuid_is_null (parent->gfid)) + uuid_copy (pargfid, parent->gfid); + + if (uuid_is_null (pargfid)) + goto out; + + if (strcmp (parent->path, "/") == 0) + ret = gf_asprintf ((char **)&child->path, "/%s", name); + else + ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path, + name); + + if (-1 == ret) { + gf_log (this->name, GF_LOG_ERROR, + "asprintf failed while setting child path"); + } + + child->name = strrchr (child->path, '/'); + if (child->name) + child->name++; + + child->parent = inode_ref (parent->inode); + child->inode = inode_new (parent->inode->table); + uuid_copy (child->pargfid, pargfid); + + if (!child->inode) { + ret = -1; + goto out; + } + + ret = 0; +out: + if ((ret == -1) && child) + loc_wipe (child); + + return ret; +} + +static void +afr_build_root_loc (xlator_t *this, loc_t *loc) +{ + afr_private_t *priv = NULL; + + priv = this->private; + loc->path = gf_strdup ("/"); + loc->name = ""; + loc->inode = inode_ref (priv->root_inode); + uuid_copy (loc->gfid, loc->inode->gfid); +} + +static void +afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) +{ + GF_ASSERT (loc); + GF_ASSERT (buf); + + uuid_copy (loc->gfid, buf->ia_gfid); + if (postparent) + uuid_copy (loc->pargfid, postparent->ia_gfid); +} static uint64_t pump_pid = 0; static inline void @@ -387,54 +501,68 @@ gf_pump_traverse_directory (loc_t *loc) if (ret) goto out; - if (!IS_ENTRY_CWD (entry->d_name) && - !IS_ENTRY_PARENT (entry->d_name)) { - - is_directory_empty = _gf_false; - gf_log (this->name, GF_LOG_DEBUG, - "lookup %s => %"PRId64, - entry_loc.path, - iatt.ia_ino); - - ret = syncop_lookup (this, &entry_loc, NULL, - &iatt, &xattr_rsp, &parent); - - if (ret) { - gf_log (this->name, GF_LOG_ERROR, - "%s: lookup failed", - entry_loc.path); - continue; - } - pump_fill_loc_info (&entry_loc, &iatt, - &parent); - - pump_update_resume_state (this, entry_loc.path); - - pump_save_path (this, entry_loc.path); - pump_save_file_stats (this, entry_loc.path); - - ret = pump_check_and_update_status (this); - if (ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Pump beginning to exit out"); - goto out; - } - - if (IA_ISDIR (iatt.ia_type)) { - if (is_pump_traversal_allowed (this, entry_loc.path)) { - gf_log (this->name, GF_LOG_TRACE, - "entering dir=%s", - entry->d_name); - gf_pump_traverse_directory (&entry_loc); - } - } + if ((strcmp (entry->d_name, ".") == 0) || + (strcmp (entry->d_name, "..") == 0)) + continue; + + is_directory_empty = _gf_false; + gf_log (this->name, GF_LOG_DEBUG, + "lookup %s => %"PRId64, + entry_loc.path, + iatt.ia_ino); + + ret = syncop_lookup (this, &entry_loc, NULL, &iatt, + &xattr_rsp, &parent); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: lookup failed", entry_loc.path); + continue; + } + + ret = afr_selfheal_name (this, loc->gfid, entry->d_name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: name self-heal failed (%s/%s)", + entry_loc.path, uuid_utoa (loc->gfid), + entry->d_name); + continue; + } + + ret = afr_selfheal (this, iatt.ia_gfid); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "%s: self-heal failed (%s)", + entry_loc.path, uuid_utoa (iatt.ia_gfid)); + continue; + } + + pump_fill_loc_info (&entry_loc, &iatt, &parent); + + pump_update_resume_state (this, entry_loc.path); + + pump_save_path (this, entry_loc.path); + pump_save_file_stats (this, entry_loc.path); + + ret = pump_check_and_update_status (this); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "Pump beginning to exit out"); + goto out; + } + + if (IA_ISDIR (iatt.ia_type)) { + if (is_pump_traversal_allowed (this, entry_loc.path)) { + gf_log (this->name, GF_LOG_TRACE, + "entering dir=%s", entry->d_name); + gf_pump_traverse_directory (&entry_loc); + } } } gf_dirent_free (&entries); free_entries = _gf_false; - gf_log (this->name, GF_LOG_TRACE, - "offset incremented to %d", + gf_log (this->name, GF_LOG_TRACE, "offset incremented to %d", (int32_t ) offset); } @@ -443,7 +571,7 @@ gf_pump_traverse_directory (loc_t *loc) if (ret < 0) gf_log (this->name, GF_LOG_DEBUG, "closing the fd failed"); - if (is_directory_empty && IS_ROOT_PATH (loc->path)) { + if (is_directory_empty && (strcmp (loc->path, "/") == 0)) { pump_change_state (this, PUMP_STATE_RUNNING); gf_log (this->name, GF_LOG_INFO, "Empty source brick. " "Nothing to be done."); @@ -1277,128 +1405,16 @@ out: } -struct _xattr_key { - char *key; - struct list_head list; -}; - -static int -__gather_xattr_keys (dict_t *dict, char *key, data_t *value, - void *data) -{ - struct list_head * list = data; - struct _xattr_key * xkey = NULL; - - if (!strncmp (key, AFR_XATTR_PREFIX, - strlen (AFR_XATTR_PREFIX))) { - - xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key); - if (!xkey) - return -1; - - xkey->key = key; - INIT_LIST_HEAD (&xkey->list); - - list_add_tail (&xkey->list, list); - } - return 0; -} - -static void -__filter_xattrs (dict_t *dict) -{ - struct list_head keys; - - struct _xattr_key *key; - struct _xattr_key *tmp; - - INIT_LIST_HEAD (&keys); - - dict_foreach (dict, __gather_xattr_keys, - (void *) &keys); - - list_for_each_entry_safe (key, tmp, &keys, list) { - dict_del (dict, key->key); - - list_del_init (&key->list); - - GF_FREE (key); - } -} - -int32_t -pump_getxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int unwind = 1; - int32_t *last_index = NULL; - int32_t next_call_child = -1; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - - - priv = this->private; - children = priv->children; - - local = frame->local; - - read_child = (long) cookie; - - if (op_ret == -1) { - last_index = &local->cont.getxattr.last_index; - fresh_children = local->fresh_children; - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, read_child); - if (next_call_child < 0) - goto out; - - unwind = 0; - STACK_WIND_COOKIE (frame, pump_getxattr_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->getxattr, - &local->loc, - local->cont.getxattr.name, NULL); - } - -out: - if (unwind) { - if (op_ret >= 0 && dict) - __filter_xattrs (dict); - - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, NULL); - } - - return 0; -} - -int32_t -pump_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name, dict_t *xdata) +int +pump_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) { - afr_private_t * priv = NULL; - xlator_t ** children = NULL; - int call_child = 0; - afr_local_t *local = NULL; - int32_t ret = -1; - int32_t op_errno = 0; - uint64_t read_child = 0; - - - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); + afr_private_t *priv = NULL; + int op_errno = 0; + int ret = 0; - priv = this->private; - VALIDATE_OR_GOTO (priv->children, out); + priv = this->private; - children = priv->children; if (!priv->use_afr_in_pump) { STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD (this), @@ -1407,14 +1423,6 @@ pump_getxattr (call_frame_t *frame, xlator_t *this, return 0; } - - AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); - local = frame->local; - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) - goto out; - if (name) { if (!strncmp (name, AFR_XATTR_PREFIX, strlen (AFR_XATTR_PREFIX))) { @@ -1432,32 +1440,7 @@ pump_getxattr (call_frame_t *frame, xlator_t *this, } } - local->fresh_children = GF_CALLOC (priv->child_count, - sizeof (*local->fresh_children), - gf_afr_mt_int32_t); - if (!local->fresh_children) { - ret = -1; - op_errno = ENOMEM; - goto out; - } - - read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); - ret = afr_get_call_child (this, local->child_up, read_child, - local->fresh_children, - &call_child, - &local->cont.getxattr.last_index); - if (ret < 0) { - op_errno = -ret; - goto out; - } - loc_copy (&local->loc, loc); - if (name) - local->cont.getxattr.name = gf_strdup (name); - - STACK_WIND_COOKIE (frame, pump_getxattr_cbk, - (void *) (long) call_child, - children[call_child], children[call_child]->fops->getxattr, - loc, name, xdata); + afr_getxattr (frame, this, loc, name, xdata); ret = 0; out: @@ -1466,134 +1449,6 @@ out: return 0; } -static int -afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; - - LOCK (&frame->lock); - { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; - local->transaction.main_frame = NULL; - } - UNLOCK (&frame->lock); - - if (main_frame) { - AFR_STACK_UNWIND (setxattr, main_frame, - local->op_ret, local->op_errno, NULL); - } - return 0; -} - -static int -afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - int call_count = -1; - int need_unwind = 0; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->child_count) { - need_unwind = 1; - } - } - - local->op_errno = op_errno; - } - UNLOCK (&frame->lock); - - if (need_unwind) - local->transaction.unwind (frame, this); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->transaction.resume (frame, this); - } - - return 0; -} - -static int -afr_setxattr_wind (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - int call_count = -1; - int i = 0; - - local = frame->local; - priv = this->private; - - call_count = afr_up_children_count (local->child_up, priv->child_count); - - if (call_count == 0) { - local->transaction.resume (frame, this); - return 0; - } - - local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, - local->cont.setxattr.dict, - local->cont.setxattr.flags, NULL); - - if (!--call_count) - break; - } - } - - return 0; -} - - -static int -afr_setxattr_done (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = frame->local; - - local->transaction.unwind (frame, this); - - AFR_STACK_DESTROY (frame); - - return 0; -} - -int32_t -pump_setxattr_cbk (call_frame_t *frame, - void *cookie, - xlator_t *this, - int32_t op_ret, - int32_t op_errno, dict_t *xdata) -{ - AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); - return 0; -} - int pump_command_reply (call_frame_t *frame, xlator_t *this) { @@ -1617,51 +1472,56 @@ pump_command_reply (call_frame_t *frame, xlator_t *this) } int -pump_parse_command (call_frame_t *frame, xlator_t *this, - afr_local_t *local, dict_t *dict) +pump_parse_command (call_frame_t *frame, xlator_t *this, dict_t *dict, + int *op_errno_p) { - + afr_local_t *local = NULL; int ret = -1; + int op_errno = 0; if (pump_command_start (this, dict)) { - frame->local = local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; local->dict = dict_ref (dict); ret = pump_execute_start (frame, this); } else if (pump_command_pause (this, dict)) { - frame->local = local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; local->dict = dict_ref (dict); ret = pump_execute_pause (frame, this); } else if (pump_command_abort (this, dict)) { - frame->local = local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; local->dict = dict_ref (dict); ret = pump_execute_abort (frame, this); } else if (pump_command_commit (this, dict)) { - frame->local = local; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; local->dict = dict_ref (dict); ret = pump_execute_commit (frame, this); } +out: + if (op_errno_p) + *op_errno_p = op_errno; return ret; } int -pump_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) +pump_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; + afr_private_t *priv = NULL; int ret = -1; int op_errno = 0; - VALIDATE_OR_GOTO (frame, out); - VALIDATE_OR_GOTO (this, out); - VALIDATE_OR_GOTO (this->private, out); - - GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.pump*", dict, - op_errno, out); + GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.pump*", dict, op_errno, out); priv = this->private; if (!priv->use_afr_in_pump) { @@ -1672,57 +1532,15 @@ pump_setxattr (call_frame_t *frame, xlator_t *this, return 0; } - - AFR_LOCAL_ALLOC_OR_GOTO (local, out); - - ret = afr_local_init (local, priv, &op_errno); - if (ret < 0) { - afr_local_cleanup (local, this); - mem_put (local); - goto out; - } - - ret = pump_parse_command (frame, this, - local, dict); - if (ret >= 0) { - ret = 0; + ret = pump_parse_command (frame, this, dict, &op_errno); + if (ret >= 0) goto out; - } - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - gf_log (this->name, GF_LOG_ERROR, - "Out of memory."); - op_errno = ENOMEM; - ret = -1; - afr_local_cleanup (local, this); - goto out; - } - - transaction_frame->local = local; - - local->op_ret = -1; - - local->cont.setxattr.dict = dict_ref (dict); - local->cont.setxattr.flags = flags; - - local->transaction.fop = afr_setxattr_wind; - local->transaction.done = afr_setxattr_done; - local->transaction.unwind = afr_setxattr_unwind; - - loc_copy (&local->loc, loc); - - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; - afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + afr_setxattr (frame, this, loc, dict, flags, xdata); ret = 0; out: if (ret < 0) { - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); } @@ -2416,10 +2234,6 @@ init (xlator_t *this) goto out; LOCK_INIT (&priv->lock); - LOCK_INIT (&priv->read_child_lock); - //lock recovery is not done in afr - pthread_mutex_init (&priv->mutex, NULL); - INIT_LIST_HEAD (&priv->saved_fds); child_count = xlator_subvolume_count (this); if (child_count != 2) { @@ -2453,8 +2267,6 @@ init (xlator_t *this) and the sink. */ - priv->strict_readdir = _gf_false; - priv->wait_count = 1; priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count, gf_afr_mt_char); if (!priv->child_up) { @@ -2508,7 +2320,6 @@ init (xlator_t *this) goto out; } - priv->first_lookup = 1; priv->root_inode = NULL; priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event), @@ -2579,7 +2390,6 @@ out: GF_FREE (priv->pending_key); GF_FREE (priv->last_event); LOCK_DESTROY (&priv->lock); - LOCK_DESTROY (&priv->read_child_lock); GF_FREE (priv); } diff --git a/xlators/cluster/afr/src/pump.h b/xlators/cluster/afr/src/pump.h index bc4c31a78..9d0b6db6a 100644 --- a/xlators/cluster/afr/src/pump.h +++ b/xlators/cluster/afr/src/pump.h @@ -75,4 +75,7 @@ pump_command_status (xlator_t *this, dict_t *dict); int pump_execute_status (call_frame_t *frame, xlator_t *this); +int +pump_command_reply (call_frame_t *frame, xlator_t *this); + #endif /* __PUMP_H__ */ diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 3055f4615..3868fc38f 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -3120,7 +3120,7 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, /* making sure we set the inode ctx right with layout, currently possible only for non-directories, so for directories don't set entry inodes */ - if (!IA_ISDIR(entry->d_stat.ia_type)) { + if (!IA_ISDIR(entry->d_stat.ia_type) && orig_entry->inode) { ret = dht_layout_preset (this, prev->this, orig_entry->inode); if (ret) diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c index 32d53e8e6..79e80b513 100644 --- a/xlators/cluster/stripe/src/stripe.c +++ b/xlators/cluster/stripe/src/stripe.c @@ -4886,7 +4886,7 @@ unlock: if (!local_entry) break; - if (!IA_ISREG (local_entry->d_stat.ia_type)) { + if (!IA_ISREG (local_entry->d_stat.ia_type) || !local_entry->inode) { LOCK (&frame->lock); { local->wind_count--; diff --git a/xlators/features/index/src/index.c b/xlators/features/index/src/index.c index 5edfeda8f..5c1c65fbd 100644 --- a/xlators/features/index/src/index.c +++ b/xlators/features/index/src/index.c @@ -15,11 +15,9 @@ #include "index.h" #include "options.h" #include "glusterfs3-xdr.h" -#include "syncop.h" #include "syscall.h" #define XATTROP_SUBDIR "xattrop" -#define BASE_INDICES_HOLDER_SUBDIR "base_indices_holder" call_stub_t * __index_dequeue (struct list_head *callstubs) @@ -245,40 +243,20 @@ check_delete_stale_index_file (xlator_t *this, char *filename) { int ret = 0; struct stat st = {0}; - struct stat base_index_st = {0}; char filepath[PATH_MAX] = {0}; - char filepath_under_base_indices_holder[PATH_MAX] = {0}; index_priv_t *priv = NULL; priv = this->private; - if (priv->to_be_healed_states != synced_state) - return; - make_file_path (priv->index_basepath, XATTROP_SUBDIR, filename, filepath, sizeof (filepath)); - - make_file_path (priv->index_basepath, BASE_INDICES_HOLDER_SUBDIR, - filename, filepath_under_base_indices_holder, - sizeof (filepath_under_base_indices_holder)); - - - ret = stat (filepath_under_base_indices_holder, &base_index_st); - if (ret) { - gf_log (THIS->name, GF_LOG_ERROR, "Base index is not created" - " under index/base_indices_holder"); - return; - } - ret = stat (filepath, &st); - if (!ret && st.st_nlink == 2) { + if (!ret && st.st_nlink == 1) unlink (filepath); - unlink (filepath_under_base_indices_holder); - } } static int index_fill_readdir (fd_t *fd, DIR *dir, off_t off, - size_t size, gf_dirent_t *entries, readdir_directory type) + size_t size, gf_dirent_t *entries) { off_t in_case = -1; size_t filled = 0; @@ -321,8 +299,7 @@ index_fill_readdir (fd_t *fd, DIR *dir, off_t off, } if (!strncmp (entry->d_name, XATTROP_SUBDIR"-", - strlen (XATTROP_SUBDIR"-")) && - (type == INDEX_XATTROP)) { + strlen (XATTROP_SUBDIR"-"))) { check_delete_stale_index_file (this, entry->d_name); continue; } @@ -360,193 +337,17 @@ out: return count; } -int -sync_base_indices (void *index_priv) -{ - index_priv_t *priv = NULL; - DIR *dir_base_holder = NULL; - DIR *xattrop_dir = NULL; - struct dirent *entry = NULL; - char base_indices_holder[PATH_MAX] = {0}; - char xattrop_directory[PATH_MAX] = {0}; - char base_index_path[PATH_MAX] = {0}; - char xattrop_index_path[PATH_MAX] = {0}; - int32_t op_errno = 0; - int ret = 0; - - priv = index_priv; - - snprintf (base_indices_holder, PATH_MAX, "%s/%s", priv->index_basepath, - BASE_INDICES_HOLDER_SUBDIR); - snprintf (xattrop_directory, PATH_MAX, "%s/%s", priv->index_basepath, - XATTROP_SUBDIR); - - if ((dir_base_holder = opendir(base_indices_holder)) == NULL) { - op_errno = errno; - ret = -1; - goto out; - } - if ((xattrop_dir = opendir (xattrop_directory)) == NULL) { - op_errno = errno; - ret = -1; - (void) closedir (dir_base_holder); - goto out; - } - - priv->to_be_healed_states = sync_started; - while ((entry = readdir(xattrop_dir)) != NULL) { - if (!strcmp (entry->d_name, ".") || - !strcmp (entry->d_name, "..")) { - continue; - } - if (strncmp (entry->d_name, XATTROP_SUBDIR"-", - strlen (XATTROP_SUBDIR"-"))) { - continue; - } - if (!strncmp (entry->d_name, XATTROP_SUBDIR"-", - strlen (XATTROP_SUBDIR"-"))) { - - snprintf (xattrop_index_path, PATH_MAX, "%s/%s", - xattrop_directory, entry->d_name); - - snprintf (base_index_path, PATH_MAX, "%s/%s", - base_indices_holder, entry->d_name); - - ret = sys_link (xattrop_index_path, base_index_path); - - if (ret && errno != EEXIST) { - op_errno = errno; - (void) closedir (dir_base_holder); - (void) closedir (xattrop_dir); - goto out; - } - - } - } - ret = closedir (xattrop_dir); - if (ret) { - op_errno = errno; - (void) closedir (dir_base_holder); - goto out; - } - ret = closedir (dir_base_holder); - if (ret) { - op_errno = errno; - goto out; - } - - ret = 0; -out: - errno = op_errno; - return ret; - -} - -int -base_indices_syncing_done (int ret, call_frame_t *frame, void *data) -{ - index_priv_t *priv = NULL; - priv = data; - - if (!priv) - goto out; - - if (ret) { - priv->to_be_healed_states = sync_not_started; - } else { - priv->to_be_healed_states = synced_state; - } - - STACK_DESTROY (frame->root); - -out: - return 0; -} - -int -sync_base_indices_from_xattrop (xlator_t *this) -{ - - index_priv_t *priv = NULL; - char base_indices_holder[PATH_MAX] = {0}; - int ret = 0; - struct stat st = {0}; - DIR *dir = NULL; - struct dirent *entry = NULL; - call_frame_t *frame = NULL; - - priv = this->private; - - if (priv->to_be_healed_states != sync_not_started) { - ret = -1; - goto out; - } - - snprintf (base_indices_holder, PATH_MAX, "%s/%s", priv->index_basepath, - BASE_INDICES_HOLDER_SUBDIR); - - ret = stat (base_indices_holder, &st); - - if (ret && (errno != ENOENT)) { - goto out; - } else if (errno == ENOENT) { - ret = index_dir_create (this, BASE_INDICES_HOLDER_SUBDIR); - if (ret) - goto out; - } else { - if ((dir = opendir (base_indices_holder)) == NULL) { - ret = -1; - goto out; - } - while ((entry = readdir (dir)) != NULL) { - if (!strcmp (entry->d_name, ".") || - !strcmp (entry->d_name,"..")) { - continue; - } - ret = unlink (entry->d_name); - if (ret) { - closedir (dir); - goto out; - } - } - closedir (dir); - } - - /*At this point of time we have index/base_indicies_holder directory - *is with no entries*/ - - frame = create_frame (this, this->ctx->pool); - if (!frame) { - ret = -1; - goto out; - } - set_lk_owner_from_ptr (&frame->root->lk_owner, frame->root); - - frame->root->pid = LOW_PRIO_PROC_PID; - - ret = synctask_new (this->ctx->env, sync_base_indices, - base_indices_syncing_done,frame, priv); - - - -out: - return ret; - -} - int index_add (xlator_t *this, uuid_t gfid, const char *subdir) { int32_t op_errno = 0; char gfid_path[PATH_MAX] = {0}; char index_path[PATH_MAX] = {0}; - char base_path[PATH_MAX] = {0}; int ret = 0; uuid_t index = {0}; index_priv_t *priv = NULL; struct stat st = {0}; int fd = 0; - int index_created = 0; priv = this->private; GF_ASSERT_AND_GOTO_WITH_ERROR (this->name, !uuid_is_null (gfid), @@ -561,15 +362,12 @@ index_add (xlator_t *this, uuid_t gfid, const char *subdir) index_get_index (priv, index); make_index_path (priv->index_basepath, subdir, index, index_path, sizeof (index_path)); - ret = sys_link (index_path, gfid_path); if (!ret || (errno == EEXIST)) { ret = 0; - index_created = 1; goto out; } - op_errno = errno; if (op_errno == ENOENT) { ret = index_dir_create (this, subdir); @@ -601,36 +399,10 @@ index_add (xlator_t *this, uuid_t gfid, const char *subdir) "add to index (%s)", uuid_utoa (gfid), strerror (errno)); goto out; - } else { - index_created = 1; - } - - if (priv->to_be_healed_states != sync_not_started) { - make_index_path (priv->index_basepath, - GF_BASE_INDICES_HOLDER_GFID, - index, base_path, sizeof (base_path)); - ret = sys_link (index_path, base_path); - if (ret) - goto out; } ret = 0; out: - /*If base_indices_holder is not created: create and sync - *If directory is present: delete contents and start syncing - *If syncing is in progress :No need to do any thing - *If syncing is done: No need to do anything*/ - if (!ret) { - switch (priv->to_be_healed_states) { - case sync_not_started: - ret = sync_base_indices_from_xattrop (this); - break; - case sync_started: - case synced_state: - /*No need to do anything*/ - break; - } - } return ret; } @@ -966,6 +738,41 @@ out: return 0; } +uint64_t +index_entry_count (xlator_t *this, char *subdir) +{ + index_priv_t *priv = NULL; + char index_dir[PATH_MAX]; + DIR *dirp = NULL; + uint64_t count = 0; + struct dirent buf; + struct dirent *entry = NULL; + + priv = this->private; + + make_index_dir_path (priv->index_basepath, subdir, + index_dir, sizeof (index_dir)); + + dirp = opendir (index_dir); + if (!dirp) + return 0; + + while (readdir_r (dirp, &buf, &entry) == 0) { + if (!entry) + break; + if (!strcmp (entry->d_name, ".") || + !strcmp (entry->d_name, "..")) + continue; + if (!strncmp (entry->d_name, subdir, strlen (subdir))) + continue; + count++; + } + closedir (dirp); + + return count; +} + + int32_t index_getxattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, dict_t *xdata) @@ -973,6 +780,7 @@ index_getxattr_wrapper (call_frame_t *frame, xlator_t *this, index_priv_t *priv = NULL; dict_t *xattr = NULL; int ret = 0; + uint64_t count = 0; priv = this->private; @@ -982,24 +790,26 @@ index_getxattr_wrapper (call_frame_t *frame, xlator_t *this, goto done; } - if (!strcmp (name, GF_XATTROP_INDEX_GFID)) { - - ret = dict_set_static_bin (xattr, (char*)name, - priv->xattrop_vgfid, - sizeof (priv->xattrop_vgfid)); - - } else if (!strcmp (name, GF_BASE_INDICES_HOLDER_GFID)) { - - ret = dict_set_static_bin (xattr, (char*)name, - priv->base_indices_holder_vgfid, - sizeof (priv->base_indices_holder_vgfid)); - } - if (ret) { - ret = -ENOMEM; - gf_log (THIS->name, GF_LOG_ERROR, "xattrop index " - "gfid set failed"); - goto done; - } + if (strcmp (name, GF_XATTROP_INDEX_GFID) == 0) { + ret = dict_set_static_bin (xattr, (char*)name, priv->xattrop_vgfid, + sizeof (priv->xattrop_vgfid)); + if (ret) { + ret = -ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "xattrop index " + "gfid set failed"); + goto done; + } + } else if (strcmp (name, GF_XATTROP_INDEX_COUNT) == 0) { + count = index_entry_count (this, XATTROP_SUBDIR); + + ret = dict_set_uint64 (xattr, (char *)name, count); + if (ret) { + ret = -ENOMEM; + gf_log (this->name, GF_LOG_ERROR, "xattrop index " + "count set failed"); + goto done; + } + } done: if (ret) STACK_UNWIND_STRICT (getxattr, frame, -1, -ret, xattr, xdata); @@ -1037,15 +847,6 @@ index_lookup_wrapper (call_frame_t *frame, xlator_t *this, } else if (!uuid_compare (loc->pargfid, priv->xattrop_vgfid)) { make_file_path (priv->index_basepath, XATTROP_SUBDIR, loc->name, path, sizeof (path)); - } else if (!uuid_compare (loc->gfid,priv->base_indices_holder_vgfid)){ - make_index_dir_path (priv->index_basepath, - BASE_INDICES_HOLDER_SUBDIR, path, - sizeof (path)); - is_dir = _gf_true; - } else if (!uuid_compare (loc->pargfid, priv->base_indices_holder_vgfid)) { - make_file_path (priv->index_basepath, - BASE_INDICES_HOLDER_SUBDIR,loc->name, path, - sizeof (path)); } ret = lstat (path, &lstatbuf); @@ -1067,14 +868,10 @@ index_lookup_wrapper (call_frame_t *frame, xlator_t *this, } iatt_from_stat (&stbuf, &lstatbuf); - if (is_dir && !uuid_compare (loc->gfid, priv->xattrop_vgfid)) { + if (is_dir) uuid_copy (stbuf.ia_gfid, priv->xattrop_vgfid); - } else if (is_dir && - !uuid_compare (loc->gfid, priv->base_indices_holder_vgfid)) { - uuid_copy (stbuf.ia_gfid, priv->base_indices_holder_vgfid); - } else { + else uuid_generate (stbuf.ia_gfid); - } stbuf.ia_ino = -1; op_ret = 0; done: @@ -1085,44 +882,6 @@ done: return 0; } -int32_t -base_indices_readdir_wrapper (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t off, dict_t *xdata) -{ - index_priv_t *priv = NULL; - char base_indices_holder[PATH_MAX] = {0}; - DIR *dir = NULL; - int32_t op_ret = -1; - int32_t op_errno = 0; - int count = 0; - gf_dirent_t entries; - - priv = this->private; - - make_index_dir_path (priv->index_basepath, BASE_INDICES_HOLDER_SUBDIR, - base_indices_holder, sizeof (base_indices_holder)); - - dir = opendir (base_indices_holder); - if (!dir) { - op_errno = EINVAL; - goto done; - } - - - INIT_LIST_HEAD (&entries.list); - - count = index_fill_readdir (fd, dir, off, size, &entries, - BASE_INDICES_HOLDER); - /* pick ENOENT to indicate EOF */ - op_errno = errno; - op_ret = count; - closedir (dir); -done: - STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, &entries, xdata); - gf_dirent_free (&entries); - return 0; -} - int32_t index_readdir_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t off, dict_t *xdata) @@ -1154,8 +913,7 @@ index_readdir_wrapper (call_frame_t *frame, xlator_t *this, goto done; } - count = index_fill_readdir (fd, dir, off, size, &entries, - INDEX_XATTROP); + count = index_fill_readdir (fd, dir, off, size, &entries); /* pick ENOENT to indicate EOF */ op_errno = errno; @@ -1221,11 +979,12 @@ index_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, dict_t *xdata) { call_stub_t *stub = NULL; + index_priv_t *priv = NULL; - if (!name) - goto out; - if (strcmp (GF_XATTROP_INDEX_GFID, name) && - strcmp (GF_BASE_INDICES_HOLDER_GFID, name)) + priv = this->private; + + if (!name || (strcmp (GF_XATTROP_INDEX_GFID, name) && + strcmp (GF_XATTROP_INDEX_COUNT, name))) goto out; stub = fop_getxattr_stub (frame, index_getxattr_wrapper, loc, name, @@ -1252,9 +1011,7 @@ index_lookup (call_frame_t *frame, xlator_t *this, priv = this->private; if (uuid_compare (loc->gfid, priv->xattrop_vgfid) && - uuid_compare (loc->pargfid, priv->xattrop_vgfid) && - uuid_compare (loc->gfid, priv->base_indices_holder_vgfid) && - uuid_compare (loc->pargfid, priv->base_indices_holder_vgfid)) + uuid_compare (loc->pargfid, priv->xattrop_vgfid)) goto normal; stub = fop_lookup_stub (frame, index_lookup_wrapper, loc, xattr_req); @@ -1280,19 +1037,10 @@ index_readdir (call_frame_t *frame, xlator_t *this, index_priv_t *priv = NULL; priv = this->private; - if (uuid_compare (fd->inode->gfid, priv->xattrop_vgfid) && - uuid_compare (fd->inode->gfid, priv->base_indices_holder_vgfid)) + if (uuid_compare (fd->inode->gfid, priv->xattrop_vgfid)) goto out; - - if (!uuid_compare (fd->inode->gfid, priv->xattrop_vgfid)) { - stub = fop_readdir_stub (frame, index_readdir_wrapper, fd, size, - off, xdata); - } else if (!uuid_compare (fd->inode->gfid, - priv->base_indices_holder_vgfid)) { - stub = fop_readdir_stub (frame, base_indices_readdir_wrapper, - fd, size, off, xdata); - } - + stub = fop_readdir_stub (frame, index_readdir_wrapper, fd, size, off, + xdata); if (!stub) { STACK_UNWIND_STRICT (readdir, frame, -1, ENOMEM, NULL, NULL); return 0; @@ -1396,9 +1144,6 @@ init (xlator_t *this) GF_OPTION_INIT ("index-base", priv->index_basepath, path, out); uuid_generate (priv->index); uuid_generate (priv->xattrop_vgfid); - /*base_indices_holder is a directory which contains hard links to - * all base indices inside indices/xattrop directory*/ - uuid_generate (priv->base_indices_holder_vgfid); INIT_LIST_HEAD (&priv->callstubs); this->private = priv; @@ -1415,7 +1160,6 @@ init (xlator_t *this) } ret = 0; - out: if (ret) { if (cond_inited) diff --git a/xlators/features/index/src/index.h b/xlators/features/index/src/index.h index d6dcb1c23..661dcdbc4 100644 --- a/xlators/features/index/src/index.h +++ b/xlators/features/index/src/index.h @@ -36,28 +36,14 @@ typedef struct index_fd_ctx { DIR *dir; } index_fd_ctx_t; -typedef enum { - sync_not_started, - sync_started, - synced_state, -} to_be_healed_states_t; - -typedef enum { - INDEX_XATTROP, - BASE_INDICES_HOLDER, -} readdir_directory; - typedef struct index_priv { char *index_basepath; uuid_t index; gf_lock_t lock; uuid_t xattrop_vgfid;//virtual gfid of the xattrop index dir - uuid_t base_indices_holder_vgfid; //virtual gfid of the - //to_be_healed_xattrop directory struct list_head callstubs; pthread_mutex_t mutex; pthread_cond_t cond; - to_be_healed_states_t to_be_healed_states; } index_priv_t; #define INDEX_STACK_UNWIND(fop, frame, params ...) \ -- cgit From 16151032862af8ee70f14eff57162d829d8d75f9 Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Mon, 10 Mar 2014 18:28:02 +0530 Subject: debug/io-stats:fix compile warning Compiler Warning: --------------------------------------------------------- io-stats.c: In function 'io_stats_dump': io-stats.c:950:24: warning: comparison between 'gf1_cli_stats_op' and 'enum gf1_cli_info_op' [-Wenum-compare] if (op == GF_CLI_INFO_ALL || io-stats.c:951:24: warning: comparison between 'gf1_cli_stats_op' and 'enum gf1_cli_info_op' [-Wenum-compare] op == GF_CLI_INFO_CUMULATIVE) ^ --------------------------------------------------------- Fix: Use the appropriate enum in function defintion of io_stats_dump(). Note: Using the same BZ ID as the commit that introduced this argument. Change-Id: I24e1aaf9ab86b4f337e3daa729d561ec208f2a95 BUG: 1030580 Signed-off-by: Ravishankar N Reviewed-on: http://review.gluster.org/7217 Tested-by: Gluster Build System Reviewed-by: Vijay Bellur --- xlators/debug/io-stats/src/io-stats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'xlators') diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c index fa0dd395c..9e48a7c6e 100644 --- a/xlators/debug/io-stats/src/io-stats.c +++ b/xlators/debug/io-stats/src/io-stats.c @@ -929,7 +929,7 @@ ios_global_stats_clear (struct ios_global_stats *stats, struct timeval *now) int io_stats_dump (xlator_t *this, struct ios_dump_args *args, - gf1_cli_stats_op op, gf_boolean_t is_peek) + gf1_cli_info_op op, gf_boolean_t is_peek) { struct ios_conf *conf = NULL; struct ios_global_stats cumulative = {0, }; -- cgit From 53194718bb2aed6b88084cafd9e84a4350663ac6 Mon Sep 17 00:00:00 2001 From: Susant Palai Date: Thu, 27 Feb 2014 06:50:15 +0000 Subject: Glusterd/Remove-brick: Reconfigure the nfs server volfile upon remove-brick start Problem : For remove-brick start operation all client volfiles are reconfigured except nfs server volfile. Hence, even after layout is fixed by the rebalance process, the nfs clients dont see the change and go on creating directories and files in the decommissioned brick which leads to data loss after remove-brick commit. Solution : Reconfigure the nfs server volfile for remove-brick start credit: kaushal@redhat.com spalai@redhat.com Change-Id: Ib8cd8b45a9e1f888d5e00dff65cdf77c1613a2af BUG: 1070734 Signed-off-by: Susant Palai Reviewed-on: http://review.gluster.org/7162 Reviewed-by: Kaushal M Tested-by: Gluster Build System Reviewed-by: Raghavendra G Reviewed-by: Vijay Bellur --- xlators/mgmt/glusterd/src/glusterd-brick-ops.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'xlators') diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c index ced916ea1..5ab23f2d9 100644 --- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c @@ -1931,6 +1931,16 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr) goto out; } + if (GF_OP_CMD_START == cmd && + volinfo->status == GLUSTERD_STATUS_STARTED) { + ret = glusterd_nodesvcs_handle_reconfigure (volinfo); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "Unable to reconfigure NFS-Server"); + goto out; + } + } + /* Need to reset the defrag/rebalance status accordingly */ switch (volinfo->rebal.defrag_status) { case GF_DEFRAG_STATUS_FAILED: -- cgit From dc6f7acaeda69ac0765812b3d961197a68ef9bf5 Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Sat, 22 Mar 2014 19:30:45 +0530 Subject: encryption/crypt: Add mem-accounting for crypt xlator Without these changes crypt.t crashes when compiled with -DDEBUG Change-Id: I1f7372aa30a09dbe3ae81d1dd598cf36e17fe0b7 BUG: 1030058 Signed-off-by: Pranith Kumar K Reviewed-on: http://review.gluster.org/7319 Tested-by: Gluster Build System Reviewed-by: Vijay Bellur --- xlators/encryption/crypt/src/crypt-mem-types.h | 1 + xlators/encryption/crypt/src/crypt.c | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'xlators') diff --git a/xlators/encryption/crypt/src/crypt-mem-types.h b/xlators/encryption/crypt/src/crypt-mem-types.h index 799727573..2eab921fc 100644 --- a/xlators/encryption/crypt/src/crypt-mem-types.h +++ b/xlators/encryption/crypt/src/crypt-mem-types.h @@ -24,6 +24,7 @@ enum gf_crypt_mem_types_ { gf_crypt_mt_key, gf_crypt_mt_iovec, gf_crypt_mt_char, + gf_crypt_mt_end, }; #endif /* __CRYPT_MEM_TYPES_H__ */ diff --git a/xlators/encryption/crypt/src/crypt.c b/xlators/encryption/crypt/src/crypt.c index becff3e47..1abdad31d 100644 --- a/xlators/encryption/crypt/src/crypt.c +++ b/xlators/encryption/crypt/src/crypt.c @@ -4380,6 +4380,25 @@ static void crypt_free_private(xlator_t *this) } } +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_crypt_mt_end); + + if (ret != 0) { + gf_log(this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; + } + + return ret; +} + int32_t reconfigure (xlator_t *this, dict_t *options) { int32_t ret = -1; -- cgit From 17454dfea9f3c4d47fcf0b5370a6155f639c8aeb Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Tue, 25 Feb 2014 09:17:18 +0530 Subject: glusterd: persistent client xlator/ afr changelog names -Add a unique brick-id field to glusterd_brickinfo_t -Persist the id to the brickinfo file -Use the brick-id as the client xlator name during vol create, add-brick and replace-brick operations. -For older volumes,generate the id in-memory during glusterd restore but defer writing it to the brickinfo file until the next volume set operation. -send and receive the brick-ids during peer probe. Feature page: www.gluster.org/community/documentation/index.php/Features/persistent-AFR-changelog-xattributes Related patch: http://review.gluster.org/#/c/7122 Change-Id: Ib7f1570004e33f4144476410eec2b84df4e41448 BUG: 1066778 Signed-off-by: Ravishankar N Reviewed-on: http://review.gluster.org/7155 Tested-by: Gluster Build System Reviewed-by: Pranith Kumar Karampuri Reviewed-by: Kaushal M Reviewed-by: Vijay Bellur --- xlators/mgmt/glusterd/src/glusterd-brick-ops.c | 7 ++++ xlators/mgmt/glusterd/src/glusterd-replace-brick.c | 3 ++ xlators/mgmt/glusterd/src/glusterd-store.c | 21 +++++++++- xlators/mgmt/glusterd/src/glusterd-store.h | 1 + xlators/mgmt/glusterd/src/glusterd-utils.c | 46 ++++++++++++++++++++++ xlators/mgmt/glusterd/src/glusterd-utils.h | 7 ++++ xlators/mgmt/glusterd/src/glusterd-volgen.c | 2 +- xlators/mgmt/glusterd/src/glusterd-volume-ops.c | 7 ++++ xlators/mgmt/glusterd/src/glusterd.h | 1 + 9 files changed, 92 insertions(+), 3 deletions(-) (limited to 'xlators') diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c index 5ab23f2d9..f15ec7b18 100644 --- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c @@ -995,6 +995,7 @@ glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count, gf_boolean_t restart_needed = 0; char msg[1024] __attribute__((unused)) = {0, }; int caps = 0; + int brickid = 0; GF_ASSERT (volinfo); @@ -1022,11 +1023,17 @@ glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count, "type is set %d, need to change it", type); } + brickid = glusterd_get_next_available_brickid (volinfo); + if (brickid < 0) + goto out; while ( i <= count) { ret = glusterd_brickinfo_new_from_brick (brick, &brickinfo); if (ret) goto out; + GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO (brickinfo, volinfo, + brickid++); + ret = glusterd_resolve_brick (brickinfo); if (ret) goto out; diff --git a/xlators/mgmt/glusterd/src/glusterd-replace-brick.c b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c index 9685cb374..e78eff44d 100644 --- a/xlators/mgmt/glusterd/src/glusterd-replace-brick.c +++ b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c @@ -1516,6 +1516,9 @@ glusterd_op_perform_replace_brick (glusterd_volinfo_t *volinfo, if (ret) goto out; + strncpy (new_brickinfo->brick_id, old_brickinfo->brick_id, + sizeof (new_brickinfo->brick_id)); + list_add_tail (&new_brickinfo->brick_list, &old_brickinfo->brick_list); diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c index 0ee430969..37cf98894 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.c +++ b/xlators/mgmt/glusterd/src/glusterd-store.c @@ -241,6 +241,11 @@ glusterd_store_brickinfo_write (int fd, glusterd_brickinfo_t *brickinfo) if (ret) goto out; + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_ID, + brickinfo->brick_id); + if (ret) + goto out; + if (!brickinfo->vg[0]) goto out; @@ -1493,7 +1498,6 @@ out: return ret; } - int32_t glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo) { @@ -1511,6 +1515,7 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo) gf_store_iter_t *tmpiter = NULL; char *tmpvalue = NULL; struct pmap_registry *pmap = NULL; + int brickid = 0; gf_store_op_errno_t op_errno = GD_STORE_SUCCESS; GF_ASSERT (volinfo); @@ -1606,6 +1611,9 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo) strlen (GLUSTERD_STORE_KEY_BRICK_VGNAME))) { strncpy (brickinfo->vg, value, sizeof (brickinfo->vg)); + } else if (!strcmp(key, GLUSTERD_STORE_KEY_BRICK_ID)) { + strncpy (brickinfo->brick_id, value, + sizeof (brickinfo->brick_id)); } else { gf_log ("", GF_LOG_ERROR, "Unknown key: %s", key); @@ -1620,13 +1628,22 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo) &op_errno); } - if (op_errno != GD_STORE_EOF) + if (op_errno != GD_STORE_EOF) { + gf_log ("", GF_LOG_ERROR, "Error parsing brickinfo: " + "op_errno=%d", op_errno); goto out; + } ret = gf_store_iter_destroy (iter); if (ret) goto out; + if (brickinfo->brick_id[0] == '\0') { + /* This is an old volume upgraded to op_version 4 */ + GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO (brickinfo, volinfo, + brickid++); + } + list_add_tail (&brickinfo->brick_list, &volinfo->bricks); brick_count++; } diff --git a/xlators/mgmt/glusterd/src/glusterd-store.h b/xlators/mgmt/glusterd/src/glusterd-store.h index fadea8b2f..955abb09f 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.h +++ b/xlators/mgmt/glusterd/src/glusterd-store.h @@ -65,6 +65,7 @@ typedef enum glusterd_store_ver_ac_{ #define GLUSTERD_STORE_KEY_BRICK_RDMA_PORT "rdma.listen-port" #define GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED "decommissioned" #define GLUSTERD_STORE_KEY_BRICK_VGNAME "vg" +#define GLUSTERD_STORE_KEY_BRICK_ID "brick-id" #define GLUSTERD_STORE_KEY_PEER_UUID "uuid" #define GLUSTERD_STORE_KEY_PEER_HOSTNAME "hostname" diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index 6393c554e..0bec8c06b 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -671,6 +671,30 @@ out: return ret; } +int +glusterd_get_next_available_brickid (glusterd_volinfo_t *volinfo) +{ + glusterd_brickinfo_t *brickinfo = NULL; + char *token = NULL; + int brickid = 0; + int max_brickid = -1; + int ret = -1; + + list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { + token = strrchr (brickinfo->brick_id, '-'); + ret = gf_string2int32 (++token, &brickid); + if (ret < 0) { + gf_log (THIS->name, GF_LOG_ERROR, + "Unable to generate brick ID"); + return ret; + } + if (brickid > max_brickid) + max_brickid = brickid; + } + + return max_brickid + 1 ; +} + int32_t glusterd_resolve_brick (glusterd_brickinfo_t *brickinfo) { @@ -2131,6 +2155,13 @@ glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo, if (ret) goto out; + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "volume%d.brick%d.brick_id", + count, i); + ret = dict_set_str (dict, key, brickinfo->brick_id); + if (ret) + goto out; + i++; } @@ -2805,6 +2836,7 @@ glusterd_import_new_brick (dict_t *vols, int32_t vol_count, int ret = -1; char *hostname = NULL; char *path = NULL; + char *brick_id = NULL; int decommissioned = 0; glusterd_brickinfo_t *new_brickinfo = NULL; char msg[2048] = {0}; @@ -2831,6 +2863,11 @@ glusterd_import_new_brick (dict_t *vols, int32_t vol_count, goto out; } + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "volume%d.brick%d.brick_id", + vol_count, brick_count); + ret = dict_get_str (vols, key, &brick_id); + memset (key, 0, sizeof (key)); snprintf (key, sizeof (key), "volume%d.brick%d.decommissioned", vol_count, brick_count); @@ -2847,6 +2884,8 @@ glusterd_import_new_brick (dict_t *vols, int32_t vol_count, strcpy (new_brickinfo->path, path); strcpy (new_brickinfo->hostname, hostname); new_brickinfo->decommissioned = decommissioned; + if (brick_id) + strcpy (new_brickinfo->brick_id, brick_id); //peerinfo might not be added yet (void) glusterd_resolve_brick (new_brickinfo); ret = 0; @@ -2864,6 +2903,7 @@ glusterd_import_bricks (dict_t *vols, int32_t vol_count, { int ret = -1; int brick_count = 1; + int brickid = 0; glusterd_brickinfo_t *new_brickinfo = NULL; GF_ASSERT (vols); @@ -2875,6 +2915,12 @@ glusterd_import_bricks (dict_t *vols, int32_t vol_count, &new_brickinfo); if (ret) goto out; + if (new_brickinfo->brick_id[0] == '\0') + /*We were probed from a peer having op-version + less than GD_OP_VER_PERSISTENT_AFR_XATTRS*/ + GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO (new_brickinfo, + new_volinfo, + brickid++); list_add_tail (&new_brickinfo->brick_list, &new_volinfo->bricks); brick_count++; } diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h index cd22b2960..aebf5fcef 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.h +++ b/xlators/mgmt/glusterd/src/glusterd-utils.h @@ -29,6 +29,10 @@ #include "protocol-common.h" #define GLUSTERD_SOCK_DIR "/var/run" +#define GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO(brickinfo, volinfo, brickid) do {\ + sprintf (brickinfo->brick_id, "%s-client-%d",\ + volinfo->volname, brickid);\ +} while (0) struct glusterd_lock_ { uuid_t owner; @@ -124,6 +128,9 @@ int32_t glusterd_service_stop(const char *service, char *pidfile, int sig, gf_boolean_t force_kill); +int +glusterd_get_next_available_brickid (glusterd_volinfo_t *volinfo); + int32_t glusterd_resolve_brick (glusterd_brickinfo_t *brickinfo); diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 001825941..9012003c9 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -2151,7 +2151,7 @@ volgen_graph_build_clients (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, list_for_each_entry (brick, &volinfo->bricks, brick_list) { ret = -1; xl = volgen_graph_add_nolink (graph, "protocol/client", - "%s-client-%d", volname, i); + "%s", brick->brick_id); if (!xl) goto out; ret = xlator_set_option (xl, "remote-host", brick->hostname); diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c index 4acea7686..135faa40a 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c @@ -1488,6 +1488,7 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) char *username = NULL; char *password = NULL; int caps = 0; + int brickid = 0; char msg[1024] __attribute__((unused)) = {0, }; this = THIS; @@ -1653,11 +1654,17 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) brick = strtok_r (brick_list+1, " \n", &saveptr); caps = CAPS_BD | CAPS_THIN | CAPS_OFFLOAD_COPY | CAPS_OFFLOAD_SNAPSHOT; + brickid = glusterd_get_next_available_brickid (volinfo); + if (brickid < 0) + goto out; while ( i <= count) { ret = glusterd_brickinfo_new_from_brick (brick, &brickinfo); if (ret) goto out; + GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO (brickinfo, volinfo, + brickid++); + ret = glusterd_resolve_brick (brickinfo); if (ret) { gf_log (this->name, GF_LOG_ERROR, FMTSTR_RESOLVE_BRICK, diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index df53327cb..0694f7386 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -175,6 +175,7 @@ typedef enum gf_brick_status { struct glusterd_brickinfo { char hostname[1024]; char path[PATH_MAX]; + char brick_id[1024];/*Client xlator name, AFR changelog name*/ struct list_head brick_list; uuid_t uuid; int port; -- cgit