From f75b76350747f5f58a4bbe704915c74979cc5ac3 Mon Sep 17 00:00:00 2001 From: Amar Tumballi Date: Tue, 4 May 2010 00:36:24 +0000 Subject: structuring of protocol - 2 * 'transports/' and 'auth/' moved to xlators/protocol/ * transport.{c,h}, authenticate.{c,h}, protocol.h moved to xlators/protocol/lib/src/ Signed-off-by: Amar Tumballi Signed-off-by: Anand V. Avati BUG: 875 (Implement a new protocol to provide proper backward/forward compatibility) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=875 --- Makefile.am | 2 +- auth/Makefile.am | 3 - auth/addr/Makefile.am | 3 - auth/addr/src/Makefile.am | 12 - auth/addr/src/addr.c | 224 -- auth/login/Makefile.am | 3 - auth/login/src/Makefile.am | 13 - auth/login/src/login.c | 114 - configure.ac | 22 +- libglusterfs/src/Makefile.am | 6 +- libglusterfs/src/authenticate.c | 250 -- libglusterfs/src/authenticate.h | 61 - libglusterfs/src/protocol.h | 1114 --------- libglusterfs/src/transport.c | 422 ---- libglusterfs/src/transport.h | 106 - transport/Makefile.am | 3 - transport/ib-verbs/Makefile.am | 1 - transport/ib-verbs/src/Makefile.am | 15 - transport/ib-verbs/src/ib-verbs-mem-types.h | 39 - transport/ib-verbs/src/ib-verbs.c | 2613 -------------------- transport/ib-verbs/src/ib-verbs.h | 220 -- transport/ib-verbs/src/name.c | 712 ------ transport/ib-verbs/src/name.h | 47 - transport/socket/Makefile.am | 1 - transport/socket/src/Makefile.am | 14 - transport/socket/src/name.c | 737 ------ transport/socket/src/name.h | 44 - transport/socket/src/socket-mem-types.h | 36 - transport/socket/src/socket.c | 1552 ------------ transport/socket/src/socket.h | 125 - xlators/nfs/lib/src/rpcsvc.h | 1 - xlators/protocol/Makefile.am | 2 +- xlators/protocol/auth/Makefile.am | 3 + xlators/protocol/auth/addr/Makefile.am | 3 + xlators/protocol/auth/addr/src/Makefile.am | 14 + xlators/protocol/auth/addr/src/addr.c | 224 ++ xlators/protocol/auth/login/Makefile.am | 3 + xlators/protocol/auth/login/src/Makefile.am | 15 + xlators/protocol/auth/login/src/login.c | 114 + xlators/protocol/client/src/Makefile.am | 6 +- xlators/protocol/lib/Makefile.am | 3 + xlators/protocol/lib/src/Makefile.am | 15 + xlators/protocol/lib/src/authenticate.c | 250 ++ xlators/protocol/lib/src/authenticate.h | 61 + xlators/protocol/lib/src/protocol.h | 1114 +++++++++ xlators/protocol/lib/src/transport.c | 422 ++++ xlators/protocol/lib/src/transport.h | 106 + xlators/protocol/server/src/Makefile.am | 5 +- xlators/protocol/transport/Makefile.am | 3 + xlators/protocol/transport/ib-verbs/Makefile.am | 1 + .../protocol/transport/ib-verbs/src/Makefile.am | 19 + .../transport/ib-verbs/src/ib-verbs-mem-types.h | 39 + xlators/protocol/transport/ib-verbs/src/ib-verbs.c | 2613 ++++++++++++++++++++ xlators/protocol/transport/ib-verbs/src/ib-verbs.h | 220 ++ xlators/protocol/transport/ib-verbs/src/name.c | 712 ++++++ xlators/protocol/transport/ib-verbs/src/name.h | 47 + xlators/protocol/transport/socket/Makefile.am | 1 + xlators/protocol/transport/socket/src/Makefile.am | 19 + xlators/protocol/transport/socket/src/name.c | 737 ++++++ xlators/protocol/transport/socket/src/name.h | 44 + .../transport/socket/src/socket-mem-types.h | 36 + xlators/protocol/transport/socket/src/socket.c | 1552 ++++++++++++ xlators/protocol/transport/socket/src/socket.h | 125 + 63 files changed, 8539 insertions(+), 8504 deletions(-) delete mode 100644 auth/Makefile.am delete mode 100644 auth/addr/Makefile.am delete mode 100644 auth/addr/src/Makefile.am delete mode 100644 auth/addr/src/addr.c delete mode 100644 auth/login/Makefile.am delete mode 100644 auth/login/src/Makefile.am delete mode 100644 auth/login/src/login.c delete mode 100644 libglusterfs/src/authenticate.c delete mode 100644 libglusterfs/src/authenticate.h delete mode 100644 libglusterfs/src/protocol.h delete mode 100644 libglusterfs/src/transport.c delete mode 100644 libglusterfs/src/transport.h delete mode 100644 transport/Makefile.am delete mode 100644 transport/ib-verbs/Makefile.am delete mode 100644 transport/ib-verbs/src/Makefile.am delete mode 100644 transport/ib-verbs/src/ib-verbs-mem-types.h delete mode 100644 transport/ib-verbs/src/ib-verbs.c delete mode 100644 transport/ib-verbs/src/ib-verbs.h delete mode 100644 transport/ib-verbs/src/name.c delete mode 100644 transport/ib-verbs/src/name.h delete mode 100644 transport/socket/Makefile.am delete mode 100644 transport/socket/src/Makefile.am delete mode 100644 transport/socket/src/name.c delete mode 100644 transport/socket/src/name.h delete mode 100644 transport/socket/src/socket-mem-types.h delete mode 100644 transport/socket/src/socket.c delete mode 100644 transport/socket/src/socket.h create mode 100644 xlators/protocol/auth/Makefile.am create mode 100644 xlators/protocol/auth/addr/Makefile.am create mode 100644 xlators/protocol/auth/addr/src/Makefile.am create mode 100644 xlators/protocol/auth/addr/src/addr.c create mode 100644 xlators/protocol/auth/login/Makefile.am create mode 100644 xlators/protocol/auth/login/src/Makefile.am create mode 100644 xlators/protocol/auth/login/src/login.c create mode 100644 xlators/protocol/lib/Makefile.am create mode 100644 xlators/protocol/lib/src/Makefile.am create mode 100644 xlators/protocol/lib/src/authenticate.c create mode 100644 xlators/protocol/lib/src/authenticate.h create mode 100644 xlators/protocol/lib/src/protocol.h create mode 100644 xlators/protocol/lib/src/transport.c create mode 100644 xlators/protocol/lib/src/transport.h create mode 100644 xlators/protocol/transport/Makefile.am create mode 100644 xlators/protocol/transport/ib-verbs/Makefile.am create mode 100644 xlators/protocol/transport/ib-verbs/src/Makefile.am create mode 100644 xlators/protocol/transport/ib-verbs/src/ib-verbs-mem-types.h create mode 100644 xlators/protocol/transport/ib-verbs/src/ib-verbs.c create mode 100644 xlators/protocol/transport/ib-verbs/src/ib-verbs.h create mode 100644 xlators/protocol/transport/ib-verbs/src/name.c create mode 100644 xlators/protocol/transport/ib-verbs/src/name.h create mode 100644 xlators/protocol/transport/socket/Makefile.am create mode 100644 xlators/protocol/transport/socket/src/Makefile.am create mode 100644 xlators/protocol/transport/socket/src/name.c create mode 100644 xlators/protocol/transport/socket/src/name.h create mode 100644 xlators/protocol/transport/socket/src/socket-mem-types.h create mode 100644 xlators/protocol/transport/socket/src/socket.c create mode 100644 xlators/protocol/transport/socket/src/socket.h diff --git a/Makefile.am b/Makefile.am index ad69e447..0c5fc558 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,6 +1,6 @@ EXTRA_DIST = autogen.sh COPYING INSTALL README AUTHORS THANKS NEWS glusterfs.spec -SUBDIRS = argp-standalone libglusterfs xlators transport auth glusterfsd $(FUSERMOUNT_SUBDIR) doc extras +SUBDIRS = argp-standalone libglusterfs xlators glusterfsd $(FUSERMOUNT_SUBDIR) doc extras CLEANFILES = diff --git a/auth/Makefile.am b/auth/Makefile.am deleted file mode 100644 index 6bd54eee..00000000 --- a/auth/Makefile.am +++ /dev/null @@ -1,3 +0,0 @@ -SUBDIRS = addr login - -CLEANFILES = diff --git a/auth/addr/Makefile.am b/auth/addr/Makefile.am deleted file mode 100644 index d471a3f9..00000000 --- a/auth/addr/Makefile.am +++ /dev/null @@ -1,3 +0,0 @@ -SUBDIRS = src - -CLEANFILES = diff --git a/auth/addr/src/Makefile.am b/auth/addr/src/Makefile.am deleted file mode 100644 index cca40615..00000000 --- a/auth/addr/src/Makefile.am +++ /dev/null @@ -1,12 +0,0 @@ -auth_LTLIBRARIES = addr.la -authdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/auth - -addr_la_LDFLAGS = -module -avoidversion - -addr_la_SOURCES = addr.c -addr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) - -CLEANFILES = diff --git a/auth/addr/src/addr.c b/auth/addr/src/addr.c deleted file mode 100644 index a8803a39..00000000 --- a/auth/addr/src/addr.c +++ /dev/null @@ -1,224 +0,0 @@ -/* - Copyright (c) 2007-2009 Gluster, Inc. - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - . -*/ - - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include -#include -#include -#include "authenticate.h" -#include "dict.h" - -#define ADDR_DELIMITER " ," -#define PRIVILEGED_PORT_CEILING 1024 - -#ifndef AF_INET_SDP -#define AF_INET_SDP 27 -#endif - -auth_result_t -gf_auth (dict_t *input_params, dict_t *config_params) -{ - int ret = 0; - char *name = NULL; - char *searchstr = NULL; - char peer_addr[UNIX_PATH_MAX]; - data_t *peer_info_data = NULL; - peer_info_t *peer_info = NULL; - data_t *allow_addr = NULL, *reject_addr = NULL; - char is_inet_sdp = 0; - - name = data_to_str (dict_get (input_params, "remote-subvolume")); - if (!name) { - gf_log ("authenticate/addr", - GF_LOG_ERROR, - "remote-subvolume not specified"); - return AUTH_DONT_CARE; - } - - ret = asprintf (&searchstr, "auth.addr.%s.allow", name); - if (-1 == ret) { - gf_log ("auth/addr", GF_LOG_ERROR, - "asprintf failed while setting search string"); - return AUTH_DONT_CARE; - } - allow_addr = dict_get (config_params, - searchstr); - free (searchstr); - - ret = asprintf (&searchstr, "auth.addr.%s.reject", name); - if (-1 == ret) { - gf_log ("auth/addr", GF_LOG_ERROR, - "asprintf failed while setting search string"); - return AUTH_DONT_CARE; - } - reject_addr = dict_get (config_params, - searchstr); - free (searchstr); - - if (!allow_addr) { - /* TODO: backword compatibility */ - ret = asprintf (&searchstr, "auth.ip.%s.allow", name); - if (-1 == ret) { - gf_log ("auth/addr", GF_LOG_ERROR, - "asprintf failed while setting search string"); - return AUTH_DONT_CARE; - } - allow_addr = dict_get (config_params, searchstr); - free (searchstr); - } - - if (!(allow_addr || reject_addr)) { - gf_log ("auth/addr", GF_LOG_DEBUG, - "none of the options auth.addr.%s.allow or " - "auth.addr.%s.reject specified, returning auth_dont_care", - name, name); - return AUTH_DONT_CARE; - } - - peer_info_data = dict_get (input_params, "peer-info"); - if (!peer_info_data) { - gf_log ("authenticate/addr", - GF_LOG_ERROR, - "peer-info not present"); - return AUTH_DONT_CARE; - } - - peer_info = data_to_ptr (peer_info_data); - - switch (((struct sockaddr *) &peer_info->sockaddr)->sa_family) - { - case AF_INET_SDP: - is_inet_sdp = 1; - ((struct sockaddr *) &peer_info->sockaddr)->sa_family = AF_INET; - - case AF_INET: - case AF_INET6: - { - char *service; - uint16_t peer_port; - strcpy (peer_addr, peer_info->identifier); - service = strrchr (peer_addr, ':'); - *service = '\0'; - service ++; - - if (is_inet_sdp) { - ((struct sockaddr *) &peer_info->sockaddr)->sa_family = AF_INET_SDP; - } - - peer_port = atoi (service); - if (peer_port >= PRIVILEGED_PORT_CEILING) { - gf_log ("auth/addr", GF_LOG_ERROR, - "client is bound to port %d which is not privileged", - peer_port); - return AUTH_DONT_CARE; - } - break; - - case AF_UNIX: - strcpy (peer_addr, peer_info->identifier); - break; - - default: - gf_log ("authenticate/addr", GF_LOG_ERROR, - "unknown address family %d", - ((struct sockaddr *) &peer_info->sockaddr)->sa_family); - return AUTH_DONT_CARE; - } - } - - if (reject_addr) { - char *addr_str = NULL; - char *tmp; - char *addr_cpy = strdup (reject_addr->data); - - addr_str = strtok_r (addr_cpy, ADDR_DELIMITER, &tmp); - - while (addr_str) { - char negate = 0, match =0; - gf_log (name, GF_LOG_DEBUG, - "rejected = \"%s\", received addr = \"%s\"", - addr_str, peer_addr); - if (addr_str[0] == '!') { - negate = 1; - addr_str++; - } - - match = fnmatch (addr_str, - peer_addr, - 0); - if (negate ? match : !match) { - free (addr_cpy); - return AUTH_REJECT; - } - addr_str = strtok_r (NULL, ADDR_DELIMITER, &tmp); - } - free (addr_cpy); - } - - if (allow_addr) { - char *addr_str = NULL; - char *tmp; - char *addr_cpy = strdup (allow_addr->data); - - addr_str = strtok_r (addr_cpy, ADDR_DELIMITER, &tmp); - - while (addr_str) { - char negate = 0, match = 0; - gf_log (name, GF_LOG_DEBUG, - "allowed = \"%s\", received addr = \"%s\"", - addr_str, peer_addr); - if (addr_str[0] == '!') { - negate = 1; - addr_str++; - } - - match = fnmatch (addr_str, - peer_addr, - 0); - - if (negate ? match : !match) { - free (addr_cpy); - return AUTH_ACCEPT; - } - addr_str = strtok_r (NULL, ADDR_DELIMITER, &tmp); - } - free (addr_cpy); - } - - return AUTH_DONT_CARE; -} - -struct volume_options options[] = { - { .key = {"auth.addr.*.allow"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {"auth.addr.*.reject"}, - .type = GF_OPTION_TYPE_ANY - }, - /* Backword compatibility */ - { .key = {"auth.ip.*.allow"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {NULL} } -}; diff --git a/auth/login/Makefile.am b/auth/login/Makefile.am deleted file mode 100644 index d471a3f9..00000000 --- a/auth/login/Makefile.am +++ /dev/null @@ -1,3 +0,0 @@ -SUBDIRS = src - -CLEANFILES = diff --git a/auth/login/src/Makefile.am b/auth/login/src/Makefile.am deleted file mode 100644 index eb7b990c..00000000 --- a/auth/login/src/Makefile.am +++ /dev/null @@ -1,13 +0,0 @@ -auth_LTLIBRARIES = login.la -authdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/auth - -login_la_LDFLAGS = -module -avoidversion - -login_la_SOURCES = login.c -login_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - - -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) - -CLEANFILES = diff --git a/auth/login/src/login.c b/auth/login/src/login.c deleted file mode 100644 index 0c85292f..00000000 --- a/auth/login/src/login.c +++ /dev/null @@ -1,114 +0,0 @@ -/* - Copyright (c) 2007-2009 Gluster, Inc. - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - . -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include -#include "authenticate.h" - -auth_result_t gf_auth (dict_t *input_params, dict_t *config_params) -{ - int ret = 0; - char *username = NULL, *password = NULL; - data_t *allow_user = NULL, *username_data = NULL, *password_data = NULL; - int32_t result = AUTH_DONT_CARE; - char *brick_name = NULL, *searchstr = NULL; - - username_data = dict_get (input_params, "username"); - if (!username_data) - return AUTH_DONT_CARE; - - username = data_to_str (username_data); - - password_data = dict_get (input_params, "password"); - if (!password_data) - return AUTH_DONT_CARE; - - password = data_to_str (password_data); - - brick_name = data_to_str (dict_get (input_params, "remote-subvolume")); - if (!brick_name) { - gf_log ("auth/login", - GF_LOG_ERROR, - "remote-subvolume not specified"); - return AUTH_REJECT; - } - - ret = asprintf (&searchstr, "auth.login.%s.allow", brick_name); - if (-1 == ret) { - gf_log ("auth/login", GF_LOG_ERROR, - "asprintf failed while setting search string"); - return AUTH_DONT_CARE; - } - - allow_user = dict_get (config_params, - searchstr); - free (searchstr); - - if (allow_user) { - char *username_str = NULL; - char *tmp; - char *username_cpy = strdup (allow_user->data); - - username_str = strtok_r (username_cpy, " ,", &tmp); - - while (username_str) { - data_t *passwd_data = NULL; - if (!fnmatch (username_str, - username, - 0)) { - ret = asprintf (&searchstr, "auth.login.%s.password", username); - if (-1 == ret) { - gf_log ("auth/login", GF_LOG_ERROR, - "asprintf failed while setting search string"); - return AUTH_DONT_CARE; - } - passwd_data = dict_get (config_params, searchstr); - FREE (searchstr); - - if (!passwd_data) { - gf_log ("auth/login", - GF_LOG_DEBUG, - "wrong username/password combination"); - result = AUTH_REJECT; - } - else - result = !strcmp (data_to_str (passwd_data), password) ? AUTH_ACCEPT : AUTH_REJECT; - break; - } - username_str = strtok_r (NULL, " ,", &tmp); - } - free (username_cpy); - } - - return result; -} - -struct volume_options options[] = { - { .key = {"auth.login.*.allow"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {"auth.login.*.password"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {NULL} } -}; diff --git a/configure.ac b/configure.ac index b94e4307..88b8ada4 100644 --- a/configure.ac +++ b/configure.ac @@ -65,10 +65,22 @@ AC_CONFIG_FILES([Makefile xlators/debug/io-stats/Makefile xlators/debug/io-stats/src/Makefile xlators/protocol/Makefile + xlators/protocol/lib/Makefile + xlators/protocol/lib/src/Makefile + xlators/protocol/transport/Makefile + xlators/protocol/transport/socket/Makefile + xlators/protocol/transport/socket/src/Makefile + xlators/protocol/transport/ib-verbs/Makefile + xlators/protocol/transport/ib-verbs/src/Makefile xlators/protocol/client/Makefile xlators/protocol/client/src/Makefile xlators/protocol/server/Makefile xlators/protocol/server/src/Makefile + xlators/protocol/auth/Makefile + xlators/protocol/auth/addr/Makefile + xlators/protocol/auth/addr/src/Makefile + xlators/protocol/auth/login/Makefile + xlators/protocol/auth/login/src/Makefile xlators/features/Makefile xlators/features/locks/Makefile xlators/features/locks/src/Makefile @@ -81,16 +93,6 @@ AC_CONFIG_FILES([Makefile xlators/encryption/Makefile xlators/encryption/rot-13/Makefile xlators/encryption/rot-13/src/Makefile - transport/Makefile - transport/socket/Makefile - transport/socket/src/Makefile - transport/ib-verbs/Makefile - transport/ib-verbs/src/Makefile - auth/Makefile - auth/addr/Makefile - auth/addr/src/Makefile - auth/login/Makefile - auth/login/src/Makefile doc/Makefile doc/examples/Makefile doc/hacker-guide/Makefile diff --git a/libglusterfs/src/Makefile.am b/libglusterfs/src/Makefile.am index 82c634de..7cd1876e 100644 --- a/libglusterfs/src/Makefile.am +++ b/libglusterfs/src/Makefile.am @@ -1,14 +1,14 @@ libglusterfs_la_CFLAGS = -fPIC -Wall -g -shared -nostartfiles $(GF_CFLAGS) $(GF_DARWIN_LIBGLUSTERFS_CFLAGS) -libglusterfs_la_CPPFLAGS = -D_FILE_OFFSET_BITS=64 -D__USE_FILE_OFFSET64 -D_GNU_SOURCE -DXLATORDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator\" -DSCHEDULERDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler\" -DTRANSPORTDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/transport\" -D$(GF_HOST_OS) -DLIBDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/auth\" -I$(CONTRIBDIR)/rbtree +libglusterfs_la_CPPFLAGS = -D_FILE_OFFSET_BITS=64 -D__USE_FILE_OFFSET64 -D_GNU_SOURCE -DXLATORDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator\" -D$(GF_HOST_OS) -DLIBDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/auth\" -I$(CONTRIBDIR)/rbtree -DSCHEDULERDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler\" libglusterfs_la_LIBADD = @LEXLIB@ lib_LTLIBRARIES = libglusterfs.la -libglusterfs_la_SOURCES = dict.c spec.lex.c y.tab.c xlator.c logging.c hashfn.c defaults.c scheduler.c common-utils.c transport.c timer.c inode.c call-stub.c compat.c authenticate.c fd.c compat-errno.c event.c mem-pool.c gf-dirent.c syscall.c iobuf.c globals.c statedump.c stack.c checksum.c md5.c $(CONTRIBDIR)/rbtree/rb.c rbthash.c latency.c +libglusterfs_la_SOURCES = dict.c spec.lex.c y.tab.c xlator.c logging.c hashfn.c defaults.c scheduler.c common-utils.c timer.c inode.c call-stub.c compat.c fd.c compat-errno.c event.c mem-pool.c gf-dirent.c syscall.c iobuf.c globals.c statedump.c stack.c checksum.c md5.c $(CONTRIBDIR)/rbtree/rb.c rbthash.c latency.c -noinst_HEADERS = common-utils.h defaults.h dict.h glusterfs.h hashfn.h logging.h protocol.h scheduler.h xlator.h transport.h stack.h timer.h list.h inode.h call-stub.h compat.h authenticate.h fd.h revision.h compat-errno.h event.h mem-pool.h byte-order.h gf-dirent.h locking.h syscall.h iobuf.h globals.h statedump.h checksum.h md5.h $(CONTRIBDIR)/rbtree/rb.h rbthash.h iatt.h latency.h +noinst_HEADERS = common-utils.h defaults.h dict.h glusterfs.h hashfn.h logging.h scheduler.h xlator.h stack.h timer.h list.h inode.h call-stub.h compat.h fd.h revision.h compat-errno.h event.h mem-pool.h byte-order.h gf-dirent.h locking.h syscall.h iobuf.h globals.h statedump.h checksum.h md5.h $(CONTRIBDIR)/rbtree/rb.h rbthash.h iatt.h latency.h EXTRA_DIST = spec.l spec.y diff --git a/libglusterfs/src/authenticate.c b/libglusterfs/src/authenticate.c deleted file mode 100644 index eb0e2464..00000000 --- a/libglusterfs/src/authenticate.c +++ /dev/null @@ -1,250 +0,0 @@ -/* - Copyright (c) 2007-2009 Gluster, Inc. - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - . -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - -#include -#include -#include -#include "authenticate.h" - -static void -init (dict_t *this, - char *key, - data_t *value, - void *data) -{ - void *handle = NULL; - char *auth_file = NULL; - auth_handle_t *auth_handle = NULL; - auth_fn_t authenticate = NULL; - int *error = NULL; - int ret = 0; - - /* It gets over written */ - error = data; - - if (!strncasecmp (key, "ip", strlen ("ip"))) { - gf_log ("authenticate", GF_LOG_ERROR, - "AUTHENTICATION MODULE \"IP\" HAS BEEN REPLACED " - "BY \"ADDR\""); - dict_set (this, key, data_from_dynptr (NULL, 0)); - /* TODO: 1.3.x backword compatibility */ - // *error = -1; - // return; - key = "addr"; - } - - ret = gf_asprintf (&auth_file, "%s/%s.so", LIBDIR, key); - if (-1 == ret) { - gf_log ("authenticate", GF_LOG_ERROR, "asprintf failed"); - dict_set (this, key, data_from_dynptr (NULL, 0)); - *error = -1; - return; - } - - handle = dlopen (auth_file, RTLD_LAZY); - if (!handle) { - gf_log ("authenticate", GF_LOG_ERROR, "dlopen(%s): %s\n", - auth_file, dlerror ()); - dict_set (this, key, data_from_dynptr (NULL, 0)); - GF_FREE (auth_file); - *error = -1; - return; - } - GF_FREE (auth_file); - - authenticate = dlsym (handle, "gf_auth"); - if (!authenticate) { - gf_log ("authenticate", GF_LOG_ERROR, - "dlsym(gf_auth) on %s\n", dlerror ()); - dict_set (this, key, data_from_dynptr (NULL, 0)); - *error = -1; - return; - } - - auth_handle = GF_CALLOC (1, sizeof (*auth_handle), - gf_common_mt_auth_handle_t); - if (!auth_handle) { - gf_log ("authenticate", GF_LOG_ERROR, "Out of memory"); - dict_set (this, key, data_from_dynptr (NULL, 0)); - *error = -1; - return; - } - auth_handle->vol_opt = GF_CALLOC (1, sizeof (volume_opt_list_t), - gf_common_mt_volume_opt_list_t); - auth_handle->vol_opt->given_opt = dlsym (handle, "options"); - if (auth_handle->vol_opt->given_opt == NULL) { - gf_log ("authenticate", GF_LOG_DEBUG, - "volume option validation not specified"); - } - - auth_handle->authenticate = authenticate; - auth_handle->handle = handle; - - dict_set (this, key, - data_from_dynptr (auth_handle, sizeof (*auth_handle))); -} - -static void -fini (dict_t *this, - char *key, - data_t *value, - void *data) -{ - auth_handle_t *handle = data_to_ptr (value); - if (handle) { - dlclose (handle->handle); - } -} - -int32_t -gf_auth_init (xlator_t *xl, dict_t *auth_modules) -{ - int ret = 0; - auth_handle_t *handle = NULL; - data_pair_t *pair = NULL; - dict_foreach (auth_modules, init, &ret); - if (!ret) { - pair = auth_modules->members_list; - while (pair) { - handle = data_to_ptr (pair->value); - if (handle) { - list_add_tail (&(handle->vol_opt->list), - &(xl->volume_options)); - if (-1 == - validate_xlator_volume_options (xl, - handle->vol_opt->given_opt)) { - gf_log ("authenticate", GF_LOG_ERROR, - "volume option validation " - "failed"); - ret = -1; - } - } - pair = pair->next; - } - } - if (ret) { - gf_log (xl->name, GF_LOG_ERROR, "authentication init failed"); - dict_foreach (auth_modules, fini, &ret); - ret = -1; - } - return ret; -} - -static dict_t *__input_params; -static dict_t *__config_params; - -void -map (dict_t *this, - char *key, - data_t *value, - void *data) -{ - dict_t *res = data; - auth_fn_t authenticate; - auth_handle_t *handle = NULL; - - if (value && (handle = data_to_ptr (value)) && - (authenticate = handle->authenticate)) { - dict_set (res, key, - int_to_data (authenticate (__input_params, - __config_params))); - } else { - dict_set (res, key, int_to_data (AUTH_DONT_CARE)); - } -} - -void -reduce (dict_t *this, - char *key, - data_t *value, - void *data) -{ - int64_t val = 0; - int64_t *res = data; - if (!data) - return; - - val = data_to_int64 (value); - switch (val) - { - case AUTH_ACCEPT: - if (AUTH_DONT_CARE == *res) - *res = AUTH_ACCEPT; - break; - - case AUTH_REJECT: - *res = AUTH_REJECT; - break; - - case AUTH_DONT_CARE: - break; - } -} - - -auth_result_t -gf_authenticate (dict_t *input_params, - dict_t *config_params, - dict_t *auth_modules) -{ - dict_t *results = NULL; - int64_t result = AUTH_DONT_CARE; - - results = get_new_dict (); - __input_params = input_params; - __config_params = config_params; - - dict_foreach (auth_modules, map, results); - - dict_foreach (results, reduce, &result); - if (AUTH_DONT_CARE == result) { - data_t *peerinfo_data = dict_get (input_params, "peer-info"); - char *name = NULL; - - if (peerinfo_data) { - peer_info_t *peerinfo = data_to_ptr (peerinfo_data); - name = peerinfo->identifier; - } - - gf_log ("auth", GF_LOG_ERROR, - "no authentication module is interested in " - "accepting remote-client %s", name); - result = AUTH_REJECT; - } - - dict_destroy (results); - return result; -} - -void -gf_auth_fini (dict_t *auth_modules) -{ - int32_t dummy; - - dict_foreach (auth_modules, fini, &dummy); -} diff --git a/libglusterfs/src/authenticate.h b/libglusterfs/src/authenticate.h deleted file mode 100644 index 8931f62e..00000000 --- a/libglusterfs/src/authenticate.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - Copyright (c) 2007-2009 Gluster, Inc. - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - . -*/ - -#ifndef _AUTHENTICATE_H -#define _AUTHENTICATE_H - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - -#include -#include -#include "dict.h" -#include "compat.h" -#include "list.h" -#include "transport.h" -#include "xlator.h" - -typedef enum { - AUTH_ACCEPT, - AUTH_REJECT, - AUTH_DONT_CARE -} auth_result_t; - -typedef auth_result_t (*auth_fn_t) (dict_t *input_params, - dict_t *config_params); - -typedef struct { - void *handle; - auth_fn_t authenticate; - volume_opt_list_t *vol_opt; -} auth_handle_t; - -auth_result_t gf_authenticate (dict_t *input_params, - dict_t *config_params, - dict_t *auth_modules); -int32_t gf_auth_init (xlator_t *xl, dict_t *auth_modules); -void gf_auth_fini (dict_t *auth_modules); - -#endif /* _AUTHENTICATE_H */ diff --git a/libglusterfs/src/protocol.h b/libglusterfs/src/protocol.h deleted file mode 100644 index 6fd291bb..00000000 --- a/libglusterfs/src/protocol.h +++ /dev/null @@ -1,1114 +0,0 @@ -/* - Copyright (c) 2006-2009 Gluster, Inc. - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - . -*/ - -#ifndef _PROTOCOL_H -#define _PROTOCOL_H - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include -#include -#include -#include -#include -#include -#include - -#include "byte-order.h" -#include "iatt.h" - -/* Any changes in the protocol structure or adding new '[f,m]ops' needs to - * bump the protocol version by "0.1" - */ - -#define GF_PROTOCOL_VERSION "3.0" - -struct gf_stat { - uint64_t ino; - uint64_t size; - uint64_t blocks; - uint64_t dev; - uint32_t rdev; - uint32_t mode; - uint32_t nlink; - uint32_t uid; - uint32_t gid; - uint32_t blksize; - uint32_t atime; - uint32_t atime_nsec; - uint32_t mtime ; - uint32_t mtime_nsec; - uint32_t ctime; - uint32_t ctime_nsec; -} __attribute__((packed)); - - -static inline void -gf_stat_to_stat (struct gf_stat *gf_stat, struct stat *stat) -{ - stat->st_dev = ntoh64 (gf_stat->dev); - stat->st_ino = ntoh64 (gf_stat->ino); - stat->st_mode = ntoh32 (gf_stat->mode); - stat->st_nlink = ntoh32 (gf_stat->nlink); - stat->st_uid = ntoh32 (gf_stat->uid); - stat->st_gid = ntoh32 (gf_stat->gid); - stat->st_rdev = ntoh32 (gf_stat->rdev); - stat->st_size = ntoh64 (gf_stat->size); - stat->st_blksize = ntoh32 (gf_stat->blksize); - stat->st_blocks = ntoh64 (gf_stat->blocks); - stat->st_atime = ntoh32 (gf_stat->atime); - stat->st_mtime = ntoh32 (gf_stat->mtime); - stat->st_ctime = ntoh32 (gf_stat->ctime); - ST_ATIM_NSEC_SET(stat, ntoh32 (gf_stat->atime_nsec)); - ST_MTIM_NSEC_SET(stat, ntoh32 (gf_stat->mtime_nsec)); - ST_CTIM_NSEC_SET(stat, ntoh32 (gf_stat->ctime_nsec)); -} - - -static inline void -gf_stat_from_stat (struct gf_stat *gf_stat, struct stat *stat) -{ - gf_stat->dev = hton64 (stat->st_dev); - gf_stat->ino = hton64 (stat->st_ino); - gf_stat->mode = hton32 (stat->st_mode); - gf_stat->nlink = hton32 (stat->st_nlink); - gf_stat->uid = hton32 (stat->st_uid); - gf_stat->gid = hton32 (stat->st_gid); - gf_stat->rdev = hton32 (stat->st_rdev); - gf_stat->size = hton64 (stat->st_size); - gf_stat->blksize = hton32 (stat->st_blksize); - gf_stat->blocks = hton64 (stat->st_blocks); - gf_stat->atime = hton32 (stat->st_atime); - gf_stat->mtime = hton32 (stat->st_mtime); - gf_stat->ctime = hton32 (stat->st_ctime); - gf_stat->atime_nsec = hton32 (ST_ATIM_NSEC(stat)); - gf_stat->mtime_nsec = hton32 (ST_MTIM_NSEC(stat)); - gf_stat->ctime_nsec = hton32 (ST_CTIM_NSEC(stat)); -} - - -static inline void -gf_stat_to_iatt (struct gf_stat *gf_stat, struct iatt *iatt) -{ - iatt->ia_ino = ntoh64 (gf_stat->ino); - iatt->ia_dev = ntoh64 (gf_stat->dev); - iatt->ia_type = ia_type_from_st_mode (ntoh32 (gf_stat->mode)); - iatt->ia_prot = ia_prot_from_st_mode (ntoh32 (gf_stat->mode)); - iatt->ia_nlink = ntoh32 (gf_stat->nlink); - iatt->ia_uid = ntoh32 (gf_stat->uid); - iatt->ia_gid = ntoh32 (gf_stat->gid); - iatt->ia_rdev = ntoh64 (gf_stat->rdev); - iatt->ia_size = ntoh64 (gf_stat->size); - iatt->ia_blksize = ntoh32 (gf_stat->blksize); - iatt->ia_blocks = ntoh64 (gf_stat->blocks); - iatt->ia_atime = ntoh32 (gf_stat->atime); - iatt->ia_atime_nsec = ntoh32 (gf_stat->atime_nsec); - iatt->ia_mtime = ntoh32 (gf_stat->mtime); - iatt->ia_mtime_nsec = ntoh32 (gf_stat->mtime_nsec); - iatt->ia_ctime = ntoh32 (gf_stat->ctime); - iatt->ia_ctime_nsec = ntoh32 (gf_stat->ctime_nsec); - - iatt->ia_gen = ntoh64 (gf_stat->dev); -} - - -static inline void -gf_stat_from_iatt (struct gf_stat *gf_stat, struct iatt *iatt) -{ - gf_stat->ino = hton64 (iatt->ia_ino); - gf_stat->dev = hton64 (iatt->ia_dev); - gf_stat->mode = hton32 (st_mode_from_ia (iatt->ia_prot, - iatt->ia_type)); - gf_stat->nlink = hton32 (iatt->ia_nlink); - gf_stat->uid = hton32 (iatt->ia_uid); - gf_stat->gid = hton32 (iatt->ia_gid); - gf_stat->rdev = hton32 (iatt->ia_rdev); - gf_stat->size = hton64 (iatt->ia_size); - gf_stat->blksize = hton32 (iatt->ia_blksize); - gf_stat->blocks = hton64 (iatt->ia_blocks); - gf_stat->atime = hton32 (iatt->ia_atime); - gf_stat->atime_nsec = hton32 (iatt->ia_atime_nsec); - gf_stat->mtime = hton32 (iatt->ia_mtime); - gf_stat->mtime_nsec = hton32 (iatt->ia_mtime_nsec); - gf_stat->ctime = hton32 (iatt->ia_ctime); - gf_stat->ctime_nsec = hton32 (iatt->ia_ctime_nsec); - - gf_stat->dev = hton64 (iatt->ia_gen); - -} - - -struct gf_statfs { - uint64_t bsize; - uint64_t frsize; - uint64_t blocks; - uint64_t bfree; - uint64_t bavail; - uint64_t files; - uint64_t ffree; - uint64_t favail; - uint64_t fsid; - uint64_t flag; - uint64_t namemax; -} __attribute__((packed)); - - -static inline void -gf_statfs_to_statfs (struct gf_statfs *gf_stat, struct statvfs *stat) -{ - stat->f_bsize = ntoh64 (gf_stat->bsize); - stat->f_frsize = ntoh64 (gf_stat->frsize); - stat->f_blocks = ntoh64 (gf_stat->blocks); - stat->f_bfree = ntoh64 (gf_stat->bfree); - stat->f_bavail = ntoh64 (gf_stat->bavail); - stat->f_files = ntoh64 (gf_stat->files); - stat->f_ffree = ntoh64 (gf_stat->ffree); - stat->f_favail = ntoh64 (gf_stat->favail); - stat->f_fsid = ntoh64 (gf_stat->fsid); - stat->f_flag = ntoh64 (gf_stat->flag); - stat->f_namemax = ntoh64 (gf_stat->namemax); -} - - -static inline void -gf_statfs_from_statfs (struct gf_statfs *gf_stat, struct statvfs *stat) -{ - gf_stat->bsize = hton64 (stat->f_bsize); - gf_stat->frsize = hton64 (stat->f_frsize); - gf_stat->blocks = hton64 (stat->f_blocks); - gf_stat->bfree = hton64 (stat->f_bfree); - gf_stat->bavail = hton64 (stat->f_bavail); - gf_stat->files = hton64 (stat->f_files); - gf_stat->ffree = hton64 (stat->f_ffree); - gf_stat->favail = hton64 (stat->f_favail); - gf_stat->fsid = hton64 (stat->f_fsid); - gf_stat->flag = hton64 (stat->f_flag); - gf_stat->namemax = hton64 (stat->f_namemax); -} - - -struct gf_flock { - uint16_t type; - uint16_t whence; - uint64_t start; - uint64_t len; - uint32_t pid; -} __attribute__((packed)); - - -static inline void -gf_flock_to_flock (struct gf_flock *gf_flock, struct flock *flock) -{ - flock->l_type = ntoh16 (gf_flock->type); - flock->l_whence = ntoh16 (gf_flock->whence); - flock->l_start = ntoh64 (gf_flock->start); - flock->l_len = ntoh64 (gf_flock->len); - flock->l_pid = ntoh32 (gf_flock->pid); -} - - -static inline void -gf_flock_from_flock (struct gf_flock *gf_flock, struct flock *flock) -{ - gf_flock->type = hton16 (flock->l_type); - gf_flock->whence = hton16 (flock->l_whence); - gf_flock->start = hton64 (flock->l_start); - gf_flock->len = hton64 (flock->l_len); - gf_flock->pid = hton32 (flock->l_pid); -} - - -struct gf_timespec { - uint32_t tv_sec; - uint32_t tv_nsec; -} __attribute__((packed)); - - -static inline void -gf_timespec_to_timespec (struct gf_timespec *gf_ts, struct timespec *ts) -{ - - ts[0].tv_sec = ntoh32 (gf_ts[0].tv_sec); - ts[0].tv_nsec = ntoh32 (gf_ts[0].tv_nsec); - ts[1].tv_sec = ntoh32 (gf_ts[1].tv_sec); - ts[1].tv_nsec = ntoh32 (gf_ts[1].tv_nsec); -} - - -static inline void -gf_timespec_from_timespec (struct gf_timespec *gf_ts, struct timespec *ts) -{ - gf_ts[0].tv_sec = hton32 (ts[0].tv_sec); - gf_ts[0].tv_nsec = hton32 (ts[0].tv_nsec); - gf_ts[1].tv_sec = hton32 (ts[1].tv_sec); - gf_ts[1].tv_nsec = hton32 (ts[1].tv_nsec); -} - - -#define GF_O_ACCMODE 003 -#define GF_O_RDONLY 00 -#define GF_O_WRONLY 01 -#define GF_O_RDWR 02 -#define GF_O_CREAT 0100 -#define GF_O_EXCL 0200 -#define GF_O_NOCTTY 0400 -#define GF_O_TRUNC 01000 -#define GF_O_APPEND 02000 -#define GF_O_NONBLOCK 04000 -#define GF_O_SYNC 010000 -#define GF_O_ASYNC 020000 - -#define GF_O_DIRECT 040000 -#define GF_O_DIRECTORY 0200000 -#define GF_O_NOFOLLOW 0400000 -#define GF_O_NOATIME 01000000 -#define GF_O_CLOEXEC 02000000 - -#define GF_O_LARGEFILE 0100000 - -#define XLATE_BIT(from, to, bit) do { \ - if (from & bit) \ - to = to | GF_##bit; \ - } while (0) - -#define UNXLATE_BIT(from, to, bit) do { \ - if (from & GF_##bit) \ - to = to | bit; \ - } while (0) - -#define XLATE_ACCESSMODE(from, to) do { \ - switch (from & O_ACCMODE) { \ - case O_RDONLY: to |= GF_O_RDONLY; \ - break; \ - case O_WRONLY: to |= GF_O_WRONLY; \ - break; \ - case O_RDWR: to |= GF_O_RDWR; \ - break; \ - } \ - } while (0) - -#define UNXLATE_ACCESSMODE(from, to) do { \ - switch (from & GF_O_ACCMODE) { \ - case GF_O_RDONLY: to |= O_RDONLY; \ - break; \ - case GF_O_WRONLY: to |= O_WRONLY; \ - break; \ - case GF_O_RDWR: to |= O_RDWR; \ - break; \ - } \ - } while (0) - -static inline uint32_t -gf_flags_from_flags (uint32_t flags) -{ - uint32_t gf_flags = 0; - - XLATE_ACCESSMODE (flags, gf_flags); - - XLATE_BIT (flags, gf_flags, O_CREAT); - XLATE_BIT (flags, gf_flags, O_EXCL); - XLATE_BIT (flags, gf_flags, O_NOCTTY); - XLATE_BIT (flags, gf_flags, O_TRUNC); - XLATE_BIT (flags, gf_flags, O_APPEND); - XLATE_BIT (flags, gf_flags, O_NONBLOCK); - XLATE_BIT (flags, gf_flags, O_SYNC); - XLATE_BIT (flags, gf_flags, O_ASYNC); - - XLATE_BIT (flags, gf_flags, O_DIRECT); - XLATE_BIT (flags, gf_flags, O_DIRECTORY); - XLATE_BIT (flags, gf_flags, O_NOFOLLOW); -#ifdef O_NOATIME - XLATE_BIT (flags, gf_flags, O_NOATIME); -#endif -#ifdef O_CLOEXEC - XLATE_BIT (flags, gf_flags, O_CLOEXEC); -#endif - XLATE_BIT (flags, gf_flags, O_LARGEFILE); - - return gf_flags; -} - -static inline uint32_t -gf_flags_to_flags (uint32_t gf_flags) -{ - uint32_t flags = 0; - - UNXLATE_ACCESSMODE (gf_flags, flags); - - UNXLATE_BIT (gf_flags, flags, O_CREAT); - UNXLATE_BIT (gf_flags, flags, O_EXCL); - UNXLATE_BIT (gf_flags, flags, O_NOCTTY); - UNXLATE_BIT (gf_flags, flags, O_TRUNC); - UNXLATE_BIT (gf_flags, flags, O_APPEND); - UNXLATE_BIT (gf_flags, flags, O_NONBLOCK); - UNXLATE_BIT (gf_flags, flags, O_SYNC); - UNXLATE_BIT (gf_flags, flags, O_ASYNC); - - UNXLATE_BIT (gf_flags, flags, O_DIRECT); - UNXLATE_BIT (gf_flags, flags, O_DIRECTORY); - UNXLATE_BIT (gf_flags, flags, O_NOFOLLOW); -#ifdef O_NOATIME - UNXLATE_BIT (gf_flags, flags, O_NOATIME); -#endif -#ifdef O_CLOEXEC - UNXLATE_BIT (gf_flags, flags, O_CLOEXEC); -#endif - UNXLATE_BIT (gf_flags, flags, O_LARGEFILE); - - return flags; -} - - -typedef struct { - uint64_t ino; - uint64_t gen; - char path[0]; /* NULL terminated */ -} __attribute__((packed)) gf_fop_stat_req_t;; -typedef struct { - struct gf_stat stat; -} __attribute__((packed)) gf_fop_stat_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - uint32_t size; - char path[0]; /* NULL terminated */ -} __attribute__((packed)) gf_fop_readlink_req_t; -typedef struct { - struct gf_stat buf; - char path[0]; /* NULL terminated */ -} __attribute__((packed)) gf_fop_readlink_rsp_t; - - -typedef struct { - uint64_t par; - uint64_t gen; - uint64_t dev; - uint32_t mode; - char path[0]; /* NULL terminated */ - char bname[0]; /* NULL terminated */ -} __attribute__((packed)) gf_fop_mknod_req_t; -typedef struct { - struct gf_stat stat; - struct gf_stat preparent; - struct gf_stat postparent; -} __attribute__((packed)) gf_fop_mknod_rsp_t; - - -typedef struct { - uint64_t par; - uint64_t gen; - uint32_t mode; - char path[0]; /* NULL terminated */ - char bname[0]; /* NULL terminated */ -} __attribute__((packed)) gf_fop_mkdir_req_t; -typedef struct { - struct gf_stat stat; - struct gf_stat preparent; - struct gf_stat postparent; -} __attribute__((packed)) gf_fop_mkdir_rsp_t; - - -typedef struct { - uint64_t par; - uint64_t gen; - char path[0]; /* NULL terminated */ - char bname[0]; /* NULL terminated */ -} __attribute__((packed)) gf_fop_unlink_req_t; -typedef struct { - struct gf_stat preparent; - struct gf_stat postparent; -} __attribute__((packed)) gf_fop_unlink_rsp_t; - - -typedef struct { - uint64_t par; - uint64_t gen; - char path[0]; - char bname[0]; /* NULL terminated */ -} __attribute__((packed)) gf_fop_rmdir_req_t; -typedef struct { - struct gf_stat preparent; - struct gf_stat postparent; -} __attribute__((packed)) gf_fop_rmdir_rsp_t; - - -typedef struct { - uint64_t par; - uint64_t gen; - char path[0]; - char bname[0]; - char linkname[0]; -} __attribute__((packed)) gf_fop_symlink_req_t; -typedef struct { - struct gf_stat stat; - struct gf_stat preparent; - struct gf_stat postparent; -}__attribute__((packed)) gf_fop_symlink_rsp_t; - - -typedef struct { - uint64_t oldpar; - uint64_t oldgen; - uint64_t newpar; - uint64_t newgen; - char oldpath[0]; - char oldbname[0]; /* NULL terminated */ - char newpath[0]; - char newbname[0]; /* NULL terminated */ -} __attribute__((packed)) gf_fop_rename_req_t; -typedef struct { - struct gf_stat stat; - struct gf_stat preoldparent; - struct gf_stat postoldparent; - struct gf_stat prenewparent; - struct gf_stat postnewparent; -} __attribute__((packed)) gf_fop_rename_rsp_t; - - -typedef struct { - uint64_t oldino; - uint64_t oldgen; - uint64_t newpar; - uint64_t newgen; - char oldpath[0]; - char newpath[0]; - char newbname[0]; -}__attribute__((packed)) gf_fop_link_req_t; -typedef struct { - struct gf_stat stat; - struct gf_stat preparent; - struct gf_stat postparent; -} __attribute__((packed)) gf_fop_link_rsp_t; - -typedef struct { - uint64_t ino; - uint64_t gen; - uint64_t offset; - char path[0]; -} __attribute__((packed)) gf_fop_truncate_req_t; -typedef struct { - struct gf_stat prestat; - struct gf_stat poststat; -} __attribute__((packed)) gf_fop_truncate_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - uint32_t flags; - uint32_t wbflags; - char path[0]; -} __attribute__((packed)) gf_fop_open_req_t; -typedef struct { - int64_t fd; -} __attribute__((packed)) gf_fop_open_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - int64_t fd; - uint64_t offset; - uint32_t size; -} __attribute__((packed)) gf_fop_read_req_t; -typedef struct { - struct gf_stat stat; - char buf[0]; -} __attribute__((packed)) gf_fop_read_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - int64_t fd; - uint64_t offset; - uint32_t size; -} __attribute__((packed)) gf_fop_write_req_t; -typedef struct { - struct gf_stat prestat; - struct gf_stat poststat; -} __attribute__((packed)) gf_fop_write_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - char path[0]; -} __attribute__((packed)) gf_fop_statfs_req_t; -typedef struct { - struct gf_statfs statfs; -} __attribute__((packed)) gf_fop_statfs_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - int64_t fd; -} __attribute__((packed)) gf_fop_flush_req_t; -typedef struct { } __attribute__((packed)) gf_fop_flush_rsp_t; - - -typedef struct fsync_req { - uint64_t ino; - uint64_t gen; - int64_t fd; - uint32_t data; -} __attribute__((packed)) gf_fop_fsync_req_t; -typedef struct { - struct gf_stat prestat; - struct gf_stat poststat; -} __attribute__((packed)) gf_fop_fsync_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - uint32_t flags; - uint32_t dict_len; - char dict[0]; - char path[0]; -} __attribute__((packed)) gf_fop_setxattr_req_t; -typedef struct { } __attribute__((packed)) gf_fop_setxattr_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - int64_t fd; - uint32_t flags; - uint32_t dict_len; - char dict[0]; -} __attribute__((packed)) gf_fop_fsetxattr_req_t; -typedef struct { } __attribute__((packed)) gf_fop_fsetxattr_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - uint32_t flags; - uint32_t dict_len; - char dict[0]; - char path[0]; -} __attribute__((packed)) gf_fop_xattrop_req_t; - -typedef struct { - uint32_t dict_len; - char dict[0]; -} __attribute__((packed)) gf_fop_xattrop_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - int64_t fd; - uint32_t flags; - uint32_t dict_len; - char dict[0]; -} __attribute__((packed)) gf_fop_fxattrop_req_t; - -typedef struct { - uint32_t dict_len; - char dict[0]; -} __attribute__((packed)) gf_fop_fxattrop_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - uint32_t namelen; - char path[0]; - char name[0]; -} __attribute__((packed)) gf_fop_getxattr_req_t; -typedef struct { - uint32_t dict_len; - char dict[0]; -} __attribute__((packed)) gf_fop_getxattr_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - int64_t fd; - uint32_t namelen; - char name[0]; -} __attribute__((packed)) gf_fop_fgetxattr_req_t; -typedef struct { - uint32_t dict_len; - char dict[0]; -} __attribute__((packed)) gf_fop_fgetxattr_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - char path[0]; - char name[0]; -} __attribute__((packed)) gf_fop_removexattr_req_t; -typedef struct { } __attribute__((packed)) gf_fop_removexattr_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - char path[0]; -} __attribute__((packed)) gf_fop_opendir_req_t; -typedef struct { - int64_t fd; -} __attribute__((packed)) gf_fop_opendir_rsp_t; - - -typedef struct fsyncdir_req { - uint64_t ino; - uint64_t gen; - int64_t fd; - int32_t data; -} __attribute__((packed)) gf_fop_fsyncdir_req_t; -typedef struct { -} __attribute__((packed)) gf_fop_fsyncdir_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - int64_t fd; - uint64_t offset; - uint32_t size; -} __attribute__((packed)) gf_fop_readdir_req_t; -typedef struct { - uint32_t size; - char buf[0]; -} __attribute__((packed)) gf_fop_readdir_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - int64_t fd; - uint64_t offset; - uint32_t size; -} __attribute__((packed)) gf_fop_readdirp_req_t; -typedef struct { - uint32_t size; - char buf[0]; -} __attribute__((packed)) gf_fop_readdirp_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - uint32_t mask; - char path[0]; -} __attribute__((packed)) gf_fop_access_req_t; -typedef struct { -} __attribute__((packed)) gf_fop_access_rsp_t; - - -typedef struct { - uint64_t par; - uint64_t gen; - uint32_t flags; - uint32_t mode; - char path[0]; - char bname[0]; -} __attribute__((packed)) gf_fop_create_req_t; -typedef struct { - struct gf_stat stat; - uint64_t fd; - struct gf_stat preparent; - struct gf_stat postparent; -} __attribute__((packed)) gf_fop_create_rsp_t; - - - -typedef struct { - uint64_t ino; - uint64_t gen; - int64_t fd; - uint64_t offset; -} __attribute__((packed)) gf_fop_ftruncate_req_t; -typedef struct { - struct gf_stat prestat; - struct gf_stat poststat; -} __attribute__((packed)) gf_fop_ftruncate_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - int64_t fd; -} __attribute__((packed)) gf_fop_fstat_req_t; -typedef struct { - struct gf_stat stat; -} __attribute__((packed)) gf_fop_fstat_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - int64_t fd; - uint32_t cmd; - uint32_t type; - struct gf_flock flock; -} __attribute__((packed)) gf_fop_lk_req_t; -typedef struct { - struct gf_flock flock; -} __attribute__((packed)) gf_fop_lk_rsp_t; - -typedef struct { - uint64_t ino; - uint64_t gen; - uint32_t cmd; - uint32_t type; - struct gf_flock flock; - char path[0]; - char volume[0]; -} __attribute__((packed)) gf_fop_inodelk_req_t; -typedef struct { -} __attribute__((packed)) gf_fop_inodelk_rsp_t; - -typedef struct { - uint64_t ino; - uint64_t gen; - int64_t fd; - uint32_t cmd; - uint32_t type; - struct gf_flock flock; - char volume[0]; -} __attribute__((packed)) gf_fop_finodelk_req_t; -typedef struct { -} __attribute__((packed)) gf_fop_finodelk_rsp_t; - -typedef struct { - uint64_t ino; - uint64_t gen; - uint32_t cmd; - uint32_t type; - uint64_t namelen; - char path[0]; - char name[0]; - char volume[0]; -} __attribute__((packed)) gf_fop_entrylk_req_t; -typedef struct { -} __attribute__((packed)) gf_fop_entrylk_rsp_t; - -typedef struct { - uint64_t ino; - uint64_t gen; - int64_t fd; - uint32_t cmd; - uint32_t type; - uint64_t namelen; - char name[0]; - char volume[0]; -} __attribute__((packed)) gf_fop_fentrylk_req_t; -typedef struct { -} __attribute__((packed)) gf_fop_fentrylk_rsp_t; - -typedef struct { - uint64_t ino; /* NOTE: used only in case of 'root' lookup */ - uint64_t par; - uint64_t gen; - uint32_t flags; - uint32_t dictlen; - char path[0]; - char bname[0]; - char dict[0]; -} __attribute__((packed)) gf_fop_lookup_req_t; -typedef struct { - struct gf_stat stat; - struct gf_stat postparent; - uint32_t dict_len; - char dict[0]; -} __attribute__((packed)) gf_fop_lookup_rsp_t; - -typedef struct { - uint64_t ino; - uint64_t gen; - uint32_t flag; - char path[0]; -} __attribute__((packed)) gf_fop_checksum_req_t; -typedef struct { - unsigned char fchecksum[0]; - unsigned char dchecksum[0]; -} __attribute__((packed)) gf_fop_checksum_rsp_t; - -typedef struct { - uint64_t ino; - uint64_t gen; - struct gf_stat stbuf; - int32_t valid; - char path[0]; -} __attribute__((packed)) gf_fop_setattr_req_t; -typedef struct { - struct gf_stat statpre; - struct gf_stat statpost; -} __attribute__((packed)) gf_fop_setattr_rsp_t; - -typedef struct { - int64_t fd; - struct gf_stat stbuf; - int32_t valid; -} __attribute__((packed)) gf_fop_fsetattr_req_t; -typedef struct { - struct gf_stat statpre; - struct gf_stat statpost; -} __attribute__((packed)) gf_fop_fsetattr_rsp_t; - -typedef struct { - int64_t fd; - uint64_t offset; - uint32_t len; -} __attribute__((packed)) gf_fop_rchecksum_req_t; -typedef struct { - uint32_t weak_checksum; - unsigned char strong_checksum[0]; -} __attribute__((packed)) gf_fop_rchecksum_rsp_t; - -typedef struct { - uint32_t flags; - uint32_t keylen; - char key[0]; -} __attribute__((packed)) gf_mop_getspec_req_t; -typedef struct { - char spec[0]; -} __attribute__((packed)) gf_mop_getspec_rsp_t; - - -typedef struct { - uint32_t msglen; - char msg[0]; -} __attribute__((packed)) gf_mop_log_req_t; -typedef struct { -} __attribute__((packed)) gf_mop_log_rsp_t; - - -typedef struct { - uint32_t dict_len; - char buf[0]; -} __attribute__((packed)) gf_mop_setvolume_req_t; -typedef struct { - uint32_t dict_len; - char buf[0]; -} __attribute__((packed)) gf_mop_setvolume_rsp_t; - - -typedef struct { -} __attribute__((packed)) gf_mop_ping_req_t; -typedef struct { -} __attribute__((packed)) gf_mop_ping_rsp_t; - -typedef struct { - uint32_t flags; - char buf[0]; -} __attribute__((packed)) gf_mop_notify_req_t; -typedef struct { - uint32_t flags; - char buf[0]; -} __attribute__((packed)) gf_mop_notify_rsp_t; - -typedef struct { - uint64_t ino; - uint64_t gen; - int64_t fd; -} __attribute__((packed)) gf_cbk_releasedir_req_t; -typedef struct { -} __attribute__((packed)) gf_cbk_releasedir_rsp_t; - - -typedef struct { - uint64_t ino; - uint64_t gen; - int64_t fd; -} __attribute__((packed)) gf_cbk_release_req_t; -typedef struct { -} __attribute__((packed)) gf_cbk_release_rsp_t; - - -typedef struct { - uint32_t count; - uint64_t ino_array[0]; -} __attribute__((packed)) gf_cbk_forget_req_t; -typedef struct { } __attribute__((packed)) gf_cbk_forget_rsp_t; - - -typedef struct { - uint32_t pid; - uint32_t uid; - uint32_t gid; - - /* Number of groups being sent through the array above. */ - uint32_t ngrps; - - /* Array of groups to which the uid belongs apart from the primary group - * in gid. - */ - uint32_t groups[GF_REQUEST_MAXGROUPS]; - - uint64_t lk_owner; -} __attribute__ ((packed)) gf_hdr_req_t; - - -typedef struct { - uint32_t op_ret; - uint32_t op_errno; -} __attribute__ ((packed)) gf_hdr_rsp_t; - - -typedef struct { - uint64_t callid; - uint32_t type; - uint32_t op; - uint32_t size; - union { - gf_hdr_req_t req; - gf_hdr_rsp_t rsp; - } __attribute__ ((packed)); -} __attribute__ ((packed)) gf_hdr_common_t; - - -static inline gf_hdr_common_t * -__gf_hdr_new (int size) -{ - gf_hdr_common_t *hdr = NULL; - - /* TODO: use mem-pool */ - hdr = GF_CALLOC (sizeof (gf_hdr_common_t) + size, 1, - gf_common_mt_gf_hdr_common_t); - - if (!hdr) { - return NULL; - } - - hdr->size = hton32 (size); - - return hdr; -} - - -#define gf_hdr_len(type, x) (sizeof (gf_hdr_common_t) + sizeof (*type) + x) -#define gf_hdr_new(type, x) __gf_hdr_new (sizeof (*type) + x) - - -static inline void * -gf_param (gf_hdr_common_t *hdr) -{ - return ((void *)hdr) + sizeof (*hdr); -} - - -struct gf_dirent_nb { - uint64_t d_ino; - uint64_t d_off; - uint32_t d_len; - uint32_t d_type; - struct gf_stat d_stat; - char d_name[0]; -} __attribute__((packed)); - - -static inline int -gf_dirent_nb_size (gf_dirent_t *entries) -{ - return (sizeof (struct gf_dirent_nb) + strlen (entries->d_name) + 1); -} - -static inline int -gf_dirent_serialize (gf_dirent_t *entries, char *buf, size_t buf_size) -{ - struct gf_dirent_nb *entry_nb = NULL; - gf_dirent_t *entry = NULL; - int size = 0; - int entry_size = 0; - - - list_for_each_entry (entry, &entries->list, list) { - entry_size = gf_dirent_nb_size (entry); - - if (buf && (size + entry_size <= buf_size)) { - entry_nb = (void *) (buf + size); - - entry_nb->d_ino = hton64 (entry->d_ino); - entry_nb->d_off = hton64 (entry->d_off); - entry_nb->d_len = hton32 (entry->d_len); - entry_nb->d_type = hton32 (entry->d_type); - - gf_stat_from_iatt (&entry_nb->d_stat, &entry->d_stat); - - strcpy (entry_nb->d_name, entry->d_name); - } - size += entry_size; - } - - return size; -} - - -static inline int -gf_dirent_unserialize (gf_dirent_t *entries, const char *buf, size_t buf_size) -{ - struct gf_dirent_nb *entry_nb = NULL; - int remaining_size = 0; - int least_dirent_size = 0; - int count = 0; - gf_dirent_t *entry = NULL; - int entry_strlen = 0; - int entry_len = 0; - - - remaining_size = buf_size; - least_dirent_size = (sizeof (struct gf_dirent_nb) + 2); - - while (remaining_size >= least_dirent_size) { - entry_nb = (void *)(buf + (buf_size - remaining_size)); - - entry_strlen = strnlen (entry_nb->d_name, remaining_size); - if (entry_strlen == remaining_size) { - break; - } - - entry_len = sizeof (gf_dirent_t) + entry_strlen + 1; - entry = GF_CALLOC (1, entry_len, gf_common_mt_gf_dirent_t); - if (!entry) { - break; - } - - entry->d_ino = ntoh64 (entry_nb->d_ino); - entry->d_off = ntoh64 (entry_nb->d_off); - entry->d_len = ntoh32 (entry_nb->d_len); - entry->d_type = ntoh32 (entry_nb->d_type); - - gf_stat_to_iatt (&entry_nb->d_stat, &entry->d_stat); - - strcpy (entry->d_name, entry_nb->d_name); - - list_add_tail (&entry->list, &entries->list); - - remaining_size -= (sizeof (*entry_nb) + entry_strlen + 1); - count++; - } - - return count; -} - -#endif diff --git a/libglusterfs/src/transport.c b/libglusterfs/src/transport.c deleted file mode 100644 index d460d020..00000000 --- a/libglusterfs/src/transport.c +++ /dev/null @@ -1,422 +0,0 @@ -/* - Copyright (c) 2006-2009 Gluster, Inc. - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - . -*/ - -#include -#include -#include -#include -#include -#include - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "logging.h" -#include "transport.h" -#include "glusterfs.h" -#include "xlator.h" -#include "list.h" - - -transport_t * -transport_load (dict_t *options, - xlator_t *xl) -{ - struct transport *trans = NULL, *return_trans = NULL; - char *name = NULL; - void *handle = NULL; - char *type = NULL; - char str[] = "ERROR"; - int32_t ret = -1; - int8_t is_tcp = 0, is_unix = 0, is_ibsdp = 0; - volume_opt_list_t *vol_opt = NULL; - - GF_VALIDATE_OR_GOTO("transport", options, fail); - GF_VALIDATE_OR_GOTO("transport", xl, fail); - - trans = GF_CALLOC (1, sizeof (struct transport), - gf_common_mt_transport); - GF_VALIDATE_OR_GOTO("transport", trans, fail); - - trans->xl = xl; - type = str; - - /* Backward compatibility */ - ret = dict_get_str (options, "transport-type", &type); - if (ret < 0) { - ret = dict_set_str (options, "transport-type", "socket"); - if (ret < 0) - gf_log ("dict", GF_LOG_DEBUG, - "setting transport-type failed"); - gf_log ("transport", GF_LOG_WARNING, - "missing 'option transport-type'. defaulting to " - "\"socket\""); - } else { - { - /* Backword compatibility to handle * /client, - * * /server. - */ - char *tmp = strchr (type, '/'); - if (tmp) - *tmp = '\0'; - } - - is_tcp = strcmp (type, "tcp"); - is_unix = strcmp (type, "unix"); - is_ibsdp = strcmp (type, "ib-sdp"); - if ((is_tcp == 0) || - (is_unix == 0) || - (is_ibsdp == 0)) { - if (is_unix == 0) - ret = dict_set_str (options, - "transport.address-family", - "unix"); - if (is_ibsdp == 0) - ret = dict_set_str (options, - "transport.address-family", - "inet-sdp"); - - if (ret < 0) - gf_log ("dict", GF_LOG_DEBUG, - "setting address-family failed"); - - ret = dict_set_str (options, - "transport-type", "socket"); - if (ret < 0) - gf_log ("dict", GF_LOG_DEBUG, - "setting transport-type failed"); - } - } - - ret = dict_get_str (options, "transport-type", &type); - if (ret < 0) { - GF_FREE (trans); - gf_log ("transport", GF_LOG_ERROR, - "'option transport-type ' missing in volume '%s'", - xl->name); - goto fail; - } - - ret = gf_asprintf (&name, "%s/%s.so", TRANSPORTDIR, type); - if (-1 == ret) { - gf_log ("transport", GF_LOG_ERROR, "asprintf failed"); - goto fail; - } - gf_log ("transport", GF_LOG_DEBUG, - "attempt to load file %s", name); - - handle = dlopen (name, RTLD_NOW|RTLD_GLOBAL); - if (handle == NULL) { - gf_log ("transport", GF_LOG_ERROR, "%s", dlerror ()); - gf_log ("transport", GF_LOG_ERROR, - "volume '%s': transport-type '%s' is not valid or " - "not found on this machine", - xl->name, type); - GF_FREE (name); - GF_FREE (trans); - goto fail; - } - GF_FREE (name); - - trans->ops = dlsym (handle, "tops"); - if (trans->ops == NULL) { - gf_log ("transport", GF_LOG_ERROR, - "dlsym (transport_ops) on %s", dlerror ()); - GF_FREE (trans); - goto fail; - } - - trans->init = dlsym (handle, "init"); - if (trans->init == NULL) { - gf_log ("transport", GF_LOG_ERROR, - "dlsym (gf_transport_init) on %s", dlerror ()); - GF_FREE (trans); - goto fail; - } - - trans->fini = dlsym (handle, "fini"); - if (trans->fini == NULL) { - gf_log ("transport", GF_LOG_ERROR, - "dlsym (gf_transport_fini) on %s", dlerror ()); - GF_FREE (trans); - goto fail; - } - - vol_opt = GF_CALLOC (1, sizeof (volume_opt_list_t), - gf_common_mt_volume_opt_list_t); - vol_opt->given_opt = dlsym (handle, "options"); - if (vol_opt->given_opt == NULL) { - gf_log ("transport", GF_LOG_DEBUG, - "volume option validation not specified"); - } else { - list_add_tail (&vol_opt->list, &xl->volume_options); - if (-1 == - validate_xlator_volume_options (xl, - vol_opt->given_opt)) { - gf_log ("transport", GF_LOG_ERROR, - "volume option validation failed"); - GF_FREE (trans); - goto fail; - } - } - - ret = trans->init (trans); - if (ret != 0) { - gf_log ("transport", GF_LOG_ERROR, - "'%s' initialization failed", type); - GF_FREE (trans); - goto fail; - } - - pthread_mutex_init (&trans->lock, NULL); - return_trans = trans; -fail: - return return_trans; -} - - -int32_t -transport_submit (transport_t *this, char *buf, int32_t len, - struct iovec *vector, int count, - struct iobref *iobref) -{ - int32_t ret = -1; - transport_t *peer_trans = NULL; - struct iobuf *iobuf = NULL; - struct transport_msg *msg = NULL; - - if (this->peer_trans) { - peer_trans = this->peer_trans; - - msg = GF_CALLOC (1, sizeof (*msg), - gf_common_mt_transport_msg); - if (!msg) { - return -ENOMEM; - } - - msg->hdr = buf; - msg->hdrlen = len; - - if (vector) { - iobuf = iobuf_get (this->xl->ctx->iobuf_pool); - if (!iobuf) { - GF_FREE (msg->hdr); - GF_FREE (msg); - return -ENOMEM; - } - - iov_unload (iobuf->ptr, vector, count); - msg->iobuf = iobuf; - } - - pthread_mutex_lock (&peer_trans->handover.mutex); - { - list_add_tail (&msg->list, &peer_trans->handover.msgs); - pthread_cond_broadcast (&peer_trans->handover.cond); - } - pthread_mutex_unlock (&peer_trans->handover.mutex); - - return 0; - } - - GF_VALIDATE_OR_GOTO("transport", this, fail); - GF_VALIDATE_OR_GOTO("transport", this->ops, fail); - - ret = this->ops->submit (this, buf, len, vector, count, iobref); -fail: - return ret; -} - - -int32_t -transport_connect (transport_t *this) -{ - int ret = -1; - - GF_VALIDATE_OR_GOTO("transport", this, fail); - - ret = this->ops->connect (this); -fail: - return ret; -} - - -int32_t -transport_listen (transport_t *this) -{ - int ret = -1; - - GF_VALIDATE_OR_GOTO("transport", this, fail); - - ret = this->ops->listen (this); -fail: - return ret; -} - - -int32_t -transport_disconnect (transport_t *this) -{ - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO("transport", this, fail); - - ret = this->ops->disconnect (this); -fail: - return ret; -} - - -int32_t -transport_destroy (transport_t *this) -{ - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO("transport", this, fail); - - if (this->fini) - this->fini (this); - - pthread_mutex_destroy (&this->lock); - GF_FREE (this); -fail: - return ret; -} - - -transport_t * -transport_ref (transport_t *this) -{ - transport_t *return_this = NULL; - - GF_VALIDATE_OR_GOTO("transport", this, fail); - - pthread_mutex_lock (&this->lock); - { - this->refcount ++; - } - pthread_mutex_unlock (&this->lock); - - return_this = this; -fail: - return return_this; -} - - -int32_t -transport_receive (transport_t *this, char **hdr_p, size_t *hdrlen_p, - struct iobuf **iobuf_p) -{ - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO("transport", this, fail); - - if (this->peer_trans) { - *hdr_p = this->handover.msg->hdr; - *hdrlen_p = this->handover.msg->hdrlen; - *iobuf_p = this->handover.msg->iobuf; - - return 0; - } - - ret = this->ops->receive (this, hdr_p, hdrlen_p, iobuf_p); -fail: - return ret; -} - - -int32_t -transport_unref (transport_t *this) -{ - int32_t refcount = 0; - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO("transport", this, fail); - - pthread_mutex_lock (&this->lock); - { - refcount = --this->refcount; - } - pthread_mutex_unlock (&this->lock); - - if (refcount == 0) { - xlator_notify (this->xl, GF_EVENT_TRANSPORT_CLEANUP, this); - transport_destroy (this); - } - - ret = 0; -fail: - return ret; -} - - -void * -transport_peerproc (void *trans_data) -{ - transport_t *trans = NULL; - struct transport_msg *msg = NULL; - - trans = trans_data; - - while (1) { - pthread_mutex_lock (&trans->handover.mutex); - { - while (list_empty (&trans->handover.msgs)) - pthread_cond_wait (&trans->handover.cond, - &trans->handover.mutex); - - msg = list_entry (trans->handover.msgs.next, - struct transport_msg, list); - - list_del_init (&msg->list); - } - pthread_mutex_unlock (&trans->handover.mutex); - - trans->handover.msg = msg; - - xlator_notify (trans->xl, GF_EVENT_POLLIN, trans); - - GF_FREE (msg); - } -} - - -int -transport_setpeer (transport_t *trans, transport_t *peer_trans) -{ - trans->peer_trans = transport_ref (peer_trans); - - INIT_LIST_HEAD (&trans->handover.msgs); - pthread_cond_init (&trans->handover.cond, NULL); - pthread_mutex_init (&trans->handover.mutex, NULL); - pthread_create (&trans->handover.thread, NULL, - transport_peerproc, trans); - - peer_trans->peer_trans = transport_ref (trans); - - INIT_LIST_HEAD (&peer_trans->handover.msgs); - pthread_cond_init (&peer_trans->handover.cond, NULL); - pthread_mutex_init (&peer_trans->handover.mutex, NULL); - pthread_create (&peer_trans->handover.thread, NULL, - transport_peerproc, peer_trans); - - return 0; -} diff --git a/libglusterfs/src/transport.h b/libglusterfs/src/transport.h deleted file mode 100644 index f0623d5b..00000000 --- a/libglusterfs/src/transport.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - Copyright (c) 2006-2009 Gluster, Inc. - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - . -*/ - -#ifndef __TRANSPORT_H__ -#define __TRANSPORT_H__ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include - -struct transport_ops; -typedef struct transport transport_t; - -#include "xlator.h" -#include "dict.h" -#include "compat.h" - -typedef struct peer_info { - struct sockaddr_storage sockaddr; - socklen_t sockaddr_len; - char identifier[UNIX_PATH_MAX]; -}peer_info_t; - -struct transport_msg { - struct list_head list; - char *hdr; - int hdrlen; - struct iobuf *iobuf; -}; - -struct transport { - struct transport_ops *ops; - void *private; - void *xl_private; - pthread_mutex_t lock; - int32_t refcount; - - xlator_t *xl; - void *dnscache; - data_t *buf; - int32_t (*init) (transport_t *this); - void (*fini) (transport_t *this); - /* int (*notify) (transport_t *this, int event, void *data); */ - peer_info_t peerinfo; - peer_info_t myinfo; - - transport_t *peer_trans; - struct { - pthread_mutex_t mutex; - pthread_cond_t cond; - pthread_t thread; - struct list_head msgs; - struct transport_msg *msg; - } handover; - -}; - -struct transport_ops { - int32_t (*receive) (transport_t *this, char **hdr_p, size_t *hdrlen_p, - struct iobuf **iobuf_p); - int32_t (*submit) (transport_t *this, char *buf, int len, - struct iovec *vector, int count, - struct iobref *iobref); - int32_t (*connect) (transport_t *this); - int32_t (*listen) (transport_t *this); - int32_t (*disconnect) (transport_t *this); -}; - - -int32_t transport_listen (transport_t *this); -int32_t transport_connect (transport_t *this); -int32_t transport_disconnect (transport_t *this); -int32_t transport_notify (transport_t *this, int event); -int32_t transport_submit (transport_t *this, char *buf, int len, - struct iovec *vector, int count, - struct iobref *iobref); -int32_t transport_receive (transport_t *this, char **hdr_p, size_t *hdrlen_p, - struct iobuf **iobuf_p); -int32_t transport_destroy (transport_t *this); - -transport_t *transport_load (dict_t *options, xlator_t *xl); -transport_t *transport_ref (transport_t *trans); -int32_t transport_unref (transport_t *trans); - -int transport_setpeer (transport_t *trans, transport_t *trans_peer); - -#endif /* __TRANSPORT_H__ */ diff --git a/transport/Makefile.am b/transport/Makefile.am deleted file mode 100644 index e2f97437..00000000 --- a/transport/Makefile.am +++ /dev/null @@ -1,3 +0,0 @@ -SUBDIRS = socket $(IBVERBS_SUBDIR) - -CLEANFILES = diff --git a/transport/ib-verbs/Makefile.am b/transport/ib-verbs/Makefile.am deleted file mode 100644 index f963effe..00000000 --- a/transport/ib-verbs/Makefile.am +++ /dev/null @@ -1 +0,0 @@ -SUBDIRS = src \ No newline at end of file diff --git a/transport/ib-verbs/src/Makefile.am b/transport/ib-verbs/src/Makefile.am deleted file mode 100644 index 1baf080f..00000000 --- a/transport/ib-verbs/src/Makefile.am +++ /dev/null @@ -1,15 +0,0 @@ -noinst_HEADERS = ib-verbs.h name.h - -transport_LTLIBRARIES = ib-verbs.la -transportdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/transport - -ib_verbs_la_LDFLAGS = -module -avoidversion - -ib_verbs_la_SOURCES = ib-verbs.c name.c -ib_verbs_la_LIBADD = -libverbs $(top_builddir)/libglusterfs/src/libglusterfs.la - -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ - -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/transport/ib-verbs \ - -shared -nostartfiles $(GF_CFLAGS) - -CLEANFILES = *~ diff --git a/transport/ib-verbs/src/ib-verbs-mem-types.h b/transport/ib-verbs/src/ib-verbs-mem-types.h deleted file mode 100644 index bac55964..00000000 --- a/transport/ib-verbs/src/ib-verbs-mem-types.h +++ /dev/null @@ -1,39 +0,0 @@ - -/* - Copyright (c) 2008-2009 Gluster, Inc. - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - . -*/ - - -#ifndef __IB_VERBS_MEM_TYPES_H__ -#define __IB_VERBS_MEM_TYPES_H__ - -#include "mem-types.h" - -enum gf_ib_verbs_mem_types_ { - gf_ibv_mt_ib_verbs_private_t = gf_common_mt_end + 1, - gf_ibv_mt_ib_verbs_ioq_t, - gf_ibv_mt_transport_t, - gf_ibv_mt_ib_verbs_local_t, - gf_ibv_mt_ib_verbs_post_t, - gf_ibv_mt_char, - gf_ibv_mt_qpent, - gf_ibv_mt_ib_verbs_device_t, - gf_ibv_mt_end -}; -#endif - diff --git a/transport/ib-verbs/src/ib-verbs.c b/transport/ib-verbs/src/ib-verbs.c deleted file mode 100644 index a252a13d..00000000 --- a/transport/ib-verbs/src/ib-verbs.c +++ /dev/null @@ -1,2613 +0,0 @@ -/* - Copyright (c) 2006-2009 Gluster, Inc. - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - . -*/ - - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "dict.h" -#include "glusterfs.h" -#include "transport.h" -#include "protocol.h" -#include "logging.h" -#include "xlator.h" -#include "name.h" -#include "ib-verbs.h" -#include - -int32_t -gf_resolve_ip6 (const char *hostname, - uint16_t port, - int family, - void **dnscache, - struct addrinfo **addr_info); - -static uint16_t -ib_verbs_get_local_lid (struct ibv_context *context, - int32_t port) -{ - struct ibv_port_attr attr; - - if (ibv_query_port (context, port, &attr)) - return 0; - - return attr.lid; -} - -static const char * -get_port_state_str(enum ibv_port_state pstate) -{ - switch (pstate) { - case IBV_PORT_DOWN: return "PORT_DOWN"; - case IBV_PORT_INIT: return "PORT_INIT"; - case IBV_PORT_ARMED: return "PORT_ARMED"; - case IBV_PORT_ACTIVE: return "PORT_ACTIVE"; - case IBV_PORT_ACTIVE_DEFER: return "PORT_ACTIVE_DEFER"; - default: return "invalid state"; - } -} - -static int32_t -ib_check_active_port (struct ibv_context *ctx, uint8_t port) -{ - struct ibv_port_attr port_attr; - - int32_t ret = 0; - const char *state_str = NULL; - - if (!ctx) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "Error in supplied context"); - return -1; - } - - ret = ibv_query_port (ctx, port, &port_attr); - - if (ret) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "Failed to query port %u properties", port); - return -1; - } - - state_str = get_port_state_str (port_attr.state); - gf_log ("transport/ib-verbs", GF_LOG_TRACE, - "Infiniband PORT: (%u) STATE: (%s)", - port, state_str); - - if (port_attr.state == IBV_PORT_ACTIVE) - return 0; - - return -1; -} - -static int32_t -ib_get_active_port (struct ibv_context *ib_ctx) -{ - struct ibv_device_attr ib_device_attr; - - int32_t ret = -1; - uint8_t ib_port = 0; - - if (!ib_ctx) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "Error in supplied context"); - return -1; - } - if (ibv_query_device (ib_ctx, &ib_device_attr)) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "Failed to query device properties"); - return -1; - } - - for (ib_port = 1; ib_port <= ib_device_attr.phys_port_cnt; ++ib_port) { - ret = ib_check_active_port (ib_ctx, ib_port); - if (ret == 0) - return ib_port; - - gf_log ("transport/ib-verbs", GF_LOG_TRACE, - "Port:(%u) not active", ib_port); - continue; - } - return ret; -} - - - -static void -ib_verbs_put_post (ib_verbs_queue_t *queue, - ib_verbs_post_t *post) -{ - pthread_mutex_lock (&queue->lock); - if (post->prev) { - queue->active_count--; - post->prev->next = post->next; - } - if (post->next) - post->next->prev = post->prev; - post->prev = &queue->passive_posts; - post->next = post->prev->next; - post->prev->next = post; - post->next->prev = post; - queue->passive_count++; - pthread_mutex_unlock (&queue->lock); -} - - -static ib_verbs_post_t * -ib_verbs_new_post (ib_verbs_device_t *device, int32_t len) -{ - ib_verbs_post_t *post; - - post = (ib_verbs_post_t *) GF_CALLOC (1, sizeof (*post), - gf_ibv_mt_ib_verbs_post_t); - if (!post) - return NULL; - - post->buf_size = len; - - post->buf = valloc (len); - if (!post->buf) { - GF_FREE (post); - return NULL; - } - - post->mr = ibv_reg_mr (device->pd, - post->buf, - post->buf_size, - IBV_ACCESS_LOCAL_WRITE); - if (!post->mr) { - free (post->buf); - GF_FREE (post); - return NULL; - } - - return post; -} - - -static ib_verbs_post_t * -ib_verbs_get_post (ib_verbs_queue_t *queue) -{ - ib_verbs_post_t *post; - - pthread_mutex_lock (&queue->lock); - { - post = queue->passive_posts.next; - if (post == &queue->passive_posts) - post = NULL; - - if (post) { - if (post->prev) - post->prev->next = post->next; - if (post->next) - post->next->prev = post->prev; - post->prev = &queue->active_posts; - post->next = post->prev->next; - post->prev->next = post; - post->next->prev = post; - post->reused++; - queue->active_count++; - } - } - pthread_mutex_unlock (&queue->lock); - - return post; -} - -void -ib_verbs_destroy_post (ib_verbs_post_t *post) -{ - ibv_dereg_mr (post->mr); - free (post->buf); - GF_FREE (post); -} - - -static int32_t -__ib_verbs_quota_get (ib_verbs_peer_t *peer) -{ - int32_t ret = -1; - ib_verbs_private_t *priv = peer->trans->private; - - if (priv->connected && peer->quota > 0) { - ret = peer->quota--; - } - - return ret; -} - -/* - static int32_t - ib_verbs_quota_get (ib_verbs_peer_t *peer) - { - int32_t ret = -1; - ib_verbs_private_t *priv = peer->trans->private; - - pthread_mutex_lock (&priv->write_mutex); - { - ret = __ib_verbs_quota_get (peer); - } - pthread_mutex_unlock (&priv->write_mutex); - - return ret; - } -*/ - -static void -__ib_verbs_ioq_entry_free (ib_verbs_ioq_t *entry) -{ - list_del_init (&entry->list); - if (entry->iobref) - iobref_unref (entry->iobref); - - /* TODO: use mem-pool */ - GF_FREE (entry->buf); - - /* TODO: use mem-pool */ - GF_FREE (entry); -} - - -static void -__ib_verbs_ioq_flush (ib_verbs_peer_t *peer) -{ - ib_verbs_ioq_t *entry = NULL, *dummy = NULL; - - list_for_each_entry_safe (entry, dummy, &peer->ioq, list) { - __ib_verbs_ioq_entry_free (entry); - } -} - - -static int32_t -__ib_verbs_disconnect (transport_t *this) -{ - ib_verbs_private_t *priv = this->private; - int32_t ret = 0; - - if (priv->connected || priv->tcp_connected) { - fcntl (priv->sock, F_SETFL, O_NONBLOCK); - if (shutdown (priv->sock, SHUT_RDWR) != 0) { - gf_log ("transport/ib-verbs", - GF_LOG_DEBUG, - "shutdown () - error: %s", - strerror (errno)); - ret = -errno; - priv->tcp_connected = 0; - } - } - - return ret; -} - - -static int32_t -ib_verbs_post_send (struct ibv_qp *qp, - ib_verbs_post_t *post, - int32_t len) -{ - struct ibv_sge list = { - .addr = (unsigned long) post->buf, - .length = len, - .lkey = post->mr->lkey - }; - - struct ibv_send_wr wr = { - .wr_id = (unsigned long) post, - .sg_list = &list, - .num_sge = 1, - .opcode = IBV_WR_SEND, - .send_flags = IBV_SEND_SIGNALED, - }, *bad_wr; - - if (!qp) - return -1; - - return ibv_post_send (qp, &wr, &bad_wr); -} - - -static int32_t -__ib_verbs_ioq_churn_entry (ib_verbs_peer_t *peer, ib_verbs_ioq_t *entry) -{ - int32_t ret = 0, quota = 0; - ib_verbs_private_t *priv = peer->trans->private; - ib_verbs_device_t *device = priv->device; - ib_verbs_options_t *options = &priv->options; - ib_verbs_post_t *post = NULL; - int32_t len = 0; - - quota = __ib_verbs_quota_get (peer); - if (quota > 0) { - post = ib_verbs_get_post (&device->sendq); - if (!post) - post = ib_verbs_new_post (device, - (options->send_size + 2048)); - - len = iov_length ((const struct iovec *)&entry->vector, - entry->count); - if (len >= (options->send_size + 2048)) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "increase value of option 'transport.ib-verbs." - "work-request-send-size' (given=> %"PRId64") " - "to send bigger (%d) messages", - (options->send_size + 2048), len); - return -1; - } - - iov_unload (post->buf, - (const struct iovec *)&entry->vector, - entry->count); - - ret = ib_verbs_post_send (peer->qp, post, len); - if (!ret) { - __ib_verbs_ioq_entry_free (entry); - ret = len; - } else { - gf_log ("transport/ib-verbs", GF_LOG_DEBUG, - "ibv_post_send failed with ret = %d", ret); - ib_verbs_put_post (&device->sendq, post); - __ib_verbs_disconnect (peer->trans); - ret = -1; - } - } - - return ret; -} - - -static int32_t -__ib_verbs_ioq_churn (ib_verbs_peer_t *peer) -{ - ib_verbs_ioq_t *entry = NULL; - int32_t ret = 0; - - while (!list_empty (&peer->ioq)) - { - /* pick next entry */ - entry = peer->ioq_next; - - ret = __ib_verbs_ioq_churn_entry (peer, entry); - - if (ret <= 0) - break; - } - - /* - list_for_each_entry_safe (entry, dummy, &peer->ioq, list) { - ret = __ib_verbs_ioq_churn_entry (peer, entry); - if (ret <= 0) { - break; - } - } - */ - - return ret; -} - -static int32_t -__ib_verbs_quota_put (ib_verbs_peer_t *peer) -{ - int32_t ret; - - peer->quota++; - ret = peer->quota; - - if (!list_empty (&peer->ioq)) { - ret = __ib_verbs_ioq_churn (peer); - } - - return ret; -} - - -static int32_t -ib_verbs_quota_put (ib_verbs_peer_t *peer) -{ - int32_t ret; - ib_verbs_private_t *priv = peer->trans->private; - - pthread_mutex_lock (&priv->write_mutex); - { - ret = __ib_verbs_quota_put (peer); - } - pthread_mutex_unlock (&priv->write_mutex); - - return ret; -} - - -static int32_t -ib_verbs_post_recv (struct ibv_srq *srq, - ib_verbs_post_t *post) -{ - struct ibv_sge list = { - .addr = (unsigned long) post->buf, - .length = post->buf_size, - .lkey = post->mr->lkey - }; - - struct ibv_recv_wr wr = { - .wr_id = (unsigned long) post, - .sg_list = &list, - .num_sge = 1, - }, *bad_wr; - - return ibv_post_srq_recv (srq, &wr, &bad_wr); -} - - -static int32_t -ib_verbs_writev (transport_t *this, - ib_verbs_ioq_t *entry) -{ - int32_t ret = 0, need_append = 1; - ib_verbs_private_t *priv = this->private; - ib_verbs_peer_t *peer = NULL; - - pthread_mutex_lock (&priv->write_mutex); - { - if (!priv->connected) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "ib-verbs is not connected to post a " - "send request"); - ret = -1; - goto unlock; - } - - peer = &priv->peer; - if (list_empty (&peer->ioq)) { - ret = __ib_verbs_ioq_churn_entry (peer, entry); - if (ret != 0) { - need_append = 0; - } - } - - if (need_append) { - list_add_tail (&entry->list, &peer->ioq); - } - } -unlock: - pthread_mutex_unlock (&priv->write_mutex); - return ret; -} - - -static ib_verbs_ioq_t * -ib_verbs_ioq_new (char *buf, int len, struct iovec *vector, - int count, struct iobref *iobref) -{ - ib_verbs_ioq_t *entry = NULL; - - /* TODO: use mem-pool */ - entry = GF_CALLOC (1, sizeof (*entry), gf_ibv_mt_ib_verbs_ioq_t); - - assert (count <= (MAX_IOVEC-2)); - - entry->header.colonO[0] = ':'; - entry->header.colonO[1] = 'O'; - entry->header.colonO[2] = '\0'; - entry->header.version = 42; - entry->header.size1 = hton32 (len); - entry->header.size2 = hton32 (iov_length (vector, count)); - - entry->vector[0].iov_base = &entry->header; - entry->vector[0].iov_len = sizeof (entry->header); - entry->count++; - - entry->vector[1].iov_base = buf; - entry->vector[1].iov_len = len; - entry->count++; - - if (vector && count) - { - memcpy (&entry->vector[2], vector, sizeof (*vector) * count); - entry->count += count; - } - - if (iobref) - entry->iobref = iobref_ref (iobref); - - entry->buf = buf; - - INIT_LIST_HEAD (&entry->list); - - return entry; -} - - -static int32_t -ib_verbs_submit (transport_t *this, char *buf, int32_t len, - struct iovec *vector, int count, struct iobref *iobref) -{ - int32_t ret = 0; - ib_verbs_ioq_t *entry = NULL; - - entry = ib_verbs_ioq_new (buf, len, vector, count, iobref); - ret = ib_verbs_writev (this, entry); - - if (ret > 0) { - ret = 0; - } - - return ret; -} - -static int -ib_verbs_receive (transport_t *this, char **hdr_p, size_t *hdrlen_p, - struct iobuf **iobuf_p) -{ - ib_verbs_private_t *priv = this->private; - /* TODO: return error if !priv->connected, check with locks */ - /* TODO: boundry checks for data_ptr/offset */ - char *copy_from = NULL; - ib_verbs_header_t *header = NULL; - uint32_t size1, size2, data_len = 0; - char *hdr = NULL; - struct iobuf *iobuf = NULL; - int32_t ret = 0; - - pthread_mutex_lock (&priv->recv_mutex); - { -/* - while (!priv->data_ptr) - pthread_cond_wait (&priv->recv_cond, &priv->recv_mutex); -*/ - - copy_from = priv->data_ptr + priv->data_offset; - - priv->data_ptr = NULL; - data_len = priv->data_len; - pthread_cond_broadcast (&priv->recv_cond); - } - pthread_mutex_unlock (&priv->recv_mutex); - - header = (ib_verbs_header_t *)copy_from; - if (strcmp (header->colonO, ":O")) { - gf_log ("transport/ib-verbs", GF_LOG_DEBUG, - "%s: corrupt header received", this->xl->name); - ret = -1; - goto err; - } - - size1 = ntoh32 (header->size1); - size2 = ntoh32 (header->size2); - - if (data_len != (size1 + size2 + sizeof (*header))) { - gf_log ("transport/ib-verbs", GF_LOG_DEBUG, - "%s: sizeof data read from transport is not equal " - "to the size specified in the header", - this->xl->name); - ret = -1; - goto err; - } - - copy_from += sizeof (*header); - - if (size1) { - hdr = GF_CALLOC (1, size1, gf_ibv_mt_char); - if (!hdr) { - gf_log (this->xl->name, GF_LOG_ERROR, - "unable to allocate header for peer %s", - this->peerinfo.identifier); - ret = -ENOMEM; - goto err; - } - memcpy (hdr, copy_from, size1); - copy_from += size1; - *hdr_p = hdr; - } - *hdrlen_p = size1; - - if (size2) { - iobuf = iobuf_get (this->xl->ctx->iobuf_pool); - if (!iobuf) { - gf_log (this->xl->name, GF_LOG_ERROR, - "unable to allocate IO buffer for peer %s", - this->peerinfo.identifier); - ret = -ENOMEM; - goto err; - } - memcpy (iobuf->ptr, copy_from, size2); - *iobuf_p = iobuf; - } - -err: - return ret; -} - - -static void -ib_verbs_destroy_cq (transport_t *this) -{ - ib_verbs_private_t *priv = this->private; - ib_verbs_device_t *device = priv->device; - - if (device->recv_cq) - ibv_destroy_cq (device->recv_cq); - device->recv_cq = NULL; - - if (device->send_cq) - ibv_destroy_cq (device->send_cq); - device->send_cq = NULL; - - return; -} - - -static int32_t -ib_verbs_create_cq (transport_t *this) -{ - ib_verbs_private_t *priv = this->private; - ib_verbs_options_t *options = &priv->options; - ib_verbs_device_t *device = priv->device; - int32_t ret = 0; - - device->recv_cq = ibv_create_cq (priv->device->context, - options->recv_count * 2, - device, - device->recv_chan, - 0); - if (!device->recv_cq) { - gf_log ("transport/ib-verbs", - GF_LOG_ERROR, - "%s: creation of CQ failed", - this->xl->name); - ret = -1; - } else if (ibv_req_notify_cq (device->recv_cq, 0)) { - gf_log ("transport/ib-verbs", - GF_LOG_ERROR, - "%s: ibv_req_notify_cq on CQ failed", - this->xl->name); - ret = -1; - } - - do { - /* TODO: make send_cq size dynamically adaptive */ - device->send_cq = ibv_create_cq (priv->device->context, - options->send_count * 1024, - device, - device->send_chan, - 0); - if (!device->send_cq) { - gf_log ("transport/ib-verbs", - GF_LOG_ERROR, - "%s: creation of send_cq failed", - this->xl->name); - ret = -1; - break; - } - - if (ibv_req_notify_cq (device->send_cq, 0)) { - gf_log ("transport/ib-verbs", - GF_LOG_ERROR, - "%s: ibv_req_notify_cq on send_cq failed", - this->xl->name); - ret = -1; - break; - } - } while (0); - - if (ret != 0) - ib_verbs_destroy_cq (this); - - return ret; -} - - -static void -ib_verbs_register_peer (ib_verbs_device_t *device, - int32_t qp_num, - ib_verbs_peer_t *peer) -{ - struct _qpent *ent; - ib_verbs_qpreg_t *qpreg = &device->qpreg; - int32_t hash = qp_num % 42; - - pthread_mutex_lock (&qpreg->lock); - ent = qpreg->ents[hash].next; - while ((ent != &qpreg->ents[hash]) && (ent->qp_num != qp_num)) - ent = ent->next; - if (ent->qp_num == qp_num) { - pthread_mutex_unlock (&qpreg->lock); - return; - } - ent = (struct _qpent *) GF_CALLOC (1, sizeof (*ent), gf_ibv_mt_qpent); - ERR_ABORT (ent); - /* TODO: ref reg->peer */ - ent->peer = peer; - ent->next = &qpreg->ents[hash]; - ent->prev = ent->next->prev; - ent->next->prev = ent; - ent->prev->next = ent; - ent->qp_num = qp_num; - qpreg->count++; - pthread_mutex_unlock (&qpreg->lock); -} - - -static void -ib_verbs_unregister_peer (ib_verbs_device_t *device, - int32_t qp_num) -{ - struct _qpent *ent; - ib_verbs_qpreg_t *qpreg = &device->qpreg; - int32_t hash = qp_num % 42; - - pthread_mutex_lock (&qpreg->lock); - ent = qpreg->ents[hash].next; - while ((ent != &qpreg->ents[hash]) && (ent->qp_num != qp_num)) - ent = ent->next; - if (ent->qp_num != qp_num) { - pthread_mutex_unlock (&qpreg->lock); - return; - } - ent->prev->next = ent->next; - ent->next->prev = ent->prev; - /* TODO: unref reg->peer */ - GF_FREE (ent); - qpreg->count--; - pthread_mutex_unlock (&qpreg->lock); -} - - -static ib_verbs_peer_t * -__ib_verbs_lookup_peer (ib_verbs_device_t *device, int32_t qp_num) -{ - struct _qpent *ent = NULL; - ib_verbs_peer_t *peer = NULL; - ib_verbs_qpreg_t *qpreg = NULL; - int32_t hash = 0; - - qpreg = &device->qpreg; - hash = qp_num % 42; - ent = qpreg->ents[hash].next; - while ((ent != &qpreg->ents[hash]) && (ent->qp_num != qp_num)) - ent = ent->next; - - if (ent != &qpreg->ents[hash]) { - peer = ent->peer; - } - - return peer; -} - -/* -static ib_verbs_peer_t * -ib_verbs_lookup_peer (ib_verbs_device_t *device, - int32_t qp_num) -{ - ib_verbs_qpreg_t *qpreg = NULL; - ib_verbs_peer_t *peer = NULL; - - qpreg = &device->qpreg; - pthread_mutex_lock (&qpreg->lock); - { - peer = __ib_verbs_lookup_peer (device, qp_num); - } - pthread_mutex_unlock (&qpreg->lock); - - return peer; -} -*/ - - -static void -__ib_verbs_destroy_qp (transport_t *this) -{ - ib_verbs_private_t *priv = this->private; - - if (priv->peer.qp) { - ib_verbs_unregister_peer (priv->device, priv->peer.qp->qp_num); - ibv_destroy_qp (priv->peer.qp); - } - priv->peer.qp = NULL; - - return; -} - - -static int32_t -ib_verbs_create_qp (transport_t *this) -{ - ib_verbs_private_t *priv = this->private; - ib_verbs_options_t *options = &priv->options; - ib_verbs_device_t *device = priv->device; - int32_t ret = 0; - ib_verbs_peer_t *peer; - - peer = &priv->peer; - struct ibv_qp_init_attr init_attr = { - .send_cq = device->send_cq, - .recv_cq = device->recv_cq, - .srq = device->srq, - .cap = { - .max_send_wr = peer->send_count, - .max_recv_wr = peer->recv_count, - .max_send_sge = 1, - .max_recv_sge = 1 - }, - .qp_type = IBV_QPT_RC - }; - - struct ibv_qp_attr attr = { - .qp_state = IBV_QPS_INIT, - .pkey_index = 0, - .port_num = options->port, - .qp_access_flags = 0 - }; - - peer->qp = ibv_create_qp (device->pd, &init_attr); - if (!peer->qp) { - gf_log ("transport/ib-verbs", - GF_LOG_CRITICAL, - "%s: could not create QP", - this->xl->name); - ret = -1; - goto out; - } else if (ibv_modify_qp (peer->qp, &attr, - IBV_QP_STATE | - IBV_QP_PKEY_INDEX | - IBV_QP_PORT | - IBV_QP_ACCESS_FLAGS)) { - gf_log ("transport/ib-verbs", - GF_LOG_ERROR, - "%s: failed to modify QP to INIT state", - this->xl->name); - ret = -1; - goto out; - } - - peer->local_lid = ib_verbs_get_local_lid (device->context, - options->port); - peer->local_qpn = peer->qp->qp_num; - peer->local_psn = lrand48 () & 0xffffff; - - ib_verbs_register_peer (device, peer->qp->qp_num, peer); - -out: - if (ret == -1) - __ib_verbs_destroy_qp (this); - - return ret; -} - - -static void -ib_verbs_destroy_posts (transport_t *this) -{ - -} - - -static int32_t -__ib_verbs_create_posts (transport_t *this, - int32_t count, - int32_t size, - ib_verbs_queue_t *q) -{ - int32_t i; - int32_t ret = 0; - ib_verbs_private_t *priv = this->private; - ib_verbs_device_t *device = priv->device; - - for (i=0 ; ixl->name); - ret = -1; - break; - } - - ib_verbs_put_post (q, post); - } - return ret; -} - - -static int32_t -ib_verbs_create_posts (transport_t *this) -{ - int32_t i, ret; - ib_verbs_post_t *post = NULL; - ib_verbs_private_t *priv = this->private; - ib_verbs_options_t *options = &priv->options; - ib_verbs_device_t *device = priv->device; - - ret = __ib_verbs_create_posts (this, options->send_count, - options->send_size, - &device->sendq); - if (!ret) - ret = __ib_verbs_create_posts (this, options->recv_count, - options->recv_size, - &device->recvq); - - if (!ret) { - for (i=0 ; irecv_count ; i++) { - post = ib_verbs_get_post (&device->recvq); - if (ib_verbs_post_recv (device->srq, post) != 0) { - ret = -1; - break; - } - } - } - - if (ret) - ib_verbs_destroy_posts (this); - - return ret; -} - - -static int32_t -ib_verbs_connect_qp (transport_t *this) -{ - ib_verbs_private_t *priv = this->private; - ib_verbs_options_t *options = &priv->options; - struct ibv_qp_attr attr = { - .qp_state = IBV_QPS_RTR, - .path_mtu = options->mtu, - .dest_qp_num = priv->peer.remote_qpn, - .rq_psn = priv->peer.remote_psn, - .max_dest_rd_atomic = 1, - .min_rnr_timer = 12, - .ah_attr = { - .is_global = 0, - .dlid = priv->peer.remote_lid, - .sl = 0, - .src_path_bits = 0, - .port_num = options->port - } - }; - if (ibv_modify_qp (priv->peer.qp, &attr, - IBV_QP_STATE | - IBV_QP_AV | - IBV_QP_PATH_MTU | - IBV_QP_DEST_QPN | - IBV_QP_RQ_PSN | - IBV_QP_MAX_DEST_RD_ATOMIC | - IBV_QP_MIN_RNR_TIMER)) { - gf_log ("transport/ib-verbs", - GF_LOG_CRITICAL, - "Failed to modify QP to RTR\n"); - return -1; - } - - /* TODO: make timeout and retry_cnt configurable from options */ - attr.qp_state = IBV_QPS_RTS; - attr.timeout = 14; - attr.retry_cnt = 7; - attr.rnr_retry = 7; - attr.sq_psn = priv->peer.local_psn; - attr.max_rd_atomic = 1; - if (ibv_modify_qp (priv->peer.qp, &attr, - IBV_QP_STATE | - IBV_QP_TIMEOUT | - IBV_QP_RETRY_CNT | - IBV_QP_RNR_RETRY | - IBV_QP_SQ_PSN | - IBV_QP_MAX_QP_RD_ATOMIC)) { - gf_log ("transport/ib-verbs", - GF_LOG_CRITICAL, - "Failed to modify QP to RTS\n"); - return -1; - } - - return 0; -} - -static int32_t -__ib_verbs_teardown (transport_t *this) -{ - ib_verbs_private_t *priv = this->private; - - __ib_verbs_destroy_qp (this); - - if (!list_empty (&priv->peer.ioq)) { - __ib_verbs_ioq_flush (&priv->peer); - } - - /* TODO: decrement cq size */ - return 0; -} - -/* - * return value: - * 0 = success (completed) - * -1 = error - * > 0 = incomplete - */ - -static int -__tcp_rwv (transport_t *this, struct iovec *vector, int count, - struct iovec **pending_vector, int *pending_count, - int write) -{ - ib_verbs_private_t *priv = NULL; - int sock = -1; - int ret = -1; - struct iovec *opvector = vector; - int opcount = count; - int moved = 0; - - priv = this->private; - sock = priv->sock; - - while (opcount) - { - if (write) - { - ret = writev (sock, opvector, opcount); - - if (ret == 0 || (ret == -1 && errno == EAGAIN)) - { - /* done for now */ - break; - } - } - else - { - ret = readv (sock, opvector, opcount); - - if (ret == -1 && errno == EAGAIN) - { - /* done for now */ - break; - } - } - - if (ret == 0) - { - gf_log (this->xl->name, GF_LOG_DEBUG, - "EOF from peer %s", this->peerinfo.identifier); - opcount = -1; - errno = ENOTCONN; - break; - } - - if (ret == -1) - { - if (errno == EINTR) - continue; - - gf_log (this->xl->name, GF_LOG_DEBUG, - "%s failed (%s)", write ? "writev" : "readv", - strerror (errno)); - if (write && !priv->connected && - (errno == ECONNREFUSED)) - gf_log (this->xl->name, GF_LOG_ERROR, - "possible mismatch of 'transport-type'" - " in protocol server and client. " - "check volume file"); - opcount = -1; - break; - } - - moved = 0; - - while (moved < ret) - { - if ((ret - moved) >= opvector[0].iov_len) - { - moved += opvector[0].iov_len; - opvector++; - opcount--; - } - else - { - opvector[0].iov_len -= (ret - moved); - opvector[0].iov_base += (ret - moved); - moved += (ret - moved); - } - while (opcount && !opvector[0].iov_len) - { - opvector++; - opcount--; - } - } - } - - if (pending_vector) - *pending_vector = opvector; - - if (pending_count) - *pending_count = opcount; - - return opcount; -} - - -static int -__tcp_readv (transport_t *this, struct iovec *vector, int count, - struct iovec **pending_vector, int *pending_count) -{ - int ret = -1; - - ret = __tcp_rwv (this, vector, count, - pending_vector, pending_count, 0); - - return ret; -} - - -static int -__tcp_writev (transport_t *this, struct iovec *vector, int count, - struct iovec **pending_vector, int *pending_count) -{ - int ret = -1; - ib_verbs_private_t *priv = this->private; - - ret = __tcp_rwv (this, vector, count, pending_vector, - pending_count, 1); - - if (ret > 0) { - /* TODO: Avoid multiple calls when socket is already - registered for POLLOUT */ - priv->idx = event_select_on (this->xl->ctx->event_pool, - priv->sock, priv->idx, -1, 1); - } else if (ret == 0) { - priv->idx = event_select_on (this->xl->ctx->event_pool, - priv->sock, - priv->idx, -1, 0); - } - - return ret; -} - - -static void * -ib_verbs_recv_completion_proc (void *data) -{ - struct ibv_comp_channel *chan = data; - ib_verbs_private_t *priv = NULL; - ib_verbs_device_t *device; - ib_verbs_post_t *post; - ib_verbs_peer_t *peer; - struct ibv_cq *event_cq; - struct ibv_wc wc; - void *event_ctx; - int32_t ret = 0; - - - while (1) { - ret = ibv_get_cq_event (chan, &event_cq, &event_ctx); - if (ret) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "ibv_get_cq_event failed, terminating recv " - "thread %d (%d)", ret, errno); - continue; - } - - device = event_ctx; - - ret = ibv_req_notify_cq (event_cq, 0); - if (ret) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "ibv_req_notify_cq on %s failed, terminating " - "recv thread: %d (%d)", - device->device_name, ret, errno); - continue; - } - - device = (ib_verbs_device_t *) event_ctx; - - while ((ret = ibv_poll_cq (event_cq, 1, &wc)) > 0) { - post = (ib_verbs_post_t *) (long) wc.wr_id; - - pthread_mutex_lock (&device->qpreg.lock); - { - peer = __ib_verbs_lookup_peer (device, - wc.qp_num); - - /* - * keep a refcount on transport so that it - * doesnot get freed because of some error - * indicated by wc.status till we are done - * with usage of peer and thereby that of trans. - */ - if (peer != NULL) { - transport_ref (peer->trans); - } - } - pthread_mutex_unlock (&device->qpreg.lock); - - if (wc.status != IBV_WC_SUCCESS) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "recv work request on `%s' returned " - "error (%d)", - device->device_name, - wc.status); - if (peer) { - transport_unref (peer->trans); - transport_disconnect (peer->trans); - } - - if (post) { - ib_verbs_post_recv (device->srq, post); - } - continue; - } - - if (peer) { - priv = peer->trans->private; - - pthread_mutex_lock (&priv->recv_mutex); - { - while (priv->data_ptr) - pthread_cond_wait (&priv->recv_cond, - &priv->recv_mutex); - - priv->data_ptr = post->buf; - priv->data_offset = 0; - priv->data_len = wc.byte_len; - - /*pthread_cond_broadcast (&priv->recv_cond);*/ - } - pthread_mutex_unlock (&priv->recv_mutex); - - if ((ret = xlator_notify (peer->trans->xl, GF_EVENT_POLLIN, - peer->trans, NULL)) == -1) { - gf_log ("transport/ib-verbs", - GF_LOG_DEBUG, - "pollin notification to %s " - "failed, disconnecting " - "transport", - peer->trans->xl->name); - transport_disconnect (peer->trans); - } - - transport_unref (peer->trans); - } else { - gf_log ("transport/ib-verbs", - GF_LOG_DEBUG, - "could not lookup peer for qp_num: %d", - wc.qp_num); - } - ib_verbs_post_recv (device->srq, post); - } - - if (ret < 0) { - gf_log ("transport/ib-verbs", - GF_LOG_ERROR, - "ibv_poll_cq on `%s' returned error " - "(ret = %d, errno = %d)", - device->device_name, ret, errno); - continue; - } - ibv_ack_cq_events (event_cq, 1); - } - return NULL; -} - - -static void * -ib_verbs_send_completion_proc (void *data) -{ - struct ibv_comp_channel *chan = data; - ib_verbs_post_t *post; - ib_verbs_peer_t *peer; - struct ibv_cq *event_cq; - void *event_ctx; - ib_verbs_device_t *device; - struct ibv_wc wc; - int32_t ret; - - while (1) { - ret = ibv_get_cq_event (chan, &event_cq, &event_ctx); - if (ret) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "ibv_get_cq_event on failed, terminating " - "send thread: %d (%d)", ret, errno); - continue; - } - - device = event_ctx; - - ret = ibv_req_notify_cq (event_cq, 0); - if (ret) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "ibv_req_notify_cq on %s failed, terminating " - "send thread: %d (%d)", - device->device_name, ret, errno); - continue; - } - - while ((ret = ibv_poll_cq (event_cq, 1, &wc)) > 0) { - post = (ib_verbs_post_t *) (long) wc.wr_id; - - pthread_mutex_lock (&device->qpreg.lock); - { - peer = __ib_verbs_lookup_peer (device, - wc.qp_num); - - /* - * keep a refcount on transport so that it - * doesnot get freed because of some error - * indicated by wc.status till we are done - * with usage of peer and thereby that of trans. - */ - if (peer != NULL) { - transport_ref (peer->trans); - } - } - pthread_mutex_unlock (&device->qpreg.lock); - - if (wc.status != IBV_WC_SUCCESS) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "send work request on `%s' returned " - "error wc.status = %d, wc.vendor_err " - "= %d, post->buf = %p, wc.byte_len = " - "%d, post->reused = %d", - device->device_name, wc.status, - wc.vendor_err, - post->buf, wc.byte_len, post->reused); - if (wc.status == IBV_WC_RETRY_EXC_ERR) - gf_log ("ib-verbs", GF_LOG_ERROR, - "connection between client and" - " server not working. check by" - " running 'ibv_srq_pingpong'. " - "also make sure subnet manager" - " is running (eg: 'opensm'), " - "or check if ib-verbs port is " - "valid (or active) by running " - " 'ibv_devinfo'. contact " - "Gluster Support Team if " - "the problem persists."); - if (peer) - transport_disconnect (peer->trans); - } - - if (post) { - ib_verbs_put_post (&device->sendq, post); - } - - if (peer) { - int quota_ret = ib_verbs_quota_put (peer); - if (quota_ret < 0) { - gf_log ("ib-verbs", GF_LOG_DEBUG, - "failed to send message"); - - } - - transport_unref (peer->trans); - } else { - gf_log ("transport/ib-verbs", GF_LOG_DEBUG, - "could not lookup peer for qp_num: %d", - wc.qp_num); - } - } - - if (ret < 0) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "ibv_poll_cq on `%s' returned error (ret = %d," - " errno = %d)", - device->device_name, ret, errno); - continue; - } - ibv_ack_cq_events (event_cq, 1); - } - - return NULL; -} - -static void -ib_verbs_options_init (transport_t *this) -{ - ib_verbs_private_t *priv = this->private; - ib_verbs_options_t *options = &priv->options; - int32_t mtu; - data_t *temp; - - /* TODO: validate arguments from options below */ - - options->send_size = this->xl->ctx->page_size * 4; /* 512 KB */ - options->recv_size = this->xl->ctx->page_size * 4; /* 512 KB */ - options->send_count = 32; - options->recv_count = 32; - - temp = dict_get (this->xl->options, - "transport.ib-verbs.work-request-send-count"); - if (temp) - options->send_count = data_to_int32 (temp); - - temp = dict_get (this->xl->options, - "transport.ib-verbs.work-request-recv-count"); - if (temp) - options->recv_count = data_to_int32 (temp); - - options->port = 0; - temp = dict_get (this->xl->options, - "transport.ib-verbs.port"); - if (temp) - options->port = data_to_uint64 (temp); - - options->mtu = mtu = IBV_MTU_2048; - temp = dict_get (this->xl->options, - "transport.ib-verbs.mtu"); - if (temp) - mtu = data_to_int32 (temp); - switch (mtu) { - case 256: options->mtu = IBV_MTU_256; - break; - case 512: options->mtu = IBV_MTU_512; - break; - case 1024: options->mtu = IBV_MTU_1024; - break; - case 2048: options->mtu = IBV_MTU_2048; - break; - case 4096: options->mtu = IBV_MTU_4096; - break; - default: - if (temp) - gf_log ("transport/ib-verbs", GF_LOG_WARNING, - "%s: unrecognized MTU value '%s', defaulting " - "to '2048'", this->xl->name, - data_to_str (temp)); - else - gf_log ("transport/ib-verbs", GF_LOG_TRACE, - "%s: defaulting MTU to '2048'", - this->xl->name); - options->mtu = IBV_MTU_2048; - break; - } - - temp = dict_get (this->xl->options, - "transport.ib-verbs.device-name"); - if (temp) - options->device_name = gf_strdup (temp->data); - - return; -} - -static void -ib_verbs_queue_init (ib_verbs_queue_t *queue) -{ - pthread_mutex_init (&queue->lock, NULL); - - queue->active_posts.next = &queue->active_posts; - queue->active_posts.prev = &queue->active_posts; - queue->passive_posts.next = &queue->passive_posts; - queue->passive_posts.prev = &queue->passive_posts; -} - - -static ib_verbs_device_t * -ib_verbs_get_device (transport_t *this, - struct ibv_context *ibctx) -{ - glusterfs_ctx_t *ctx = this->xl->ctx; - ib_verbs_private_t *priv = this->private; - ib_verbs_options_t *options = &priv->options; - char *device_name = priv->options.device_name; - uint32_t port = priv->options.port; - - uint8_t active_port = 0; - int32_t ret = 0; - int32_t i = 0; - - ib_verbs_device_t *trav; - - trav = ctx->ib; - while (trav) { - if ((!strcmp (trav->device_name, device_name)) && - (trav->port == port)) - break; - trav = trav->next; - } - - if (!trav) { - - trav = GF_CALLOC (1, sizeof (*trav), - gf_ibv_mt_ib_verbs_device_t); - ERR_ABORT (trav); - priv->device = trav; - - trav->context = ibctx; - - ret = ib_get_active_port (trav->context); - - if (ret < 0) { - if (!port) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "Failed to find any active ports and " - "none specified in volume file," - " exiting"); - return NULL; - } - } - - active_port = ret; - - if (port) { - ret = ib_check_active_port (trav->context, port); - if (ret < 0) { - gf_log ("transport/ib-verbs", GF_LOG_WARNING, - "On device %s: provided port:%u is " - "found to be offline, continuing to " - "use the same port", device_name, port); - } - } else { - priv->options.port = active_port; - port = active_port; - gf_log ("transport/ib-verbs", GF_LOG_TRACE, - "Port unspecified in volume file using active " - "port: %u", port); - } - - trav->device_name = gf_strdup (device_name); - trav->port = port; - - trav->next = ctx->ib; - ctx->ib = trav; - - trav->send_chan = ibv_create_comp_channel (trav->context); - if (!trav->send_chan) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "%s: could not create send completion channel", - device_name); - /* TODO: cleanup current mess */ - return NULL; - } - - trav->recv_chan = ibv_create_comp_channel (trav->context); - if (!trav->recv_chan) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "could not create recv completion channel"); - /* TODO: cleanup current mess */ - return NULL; - } - - if (ib_verbs_create_cq (this) < 0) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "%s: could not create CQ", - this->xl->name); - return NULL; - } - - /* protection domain */ - trav->pd = ibv_alloc_pd (trav->context); - - if (!trav->pd) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "%s: could not allocate protection domain", - this->xl->name); - return NULL; - } - - struct ibv_srq_init_attr attr = { - .attr = { - .max_wr = options->recv_count, - .max_sge = 1 - } - }; - trav->srq = ibv_create_srq (trav->pd, &attr); - - if (!trav->srq) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "%s: could not create SRQ", - this->xl->name); - return NULL; - } - - /* queue init */ - ib_verbs_queue_init (&trav->sendq); - ib_verbs_queue_init (&trav->recvq); - - if (ib_verbs_create_posts (this) < 0) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "%s: could not allocate posts", - this->xl->name); - return NULL; - } - - /* completion threads */ - ret = pthread_create (&trav->send_thread, - NULL, - ib_verbs_send_completion_proc, - trav->send_chan); - if (ret) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "could not create send completion thread"); - return NULL; - } - ret = pthread_create (&trav->recv_thread, - NULL, - ib_verbs_recv_completion_proc, - trav->recv_chan); - if (ret) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "could not create recv completion thread"); - return NULL; - } - - /* qpreg */ - pthread_mutex_init (&trav->qpreg.lock, NULL); - for (i=0; i<42; i++) { - trav->qpreg.ents[i].next = &trav->qpreg.ents[i]; - trav->qpreg.ents[i].prev = &trav->qpreg.ents[i]; - } - } - return trav; -} - -static int32_t -ib_verbs_init (transport_t *this) -{ - ib_verbs_private_t *priv = this->private; - ib_verbs_options_t *options = &priv->options; - struct ibv_device **dev_list; - struct ibv_context *ib_ctx = NULL; - int32_t ret = 0; - - ib_verbs_options_init (this); - - { - dev_list = ibv_get_device_list (NULL); - - if (!dev_list) { - gf_log ("transport/ib-verbs", - GF_LOG_CRITICAL, - "Failed to get IB devices"); - ret = -1; - goto cleanup; - } - - if (!*dev_list) { - gf_log ("transport/ib-verbs", - GF_LOG_CRITICAL, - "No IB devices found"); - ret = -1; - goto cleanup; - } - - if (!options->device_name) { - if (*dev_list) { - options->device_name = - gf_strdup (ibv_get_device_name (*dev_list)); - } else { - gf_log ("transport/ib-verbs", GF_LOG_CRITICAL, - "IB device list is empty. Check for " - "'ib_uverbs' module"); - return -1; - goto cleanup; - } - } - - while (*dev_list) { - if (!strcmp (ibv_get_device_name (*dev_list), - options->device_name)) { - ib_ctx = ibv_open_device (*dev_list); - - if (!ib_ctx) { - gf_log ("transport/ib-verbs", - GF_LOG_ERROR, - "Failed to get infiniband" - "device context"); - ret = -1; - goto cleanup; - } - break; - } - ++dev_list; - } - - priv->device = ib_verbs_get_device (this, ib_ctx); - - if (!priv->device) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "could not create ib_verbs device for %s", - priv->device->device_name); - ret = -1; - goto cleanup; - } - } - - priv->peer.trans = this; - INIT_LIST_HEAD (&priv->peer.ioq); - - pthread_mutex_init (&priv->read_mutex, NULL); - pthread_mutex_init (&priv->write_mutex, NULL); - pthread_mutex_init (&priv->recv_mutex, NULL); - pthread_cond_init (&priv->recv_cond, NULL); - -cleanup: - if (-1 == ret) { - if (ib_ctx) - ibv_close_device (ib_ctx); - } - - if (dev_list) - ibv_free_device_list (dev_list); - - return ret; -} - - -static int32_t -ib_verbs_disconnect (transport_t *this) -{ - ib_verbs_private_t *priv = this->private; - int32_t ret = 0; - - pthread_mutex_lock (&priv->write_mutex); - { - ret = __ib_verbs_disconnect (this); - } - pthread_mutex_unlock (&priv->write_mutex); - - return ret; -} - - -static int32_t -__tcp_connect_finish (int fd) -{ - int ret = -1; - int optval = 0; - socklen_t optlen = sizeof (int); - - ret = getsockopt (fd, SOL_SOCKET, SO_ERROR, - (void *)&optval, &optlen); - - if (ret == 0 && optval) - { - errno = optval; - ret = -1; - } - - return ret; -} - -static inline void -ib_verbs_fill_handshake_data (char *buf, struct ib_verbs_nbio *nbio, - ib_verbs_private_t *priv) -{ - sprintf (buf, - "QP1:RECV_BLKSIZE=%08x:SEND_BLKSIZE=%08x\n" - "QP1:LID=%04x:QPN=%06x:PSN=%06x\n", - priv->peer.recv_size, - priv->peer.send_size, - priv->peer.local_lid, - priv->peer.local_qpn, - priv->peer.local_psn); - - nbio->vector.iov_base = buf; - nbio->vector.iov_len = strlen (buf) + 1; - nbio->count = 1; - return; -} - -static inline void -ib_verbs_fill_handshake_ack (char *buf, struct ib_verbs_nbio *nbio) -{ - sprintf (buf, "DONE\n"); - nbio->vector.iov_base = buf; - nbio->vector.iov_len = strlen (buf) + 1; - nbio->count = 1; - return; -} - -static int -ib_verbs_handshake_pollin (transport_t *this) -{ - int ret = 0; - ib_verbs_private_t *priv = this->private; - char *buf = priv->handshake.incoming.buf; - int32_t recv_buf_size, send_buf_size; - socklen_t sock_len; - - if (priv->handshake.incoming.state == IB_VERBS_HANDSHAKE_COMPLETE) { - return -1; - } - - pthread_mutex_lock (&priv->write_mutex); - { - while (priv->handshake.incoming.state != IB_VERBS_HANDSHAKE_COMPLETE) - { - switch (priv->handshake.incoming.state) - { - case IB_VERBS_HANDSHAKE_START: - buf = priv->handshake.incoming.buf = GF_CALLOC (1, 256, gf_ibv_mt_char); - ib_verbs_fill_handshake_data (buf, &priv->handshake.incoming, priv); - buf[0] = 0; - priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_RECEIVING_DATA; - break; - - case IB_VERBS_HANDSHAKE_RECEIVING_DATA: - ret = __tcp_readv (this, - &priv->handshake.incoming.vector, - priv->handshake.incoming.count, - &priv->handshake.incoming.pending_vector, - &priv->handshake.incoming.pending_count); - if (ret == -1) { - goto unlock; - } - - if (ret > 0) { - gf_log (this->xl->name, GF_LOG_TRACE, - "partial header read on NB socket. continue later"); - goto unlock; - } - - if (!ret) { - priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_RECEIVED_DATA; - } - break; - - case IB_VERBS_HANDSHAKE_RECEIVED_DATA: - ret = sscanf (buf, - "QP1:RECV_BLKSIZE=%08x:SEND_BLKSIZE=%08x\n" - "QP1:LID=%04x:QPN=%06x:PSN=%06x\n", - &recv_buf_size, - &send_buf_size, - &priv->peer.remote_lid, - &priv->peer.remote_qpn, - &priv->peer.remote_psn); - - if ((ret != 5) && (strncmp (buf, "QP1:", 4))) { - gf_log ("transport/ib-verbs", - GF_LOG_CRITICAL, - "%s: remote-host(%s)'s " - "transport type is different", - this->xl->name, - this->peerinfo.identifier); - ret = -1; - goto unlock; - } - - if (recv_buf_size < priv->peer.recv_size) - priv->peer.recv_size = recv_buf_size; - if (send_buf_size < priv->peer.send_size) - priv->peer.send_size = send_buf_size; - - gf_log ("transport/ib-verbs", GF_LOG_TRACE, - "%s: transacted recv_size=%d " - "send_size=%d", - this->xl->name, priv->peer.recv_size, - priv->peer.send_size); - - priv->peer.quota = priv->peer.send_count; - - if (ib_verbs_connect_qp (this)) { - gf_log ("transport/ib-verbs", - GF_LOG_ERROR, - "%s: failed to connect with " - "remote QP", this->xl->name); - ret = -1; - goto unlock; - } - ib_verbs_fill_handshake_ack (buf, &priv->handshake.incoming); - buf[0] = 0; - priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_RECEIVING_ACK; - break; - - case IB_VERBS_HANDSHAKE_RECEIVING_ACK: - ret = __tcp_readv (this, - &priv->handshake.incoming.vector, - priv->handshake.incoming.count, - &priv->handshake.incoming.pending_vector, - &priv->handshake.incoming.pending_count); - if (ret == -1) { - goto unlock; - } - - if (ret > 0) { - gf_log (this->xl->name, GF_LOG_TRACE, - "partial header read on NB " - "socket. continue later"); - goto unlock; - } - - if (!ret) { - priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_RECEIVED_ACK; - } - break; - - case IB_VERBS_HANDSHAKE_RECEIVED_ACK: - if (strncmp (buf, "DONE", 4)) { - gf_log ("transport/ib-verbs", - GF_LOG_DEBUG, - "%s: handshake-3 did not " - "return 'DONE' (%s)", - this->xl->name, buf); - ret = -1; - goto unlock; - } - ret = 0; - priv->connected = 1; - sock_len = sizeof (struct sockaddr_storage); - getpeername (priv->sock, - (struct sockaddr *) &this->peerinfo.sockaddr, - &sock_len); - - GF_FREE (priv->handshake.incoming.buf); - priv->handshake.incoming.buf = NULL; - priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_COMPLETE; - } - } - } -unlock: - pthread_mutex_unlock (&priv->write_mutex); - - if (ret == -1) { - transport_disconnect (this); - } else { - ret = 0; - } - - if (!ret && priv->connected) { - ret = xlator_notify (this->xl, GF_EVENT_CHILD_UP, this); - } - - return ret; -} - -static int -ib_verbs_handshake_pollout (transport_t *this) -{ - ib_verbs_private_t *priv = this->private; - char *buf = priv->handshake.outgoing.buf; - int32_t ret = 0; - - if (priv->handshake.outgoing.state == IB_VERBS_HANDSHAKE_COMPLETE) { - return 0; - } - - pthread_mutex_unlock (&priv->write_mutex); - { - while (priv->handshake.outgoing.state != IB_VERBS_HANDSHAKE_COMPLETE) - { - switch (priv->handshake.outgoing.state) - { - case IB_VERBS_HANDSHAKE_START: - buf = priv->handshake.outgoing.buf = GF_CALLOC (1, 256, gf_ibv_mt_char); - ib_verbs_fill_handshake_data (buf, &priv->handshake.outgoing, priv); - priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_SENDING_DATA; - break; - - case IB_VERBS_HANDSHAKE_SENDING_DATA: - ret = __tcp_writev (this, - &priv->handshake.outgoing.vector, - priv->handshake.outgoing.count, - &priv->handshake.outgoing.pending_vector, - &priv->handshake.outgoing.pending_count); - if (ret == -1) { - goto unlock; - } - - if (ret > 0) { - gf_log (this->xl->name, GF_LOG_TRACE, - "partial header read on NB socket. continue later"); - goto unlock; - } - - if (!ret) { - priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_SENT_DATA; - } - break; - - case IB_VERBS_HANDSHAKE_SENT_DATA: - ib_verbs_fill_handshake_ack (buf, &priv->handshake.outgoing); - priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_SENDING_ACK; - break; - - case IB_VERBS_HANDSHAKE_SENDING_ACK: - ret = __tcp_writev (this, - &priv->handshake.outgoing.vector, - priv->handshake.outgoing.count, - &priv->handshake.outgoing.pending_vector, - &priv->handshake.outgoing.pending_count); - - if (ret == -1) { - goto unlock; - } - - if (ret > 0) { - gf_log (this->xl->name, GF_LOG_TRACE, - "partial header read on NB " - "socket. continue later"); - goto unlock; - } - - if (!ret) { - GF_FREE (priv->handshake.outgoing.buf); - priv->handshake.outgoing.buf = NULL; - priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_COMPLETE; - } - break; - } - } - } -unlock: - pthread_mutex_unlock (&priv->write_mutex); - - if (ret == -1) { - transport_disconnect (this); - } else { - ret = 0; - } - - return ret; -} - -static int -ib_verbs_handshake_pollerr (transport_t *this) -{ - ib_verbs_private_t *priv = this->private; - int32_t ret = 0; - char need_unref = 0; - - gf_log ("transport/ib-verbs", GF_LOG_DEBUG, - "%s: peer disconnected, cleaning up", - this->xl->name); - - pthread_mutex_lock (&priv->write_mutex); - { - __ib_verbs_teardown (this); - - if (priv->sock != -1) { - event_unregister (this->xl->ctx->event_pool, - priv->sock, priv->idx); - need_unref = 1; - - if (close (priv->sock) != 0) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "close () - error: %s", - strerror (errno)); - ret = -errno; - } - priv->tcp_connected = priv->connected = 0; - priv->sock = -1; - } - - if (priv->handshake.incoming.buf) { - GF_FREE (priv->handshake.incoming.buf); - priv->handshake.incoming.buf = NULL; - } - - priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_START; - - if (priv->handshake.outgoing.buf) { - GF_FREE (priv->handshake.outgoing.buf); - priv->handshake.outgoing.buf = NULL; - } - - priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_START; - } - pthread_mutex_unlock (&priv->write_mutex); - - xlator_notify (this->xl, GF_EVENT_POLLERR, this, NULL); - - if (need_unref) - transport_unref (this); - - return 0; -} - - -static int -tcp_connect_finish (transport_t *this) -{ - ib_verbs_private_t *priv = this->private; - int error = 0, ret = 0; - - pthread_mutex_lock (&priv->write_mutex); - { - ret = __tcp_connect_finish (priv->sock); - - if (!ret) { - this->myinfo.sockaddr_len = - sizeof (this->myinfo.sockaddr); - ret = getsockname (priv->sock, - (struct sockaddr *)&this->myinfo.sockaddr, - &this->myinfo.sockaddr_len); - if (ret == -1) - { - gf_log (this->xl->name, GF_LOG_ERROR, - "getsockname on new client-socket %d " - "failed (%s)", - priv->sock, strerror (errno)); - close (priv->sock); - error = 1; - goto unlock; - } - - get_transport_identifiers (this); - priv->tcp_connected = 1; - } - - if (ret == -1 && errno != EINPROGRESS) { - gf_log (this->xl->name, GF_LOG_ERROR, - "tcp connect to %s failed (%s)", - this->peerinfo.identifier, strerror (errno)); - error = 1; - } - } -unlock: - pthread_mutex_unlock (&priv->write_mutex); - - if (error) { - transport_disconnect (this); - } - - return ret; -} - -static int -ib_verbs_event_handler (int fd, int idx, void *data, - int poll_in, int poll_out, int poll_err) -{ - transport_t *this = data; - ib_verbs_private_t *priv = this->private; - ib_verbs_options_t *options = NULL; - int ret = 0; - - if (!priv->tcp_connected) { - ret = tcp_connect_finish (this); - if (priv->tcp_connected) { - options = &priv->options; - - priv->peer.send_count = options->send_count; - priv->peer.recv_count = options->recv_count; - priv->peer.send_size = options->send_size; - priv->peer.recv_size = options->recv_size; - - if ((ret = ib_verbs_create_qp (this)) < 0) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "%s: could not create QP", - this->xl->name); - transport_disconnect (this); - } - } - } - - if (!ret && poll_out && priv->tcp_connected) { - ret = ib_verbs_handshake_pollout (this); - } - - if (!ret && poll_in && priv->tcp_connected) { - if (priv->handshake.incoming.state == IB_VERBS_HANDSHAKE_COMPLETE) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "%s: pollin received on tcp socket (peer: %s) " - "after handshake is complete", - this->xl->name, this->peerinfo.identifier); - ib_verbs_handshake_pollerr (this); - return 0; - } - ret = ib_verbs_handshake_pollin (this); - } - - if (ret < 0 || poll_err) { - ret = ib_verbs_handshake_pollerr (this); - } - - return 0; -} - -static int -__tcp_nonblock (int fd) -{ - int flags = 0; - int ret = -1; - - flags = fcntl (fd, F_GETFL); - - if (flags != -1) - ret = fcntl (fd, F_SETFL, flags | O_NONBLOCK); - - return ret; -} - -static int32_t -ib_verbs_connect (struct transport *this) -{ - dict_t *options = this->xl->options; - - ib_verbs_private_t *priv = this->private; - - int32_t ret = 0; - gf_boolean_t non_blocking = 1; - struct sockaddr_storage sockaddr; - socklen_t sockaddr_len = 0; - - if (priv->connected) { - return 0; - } - - if (dict_get (options, "non-blocking-io")) { - char *nb_connect = data_to_str (dict_get (this->xl->options, - "non-blocking-io")); - - if (gf_string2boolean (nb_connect, &non_blocking) == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "'non-blocking-io' takes only boolean " - "options, not taking any action"); - non_blocking = 1; - } - } - - ret = ibverbs_client_get_remote_sockaddr (this, (struct sockaddr *)&sockaddr, - &sockaddr_len); - if (ret != 0) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "cannot get remote address to connect"); - return ret; - } - - pthread_mutex_lock (&priv->write_mutex); - { - if (priv->sock != -1) { - ret = 0; - goto unlock; - } - - priv->sock = socket (((struct sockaddr *)&sockaddr)->sa_family, - SOCK_STREAM, 0); - - if (priv->sock == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "socket () - error: %s", strerror (errno)); - ret = -errno; - goto unlock; - } - - gf_log (this->xl->name, GF_LOG_TRACE, - "socket fd = %d", priv->sock); - - memcpy (&this->peerinfo.sockaddr, &sockaddr, sockaddr_len); - this->peerinfo.sockaddr_len = sockaddr_len; - - ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family = - ((struct sockaddr *)&this->peerinfo.sockaddr)->sa_family; - - if (non_blocking) - { - ret = __tcp_nonblock (priv->sock); - - if (ret == -1) - { - gf_log (this->xl->name, GF_LOG_ERROR, - "could not set socket %d to non " - "blocking mode (%s)", - priv->sock, strerror (errno)); - close (priv->sock); - priv->sock = -1; - goto unlock; - } - } - - ret = client_bind (this, - (struct sockaddr *)&this->myinfo.sockaddr, - &this->myinfo.sockaddr_len, priv->sock); - if (ret == -1) - { - gf_log (this->xl->name, GF_LOG_WARNING, - "client bind failed: %s", strerror (errno)); - close (priv->sock); - priv->sock = -1; - goto unlock; - } - - ret = connect (priv->sock, - (struct sockaddr *)&this->peerinfo.sockaddr, - this->peerinfo.sockaddr_len); - if (ret == -1 && errno != EINPROGRESS) - { - gf_log (this->xl->name, GF_LOG_ERROR, - "connection attempt failed (%s)", - strerror (errno)); - close (priv->sock); - priv->sock = -1; - goto unlock; - } - - priv->tcp_connected = priv->connected = 0; - - transport_ref (this); - - priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_START; - priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_START; - - priv->idx = event_register (this->xl->ctx->event_pool, - priv->sock, ib_verbs_event_handler, - this, 1, 1); - } -unlock: - pthread_mutex_unlock (&priv->write_mutex); - - return ret; -} - -static int -ib_verbs_server_event_handler (int fd, int idx, void *data, - int poll_in, int poll_out, int poll_err) -{ - int32_t main_sock = -1; - transport_t *this, *trans = data; - ib_verbs_private_t *priv = NULL; - ib_verbs_private_t *trans_priv = (ib_verbs_private_t *) trans->private; - ib_verbs_options_t *options = NULL; - - if (!poll_in) - return 0; - - this = GF_CALLOC (1, sizeof (transport_t), - gf_ibv_mt_transport_t); - ERR_ABORT (this); - priv = GF_CALLOC (1, sizeof (ib_verbs_private_t), - gf_ibv_mt_ib_verbs_private_t); - ERR_ABORT (priv); - this->private = priv; - /* Copy all the ib_verbs related values in priv, from trans_priv - as other than QP, all the values remain same */ - priv->device = trans_priv->device; - priv->options = trans_priv->options; - options = &priv->options; - - this->ops = trans->ops; - this->xl = trans->xl; - this->init = trans->init; - this->fini = trans->fini; - - memcpy (&this->myinfo.sockaddr, &trans->myinfo.sockaddr, - trans->myinfo.sockaddr_len); - this->myinfo.sockaddr_len = trans->myinfo.sockaddr_len; - - main_sock = (trans_priv)->sock; - this->peerinfo.sockaddr_len = sizeof (this->peerinfo.sockaddr); - priv->sock = accept (main_sock, - (struct sockaddr *)&this->peerinfo.sockaddr, - &this->peerinfo.sockaddr_len); - if (priv->sock == -1) { - gf_log ("ib-verbs/server", GF_LOG_ERROR, - "accept() failed: %s", - strerror (errno)); - GF_FREE (this->private); - GF_FREE (this); - return -1; - } - - priv->peer.trans = this; - transport_ref (this); - - get_transport_identifiers (this); - - priv->tcp_connected = 1; - priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_START; - priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_START; - - priv->peer.send_count = options->send_count; - priv->peer.recv_count = options->recv_count; - priv->peer.send_size = options->send_size; - priv->peer.recv_size = options->recv_size; - INIT_LIST_HEAD (&priv->peer.ioq); - - if (ib_verbs_create_qp (this) < 0) { - gf_log ("transport/ib-verbs", GF_LOG_ERROR, - "%s: could not create QP", - this->xl->name); - transport_disconnect (this); - return -1; - } - - priv->idx = event_register (this->xl->ctx->event_pool, priv->sock, - ib_verbs_event_handler, this, 1, 1); - - pthread_mutex_init (&priv->read_mutex, NULL); - pthread_mutex_init (&priv->write_mutex, NULL); - pthread_mutex_init (&priv->recv_mutex, NULL); - /* pthread_cond_init (&priv->recv_cond, NULL); */ - - return 0; -} - -static int32_t -ib_verbs_listen (transport_t *this) -{ - struct sockaddr_storage sockaddr; - socklen_t sockaddr_len; - ib_verbs_private_t *priv = this->private; - int opt = 1, ret = 0; - char service[NI_MAXSERV], host[NI_MAXHOST]; - - memset (&sockaddr, 0, sizeof (sockaddr)); - ret = ibverbs_server_get_local_sockaddr (this, - (struct sockaddr *)&sockaddr, - &sockaddr_len); - if (ret != 0) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "cannot find network address of server to bind to"); - goto err; - } - - priv->sock = socket (((struct sockaddr *)&sockaddr)->sa_family, - SOCK_STREAM, 0); - if (priv->sock == -1) { - gf_log ("ib-verbs/server", GF_LOG_CRITICAL, - "init: failed to create socket, error: %s", - strerror (errno)); - GF_FREE (this->private); - ret = -1; - goto err; - } - - memcpy (&this->myinfo.sockaddr, &sockaddr, sockaddr_len); - this->myinfo.sockaddr_len = sockaddr_len; - - ret = getnameinfo ((struct sockaddr *)&this->myinfo.sockaddr, - this->myinfo.sockaddr_len, - host, sizeof (host), - service, sizeof (service), - NI_NUMERICHOST); - if (ret != 0) { - gf_log (this->xl->name, GF_LOG_ERROR, - "getnameinfo failed (%s)", gai_strerror (ret)); - goto err; - } - sprintf (this->myinfo.identifier, "%s:%s", host, service); - - setsockopt (priv->sock, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof (opt)); - if (bind (priv->sock, - (struct sockaddr *)&sockaddr, - sockaddr_len) != 0) { - ret = -1; - gf_log ("ib-verbs/server", GF_LOG_ERROR, - "init: failed to bind to socket for %s (%s)", - this->myinfo.identifier, strerror (errno)); - goto err; - } - - if (listen (priv->sock, 10) != 0) { - gf_log ("ib-verbs/server", GF_LOG_ERROR, - "init: listen () failed on socket for %s (%s)", - this->myinfo.identifier, strerror (errno)); - ret = -1; - goto err; - } - - /* Register the main socket */ - priv->idx = event_register (this->xl->ctx->event_pool, priv->sock, - ib_verbs_server_event_handler, - transport_ref (this), 1, 0); - -err: - return ret; -} - -struct transport_ops tops = { - .receive = ib_verbs_receive, - .submit = ib_verbs_submit, - .connect = ib_verbs_connect, - .disconnect = ib_verbs_disconnect, - .listen = ib_verbs_listen, -}; - -int32_t -init (transport_t *this) -{ - ib_verbs_private_t *priv = GF_CALLOC (1, sizeof (*priv), - gf_ibv_mt_ib_verbs_private_t); - this->private = priv; - priv->sock = -1; - - if (ib_verbs_init (this)) { - gf_log (this->xl->name, GF_LOG_ERROR, - "Failed to initialize IB Device"); - return -1; - } - - return 0; -} - -void -fini (struct transport *this) -{ - /* TODO: verify this function does graceful finish */ - ib_verbs_private_t *priv = this->private; - this->private = NULL; - - pthread_mutex_destroy (&priv->recv_mutex); - pthread_mutex_destroy (&priv->write_mutex); - pthread_mutex_destroy (&priv->read_mutex); - /* pthread_cond_destroy (&priv->recv_cond); */ - - gf_log (this->xl->name, GF_LOG_TRACE, - "called fini on transport: %p", - this); - GF_FREE (priv); - return; -} - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_common_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } - - return ret; -} - -/* TODO: expand each option */ -struct volume_options options[] = { - { .key = {"transport.ib-verbs.port", - "ib-verbs-port"}, - .type = GF_OPTION_TYPE_INT, - .min = 1, - .max = 4, - .description = "check the option by 'ibv_devinfo'" - }, - { .key = {"transport.ib-verbs.mtu", - "ib-verbs-mtu"}, - .type = GF_OPTION_TYPE_INT, - }, - { .key = {"transport.ib-verbs.device-name", - "ib-verbs-device-name"}, - .type = GF_OPTION_TYPE_ANY, - .description = "check by 'ibv_devinfo'" - }, - { .key = {"transport.ib-verbs.work-request-send-count", - "ib-verbs-work-request-send-count"}, - .type = GF_OPTION_TYPE_INT, - }, - { .key = {"transport.ib-verbs.work-request-recv-count", - "ib-verbs-work-request-recv-count"}, - .type = GF_OPTION_TYPE_INT, - }, - { .key = {"remote-port", - "transport.remote-port", - "transport.ib-verbs.remote-port"}, - .type = GF_OPTION_TYPE_INT - }, - { .key = {"transport.ib-verbs.listen-port", "listen-port"}, - .type = GF_OPTION_TYPE_INT - }, - { .key = {"transport.ib-verbs.connect-path", "connect-path"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {"transport.ib-verbs.bind-path", "bind-path"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {"transport.ib-verbs.listen-path", "listen-path"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {"transport.address-family", - "address-family"}, - .value = {"inet", "inet6", "inet/inet6", "inet6/inet", - "unix", "inet-sdp" }, - .type = GF_OPTION_TYPE_STR - }, - { .key = {NULL} } -}; diff --git a/transport/ib-verbs/src/ib-verbs.h b/transport/ib-verbs/src/ib-verbs.h deleted file mode 100644 index c385b62e..00000000 --- a/transport/ib-verbs/src/ib-verbs.h +++ /dev/null @@ -1,220 +0,0 @@ -/* - Copyright (c) 2006-2009 Gluster, Inc. - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - . -*/ - -#ifndef _XPORT_IB_VERBS_H -#define _XPORT_IB_VERBS_H - - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#ifndef MAX_IOVEC -#define MAX_IOVEC 16 -#endif /* MAX_IOVEC */ - -#include "xlator.h" -#include "event.h" -#include "ib-verbs-mem-types.h" - -#include -#include -#include -#include - -#define GF_DEFAULT_IBVERBS_LISTEN_PORT 6997 - -/* options per transport end point */ -struct _ib_verbs_options { - int32_t port; - char *device_name; - enum ibv_mtu mtu; - int32_t send_count; - int32_t recv_count; - uint64_t recv_size; - uint64_t send_size; -}; -typedef struct _ib_verbs_options ib_verbs_options_t; - - -struct _ib_verbs_header { - char colonO[3]; - uint32_t size1; - uint32_t size2; - char version; -} __attribute__((packed)); -typedef struct _ib_verbs_header ib_verbs_header_t; - -struct _ib_verbs_ioq { - union { - struct list_head list; - struct { - struct _ib_verbs_ioq *next; - struct _ib_verbs_ioq *prev; - }; - }; - ib_verbs_header_t header; - struct iovec vector[MAX_IOVEC]; - int count; - char *buf; - struct iobref *iobref; -}; -typedef struct _ib_verbs_ioq ib_verbs_ioq_t; - -/* represents one communication peer, two per transport_t */ -struct _ib_verbs_peer { - transport_t *trans; - struct ibv_qp *qp; - - int32_t recv_count; - int32_t send_count; - int32_t recv_size; - int32_t send_size; - - int32_t quota; - union { - struct list_head ioq; - struct { - ib_verbs_ioq_t *ioq_next; - ib_verbs_ioq_t *ioq_prev; - }; - }; - - /* QP attributes, needed to connect with remote QP */ - int32_t local_lid; - int32_t local_psn; - int32_t local_qpn; - int32_t remote_lid; - int32_t remote_psn; - int32_t remote_qpn; -}; -typedef struct _ib_verbs_peer ib_verbs_peer_t; - - -struct _ib_verbs_post { - struct _ib_verbs_post *next, *prev; - struct ibv_mr *mr; - char *buf; - int32_t buf_size; - char aux; - int32_t reused; - pthread_barrier_t wait; -}; -typedef struct _ib_verbs_post ib_verbs_post_t; - - -struct _ib_verbs_queue { - ib_verbs_post_t active_posts, passive_posts; - int32_t active_count, passive_count; - pthread_mutex_t lock; -}; -typedef struct _ib_verbs_queue ib_verbs_queue_t; - - -struct _ib_verbs_qpreg { - pthread_mutex_t lock; - int32_t count; - struct _qpent { - struct _qpent *next, *prev; - int32_t qp_num; - ib_verbs_peer_t *peer; - } ents[42]; -}; -typedef struct _ib_verbs_qpreg ib_verbs_qpreg_t; - -/* context per device, stored in global glusterfs_ctx_t->ib */ -struct _ib_verbs_device { - struct _ib_verbs_device *next; - const char *device_name; - struct ibv_context *context; - int32_t port; - struct ibv_pd *pd; - struct ibv_srq *srq; - ib_verbs_qpreg_t qpreg; - struct ibv_comp_channel *send_chan, *recv_chan; - struct ibv_cq *send_cq, *recv_cq; - ib_verbs_queue_t sendq, recvq; - pthread_t send_thread, recv_thread; -}; -typedef struct _ib_verbs_device ib_verbs_device_t; - -typedef enum { - IB_VERBS_HANDSHAKE_START = 0, - IB_VERBS_HANDSHAKE_SENDING_DATA, - IB_VERBS_HANDSHAKE_RECEIVING_DATA, - IB_VERBS_HANDSHAKE_SENT_DATA, - IB_VERBS_HANDSHAKE_RECEIVED_DATA, - IB_VERBS_HANDSHAKE_SENDING_ACK, - IB_VERBS_HANDSHAKE_RECEIVING_ACK, - IB_VERBS_HANDSHAKE_RECEIVED_ACK, - IB_VERBS_HANDSHAKE_COMPLETE, -} ib_verbs_handshake_state_t; - -struct ib_verbs_nbio { - int state; - char *buf; - int count; - struct iovec vector; - struct iovec *pending_vector; - int pending_count; -}; - - -struct _ib_verbs_private { - int32_t sock; - int32_t idx; - unsigned char connected; - unsigned char tcp_connected; - unsigned char ib_connected; - in_addr_t addr; - unsigned short port; - - /* IB Verbs Driver specific variables, pointers */ - ib_verbs_peer_t peer; - ib_verbs_device_t *device; - ib_verbs_options_t options; - - /* Used by trans->op->receive */ - char *data_ptr; - int32_t data_offset; - int32_t data_len; - - /* Mutex */ - pthread_mutex_t read_mutex; - pthread_mutex_t write_mutex; - pthread_barrier_t handshake_barrier; - char handshake_ret; - - pthread_mutex_t recv_mutex; - pthread_cond_t recv_cond; - - /* used during ib_verbs_handshake */ - struct { - struct ib_verbs_nbio incoming; - struct ib_verbs_nbio outgoing; - int state; - ib_verbs_header_t header; - char *buf; - size_t size; - } handshake; -}; -typedef struct _ib_verbs_private ib_verbs_private_t; - -#endif /* _XPORT_IB_VERBS_H */ diff --git a/transport/ib-verbs/src/name.c b/transport/ib-verbs/src/name.c deleted file mode 100644 index a3e18481..00000000 --- a/transport/ib-verbs/src/name.c +++ /dev/null @@ -1,712 +0,0 @@ -/* - Copyright (c) 2008-2009 Gluster, Inc. - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - . -*/ - -#include -#include -#include -#include -#include - -#ifdef CLIENT_PORT_CEILING -#undef CLIENT_PORT_CEILING -#endif - -#define CLIENT_PORT_CEILING 1024 - -#ifndef AF_INET_SDP -#define AF_INET_SDP 27 -#endif - -#include "transport.h" -#include "ib-verbs.h" - -int32_t -gf_resolve_ip6 (const char *hostname, - uint16_t port, - int family, - void **dnscache, - struct addrinfo **addr_info); - -static int32_t -af_inet_bind_to_port_lt_ceiling (int fd, struct sockaddr *sockaddr, - socklen_t sockaddr_len, int ceiling) -{ - int32_t ret = -1; - /* struct sockaddr_in sin = {0, }; */ - uint16_t port = ceiling - 1; - - while (port) - { - switch (sockaddr->sa_family) - { - case AF_INET6: - ((struct sockaddr_in6 *)sockaddr)->sin6_port = htons (port); - break; - - case AF_INET_SDP: - case AF_INET: - ((struct sockaddr_in *)sockaddr)->sin_port = htons (port); - break; - } - - ret = bind (fd, sockaddr, sockaddr_len); - - if (ret == 0) - break; - - if (ret == -1 && errno == EACCES) - break; - - port--; - } - - return ret; -} - -static int32_t -af_unix_client_bind (transport_t *this, - struct sockaddr *sockaddr, - socklen_t sockaddr_len, - int sock) -{ - data_t *path_data = NULL; - struct sockaddr_un *addr = NULL; - int32_t ret = -1; - - path_data = dict_get (this->xl->options, - "transport.ib-verbs.bind-path"); - if (path_data) { - char *path = data_to_str (path_data); - if (!path || strlen (path) > UNIX_PATH_MAX) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "transport.ib-verbs.bind-path not specfied " - "for unix socket, letting connect to assign " - "default value"); - goto err; - } - - addr = (struct sockaddr_un *) sockaddr; - strcpy (addr->sun_path, path); - ret = bind (sock, (struct sockaddr *)addr, sockaddr_len); - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "cannot bind to unix-domain socket %d (%s)", - sock, strerror (errno)); - goto err; - } - } - -err: - return ret; -} - -static int32_t -client_fill_address_family (transport_t *this, struct sockaddr *sockaddr) -{ - data_t *address_family_data = NULL; - - address_family_data = dict_get (this->xl->options, - "transport.address-family"); - if (!address_family_data) { - data_t *remote_host_data = NULL, *connect_path_data = NULL; - remote_host_data = dict_get (this->xl->options, "remote-host"); - connect_path_data = dict_get (this->xl->options, - "transport.ib-verbs.connect-path"); - - if (!(remote_host_data || connect_path_data) || - (remote_host_data && connect_path_data)) { - gf_log (this->xl->name, GF_LOG_ERROR, - "address-family not specified and not able to " - "determine the same from other options " - "(remote-host:%s and connect-path:%s)", - data_to_str (remote_host_data), - data_to_str (connect_path_data)); - return -1; - } - - if (remote_host_data) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "address-family not specified, guessing it " - "to be inet/inet6"); - sockaddr->sa_family = AF_UNSPEC; - } else { - gf_log (this->xl->name, GF_LOG_DEBUG, - "address-family not specified, guessing it " - "to be unix"); - sockaddr->sa_family = AF_UNIX; - } - - } else { - char *address_family = data_to_str (address_family_data); - if (!strcasecmp (address_family, "unix")) { - sockaddr->sa_family = AF_UNIX; - } else if (!strcasecmp (address_family, "inet")) { - sockaddr->sa_family = AF_INET; - } else if (!strcasecmp (address_family, "inet6")) { - sockaddr->sa_family = AF_INET6; - } else if (!strcasecmp (address_family, "inet-sdp")) { - sockaddr->sa_family = AF_INET_SDP; - } else if (!strcasecmp (address_family, "inet/inet6") - || !strcasecmp (address_family, "inet6/inet")) { - sockaddr->sa_family = AF_UNSPEC; - } else { - gf_log (this->xl->name, GF_LOG_ERROR, - "unknown address-family (%s) specified", - address_family); - return -1; - } - } - - return 0; -} - -static int32_t -af_inet_client_get_remote_sockaddr (transport_t *this, - struct sockaddr *sockaddr, - socklen_t *sockaddr_len) -{ - dict_t *options = this->xl->options; - data_t *remote_host_data = NULL; - data_t *remote_port_data = NULL; - char *remote_host = NULL; - uint16_t remote_port = 0; - struct addrinfo *addr_info = NULL; - int32_t ret = 0; - - remote_host_data = dict_get (options, "remote-host"); - if (remote_host_data == NULL) - { - gf_log (this->xl->name, GF_LOG_ERROR, - "option remote-host missing in volume %s", - this->xl->name); - ret = -1; - goto err; - } - - remote_host = data_to_str (remote_host_data); - if (remote_host == NULL) - { - gf_log (this->xl->name, GF_LOG_ERROR, - "option remote-host has data NULL in volume %s", - this->xl->name); - ret = -1; - goto err; - } - - remote_port_data = dict_get (options, "remote-port"); - if (remote_port_data == NULL) - { - gf_log (this->xl->name, GF_LOG_DEBUG, - "option remote-port missing in volume %s. " - "Defaulting to %d", - this->xl->name, GF_DEFAULT_IBVERBS_LISTEN_PORT); - - remote_port = GF_DEFAULT_IBVERBS_LISTEN_PORT; - } - else - { - remote_port = data_to_uint16 (remote_port_data); - } - - if (remote_port == (uint16_t)-1) - { - gf_log (this->xl->name, GF_LOG_ERROR, - "option remote-port has invalid port in volume %s", - this->xl->name); - ret = -1; - goto err; - } - - /* TODO: gf_resolve is a blocking call. kick in some - non blocking dns techniques */ - ret = gf_resolve_ip6 (remote_host, remote_port, - sockaddr->sa_family, - &this->dnscache, &addr_info); - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "DNS resolution failed on host %s", remote_host); - goto err; - } - - memcpy (sockaddr, addr_info->ai_addr, addr_info->ai_addrlen); - *sockaddr_len = addr_info->ai_addrlen; - -err: - return ret; -} - -static int32_t -af_unix_client_get_remote_sockaddr (transport_t *this, - struct sockaddr *sockaddr, - socklen_t *sockaddr_len) -{ - struct sockaddr_un *sockaddr_un = NULL; - char *connect_path = NULL; - data_t *connect_path_data = NULL; - int32_t ret = 0; - - connect_path_data = dict_get (this->xl->options, - "transport.ib-verbs.connect-path"); - if (!connect_path_data) { - gf_log (this->xl->name, GF_LOG_ERROR, - "option transport.ib-verbs.connect-path not " - "specified for address-family unix"); - ret = -1; - goto err; - } - - connect_path = data_to_str (connect_path_data); - if (!connect_path) { - gf_log (this->xl->name, GF_LOG_ERROR, - "connect-path is null-string"); - ret = -1; - goto err; - } - - if (strlen (connect_path) > UNIX_PATH_MAX) { - gf_log (this->xl->name, GF_LOG_ERROR, - "connect-path value length %"GF_PRI_SIZET" > " - "%d octets", strlen (connect_path), UNIX_PATH_MAX); - ret = -1; - goto err; - } - - gf_log (this->xl->name, - GF_LOG_DEBUG, - "using connect-path %s", connect_path); - sockaddr_un = (struct sockaddr_un *)sockaddr; - strcpy (sockaddr_un->sun_path, connect_path); - *sockaddr_len = sizeof (struct sockaddr_un); - -err: - return ret; -} - -static int32_t -af_unix_server_get_local_sockaddr (transport_t *this, - struct sockaddr *addr, - socklen_t *addr_len) -{ - data_t *listen_path_data = NULL; - char *listen_path = NULL; - int32_t ret = 0; - struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; - - - listen_path_data = dict_get (this->xl->options, - "transport.ib-verbs.listen-path"); - if (!listen_path_data) { - gf_log (this->xl->name, GF_LOG_ERROR, - "missing option listen-path"); - ret = -1; - goto err; - } - - listen_path = data_to_str (listen_path_data); - -#ifndef UNIX_PATH_MAX -#define UNIX_PATH_MAX 108 -#endif - - if (strlen (listen_path) > UNIX_PATH_MAX) { - gf_log (this->xl->name, GF_LOG_ERROR, - "option listen-path has value length %"GF_PRI_SIZET" > %d", - strlen (listen_path), UNIX_PATH_MAX); - ret = -1; - goto err; - } - - sunaddr->sun_family = AF_UNIX; - strcpy (sunaddr->sun_path, listen_path); - *addr_len = sizeof (struct sockaddr_un); - -err: - return ret; -} - -static int32_t -af_inet_server_get_local_sockaddr (transport_t *this, - struct sockaddr *addr, - socklen_t *addr_len) -{ - struct addrinfo hints, *res = 0; - data_t *listen_port_data = NULL, *listen_host_data = NULL; - uint16_t listen_port = -1; - char service[NI_MAXSERV], *listen_host = NULL; - dict_t *options = NULL; - int32_t ret = 0; - - options = this->xl->options; - - listen_port_data = dict_get (options, "transport.ib-verbs.listen-port"); - listen_host_data = dict_get (options, "transport.ib-verbs.bind-address"); - - if (listen_port_data) - { - listen_port = data_to_uint16 (listen_port_data); - } else { - if (addr->sa_family == AF_INET6) { - struct sockaddr_in6 *in = (struct sockaddr_in6 *) addr; - in->sin6_addr = in6addr_any; - in->sin6_port = htons(listen_port); - *addr_len = sizeof(struct sockaddr_in6); - goto out; - } else if (addr->sa_family == AF_INET) { - struct sockaddr_in *in = (struct sockaddr_in *) addr; - in->sin_addr.s_addr = htonl(INADDR_ANY); - in->sin_port = htons(listen_port); - *addr_len = sizeof(struct sockaddr_in); - goto out; - } - } - - if (listen_port == (uint16_t) -1) - listen_port = GF_DEFAULT_IBVERBS_LISTEN_PORT; - - - if (listen_host_data) - { - listen_host = data_to_str (listen_host_data); - } - - memset (service, 0, sizeof (service)); - sprintf (service, "%d", listen_port); - - memset (&hints, 0, sizeof (hints)); - hints.ai_family = addr->sa_family; - hints.ai_socktype = SOCK_STREAM; - hints.ai_flags = AI_ADDRCONFIG | AI_PASSIVE; - - ret = getaddrinfo(listen_host, service, &hints, &res); - if (ret != 0) { - gf_log (this->xl->name, - GF_LOG_ERROR, - "getaddrinfo failed for host %s, service %s (%s)", - listen_host, service, gai_strerror (ret)); - ret = -1; - goto out; - } - - memcpy (addr, res->ai_addr, res->ai_addrlen); - *addr_len = res->ai_addrlen; - - freeaddrinfo (res); - -out: - return ret; -} - -int32_t -client_bind (transport_t *this, - struct sockaddr *sockaddr, - socklen_t *sockaddr_len, - int sock) -{ - int ret = 0; - - *sockaddr_len = sizeof (struct sockaddr_in6); - switch (sockaddr->sa_family) - { - case AF_INET_SDP: - case AF_INET: - *sockaddr_len = sizeof (struct sockaddr_in); - - case AF_INET6: - ret = af_inet_bind_to_port_lt_ceiling (sock, sockaddr, - *sockaddr_len, - CLIENT_PORT_CEILING); - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_WARNING, - "cannot bind inet socket (%d) to port " - "less than %d (%s)", - sock, CLIENT_PORT_CEILING, strerror (errno)); - ret = 0; - } - break; - - case AF_UNIX: - *sockaddr_len = sizeof (struct sockaddr_un); - ret = af_unix_client_bind (this, (struct sockaddr *)sockaddr, - *sockaddr_len, sock); - break; - - default: - gf_log (this->xl->name, GF_LOG_ERROR, - "unknown address family %d", sockaddr->sa_family); - ret = -1; - break; - } - - return ret; -} - -int32_t -ibverbs_client_get_remote_sockaddr (transport_t *this, - struct sockaddr *sockaddr, - socklen_t *sockaddr_len) -{ - int32_t ret = 0; - char is_inet_sdp = 0; - - ret = client_fill_address_family (this, sockaddr); - if (ret) { - ret = -1; - goto err; - } - - switch (sockaddr->sa_family) - { - case AF_INET_SDP: - sockaddr->sa_family = AF_INET; - is_inet_sdp = 1; - - case AF_INET: - case AF_INET6: - case AF_UNSPEC: - ret = af_inet_client_get_remote_sockaddr (this, - sockaddr, - sockaddr_len); - - if (is_inet_sdp) { - sockaddr->sa_family = AF_INET_SDP; - } - - break; - - case AF_UNIX: - ret = af_unix_client_get_remote_sockaddr (this, - sockaddr, - sockaddr_len); - break; - - default: - gf_log (this->xl->name, GF_LOG_ERROR, - "unknown address-family %d", sockaddr->sa_family); - ret = -1; - } - -err: - return ret; -} - -int32_t -ibverbs_server_get_local_sockaddr (transport_t *this, - struct sockaddr *addr, - socklen_t *addr_len) -{ - data_t *address_family_data = NULL; - int32_t ret = 0; - char is_inet_sdp = 0; - - address_family_data = dict_get (this->xl->options, - "transport.address-family"); - if (address_family_data) { - char *address_family = NULL; - address_family = data_to_str (address_family_data); - - if (!strcasecmp (address_family, "inet")) { - addr->sa_family = AF_INET; - } else if (!strcasecmp (address_family, "inet6")) { - addr->sa_family = AF_INET6; - } else if (!strcasecmp (address_family, "inet-sdp")) { - addr->sa_family = AF_INET_SDP; - } else if (!strcasecmp (address_family, "unix")) { - addr->sa_family = AF_UNIX; - } else if (!strcasecmp (address_family, "inet/inet6") - || !strcasecmp (address_family, "inet6/inet")) { - addr->sa_family = AF_UNSPEC; - } else { - gf_log (this->xl->name, GF_LOG_ERROR, - "unknown address family (%s) specified", - address_family); - ret = -1; - goto err; - } - } else { - gf_log (this->xl->name, GF_LOG_DEBUG, - "option address-family not specified, defaulting " - "to inet/inet6"); - addr->sa_family = AF_UNSPEC; - } - - switch (addr->sa_family) - { - case AF_INET_SDP: - is_inet_sdp = 1; - addr->sa_family = AF_INET; - - case AF_INET: - case AF_INET6: - case AF_UNSPEC: - ret = af_inet_server_get_local_sockaddr (this, addr, addr_len); - if (is_inet_sdp && !ret) { - addr->sa_family = AF_INET_SDP; - } - break; - - case AF_UNIX: - ret = af_unix_server_get_local_sockaddr (this, addr, addr_len); - break; - } - -err: - return ret; -} - -int32_t -fill_inet6_inet_identifiers (transport_t *this, struct sockaddr_storage *addr, - int32_t addr_len, char *identifier) -{ - int32_t ret = 0, tmpaddr_len = 0; - char service[NI_MAXSERV], host[NI_MAXHOST]; - struct sockaddr_storage tmpaddr; - - memset (&tmpaddr, 0, sizeof (tmpaddr)); - tmpaddr = *addr; - tmpaddr_len = addr_len; - - if (((struct sockaddr *) &tmpaddr)->sa_family == AF_INET6) { - int32_t one_to_four, four_to_eight, twelve_to_sixteen; - int16_t eight_to_ten, ten_to_twelve; - - one_to_four = four_to_eight = twelve_to_sixteen = 0; - eight_to_ten = ten_to_twelve = 0; - - one_to_four = ((struct sockaddr_in6 *) - &tmpaddr)->sin6_addr.s6_addr32[0]; - four_to_eight = ((struct sockaddr_in6 *) - &tmpaddr)->sin6_addr.s6_addr32[1]; -#ifdef GF_SOLARIS_HOST_OS - eight_to_ten = S6_ADDR16(((struct sockaddr_in6 *) - &tmpaddr)->sin6_addr)[4]; -#else - eight_to_ten = ((struct sockaddr_in6 *) - &tmpaddr)->sin6_addr.s6_addr16[4]; -#endif - -#ifdef GF_SOLARIS_HOST_OS - ten_to_twelve = S6_ADDR16(((struct sockaddr_in6 *) - &tmpaddr)->sin6_addr)[5]; -#else - ten_to_twelve = ((struct sockaddr_in6 *) - &tmpaddr)->sin6_addr.s6_addr16[5]; -#endif - twelve_to_sixteen = ((struct sockaddr_in6 *) - &tmpaddr)->sin6_addr.s6_addr32[3]; - - /* ipv4 mapped ipv6 address has - bits 0-80: 0 - bits 80-96: 0xffff - bits 96-128: ipv4 address - */ - - if (one_to_four == 0 && - four_to_eight == 0 && - eight_to_ten == 0 && - ten_to_twelve == -1) { - struct sockaddr_in *in_ptr = (struct sockaddr_in *)&tmpaddr; - memset (&tmpaddr, 0, sizeof (tmpaddr)); - - in_ptr->sin_family = AF_INET; - in_ptr->sin_port = ((struct sockaddr_in6 *)addr)->sin6_port; - in_ptr->sin_addr.s_addr = twelve_to_sixteen; - tmpaddr_len = sizeof (*in_ptr); - } - } - - ret = getnameinfo ((struct sockaddr *) &tmpaddr, - tmpaddr_len, - host, sizeof (host), - service, sizeof (service), - NI_NUMERICHOST | NI_NUMERICSERV); - if (ret != 0) { - gf_log (this->xl->name, - GF_LOG_ERROR, - "getnameinfo failed (%s)", gai_strerror (ret)); - } - - sprintf (identifier, "%s:%s", host, service); - - return ret; -} - -int32_t -get_transport_identifiers (transport_t *this) -{ - int32_t ret = 0; - char is_inet_sdp = 0; - - switch (((struct sockaddr *) &this->myinfo.sockaddr)->sa_family) - { - case AF_INET_SDP: - is_inet_sdp = 1; - ((struct sockaddr *) &this->peerinfo.sockaddr)->sa_family = ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family = AF_INET; - - case AF_INET: - case AF_INET6: - { - ret = fill_inet6_inet_identifiers (this, - &this->myinfo.sockaddr, - this->myinfo.sockaddr_len, - this->myinfo.identifier); - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "can't fill inet/inet6 identifier for server"); - goto err; - } - - ret = fill_inet6_inet_identifiers (this, - &this->peerinfo.sockaddr, - this->peerinfo.sockaddr_len, - this->peerinfo.identifier); - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "can't fill inet/inet6 identifier for client"); - goto err; - } - - if (is_inet_sdp) { - ((struct sockaddr *) &this->peerinfo.sockaddr)->sa_family = ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family = AF_INET_SDP; - } - } - break; - - case AF_UNIX: - { - struct sockaddr_un *sunaddr = NULL; - - sunaddr = (struct sockaddr_un *) &this->myinfo.sockaddr; - strcpy (this->myinfo.identifier, sunaddr->sun_path); - - sunaddr = (struct sockaddr_un *) &this->peerinfo.sockaddr; - strcpy (this->peerinfo.identifier, sunaddr->sun_path); - } - break; - - default: - gf_log (this->xl->name, GF_LOG_ERROR, - "unknown address family (%d)", - ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family); - ret = -1; - break; - } - -err: - return ret; -} diff --git a/transport/ib-verbs/src/name.h b/transport/ib-verbs/src/name.h deleted file mode 100644 index 4f0f4771..00000000 --- a/transport/ib-verbs/src/name.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - Copyright (c) 2008-2009 Gluster, Inc. - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - . -*/ - -#ifndef _IB_VERBS_NAME_H -#define _IB_VERBS_NAME_H - -#include -#include - -#include "compat.h" - -int32_t -client_bind (transport_t *this, - struct sockaddr *sockaddr, - socklen_t *sockaddr_len, - int sock); - -int32_t -ibverbs_client_get_remote_sockaddr (transport_t *this, - struct sockaddr *sockaddr, - socklen_t *sockaddr_len); - -int32_t -ibverbs_server_get_local_sockaddr (transport_t *this, - struct sockaddr *addr, - socklen_t *addr_len); - -int32_t -get_transport_identifiers (transport_t *this); - -#endif /* _IB_VERBS_NAME_H */ diff --git a/transport/socket/Makefile.am b/transport/socket/Makefile.am deleted file mode 100644 index f963effe..00000000 --- a/transport/socket/Makefile.am +++ /dev/null @@ -1 +0,0 @@ -SUBDIRS = src \ No newline at end of file diff --git a/transport/socket/src/Makefile.am b/transport/socket/src/Makefile.am deleted file mode 100644 index 1832587a..00000000 --- a/transport/socket/src/Makefile.am +++ /dev/null @@ -1,14 +0,0 @@ -noinst_HEADERS = socket.h name.h - -transport_LTLIBRARIES = socket.la -transportdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/transport - -socket_la_LDFLAGS = -module -avoidversion - -socket_la_SOURCES = socket.c name.c -socket_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la - -AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) - -CLEANFILES = *~ diff --git a/transport/socket/src/name.c b/transport/socket/src/name.c deleted file mode 100644 index 120a669c..00000000 --- a/transport/socket/src/name.c +++ /dev/null @@ -1,737 +0,0 @@ -/* - Copyright (c) 2008-2009 Gluster, Inc. - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - . -*/ - -#include -#include -#include -#include -#include -#include - -#ifdef CLIENT_PORT_CEILING -#undef CLIENT_PORT_CEILING -#endif - -#define CLIENT_PORT_CEILING 1024 - -#ifndef AF_INET_SDP -#define AF_INET_SDP 27 -#endif - -#include "transport.h" -#include "socket.h" - -int32_t -gf_resolve_ip6 (const char *hostname, - uint16_t port, - int family, - void **dnscache, - struct addrinfo **addr_info); - -static int32_t -af_inet_bind_to_port_lt_ceiling (int fd, struct sockaddr *sockaddr, - socklen_t sockaddr_len, int ceiling) -{ - int32_t ret = -1; - /* struct sockaddr_in sin = {0, }; */ - uint16_t port = ceiling - 1; - - while (port) - { - switch (sockaddr->sa_family) - { - case AF_INET6: - ((struct sockaddr_in6 *)sockaddr)->sin6_port = htons (port); - break; - - case AF_INET_SDP: - case AF_INET: - ((struct sockaddr_in *)sockaddr)->sin_port = htons (port); - break; - } - - ret = bind (fd, sockaddr, sockaddr_len); - - if (ret == 0) - break; - - if (ret == -1 && errno == EACCES) - break; - - port--; - } - - return ret; -} - -static int32_t -af_unix_client_bind (transport_t *this, - struct sockaddr *sockaddr, - socklen_t sockaddr_len, - int sock) -{ - data_t *path_data = NULL; - struct sockaddr_un *addr = NULL; - int32_t ret = 0; - - path_data = dict_get (this->xl->options, "transport.socket.bind-path"); - if (path_data) { - char *path = data_to_str (path_data); - if (!path || strlen (path) > UNIX_PATH_MAX) { - gf_log (this->xl->name, GF_LOG_TRACE, - "bind-path not specfied for unix socket, " - "letting connect to assign default value"); - goto err; - } - - addr = (struct sockaddr_un *) sockaddr; - strcpy (addr->sun_path, path); - ret = bind (sock, (struct sockaddr *)addr, sockaddr_len); - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "cannot bind to unix-domain socket %d (%s)", - sock, strerror (errno)); - goto err; - } - } else { - gf_log (this->xl->name, GF_LOG_TRACE, - "bind-path not specfied for unix socket, " - "letting connect to assign default value"); - } - -err: - return ret; -} - -int32_t -client_fill_address_family (transport_t *this, sa_family_t *sa_family) -{ - data_t *address_family_data = NULL; - int32_t ret = -1; - - if (sa_family == NULL) { - goto out; - } - - address_family_data = dict_get (this->xl->options, - "transport.address-family"); - if (!address_family_data) { - data_t *remote_host_data = NULL, *connect_path_data = NULL; - remote_host_data = dict_get (this->xl->options, "remote-host"); - connect_path_data = dict_get (this->xl->options, - "transport.socket.connect-path"); - - if (!(remote_host_data || connect_path_data) || - (remote_host_data && connect_path_data)) { - gf_log (this->xl->name, GF_LOG_ERROR, - "transport.address-family not specified and " - "not able to determine the " - "same from other options (remote-host:%s and " - "transport.unix.connect-path:%s)", - data_to_str (remote_host_data), - data_to_str (connect_path_data)); - goto out; - } - - if (remote_host_data) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "address-family not specified, guessing it " - "to be inet/inet6"); - *sa_family = AF_UNSPEC; - } else { - gf_log (this->xl->name, GF_LOG_DEBUG, - "address-family not specified, guessing it " - "to be unix"); - *sa_family = AF_UNIX; - } - - } else { - char *address_family = data_to_str (address_family_data); - if (!strcasecmp (address_family, "unix")) { - *sa_family = AF_UNIX; - } else if (!strcasecmp (address_family, "inet")) { - *sa_family = AF_INET; - } else if (!strcasecmp (address_family, "inet6")) { - *sa_family = AF_INET6; - } else if (!strcasecmp (address_family, "inet-sdp")) { - *sa_family = AF_INET_SDP; - } else if (!strcasecmp (address_family, "inet/inet6") - || !strcasecmp (address_family, "inet6/inet")) { - *sa_family = AF_UNSPEC; - } else { - gf_log (this->xl->name, GF_LOG_ERROR, - "unknown address-family (%s) specified", - address_family); - goto out; - } - } - - ret = 0; - -out: - return ret; -} - -static int32_t -af_inet_client_get_remote_sockaddr (transport_t *this, - struct sockaddr *sockaddr, - socklen_t *sockaddr_len) -{ - dict_t *options = this->xl->options; - data_t *remote_host_data = NULL; - data_t *remote_port_data = NULL; - char *remote_host = NULL; - uint16_t remote_port = 0; - struct addrinfo *addr_info = NULL; - int32_t ret = 0; - - remote_host_data = dict_get (options, "remote-host"); - if (remote_host_data == NULL) - { - gf_log (this->xl->name, GF_LOG_ERROR, - "option remote-host missing in volume %s", this->xl->name); - ret = -1; - goto err; - } - - remote_host = data_to_str (remote_host_data); - if (remote_host == NULL) - { - gf_log (this->xl->name, GF_LOG_ERROR, - "option remote-host has data NULL in volume %s", this->xl->name); - ret = -1; - goto err; - } - - remote_port_data = dict_get (options, "remote-port"); - if (remote_port_data == NULL) - { - gf_log (this->xl->name, GF_LOG_TRACE, - "option remote-port missing in volume %s. Defaulting to %d", - this->xl->name, GF_DEFAULT_SOCKET_LISTEN_PORT); - - remote_port = GF_DEFAULT_SOCKET_LISTEN_PORT; - } - else - { - remote_port = data_to_uint16 (remote_port_data); - } - - if (remote_port == (uint16_t)-1) - { - gf_log (this->xl->name, GF_LOG_ERROR, - "option remote-port has invalid port in volume %s", - this->xl->name); - ret = -1; - goto err; - } - - /* TODO: gf_resolve is a blocking call. kick in some - non blocking dns techniques */ - ret = gf_resolve_ip6 (remote_host, remote_port, - sockaddr->sa_family, &this->dnscache, &addr_info); - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "DNS resolution failed on host %s", remote_host); - goto err; - } - - memcpy (sockaddr, addr_info->ai_addr, addr_info->ai_addrlen); - *sockaddr_len = addr_info->ai_addrlen; - -err: - return ret; -} - -static int32_t -af_unix_client_get_remote_sockaddr (transport_t *this, - struct sockaddr *sockaddr, - socklen_t *sockaddr_len) -{ - struct sockaddr_un *sockaddr_un = NULL; - char *connect_path = NULL; - data_t *connect_path_data = NULL; - int32_t ret = 0; - - connect_path_data = dict_get (this->xl->options, - "transport.socket.connect-path"); - if (!connect_path_data) { - gf_log (this->xl->name, GF_LOG_ERROR, - "option transport.unix.connect-path not specified for " - "address-family unix"); - ret = -1; - goto err; - } - - connect_path = data_to_str (connect_path_data); - if (!connect_path) { - gf_log (this->xl->name, GF_LOG_ERROR, - "transport.unix.connect-path is null-string"); - ret = -1; - goto err; - } - - if (strlen (connect_path) > UNIX_PATH_MAX) { - gf_log (this->xl->name, GF_LOG_ERROR, - "connect-path value length %"GF_PRI_SIZET" > %d octets", - strlen (connect_path), UNIX_PATH_MAX); - ret = -1; - goto err; - } - - gf_log (this->xl->name, GF_LOG_TRACE, - "using connect-path %s", connect_path); - sockaddr_un = (struct sockaddr_un *)sockaddr; - strcpy (sockaddr_un->sun_path, connect_path); - *sockaddr_len = sizeof (struct sockaddr_un); - -err: - return ret; -} - -static int32_t -af_unix_server_get_local_sockaddr (transport_t *this, - struct sockaddr *addr, - socklen_t *addr_len) -{ - data_t *listen_path_data = NULL; - char *listen_path = NULL; - int32_t ret = 0; - struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; - - - listen_path_data = dict_get (this->xl->options, - "transport.socket.listen-path"); - if (!listen_path_data) { - gf_log (this->xl->name, GF_LOG_ERROR, - "missing option transport.socket.listen-path"); - ret = -1; - goto err; - } - - listen_path = data_to_str (listen_path_data); - -#ifndef UNIX_PATH_MAX -#define UNIX_PATH_MAX 108 -#endif - - if (strlen (listen_path) > UNIX_PATH_MAX) { - gf_log (this->xl->name, GF_LOG_ERROR, - "option transport.unix.listen-path has value length " - "%"GF_PRI_SIZET" > %d", - strlen (listen_path), UNIX_PATH_MAX); - ret = -1; - goto err; - } - - sunaddr->sun_family = AF_UNIX; - strcpy (sunaddr->sun_path, listen_path); - *addr_len = sizeof (struct sockaddr_un); - -err: - return ret; -} - -static int32_t -af_inet_server_get_local_sockaddr (transport_t *this, - struct sockaddr *addr, - socklen_t *addr_len) -{ - struct addrinfo hints, *res = 0; - data_t *listen_port_data = NULL, *listen_host_data = NULL; - uint16_t listen_port = -1; - char service[NI_MAXSERV], *listen_host = NULL; - dict_t *options = NULL; - int32_t ret = 0; - - options = this->xl->options; - - listen_port_data = dict_get (options, "transport.socket.listen-port"); - listen_host_data = dict_get (options, "transport.socket.bind-address"); - - if (listen_port_data) - { - listen_port = data_to_uint16 (listen_port_data); - } - - if (listen_port == (uint16_t) -1) - listen_port = GF_DEFAULT_SOCKET_LISTEN_PORT; - - - if (listen_host_data) - { - listen_host = data_to_str (listen_host_data); - } else { - if (addr->sa_family == AF_INET6) { - struct sockaddr_in6 *in = (struct sockaddr_in6 *) addr; - in->sin6_addr = in6addr_any; - in->sin6_port = htons(listen_port); - *addr_len = sizeof(struct sockaddr_in6); - goto out; - } else if (addr->sa_family == AF_INET) { - struct sockaddr_in *in = (struct sockaddr_in *) addr; - in->sin_addr.s_addr = htonl(INADDR_ANY); - in->sin_port = htons(listen_port); - *addr_len = sizeof(struct sockaddr_in); - goto out; - } - } - - memset (service, 0, sizeof (service)); - sprintf (service, "%d", listen_port); - - memset (&hints, 0, sizeof (hints)); - hints.ai_family = addr->sa_family; - hints.ai_socktype = SOCK_STREAM; - hints.ai_flags = AI_ADDRCONFIG | AI_PASSIVE; - - ret = getaddrinfo(listen_host, service, &hints, &res); - if (ret != 0) { - gf_log (this->xl->name, GF_LOG_ERROR, - "getaddrinfo failed for host %s, service %s (%s)", - listen_host, service, gai_strerror (ret)); - ret = -1; - goto out; - } - - memcpy (addr, res->ai_addr, res->ai_addrlen); - *addr_len = res->ai_addrlen; - - freeaddrinfo (res); - -out: - return ret; -} - -int32_t -client_bind (transport_t *this, - struct sockaddr *sockaddr, - socklen_t *sockaddr_len, - int sock) -{ - int ret = 0; - - *sockaddr_len = sizeof (struct sockaddr_in6); - switch (sockaddr->sa_family) - { - case AF_INET_SDP: - case AF_INET: - *sockaddr_len = sizeof (struct sockaddr_in); - - case AF_INET6: - ret = af_inet_bind_to_port_lt_ceiling (sock, sockaddr, - *sockaddr_len, CLIENT_PORT_CEILING); - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_WARNING, - "cannot bind inet socket (%d) to port less than %d (%s)", - sock, CLIENT_PORT_CEILING, strerror (errno)); - ret = 0; - } - break; - - case AF_UNIX: - *sockaddr_len = sizeof (struct sockaddr_un); - ret = af_unix_client_bind (this, (struct sockaddr *)sockaddr, - *sockaddr_len, sock); - break; - - default: - gf_log (this->xl->name, GF_LOG_ERROR, - "unknown address family %d", sockaddr->sa_family); - ret = -1; - break; - } - - return ret; -} - -int32_t -socket_client_get_remote_sockaddr (transport_t *this, - struct sockaddr *sockaddr, - socklen_t *sockaddr_len, - sa_family_t *sa_family) -{ - int32_t ret = 0; - - if ((sockaddr == NULL) || (sockaddr_len == NULL) - || (sa_family == NULL)) { - ret = -1; - goto err; - } - - - ret = client_fill_address_family (this, &sockaddr->sa_family); - if (ret) { - ret = -1; - goto err; - } - - *sa_family = sockaddr->sa_family; - - switch (sockaddr->sa_family) - { - case AF_INET_SDP: - sockaddr->sa_family = AF_INET; - - case AF_INET: - case AF_INET6: - case AF_UNSPEC: - ret = af_inet_client_get_remote_sockaddr (this, sockaddr, - sockaddr_len); - break; - - case AF_UNIX: - ret = af_unix_client_get_remote_sockaddr (this, sockaddr, - sockaddr_len); - break; - - default: - gf_log (this->xl->name, GF_LOG_ERROR, - "unknown address-family %d", sockaddr->sa_family); - ret = -1; - } - - if (*sa_family == AF_UNSPEC) { - *sa_family = sockaddr->sa_family; - } - -err: - return ret; -} - - -int32_t -server_fill_address_family (transport_t *this, sa_family_t *sa_family) -{ - data_t *address_family_data = NULL; - int32_t ret = -1; - - if (sa_family == NULL) { - goto out; - } - - address_family_data = dict_get (this->xl->options, - "transport.address-family"); - if (address_family_data) { - char *address_family = NULL; - address_family = data_to_str (address_family_data); - - if (!strcasecmp (address_family, "inet")) { - *sa_family = AF_INET; - } else if (!strcasecmp (address_family, "inet6")) { - *sa_family = AF_INET6; - } else if (!strcasecmp (address_family, "inet-sdp")) { - *sa_family = AF_INET_SDP; - } else if (!strcasecmp (address_family, "unix")) { - *sa_family = AF_UNIX; - } else if (!strcasecmp (address_family, "inet/inet6") - || !strcasecmp (address_family, "inet6/inet")) { - *sa_family = AF_UNSPEC; - } else { - gf_log (this->xl->name, GF_LOG_ERROR, - "unknown address family (%s) specified", address_family); - goto out; - } - } else { - gf_log (this->xl->name, GF_LOG_DEBUG, - "option address-family not specified, defaulting to inet/inet6"); - *sa_family = AF_UNSPEC; - } - - ret = 0; -out: - return ret; -} - - -int32_t -socket_server_get_local_sockaddr (transport_t *this, struct sockaddr *addr, - socklen_t *addr_len, sa_family_t *sa_family) -{ - int32_t ret = -1; - - if ((addr == NULL) || (addr_len == NULL) || (sa_family == NULL)) { - goto err; - } - - ret = server_fill_address_family (this, &addr->sa_family); - if (ret == -1) { - goto err; - } - - *sa_family = addr->sa_family; - - switch (addr->sa_family) - { - case AF_INET_SDP: - addr->sa_family = AF_INET; - - case AF_INET: - case AF_INET6: - case AF_UNSPEC: - ret = af_inet_server_get_local_sockaddr (this, addr, addr_len); - break; - - case AF_UNIX: - ret = af_unix_server_get_local_sockaddr (this, addr, addr_len); - break; - } - - if (*sa_family == AF_UNSPEC) { - *sa_family = addr->sa_family; - } - -err: - return ret; -} - -int32_t -fill_inet6_inet_identifiers (transport_t *this, struct sockaddr_storage *addr, - int32_t addr_len, char *identifier) -{ - int32_t ret = 0, tmpaddr_len = 0; - char service[NI_MAXSERV], host[NI_MAXHOST]; - struct sockaddr_storage tmpaddr; - - memset (&tmpaddr, 0, sizeof (tmpaddr)); - tmpaddr = *addr; - tmpaddr_len = addr_len; - - if (((struct sockaddr *) &tmpaddr)->sa_family == AF_INET6) { - int32_t one_to_four, four_to_eight, twelve_to_sixteen; - int16_t eight_to_ten, ten_to_twelve; - - one_to_four = four_to_eight = twelve_to_sixteen = 0; - eight_to_ten = ten_to_twelve = 0; - - one_to_four = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr32[0]; - four_to_eight = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr32[1]; -#ifdef GF_SOLARIS_HOST_OS - eight_to_ten = S6_ADDR16(((struct sockaddr_in6 *) &tmpaddr)->sin6_addr)[4]; -#else - eight_to_ten = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr16[4]; -#endif - -#ifdef GF_SOLARIS_HOST_OS - ten_to_twelve = S6_ADDR16(((struct sockaddr_in6 *) &tmpaddr)->sin6_addr)[5]; -#else - ten_to_twelve = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr16[5]; -#endif - - twelve_to_sixteen = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr32[3]; - - /* ipv4 mapped ipv6 address has - bits 0-80: 0 - bits 80-96: 0xffff - bits 96-128: ipv4 address - */ - - if (one_to_four == 0 && - four_to_eight == 0 && - eight_to_ten == 0 && - ten_to_twelve == -1) { - struct sockaddr_in *in_ptr = (struct sockaddr_in *)&tmpaddr; - memset (&tmpaddr, 0, sizeof (tmpaddr)); - - in_ptr->sin_family = AF_INET; - in_ptr->sin_port = ((struct sockaddr_in6 *)addr)->sin6_port; - in_ptr->sin_addr.s_addr = twelve_to_sixteen; - tmpaddr_len = sizeof (*in_ptr); - } - } - - ret = getnameinfo ((struct sockaddr *) &tmpaddr, - tmpaddr_len, - host, sizeof (host), - service, sizeof (service), - NI_NUMERICHOST | NI_NUMERICSERV); - if (ret != 0) { - gf_log (this->xl->name, GF_LOG_ERROR, - "getnameinfo failed (%s)", gai_strerror (ret)); - } - - sprintf (identifier, "%s:%s", host, service); - - return ret; -} - -int32_t -get_transport_identifiers (transport_t *this) -{ - int32_t ret = 0; - char is_inet_sdp = 0; - - switch (((struct sockaddr *) &this->myinfo.sockaddr)->sa_family) - { - case AF_INET_SDP: - is_inet_sdp = 1; - ((struct sockaddr *) &this->peerinfo.sockaddr)->sa_family = ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family = AF_INET; - - case AF_INET: - case AF_INET6: - { - ret = fill_inet6_inet_identifiers (this, - &this->myinfo.sockaddr, - this->myinfo.sockaddr_len, - this->myinfo.identifier); - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "cannot fill inet/inet6 identifier for server"); - goto err; - } - - ret = fill_inet6_inet_identifiers (this, - &this->peerinfo.sockaddr, - this->peerinfo.sockaddr_len, - this->peerinfo.identifier); - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "cannot fill inet/inet6 identifier for client"); - goto err; - } - - if (is_inet_sdp) { - ((struct sockaddr *) &this->peerinfo.sockaddr)->sa_family = ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family = AF_INET_SDP; - } - } - break; - - case AF_UNIX: - { - struct sockaddr_un *sunaddr = NULL; - - sunaddr = (struct sockaddr_un *) &this->myinfo.sockaddr; - strcpy (this->myinfo.identifier, sunaddr->sun_path); - - sunaddr = (struct sockaddr_un *) &this->peerinfo.sockaddr; - strcpy (this->peerinfo.identifier, sunaddr->sun_path); - } - break; - - default: - gf_log (this->xl->name, GF_LOG_ERROR, - "unknown address family (%d)", - ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family); - ret = -1; - break; - } - -err: - return ret; -} diff --git a/transport/socket/src/name.h b/transport/socket/src/name.h deleted file mode 100644 index f50a7b7f..00000000 --- a/transport/socket/src/name.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - Copyright (c) 2008-2009 Gluster, Inc. - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - . -*/ - -#ifndef _SOCKET_NAME_H -#define _SOCKET_NAME_H - -#include "compat.h" - -int32_t -client_bind (transport_t *this, - struct sockaddr *sockaddr, - socklen_t *sockaddr_len, - int sock); - -int32_t -socket_client_get_remote_sockaddr (transport_t *this, - struct sockaddr *sockaddr, - socklen_t *sockaddr_len, - sa_family_t *sa_family); - -int32_t -socket_server_get_local_sockaddr (transport_t *this, struct sockaddr *addr, - socklen_t *addr_len, sa_family_t *sa_family); - -int32_t -get_transport_identifiers (transport_t *this); - -#endif /* _SOCKET_NAME_H */ diff --git a/transport/socket/src/socket-mem-types.h b/transport/socket/src/socket-mem-types.h deleted file mode 100644 index f50f4a75..00000000 --- a/transport/socket/src/socket-mem-types.h +++ /dev/null @@ -1,36 +0,0 @@ - -/* - Copyright (c) 2008-2009 Gluster, Inc. - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - . -*/ - - -#ifndef __SOCKET_MEM_TYPES_H__ -#define __SOCKET_MEM_TYPES_H__ - -#include "mem-types.h" - -enum gf_socket_mem_types_ { - gf_socket_mt_socket_private_t = gf_common_mt_end + 1, - gf_socket_mt_ioq, - gf_socket_mt_transport_t, - gf_socket_mt_socket_local_t, - gf_socket_mt_char, - gf_socket_mt_end -}; -#endif - diff --git a/transport/socket/src/socket.c b/transport/socket/src/socket.c deleted file mode 100644 index 7f7f8093..00000000 --- a/transport/socket/src/socket.c +++ /dev/null @@ -1,1552 +0,0 @@ -/* - Copyright (c) 2008-2009 Gluster, Inc. - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - . -*/ - - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "socket.h" -#include "name.h" -#include "dict.h" -#include "transport.h" -#include "logging.h" -#include "xlator.h" -#include "byte-order.h" -#include "common-utils.h" -#include "compat-errno.h" - -#include -#include -#include - - -#define GF_LOG_ERRNO(errno) ((errno == ENOTCONN) ? GF_LOG_DEBUG : GF_LOG_ERROR) -#define SA(ptr) ((struct sockaddr *)ptr) - -int socket_init (transport_t *this); - -/* - * return value: - * 0 = success (completed) - * -1 = error - * > 0 = incomplete - */ - -int -__socket_rwv (transport_t *this, struct iovec *vector, int count, - struct iovec **pending_vector, int *pending_count, - int write) -{ - socket_private_t *priv = NULL; - int sock = -1; - int ret = -1; - struct iovec *opvector = NULL; - int opcount = 0; - int moved = 0; - - priv = this->private; - sock = priv->sock; - - opvector = vector; - opcount = count; - - while (opcount) { - if (write) { - ret = writev (sock, opvector, opcount); - - if (ret == 0 || (ret == -1 && errno == EAGAIN)) { - /* done for now */ - break; - } - } else { - ret = readv (sock, opvector, opcount); - - if (ret == -1 && errno == EAGAIN) { - /* done for now */ - break; - } - } - - if (ret == 0) { - /* Mostly due to 'umount' in client */ - gf_log (this->xl->name, GF_LOG_TRACE, - "EOF from peer %s", this->peerinfo.identifier); - opcount = -1; - errno = ENOTCONN; - break; - } - - if (ret == -1) { - if (errno == EINTR) - continue; - - gf_log (this->xl->name, GF_LOG_TRACE, - "%s failed (%s)", write ? "writev" : "readv", - strerror (errno)); - opcount = -1; - break; - } - - moved = 0; - - while (moved < ret) { - if ((ret - moved) >= opvector[0].iov_len) { - moved += opvector[0].iov_len; - opvector++; - opcount--; - } else { - opvector[0].iov_len -= (ret - moved); - opvector[0].iov_base += (ret - moved); - moved += (ret - moved); - } - while (opcount && !opvector[0].iov_len) { - opvector++; - opcount--; - } - } - } - - if (pending_vector) - *pending_vector = opvector; - - if (pending_count) - *pending_count = opcount; - - return opcount; -} - - -int -__socket_readv (transport_t *this, struct iovec *vector, int count, - struct iovec **pending_vector, int *pending_count) -{ - int ret = -1; - - ret = __socket_rwv (this, vector, count, - pending_vector, pending_count, 0); - - return ret; -} - - -int -__socket_writev (transport_t *this, struct iovec *vector, int count, - struct iovec **pending_vector, int *pending_count) -{ - int ret = -1; - - ret = __socket_rwv (this, vector, count, - pending_vector, pending_count, 1); - - return ret; -} - - -int -__socket_disconnect (transport_t *this) -{ - socket_private_t *priv = NULL; - int ret = -1; - - priv = this->private; - - if (priv->sock != -1) { - ret = shutdown (priv->sock, SHUT_RDWR); - priv->connected = -1; - gf_log (this->xl->name, GF_LOG_TRACE, - "shutdown() returned %d. set connection state to -1", - ret); - } - - return ret; -} - - -int -__socket_server_bind (transport_t *this) -{ - socket_private_t *priv = NULL; - int ret = -1; - int opt = 1; - - priv = this->private; - - ret = setsockopt (priv->sock, SOL_SOCKET, SO_REUSEADDR, - &opt, sizeof (opt)); - - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "setsockopt() for SO_REUSEADDR failed (%s)", - strerror (errno)); - } - - ret = bind (priv->sock, (struct sockaddr *)&this->myinfo.sockaddr, - this->myinfo.sockaddr_len); - - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "binding to %s failed: %s", - this->myinfo.identifier, strerror (errno)); - if (errno == EADDRINUSE) { - gf_log (this->xl->name, GF_LOG_ERROR, - "Port is already in use"); - } - } - - return ret; -} - - -int -__socket_nonblock (int fd) -{ - int flags = 0; - int ret = -1; - - flags = fcntl (fd, F_GETFL); - - if (flags != -1) - ret = fcntl (fd, F_SETFL, flags | O_NONBLOCK); - - return ret; -} - - -int -__socket_nodelay (int fd) -{ - int on = 1; - int ret = -1; - - ret = setsockopt (fd, IPPROTO_TCP, TCP_NODELAY, - &on, sizeof (on)); - if (!ret) - gf_log ("", GF_LOG_TRACE, - "NODELAY enabled for socket %d", fd); - - return ret; -} - -int -__socket_connect_finish (int fd) -{ - int ret = -1; - int optval = 0; - socklen_t optlen = sizeof (int); - - ret = getsockopt (fd, SOL_SOCKET, SO_ERROR, (void *)&optval, &optlen); - - if (ret == 0 && optval) { - errno = optval; - ret = -1; - } - - return ret; -} - - -void -__socket_reset (transport_t *this) -{ - socket_private_t *priv = NULL; - - priv = this->private; - - /* TODO: use mem-pool on incoming data */ - - if (priv->incoming.hdr_p) - GF_FREE (priv->incoming.hdr_p); - - if (priv->incoming.iobuf) - iobuf_unref (priv->incoming.iobuf); - - memset (&priv->incoming, 0, sizeof (priv->incoming)); - - event_unregister (this->xl->ctx->event_pool, priv->sock, priv->idx); - close (priv->sock); - priv->sock = -1; - priv->idx = -1; - priv->connected = -1; -} - - -struct ioq * -__socket_ioq_new (transport_t *this, char *buf, int len, - struct iovec *vector, int count, struct iobref *iobref) -{ - socket_private_t *priv = NULL; - struct ioq *entry = NULL; - - priv = this->private; - - /* TODO: use mem-pool */ - entry = GF_CALLOC (1, sizeof (*entry), - gf_common_mt_ioq); - if (!entry) - return NULL; - - assert (count <= (MAX_IOVEC-2)); - - entry->header.colonO[0] = ':'; - entry->header.colonO[1] = 'O'; - entry->header.colonO[2] = '\0'; - entry->header.version = 42; - entry->header.size1 = hton32 (len); - entry->header.size2 = hton32 (iov_length (vector, count)); - - entry->vector[0].iov_base = &entry->header; - entry->vector[0].iov_len = sizeof (entry->header); - entry->count++; - - entry->vector[1].iov_base = buf; - entry->vector[1].iov_len = len; - entry->count++; - - if (vector && count) { - memcpy (&entry->vector[2], vector, sizeof (*vector) * count); - entry->count += count; - } - - entry->pending_vector = entry->vector; - entry->pending_count = entry->count; - - if (iobref) - entry->iobref = iobref_ref (iobref); - - entry->buf = buf; - - INIT_LIST_HEAD (&entry->list); - - return entry; -} - - -void -__socket_ioq_entry_free (struct ioq *entry) -{ - list_del_init (&entry->list); - if (entry->iobref) - iobref_unref (entry->iobref); - - /* TODO: use mem-pool */ - GF_FREE (entry->buf); - - /* TODO: use mem-pool */ - GF_FREE (entry); -} - - -void -__socket_ioq_flush (transport_t *this) -{ - socket_private_t *priv = NULL; - struct ioq *entry = NULL; - - priv = this->private; - - while (!list_empty (&priv->ioq)) { - entry = priv->ioq_next; - __socket_ioq_entry_free (entry); - } - - return; -} - - -int -__socket_ioq_churn_entry (transport_t *this, struct ioq *entry) -{ - int ret = -1; - - ret = __socket_writev (this, entry->pending_vector, - entry->pending_count, - &entry->pending_vector, - &entry->pending_count); - - if (ret == 0) { - /* current entry was completely written */ - assert (entry->pending_count == 0); - __socket_ioq_entry_free (entry); - } - - return ret; -} - - -int -__socket_ioq_churn (transport_t *this) -{ - socket_private_t *priv = NULL; - int ret = 0; - struct ioq *entry = NULL; - - priv = this->private; - - while (!list_empty (&priv->ioq)) { - /* pick next entry */ - entry = priv->ioq_next; - - ret = __socket_ioq_churn_entry (this, entry); - - if (ret != 0) - break; - } - - if (list_empty (&priv->ioq)) { - /* all pending writes done, not interested in POLLOUT */ - priv->idx = event_select_on (this->xl->ctx->event_pool, - priv->sock, priv->idx, -1, 0); - } - - return ret; -} - - -int -socket_event_poll_err (transport_t *this) -{ - socket_private_t *priv = NULL; - int ret = -1; - - priv = this->private; - - pthread_mutex_lock (&priv->lock); - { - __socket_ioq_flush (this); - __socket_reset (this); - } - pthread_mutex_unlock (&priv->lock); - - xlator_notify (this->xl, GF_EVENT_POLLERR, this); - - return ret; -} - - -int -socket_event_poll_out (transport_t *this) -{ - socket_private_t *priv = NULL; - int ret = -1; - - priv = this->private; - - pthread_mutex_lock (&priv->lock); - { - if (priv->connected == 1) { - ret = __socket_ioq_churn (this); - - if (ret == -1) { - __socket_disconnect (this); - } - } - } - pthread_mutex_unlock (&priv->lock); - - xlator_notify (this->xl, GF_EVENT_POLLOUT, this); - - return ret; -} - - -int -__socket_proto_validate_header (transport_t *this, - struct socket_header *header, - size_t *size1_p, size_t *size2_p) -{ - size_t size1 = 0; - size_t size2 = 0; - - if (strcmp (header->colonO, ":O")) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "socket header signature does not match :O (%x.%x.%x)", - header->colonO[0], header->colonO[1], - header->colonO[2]); - return -1; - } - - if (header->version != 42) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "socket header version does not match 42 != %d", - header->version); - return -1; - } - - size1 = ntoh32 (header->size1); - size2 = ntoh32 (header->size2); - - if (size1 <= 0 || size1 > 1048576) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "socket header has incorrect size1=%"GF_PRI_SIZET, - size1); - return -1; - } - - if (size2 > (131072)) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "socket header has incorrect size2=%"GF_PRI_SIZET, - size2); - return -1; - } - - if (size1_p) - *size1_p = size1; - - if (size2_p) - *size2_p = size2; - - return 0; -} - - - -/* socket protocol state machine */ - -int -__socket_proto_state_machine (transport_t *this) -{ - int ret = -1; - socket_private_t *priv = NULL; - size_t size1 = 0; - size_t size2 = 0; - int previous_state = -1; - struct socket_header *hdr = NULL; - struct iobuf *iobuf = NULL; - - - priv = this->private; - - while (priv->incoming.state != SOCKET_PROTO_STATE_COMPLETE) { - /* debug check against infinite loops */ - if (previous_state == priv->incoming.state) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "state did not change! (%d) breaking", - previous_state); - ret = -1; - goto unlock; - } - previous_state = priv->incoming.state; - - switch (priv->incoming.state) { - - case SOCKET_PROTO_STATE_NADA: - priv->incoming.pending_vector = - priv->incoming.vector; - - priv->incoming.pending_vector->iov_base = - &priv->incoming.header; - - priv->incoming.pending_vector->iov_len = - sizeof (struct socket_header); - - priv->incoming.state = - SOCKET_PROTO_STATE_HEADER_COMING; - break; - - case SOCKET_PROTO_STATE_HEADER_COMING: - - ret = __socket_readv (this, - priv->incoming.pending_vector, 1, - &priv->incoming.pending_vector, - NULL); - if (ret == 0) { - priv->incoming.state = - SOCKET_PROTO_STATE_HEADER_CAME; - break; - } - - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_TRACE, - "read (%s) in state %d (%s)", - strerror (errno), - SOCKET_PROTO_STATE_HEADER_COMING, - this->peerinfo.identifier); - goto unlock; - } - - if (ret > 0) { - gf_log (this->xl->name, GF_LOG_TRACE, - "partial header read on NB socket."); - goto unlock; - } - break; - - case SOCKET_PROTO_STATE_HEADER_CAME: - hdr = &priv->incoming.header; - ret = __socket_proto_validate_header (this, hdr, - &size1, &size2); - - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "socket header validate failed (%s). " - "possible mismatch of transport-type " - "between server and client volumes, " - "or version mismatch", - this->peerinfo.identifier); - goto unlock; - } - - priv->incoming.hdrlen = size1; - priv->incoming.buflen = size2; - - /* TODO: use mem-pool */ - priv->incoming.hdr_p = GF_MALLOC (size1, - gf_common_mt_char); - if (size2) { - /* TODO: sanity check size2 < page size - */ - iobuf = iobuf_get (this->xl->ctx->iobuf_pool); - if (!iobuf) { - gf_log (this->xl->name, GF_LOG_ERROR, - "unable to allocate IO buffer " - "for peer %s", - this->peerinfo.identifier); - ret = -ENOMEM; - goto unlock; - } - priv->incoming.iobuf = iobuf; - priv->incoming.buf_p = iobuf->ptr; - } - - priv->incoming.vector[0].iov_base = - priv->incoming.hdr_p; - - priv->incoming.vector[0].iov_len = size1; - - priv->incoming.vector[1].iov_base = - priv->incoming.buf_p; - - priv->incoming.vector[1].iov_len = size2; - priv->incoming.count = size2 ? 2 : 1; - - priv->incoming.pending_vector = - priv->incoming.vector; - - priv->incoming.pending_count = - priv->incoming.count; - - priv->incoming.state = - SOCKET_PROTO_STATE_DATA_COMING; - break; - - case SOCKET_PROTO_STATE_DATA_COMING: - - ret = __socket_readv (this, - priv->incoming.pending_vector, - priv->incoming.pending_count, - &priv->incoming.pending_vector, - &priv->incoming.pending_count); - if (ret == 0) { - priv->incoming.state = - SOCKET_PROTO_STATE_DATA_CAME; - break; - } - - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "read (%s) in state %d (%s)", - strerror (errno), - SOCKET_PROTO_STATE_DATA_COMING, - this->peerinfo.identifier); - goto unlock; - } - - if (ret > 0) { - gf_log (this->xl->name, GF_LOG_TRACE, - "partial data read on NB socket"); - goto unlock; - } - break; - - case SOCKET_PROTO_STATE_DATA_CAME: - memset (&priv->incoming.vector, 0, - sizeof (priv->incoming.vector)); - priv->incoming.pending_vector = NULL; - priv->incoming.pending_count = 0; - priv->incoming.state = SOCKET_PROTO_STATE_COMPLETE; - break; - - case SOCKET_PROTO_STATE_COMPLETE: - /* not reached */ - break; - - default: - gf_log (this->xl->name, GF_LOG_DEBUG, - "undefined state reached: %d", - priv->incoming.state); - goto unlock; - } - } -unlock: - - return ret; -} - - -int -socket_proto_state_machine (transport_t *this) -{ - socket_private_t *priv = NULL; - int ret = 0; - - priv = this->private; - - pthread_mutex_lock (&priv->lock); - { - ret = __socket_proto_state_machine (this); - } - pthread_mutex_unlock (&priv->lock); - - return ret; -} - - -int -socket_event_poll_in (transport_t *this) -{ - int ret = -1; - - ret = socket_proto_state_machine (this); - - /* call POLLIN on xlator even if complete block is not received, - just to keep the last_received timestamp ticking */ - - if (ret == 0) - ret = xlator_notify (this->xl, GF_EVENT_POLLIN, this); - - return ret; -} - - -int -socket_connect_finish (transport_t *this) -{ - int ret = -1; - socket_private_t *priv = NULL; - int event = -1; - char notify_xlator = 0; - - priv = this->private; - - pthread_mutex_lock (&priv->lock); - { - if (priv->connected) - goto unlock; - - ret = __socket_connect_finish (priv->sock); - - if (ret == -1 && errno == EINPROGRESS) - ret = 1; - - if (ret == -1 && errno != EINPROGRESS) { - if (!priv->connect_finish_log) { - gf_log (this->xl->name, GF_LOG_ERROR, - "connection to %s failed (%s)", - this->peerinfo.identifier, - strerror (errno)); - priv->connect_finish_log = 1; - } - __socket_disconnect (this); - notify_xlator = 1; - event = GF_EVENT_POLLERR; - goto unlock; - } - - if (ret == 0) { - notify_xlator = 1; - - this->myinfo.sockaddr_len = - sizeof (this->myinfo.sockaddr); - - ret = getsockname (priv->sock, - SA (&this->myinfo.sockaddr), - &this->myinfo.sockaddr_len); - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "getsockname on (%d) failed (%s)", - priv->sock, strerror (errno)); - __socket_disconnect (this); - event = GF_EVENT_POLLERR; - goto unlock; - } - - priv->connected = 1; - priv->connect_finish_log = 0; - event = GF_EVENT_CHILD_UP; - get_transport_identifiers (this); - } - } -unlock: - pthread_mutex_unlock (&priv->lock); - - if (notify_xlator) - xlator_notify (this->xl, event, this); - - return 0; -} - - -int -socket_event_handler (int fd, int idx, void *data, - int poll_in, int poll_out, int poll_err) -{ - transport_t *this = NULL; - socket_private_t *priv = NULL; - int ret = 0; - - this = data; - priv = this->private; - - pthread_mutex_lock (&priv->lock); - { - priv->idx = idx; - } - pthread_mutex_unlock (&priv->lock); - - if (!priv->connected) { - ret = socket_connect_finish (this); - } - - if (!ret && poll_out) { - ret = socket_event_poll_out (this); - } - - if (!ret && poll_in) { - ret = socket_event_poll_in (this); - } - - if (ret < 0 || poll_err) { - socket_event_poll_err (this); - transport_unref (this); - } - - return 0; -} - - -int -socket_server_event_handler (int fd, int idx, void *data, - int poll_in, int poll_out, int poll_err) -{ - transport_t *this = NULL; - socket_private_t *priv = NULL; - int ret = 0; - int new_sock = -1; - transport_t *new_trans = NULL; - struct sockaddr_storage new_sockaddr = {0, }; - socklen_t addrlen = sizeof (new_sockaddr); - socket_private_t *new_priv = NULL; - glusterfs_ctx_t *ctx = NULL; - - this = data; - priv = this->private; - ctx = this->xl->ctx; - - pthread_mutex_lock (&priv->lock); - { - priv->idx = idx; - - if (poll_in) { - new_sock = accept (priv->sock, SA (&new_sockaddr), - &addrlen); - - if (new_sock == -1) - goto unlock; - - if (!priv->bio) { - ret = __socket_nonblock (new_sock); - - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "NBIO on %d failed (%s)", - new_sock, strerror (errno)); - close (new_sock); - goto unlock; - } - } - - if (priv->nodelay) { - ret = __socket_nodelay (new_sock); - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "setsockopt() failed for " - "NODELAY (%s)", - strerror (errno)); - } - } - - new_trans = GF_CALLOC (1, sizeof (*new_trans), - gf_common_mt_transport_t); - new_trans->xl = this->xl; - new_trans->fini = this->fini; - - memcpy (&new_trans->peerinfo.sockaddr, &new_sockaddr, - addrlen); - new_trans->peerinfo.sockaddr_len = addrlen; - - new_trans->myinfo.sockaddr_len = - sizeof (new_trans->myinfo.sockaddr); - - ret = getsockname (new_sock, - SA (&new_trans->myinfo.sockaddr), - &new_trans->myinfo.sockaddr_len); - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "getsockname on %d failed (%s)", - new_sock, strerror (errno)); - close (new_sock); - goto unlock; - } - - get_transport_identifiers (new_trans); - socket_init (new_trans); - new_trans->ops = this->ops; - new_trans->init = this->init; - new_trans->fini = this->fini; - - new_priv = new_trans->private; - - pthread_mutex_lock (&new_priv->lock); - { - new_priv->sock = new_sock; - new_priv->connected = 1; - - transport_ref (new_trans); - new_priv->idx = - event_register (ctx->event_pool, - new_sock, - socket_event_handler, - new_trans, 1, 0); - - if (new_priv->idx == -1) - ret = -1; - } - pthread_mutex_unlock (&new_priv->lock); - } - } -unlock: - pthread_mutex_unlock (&priv->lock); - - return ret; -} - - -int -socket_disconnect (transport_t *this) -{ - socket_private_t *priv = NULL; - int ret = -1; - - priv = this->private; - - pthread_mutex_lock (&priv->lock); - { - ret = __socket_disconnect (this); - } - pthread_mutex_unlock (&priv->lock); - - return ret; -} - - -int -socket_connect (transport_t *this) -{ - int ret = -1; - int sock = -1; - socket_private_t *priv = NULL; - struct sockaddr_storage sockaddr = {0, }; - socklen_t sockaddr_len = 0; - glusterfs_ctx_t *ctx = NULL; - sa_family_t sa_family = {0, }; - - priv = this->private; - ctx = this->xl->ctx; - - if (!priv) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "connect() called on uninitialized transport"); - goto err; - } - - pthread_mutex_lock (&priv->lock); - { - sock = priv->sock; - } - pthread_mutex_unlock (&priv->lock); - - if (sock != -1) { - gf_log (this->xl->name, GF_LOG_TRACE, - "connect () called on transport already connected"); - ret = 0; - goto err; - } - - ret = socket_client_get_remote_sockaddr (this, SA (&sockaddr), - &sockaddr_len, &sa_family); - if (ret == -1) { - /* logged inside client_get_remote_sockaddr */ - goto err; - } - - pthread_mutex_lock (&priv->lock); - { - if (priv->sock != -1) { - gf_log (this->xl->name, GF_LOG_TRACE, - "connect() -- already connected"); - goto unlock; - } - - memcpy (&this->peerinfo.sockaddr, &sockaddr, sockaddr_len); - this->peerinfo.sockaddr_len = sockaddr_len; - - priv->sock = socket (sa_family, SOCK_STREAM, 0); - if (priv->sock == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "socket creation failed (%s)", - strerror (errno)); - goto unlock; - } - - /* Cant help if setting socket options fails. We can continue - * working nonetheless. - */ - if (setsockopt (priv->sock, SOL_SOCKET, SO_RCVBUF, - &priv->windowsize, - sizeof (priv->windowsize)) < 0) { - gf_log (this->xl->name, GF_LOG_ERROR, - "setting receive window size failed: %d: %d: " - "%s", priv->sock, priv->windowsize, - strerror (errno)); - } - - if (setsockopt (priv->sock, SOL_SOCKET, SO_SNDBUF, - &priv->windowsize, - sizeof (priv->windowsize)) < 0) { - gf_log (this->xl->name, GF_LOG_ERROR, - "setting send window size failed: %d: %d: " - "%s", priv->sock, priv->windowsize, - strerror (errno)); - } - - - if (priv->nodelay && priv->lowlat) { - ret = __socket_nodelay (priv->sock); - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "setsockopt() failed for NODELAY (%s)", - strerror (errno)); - } - } - - if (!priv->bio) { - ret = __socket_nonblock (priv->sock); - - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "NBIO on %d failed (%s)", - priv->sock, strerror (errno)); - close (priv->sock); - priv->sock = -1; - goto unlock; - } - } - - SA (&this->myinfo.sockaddr)->sa_family = - SA (&this->peerinfo.sockaddr)->sa_family; - - ret = client_bind (this, SA (&this->myinfo.sockaddr), - &this->myinfo.sockaddr_len, priv->sock); - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_WARNING, - "client bind failed: %s", strerror (errno)); - close (priv->sock); - priv->sock = -1; - goto unlock; - } - - ret = connect (priv->sock, SA (&this->peerinfo.sockaddr), - this->peerinfo.sockaddr_len); - - if (ret == -1 && errno != EINPROGRESS) { - gf_log (this->xl->name, GF_LOG_ERROR, - "connection attempt failed (%s)", - strerror (errno)); - close (priv->sock); - priv->sock = -1; - goto unlock; - } - - priv->connected = 0; - - transport_ref (this); - - priv->idx = event_register (ctx->event_pool, priv->sock, - socket_event_handler, this, 1, 1); - if (priv->idx == -1) - ret = -1; - } -unlock: - pthread_mutex_unlock (&priv->lock); - -err: - return ret; -} - - -int -socket_listen (transport_t *this) -{ - socket_private_t * priv = NULL; - int ret = -1; - int sock = -1; - struct sockaddr_storage sockaddr; - socklen_t sockaddr_len; - peer_info_t *myinfo = NULL; - glusterfs_ctx_t *ctx = NULL; - sa_family_t sa_family = {0, }; - - priv = this->private; - myinfo = &this->myinfo; - ctx = this->xl->ctx; - - pthread_mutex_lock (&priv->lock); - { - sock = priv->sock; - } - pthread_mutex_unlock (&priv->lock); - - if (sock != -1) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "alreading listening"); - return ret; - } - - ret = socket_server_get_local_sockaddr (this, SA (&sockaddr), - &sockaddr_len, &sa_family); - if (ret == -1) { - return ret; - } - - pthread_mutex_lock (&priv->lock); - { - if (priv->sock != -1) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "already listening"); - goto unlock; - } - - memcpy (&myinfo->sockaddr, &sockaddr, sockaddr_len); - myinfo->sockaddr_len = sockaddr_len; - - priv->sock = socket (sa_family, SOCK_STREAM, 0); - - if (priv->sock == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "socket creation failed (%s)", - strerror (errno)); - goto unlock; - } - - /* Cant help if setting socket options fails. We can continue - * working nonetheless. - */ - if (setsockopt (priv->sock, SOL_SOCKET, SO_RCVBUF, - &priv->windowsize, - sizeof (priv->windowsize)) < 0) { - gf_log (this->xl->name, GF_LOG_ERROR, - "setting receive window size failed: %d: %d: " - "%s", priv->sock, priv->windowsize, - strerror (errno)); - } - - if (setsockopt (priv->sock, SOL_SOCKET, SO_SNDBUF, - &priv->windowsize, - sizeof (priv->windowsize)) < 0) { - gf_log (this->xl->name, GF_LOG_ERROR, - "setting send window size failed: %d: %d: " - "%s", priv->sock, priv->windowsize, - strerror (errno)); - } - - if (priv->nodelay) { - ret = __socket_nodelay (priv->sock); - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "setsockopt() failed for NODELAY (%s)", - strerror (errno)); - } - } - - if (!priv->bio) { - ret = __socket_nonblock (priv->sock); - - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "NBIO on %d failed (%s)", - priv->sock, strerror (errno)); - close (priv->sock); - priv->sock = -1; - goto unlock; - } - } - - ret = __socket_server_bind (this); - - if (ret == -1) { - /* logged inside __socket_server_bind() */ - close (priv->sock); - priv->sock = -1; - goto unlock; - } - - ret = listen (priv->sock, 10); - - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "could not set socket %d to listen mode (%s)", - priv->sock, strerror (errno)); - close (priv->sock); - priv->sock = -1; - goto unlock; - } - - transport_ref (this); - - priv->idx = event_register (ctx->event_pool, priv->sock, - socket_server_event_handler, - this, 1, 0); - - if (priv->idx == -1) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "could not register socket %d with events", - priv->sock); - ret = -1; - close (priv->sock); - priv->sock = -1; - goto unlock; - } - } -unlock: - pthread_mutex_unlock (&priv->lock); - - return ret; -} - - -int -socket_receive (transport_t *this, char **hdr_p, size_t *hdrlen_p, - struct iobuf **iobuf_p) -{ - socket_private_t *priv = NULL; - int ret = -1; - - priv = this->private; - - pthread_mutex_lock (&priv->lock); - { - if (priv->connected != 1) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "socket not connected to receive"); - goto unlock; - } - - if (!hdr_p || !hdrlen_p || !iobuf_p) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "bad parameters %p %p %p", - hdr_p, hdrlen_p, iobuf_p); - goto unlock; - } - - if (priv->incoming.state == SOCKET_PROTO_STATE_COMPLETE) { - *hdr_p = priv->incoming.hdr_p; - *hdrlen_p = priv->incoming.hdrlen; - *iobuf_p = priv->incoming.iobuf; - - memset (&priv->incoming, 0, sizeof (priv->incoming)); - priv->incoming.state = SOCKET_PROTO_STATE_NADA; - - ret = 0; - } - } -unlock: - pthread_mutex_unlock (&priv->lock); - - return ret; -} - - -/* TODO: implement per transfer limit */ -int -socket_submit (transport_t *this, char *buf, int len, - struct iovec *vector, int count, - struct iobref *iobref) -{ - socket_private_t *priv = NULL; - int ret = -1; - char need_poll_out = 0; - char need_append = 1; - struct ioq *entry = NULL; - glusterfs_ctx_t *ctx = NULL; - - priv = this->private; - ctx = this->xl->ctx; - - pthread_mutex_lock (&priv->lock); - { - if (priv->connected != 1) { - if (!priv->submit_log && !priv->connect_finish_log) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "not connected (priv->connected = %d)", - priv->connected); - priv->submit_log = 1; - } - goto unlock; - } - - priv->submit_log = 0; - entry = __socket_ioq_new (this, buf, len, vector, count, iobref); - if (!entry) - goto unlock; - - if (list_empty (&priv->ioq)) { - ret = __socket_ioq_churn_entry (this, entry); - - if (ret == 0) - need_append = 0; - - if (ret > 0) - need_poll_out = 1; - } - - if (need_append) { - list_add_tail (&entry->list, &priv->ioq); - ret = 0; - } - - if (need_poll_out) { - /* first entry to wait. continue writing on POLLOUT */ - priv->idx = event_select_on (ctx->event_pool, - priv->sock, - priv->idx, -1, 1); - } - } -unlock: - pthread_mutex_unlock (&priv->lock); - - return ret; -} - - -struct transport_ops tops = { - .listen = socket_listen, - .connect = socket_connect, - .disconnect = socket_disconnect, - .submit = socket_submit, - .receive = socket_receive -}; - - -int -socket_init (transport_t *this) -{ - socket_private_t *priv = NULL; - gf_boolean_t tmp_bool = 0; - uint64_t windowsize = GF_DEFAULT_SOCKET_WINDOW_SIZE; - char *optstr = NULL; - - if (this->private) { - gf_log (this->xl->name, GF_LOG_DEBUG, - "double init attempted"); - return -1; - } - - priv = GF_CALLOC (1, sizeof (*priv), - gf_common_mt_socket_private_t); - if (!priv) { - gf_log (this->xl->name, GF_LOG_ERROR, - "calloc (1, %"GF_PRI_SIZET") returned NULL", - sizeof (*priv)); - return -1; - } - - pthread_mutex_init (&priv->lock, NULL); - - priv->sock = -1; - priv->idx = -1; - priv->connected = -1; - - INIT_LIST_HEAD (&priv->ioq); - - if (dict_get (this->xl->options, "non-blocking-io")) { - optstr = data_to_str (dict_get (this->xl->options, - "non-blocking-io")); - - if (gf_string2boolean (optstr, &tmp_bool) == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "'non-blocking-io' takes only boolean options," - " not taking any action"); - tmp_bool = 1; - } - priv->bio = 0; - if (!tmp_bool) { - priv->bio = 1; - gf_log (this->xl->name, GF_LOG_WARNING, - "disabling non-blocking IO"); - } - } - - optstr = NULL; - - // By default, we enable NODELAY - priv->nodelay = 1; - if (dict_get (this->xl->options, "transport.socket.nodelay")) { - optstr = data_to_str (dict_get (this->xl->options, - "transport.socket.nodelay")); - - if (gf_string2boolean (optstr, &tmp_bool) == -1) { - gf_log (this->xl->name, GF_LOG_ERROR, - "'transport.socket.nodelay' takes only " - "boolean options, not taking any action"); - tmp_bool = 1; - } - if (!tmp_bool) { - priv->nodelay = 0; - gf_log (this->xl->name, GF_LOG_DEBUG, - "disabling nodelay"); - } - } - - - optstr = NULL; - if (dict_get_str (this->xl->options, "transport.window-size", - &optstr) == 0) { - if (gf_string2bytesize (optstr, &windowsize) != 0) { - gf_log (this->xl->name, GF_LOG_ERROR, - "invalid number format: %s", optstr); - return -1; - } - } - - optstr = NULL; - - if (dict_get_str (this->xl->options, "transport.socket.lowlat", - &optstr) == 0) { - priv->lowlat = 1; - } - - priv->windowsize = (int)windowsize; - this->private = priv; - - return 0; -} - - -void -fini (transport_t *this) -{ - socket_private_t *priv = this->private; - - gf_log (this->xl->name, GF_LOG_TRACE, - "transport %p destroyed", this); - - pthread_mutex_destroy (&priv->lock); - GF_FREE (priv); -} - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - if (!this) - return ret; - - ret = xlator_mem_acct_init (this, gf_common_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } - - return ret; -} - -int32_t -init (transport_t *this) -{ - int ret = -1; - - ret = socket_init (this); - - if (ret == -1) { - gf_log (this->xl->name, GF_LOG_DEBUG, "socket_init() failed"); - } - - return ret; -} - -struct volume_options options[] = { - { .key = {"remote-port", - "transport.remote-port", - "transport.socket.remote-port"}, - .type = GF_OPTION_TYPE_INT - }, - { .key = {"transport.socket.listen-port", "listen-port"}, - .type = GF_OPTION_TYPE_INT - }, - { .key = {"transport.socket.bind-address", "bind-address" }, - .type = GF_OPTION_TYPE_INTERNET_ADDRESS - }, - { .key = {"transport.socket.connect-path", "connect-path"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {"transport.socket.bind-path", "bind-path"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {"transport.socket.listen-path", "listen-path"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = { "transport.address-family", - "address-family" }, - .value = {"inet", "inet6", "inet/inet6", "inet6/inet", - "unix", "inet-sdp" }, - .type = GF_OPTION_TYPE_STR - }, - - { .key = {"non-blocking-io"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"transport.window-size"}, - .type = GF_OPTION_TYPE_SIZET, - .min = GF_MIN_SOCKET_WINDOW_SIZE, - .max = GF_MAX_SOCKET_WINDOW_SIZE, - }, - { .key = {"transport.socket.nodelay"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {"transport.socket.lowlat"}, - .type = GF_OPTION_TYPE_BOOL - }, - { .key = {NULL} } -}; - diff --git a/transport/socket/src/socket.h b/transport/socket/src/socket.h deleted file mode 100644 index bc6d3b27..00000000 --- a/transport/socket/src/socket.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - Copyright (c) 2006-2009 Gluster, Inc. - This file is part of GlusterFS. - - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - . -*/ - -#ifndef _SOCKET_H -#define _SOCKET_H - - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include "event.h" -#include "transport.h" -#include "logging.h" -#include "dict.h" -#include "mem-pool.h" -#include "socket-mem-types.h" - -#ifndef MAX_IOVEC -#define MAX_IOVEC 16 -#endif /* MAX_IOVEC */ - -#define GF_DEFAULT_SOCKET_LISTEN_PORT 6996 - -/* This is the size set through setsockopt for - * both the TCP receive window size and the - * send buffer size. - * Till the time iobuf size becomes configurable, this size is set to include - * two iobufs + the GlusterFS protocol headers. - * Linux allows us to over-ride the max values for the system. - * Should we over-ride them? Because if we set a value larger than the default - * setsockopt will fail. Having larger values might be beneficial for - * IB links. - */ -#define GF_DEFAULT_SOCKET_WINDOW_SIZE (512 * GF_UNIT_KB) -#define GF_MAX_SOCKET_WINDOW_SIZE (1 * GF_UNIT_MB) -#define GF_MIN_SOCKET_WINDOW_SIZE (128 * GF_UNIT_KB) - -typedef enum { - SOCKET_PROTO_STATE_NADA = 0, - SOCKET_PROTO_STATE_HEADER_COMING, - SOCKET_PROTO_STATE_HEADER_CAME, - SOCKET_PROTO_STATE_DATA_COMING, - SOCKET_PROTO_STATE_DATA_CAME, - SOCKET_PROTO_STATE_COMPLETE, -} socket_proto_state_t; - -struct socket_header { - char colonO[3]; - uint32_t size1; - uint32_t size2; - char version; -} __attribute__((packed)); - - -struct ioq { - union { - struct list_head list; - struct { - struct ioq *next; - struct ioq *prev; - }; - }; - struct socket_header header; - struct iovec vector[MAX_IOVEC]; - int count; - struct iovec *pending_vector; - int pending_count; - char *buf; - struct iobref *iobref; -}; - - -typedef struct { - int32_t sock; - int32_t idx; - unsigned char connected; // -1 = not connected. 0 = in progress. 1 = connected - char bio; - char connect_finish_log; - char submit_log; - union { - struct list_head ioq; - struct { - struct ioq *ioq_next; - struct ioq *ioq_prev; - }; - }; - struct { - int state; - struct socket_header header; - char *hdr_p; - size_t hdrlen; - struct iobuf *iobuf; - char *buf_p; - size_t buflen; - struct iovec vector[2]; - int count; - struct iovec *pending_vector; - int pending_count; - } incoming; - pthread_mutex_t lock; - int windowsize; - char lowlat; - char nodelay; -} socket_private_t; - - -#endif diff --git a/xlators/nfs/lib/src/rpcsvc.h b/xlators/nfs/lib/src/rpcsvc.h index 2746288f..6e6dc9bc 100644 --- a/xlators/nfs/lib/src/rpcsvc.h +++ b/xlators/nfs/lib/src/rpcsvc.h @@ -27,7 +27,6 @@ #endif #include "event.h" -#include "transport.h" #include "logging.h" #include "dict.h" #include "mem-pool.h" diff --git a/xlators/protocol/Makefile.am b/xlators/protocol/Makefile.am index 745e277c..bef0c662 100644 --- a/xlators/protocol/Makefile.am +++ b/xlators/protocol/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = client server +SUBDIRS = lib transport client server auth CLEANFILES = diff --git a/xlators/protocol/auth/Makefile.am b/xlators/protocol/auth/Makefile.am new file mode 100644 index 00000000..6bd54eee --- /dev/null +++ b/xlators/protocol/auth/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = addr login + +CLEANFILES = diff --git a/xlators/protocol/auth/addr/Makefile.am b/xlators/protocol/auth/addr/Makefile.am new file mode 100644 index 00000000..d471a3f9 --- /dev/null +++ b/xlators/protocol/auth/addr/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/protocol/auth/addr/src/Makefile.am b/xlators/protocol/auth/addr/src/Makefile.am new file mode 100644 index 00000000..9b053a84 --- /dev/null +++ b/xlators/protocol/auth/addr/src/Makefile.am @@ -0,0 +1,14 @@ +auth_LTLIBRARIES = addr.la +authdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/auth + +addr_la_LDFLAGS = -module -avoidversion + +addr_la_SOURCES = addr.c +addr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(top_builddir)/xlators/protocol/lib/src/libgfproto.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) \ + -I$(top_srcdir)/xlators/protocol/lib/src + +CLEANFILES = diff --git a/xlators/protocol/auth/addr/src/addr.c b/xlators/protocol/auth/addr/src/addr.c new file mode 100644 index 00000000..a8803a39 --- /dev/null +++ b/xlators/protocol/auth/addr/src/addr.c @@ -0,0 +1,224 @@ +/* + Copyright (c) 2007-2009 Gluster, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include "authenticate.h" +#include "dict.h" + +#define ADDR_DELIMITER " ," +#define PRIVILEGED_PORT_CEILING 1024 + +#ifndef AF_INET_SDP +#define AF_INET_SDP 27 +#endif + +auth_result_t +gf_auth (dict_t *input_params, dict_t *config_params) +{ + int ret = 0; + char *name = NULL; + char *searchstr = NULL; + char peer_addr[UNIX_PATH_MAX]; + data_t *peer_info_data = NULL; + peer_info_t *peer_info = NULL; + data_t *allow_addr = NULL, *reject_addr = NULL; + char is_inet_sdp = 0; + + name = data_to_str (dict_get (input_params, "remote-subvolume")); + if (!name) { + gf_log ("authenticate/addr", + GF_LOG_ERROR, + "remote-subvolume not specified"); + return AUTH_DONT_CARE; + } + + ret = asprintf (&searchstr, "auth.addr.%s.allow", name); + if (-1 == ret) { + gf_log ("auth/addr", GF_LOG_ERROR, + "asprintf failed while setting search string"); + return AUTH_DONT_CARE; + } + allow_addr = dict_get (config_params, + searchstr); + free (searchstr); + + ret = asprintf (&searchstr, "auth.addr.%s.reject", name); + if (-1 == ret) { + gf_log ("auth/addr", GF_LOG_ERROR, + "asprintf failed while setting search string"); + return AUTH_DONT_CARE; + } + reject_addr = dict_get (config_params, + searchstr); + free (searchstr); + + if (!allow_addr) { + /* TODO: backword compatibility */ + ret = asprintf (&searchstr, "auth.ip.%s.allow", name); + if (-1 == ret) { + gf_log ("auth/addr", GF_LOG_ERROR, + "asprintf failed while setting search string"); + return AUTH_DONT_CARE; + } + allow_addr = dict_get (config_params, searchstr); + free (searchstr); + } + + if (!(allow_addr || reject_addr)) { + gf_log ("auth/addr", GF_LOG_DEBUG, + "none of the options auth.addr.%s.allow or " + "auth.addr.%s.reject specified, returning auth_dont_care", + name, name); + return AUTH_DONT_CARE; + } + + peer_info_data = dict_get (input_params, "peer-info"); + if (!peer_info_data) { + gf_log ("authenticate/addr", + GF_LOG_ERROR, + "peer-info not present"); + return AUTH_DONT_CARE; + } + + peer_info = data_to_ptr (peer_info_data); + + switch (((struct sockaddr *) &peer_info->sockaddr)->sa_family) + { + case AF_INET_SDP: + is_inet_sdp = 1; + ((struct sockaddr *) &peer_info->sockaddr)->sa_family = AF_INET; + + case AF_INET: + case AF_INET6: + { + char *service; + uint16_t peer_port; + strcpy (peer_addr, peer_info->identifier); + service = strrchr (peer_addr, ':'); + *service = '\0'; + service ++; + + if (is_inet_sdp) { + ((struct sockaddr *) &peer_info->sockaddr)->sa_family = AF_INET_SDP; + } + + peer_port = atoi (service); + if (peer_port >= PRIVILEGED_PORT_CEILING) { + gf_log ("auth/addr", GF_LOG_ERROR, + "client is bound to port %d which is not privileged", + peer_port); + return AUTH_DONT_CARE; + } + break; + + case AF_UNIX: + strcpy (peer_addr, peer_info->identifier); + break; + + default: + gf_log ("authenticate/addr", GF_LOG_ERROR, + "unknown address family %d", + ((struct sockaddr *) &peer_info->sockaddr)->sa_family); + return AUTH_DONT_CARE; + } + } + + if (reject_addr) { + char *addr_str = NULL; + char *tmp; + char *addr_cpy = strdup (reject_addr->data); + + addr_str = strtok_r (addr_cpy, ADDR_DELIMITER, &tmp); + + while (addr_str) { + char negate = 0, match =0; + gf_log (name, GF_LOG_DEBUG, + "rejected = \"%s\", received addr = \"%s\"", + addr_str, peer_addr); + if (addr_str[0] == '!') { + negate = 1; + addr_str++; + } + + match = fnmatch (addr_str, + peer_addr, + 0); + if (negate ? match : !match) { + free (addr_cpy); + return AUTH_REJECT; + } + addr_str = strtok_r (NULL, ADDR_DELIMITER, &tmp); + } + free (addr_cpy); + } + + if (allow_addr) { + char *addr_str = NULL; + char *tmp; + char *addr_cpy = strdup (allow_addr->data); + + addr_str = strtok_r (addr_cpy, ADDR_DELIMITER, &tmp); + + while (addr_str) { + char negate = 0, match = 0; + gf_log (name, GF_LOG_DEBUG, + "allowed = \"%s\", received addr = \"%s\"", + addr_str, peer_addr); + if (addr_str[0] == '!') { + negate = 1; + addr_str++; + } + + match = fnmatch (addr_str, + peer_addr, + 0); + + if (negate ? match : !match) { + free (addr_cpy); + return AUTH_ACCEPT; + } + addr_str = strtok_r (NULL, ADDR_DELIMITER, &tmp); + } + free (addr_cpy); + } + + return AUTH_DONT_CARE; +} + +struct volume_options options[] = { + { .key = {"auth.addr.*.allow"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"auth.addr.*.reject"}, + .type = GF_OPTION_TYPE_ANY + }, + /* Backword compatibility */ + { .key = {"auth.ip.*.allow"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {NULL} } +}; diff --git a/xlators/protocol/auth/login/Makefile.am b/xlators/protocol/auth/login/Makefile.am new file mode 100644 index 00000000..d471a3f9 --- /dev/null +++ b/xlators/protocol/auth/login/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/protocol/auth/login/src/Makefile.am b/xlators/protocol/auth/login/src/Makefile.am new file mode 100644 index 00000000..4a50e07d --- /dev/null +++ b/xlators/protocol/auth/login/src/Makefile.am @@ -0,0 +1,15 @@ +auth_LTLIBRARIES = login.la +authdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/auth + +login_la_LDFLAGS = -module -avoidversion + +login_la_SOURCES = login.c +login_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(top_builddir)/xlators/protocol/lib/src/libgfproto.la + + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) \ + -I$(top_srcdir)/xlators/protocol/lib/src + +CLEANFILES = diff --git a/xlators/protocol/auth/login/src/login.c b/xlators/protocol/auth/login/src/login.c new file mode 100644 index 00000000..0c85292f --- /dev/null +++ b/xlators/protocol/auth/login/src/login.c @@ -0,0 +1,114 @@ +/* + Copyright (c) 2007-2009 Gluster, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include "authenticate.h" + +auth_result_t gf_auth (dict_t *input_params, dict_t *config_params) +{ + int ret = 0; + char *username = NULL, *password = NULL; + data_t *allow_user = NULL, *username_data = NULL, *password_data = NULL; + int32_t result = AUTH_DONT_CARE; + char *brick_name = NULL, *searchstr = NULL; + + username_data = dict_get (input_params, "username"); + if (!username_data) + return AUTH_DONT_CARE; + + username = data_to_str (username_data); + + password_data = dict_get (input_params, "password"); + if (!password_data) + return AUTH_DONT_CARE; + + password = data_to_str (password_data); + + brick_name = data_to_str (dict_get (input_params, "remote-subvolume")); + if (!brick_name) { + gf_log ("auth/login", + GF_LOG_ERROR, + "remote-subvolume not specified"); + return AUTH_REJECT; + } + + ret = asprintf (&searchstr, "auth.login.%s.allow", brick_name); + if (-1 == ret) { + gf_log ("auth/login", GF_LOG_ERROR, + "asprintf failed while setting search string"); + return AUTH_DONT_CARE; + } + + allow_user = dict_get (config_params, + searchstr); + free (searchstr); + + if (allow_user) { + char *username_str = NULL; + char *tmp; + char *username_cpy = strdup (allow_user->data); + + username_str = strtok_r (username_cpy, " ,", &tmp); + + while (username_str) { + data_t *passwd_data = NULL; + if (!fnmatch (username_str, + username, + 0)) { + ret = asprintf (&searchstr, "auth.login.%s.password", username); + if (-1 == ret) { + gf_log ("auth/login", GF_LOG_ERROR, + "asprintf failed while setting search string"); + return AUTH_DONT_CARE; + } + passwd_data = dict_get (config_params, searchstr); + FREE (searchstr); + + if (!passwd_data) { + gf_log ("auth/login", + GF_LOG_DEBUG, + "wrong username/password combination"); + result = AUTH_REJECT; + } + else + result = !strcmp (data_to_str (passwd_data), password) ? AUTH_ACCEPT : AUTH_REJECT; + break; + } + username_str = strtok_r (NULL, " ,", &tmp); + } + free (username_cpy); + } + + return result; +} + +struct volume_options options[] = { + { .key = {"auth.login.*.allow"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"auth.login.*.password"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {NULL} } +}; diff --git a/xlators/protocol/client/src/Makefile.am b/xlators/protocol/client/src/Makefile.am index fb720942..722d62e3 100644 --- a/xlators/protocol/client/src/Makefile.am +++ b/xlators/protocol/client/src/Makefile.am @@ -5,12 +5,14 @@ xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/protocol client_la_LDFLAGS = -module -avoidversion client_la_SOURCES = client-protocol.c saved-frames.c -client_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +client_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(top_builddir)/xlators/protocol/lib/src/libgfproto.la noinst_HEADERS = client-protocol.h saved-frames.h AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ - -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) \ + -I$(top_srcdir)/xlators/protocol/lib/src CLEANFILES = diff --git a/xlators/protocol/lib/Makefile.am b/xlators/protocol/lib/Makefile.am new file mode 100644 index 00000000..d471a3f9 --- /dev/null +++ b/xlators/protocol/lib/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/protocol/lib/src/Makefile.am b/xlators/protocol/lib/src/Makefile.am new file mode 100644 index 00000000..d3d1aafe --- /dev/null +++ b/xlators/protocol/lib/src/Makefile.am @@ -0,0 +1,15 @@ +libgfproto_la_CFLAGS = -fPIC -Wall -g -shared -nostartfiles $(GF_CFLAGS) $(GF_DARWIN_LIBGLUSTERFS_CFLAGS) + +libgfproto_la_CPPFLAGS = -D_FILE_OFFSET_BITS=64 -D__USE_FILE_OFFSET64 -D_GNU_SOURCE \ + -D$(GF_HOST_OS) -DLIBDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/auth\" \ + -DTRANSPORTDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/transport\" \ + -I$(CONTRIBDIR)/rbtree -I$(top_srcdir)/libglusterfs/src/ + +libgfproto_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +lib_LTLIBRARIES = libgfproto.la + + +libgfproto_la_SOURCES = transport.c authenticate.c + +noinst_HEADERS = transport.h protocol.h authenticate.h diff --git a/xlators/protocol/lib/src/authenticate.c b/xlators/protocol/lib/src/authenticate.c new file mode 100644 index 00000000..eb0e2464 --- /dev/null +++ b/xlators/protocol/lib/src/authenticate.c @@ -0,0 +1,250 @@ +/* + Copyright (c) 2007-2009 Gluster, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include "authenticate.h" + +static void +init (dict_t *this, + char *key, + data_t *value, + void *data) +{ + void *handle = NULL; + char *auth_file = NULL; + auth_handle_t *auth_handle = NULL; + auth_fn_t authenticate = NULL; + int *error = NULL; + int ret = 0; + + /* It gets over written */ + error = data; + + if (!strncasecmp (key, "ip", strlen ("ip"))) { + gf_log ("authenticate", GF_LOG_ERROR, + "AUTHENTICATION MODULE \"IP\" HAS BEEN REPLACED " + "BY \"ADDR\""); + dict_set (this, key, data_from_dynptr (NULL, 0)); + /* TODO: 1.3.x backword compatibility */ + // *error = -1; + // return; + key = "addr"; + } + + ret = gf_asprintf (&auth_file, "%s/%s.so", LIBDIR, key); + if (-1 == ret) { + gf_log ("authenticate", GF_LOG_ERROR, "asprintf failed"); + dict_set (this, key, data_from_dynptr (NULL, 0)); + *error = -1; + return; + } + + handle = dlopen (auth_file, RTLD_LAZY); + if (!handle) { + gf_log ("authenticate", GF_LOG_ERROR, "dlopen(%s): %s\n", + auth_file, dlerror ()); + dict_set (this, key, data_from_dynptr (NULL, 0)); + GF_FREE (auth_file); + *error = -1; + return; + } + GF_FREE (auth_file); + + authenticate = dlsym (handle, "gf_auth"); + if (!authenticate) { + gf_log ("authenticate", GF_LOG_ERROR, + "dlsym(gf_auth) on %s\n", dlerror ()); + dict_set (this, key, data_from_dynptr (NULL, 0)); + *error = -1; + return; + } + + auth_handle = GF_CALLOC (1, sizeof (*auth_handle), + gf_common_mt_auth_handle_t); + if (!auth_handle) { + gf_log ("authenticate", GF_LOG_ERROR, "Out of memory"); + dict_set (this, key, data_from_dynptr (NULL, 0)); + *error = -1; + return; + } + auth_handle->vol_opt = GF_CALLOC (1, sizeof (volume_opt_list_t), + gf_common_mt_volume_opt_list_t); + auth_handle->vol_opt->given_opt = dlsym (handle, "options"); + if (auth_handle->vol_opt->given_opt == NULL) { + gf_log ("authenticate", GF_LOG_DEBUG, + "volume option validation not specified"); + } + + auth_handle->authenticate = authenticate; + auth_handle->handle = handle; + + dict_set (this, key, + data_from_dynptr (auth_handle, sizeof (*auth_handle))); +} + +static void +fini (dict_t *this, + char *key, + data_t *value, + void *data) +{ + auth_handle_t *handle = data_to_ptr (value); + if (handle) { + dlclose (handle->handle); + } +} + +int32_t +gf_auth_init (xlator_t *xl, dict_t *auth_modules) +{ + int ret = 0; + auth_handle_t *handle = NULL; + data_pair_t *pair = NULL; + dict_foreach (auth_modules, init, &ret); + if (!ret) { + pair = auth_modules->members_list; + while (pair) { + handle = data_to_ptr (pair->value); + if (handle) { + list_add_tail (&(handle->vol_opt->list), + &(xl->volume_options)); + if (-1 == + validate_xlator_volume_options (xl, + handle->vol_opt->given_opt)) { + gf_log ("authenticate", GF_LOG_ERROR, + "volume option validation " + "failed"); + ret = -1; + } + } + pair = pair->next; + } + } + if (ret) { + gf_log (xl->name, GF_LOG_ERROR, "authentication init failed"); + dict_foreach (auth_modules, fini, &ret); + ret = -1; + } + return ret; +} + +static dict_t *__input_params; +static dict_t *__config_params; + +void +map (dict_t *this, + char *key, + data_t *value, + void *data) +{ + dict_t *res = data; + auth_fn_t authenticate; + auth_handle_t *handle = NULL; + + if (value && (handle = data_to_ptr (value)) && + (authenticate = handle->authenticate)) { + dict_set (res, key, + int_to_data (authenticate (__input_params, + __config_params))); + } else { + dict_set (res, key, int_to_data (AUTH_DONT_CARE)); + } +} + +void +reduce (dict_t *this, + char *key, + data_t *value, + void *data) +{ + int64_t val = 0; + int64_t *res = data; + if (!data) + return; + + val = data_to_int64 (value); + switch (val) + { + case AUTH_ACCEPT: + if (AUTH_DONT_CARE == *res) + *res = AUTH_ACCEPT; + break; + + case AUTH_REJECT: + *res = AUTH_REJECT; + break; + + case AUTH_DONT_CARE: + break; + } +} + + +auth_result_t +gf_authenticate (dict_t *input_params, + dict_t *config_params, + dict_t *auth_modules) +{ + dict_t *results = NULL; + int64_t result = AUTH_DONT_CARE; + + results = get_new_dict (); + __input_params = input_params; + __config_params = config_params; + + dict_foreach (auth_modules, map, results); + + dict_foreach (results, reduce, &result); + if (AUTH_DONT_CARE == result) { + data_t *peerinfo_data = dict_get (input_params, "peer-info"); + char *name = NULL; + + if (peerinfo_data) { + peer_info_t *peerinfo = data_to_ptr (peerinfo_data); + name = peerinfo->identifier; + } + + gf_log ("auth", GF_LOG_ERROR, + "no authentication module is interested in " + "accepting remote-client %s", name); + result = AUTH_REJECT; + } + + dict_destroy (results); + return result; +} + +void +gf_auth_fini (dict_t *auth_modules) +{ + int32_t dummy; + + dict_foreach (auth_modules, fini, &dummy); +} diff --git a/xlators/protocol/lib/src/authenticate.h b/xlators/protocol/lib/src/authenticate.h new file mode 100644 index 00000000..8931f62e --- /dev/null +++ b/xlators/protocol/lib/src/authenticate.h @@ -0,0 +1,61 @@ +/* + Copyright (c) 2007-2009 Gluster, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _AUTHENTICATE_H +#define _AUTHENTICATE_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include "dict.h" +#include "compat.h" +#include "list.h" +#include "transport.h" +#include "xlator.h" + +typedef enum { + AUTH_ACCEPT, + AUTH_REJECT, + AUTH_DONT_CARE +} auth_result_t; + +typedef auth_result_t (*auth_fn_t) (dict_t *input_params, + dict_t *config_params); + +typedef struct { + void *handle; + auth_fn_t authenticate; + volume_opt_list_t *vol_opt; +} auth_handle_t; + +auth_result_t gf_authenticate (dict_t *input_params, + dict_t *config_params, + dict_t *auth_modules); +int32_t gf_auth_init (xlator_t *xl, dict_t *auth_modules); +void gf_auth_fini (dict_t *auth_modules); + +#endif /* _AUTHENTICATE_H */ diff --git a/xlators/protocol/lib/src/protocol.h b/xlators/protocol/lib/src/protocol.h new file mode 100644 index 00000000..6fd291bb --- /dev/null +++ b/xlators/protocol/lib/src/protocol.h @@ -0,0 +1,1114 @@ +/* + Copyright (c) 2006-2009 Gluster, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _PROTOCOL_H +#define _PROTOCOL_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "byte-order.h" +#include "iatt.h" + +/* Any changes in the protocol structure or adding new '[f,m]ops' needs to + * bump the protocol version by "0.1" + */ + +#define GF_PROTOCOL_VERSION "3.0" + +struct gf_stat { + uint64_t ino; + uint64_t size; + uint64_t blocks; + uint64_t dev; + uint32_t rdev; + uint32_t mode; + uint32_t nlink; + uint32_t uid; + uint32_t gid; + uint32_t blksize; + uint32_t atime; + uint32_t atime_nsec; + uint32_t mtime ; + uint32_t mtime_nsec; + uint32_t ctime; + uint32_t ctime_nsec; +} __attribute__((packed)); + + +static inline void +gf_stat_to_stat (struct gf_stat *gf_stat, struct stat *stat) +{ + stat->st_dev = ntoh64 (gf_stat->dev); + stat->st_ino = ntoh64 (gf_stat->ino); + stat->st_mode = ntoh32 (gf_stat->mode); + stat->st_nlink = ntoh32 (gf_stat->nlink); + stat->st_uid = ntoh32 (gf_stat->uid); + stat->st_gid = ntoh32 (gf_stat->gid); + stat->st_rdev = ntoh32 (gf_stat->rdev); + stat->st_size = ntoh64 (gf_stat->size); + stat->st_blksize = ntoh32 (gf_stat->blksize); + stat->st_blocks = ntoh64 (gf_stat->blocks); + stat->st_atime = ntoh32 (gf_stat->atime); + stat->st_mtime = ntoh32 (gf_stat->mtime); + stat->st_ctime = ntoh32 (gf_stat->ctime); + ST_ATIM_NSEC_SET(stat, ntoh32 (gf_stat->atime_nsec)); + ST_MTIM_NSEC_SET(stat, ntoh32 (gf_stat->mtime_nsec)); + ST_CTIM_NSEC_SET(stat, ntoh32 (gf_stat->ctime_nsec)); +} + + +static inline void +gf_stat_from_stat (struct gf_stat *gf_stat, struct stat *stat) +{ + gf_stat->dev = hton64 (stat->st_dev); + gf_stat->ino = hton64 (stat->st_ino); + gf_stat->mode = hton32 (stat->st_mode); + gf_stat->nlink = hton32 (stat->st_nlink); + gf_stat->uid = hton32 (stat->st_uid); + gf_stat->gid = hton32 (stat->st_gid); + gf_stat->rdev = hton32 (stat->st_rdev); + gf_stat->size = hton64 (stat->st_size); + gf_stat->blksize = hton32 (stat->st_blksize); + gf_stat->blocks = hton64 (stat->st_blocks); + gf_stat->atime = hton32 (stat->st_atime); + gf_stat->mtime = hton32 (stat->st_mtime); + gf_stat->ctime = hton32 (stat->st_ctime); + gf_stat->atime_nsec = hton32 (ST_ATIM_NSEC(stat)); + gf_stat->mtime_nsec = hton32 (ST_MTIM_NSEC(stat)); + gf_stat->ctime_nsec = hton32 (ST_CTIM_NSEC(stat)); +} + + +static inline void +gf_stat_to_iatt (struct gf_stat *gf_stat, struct iatt *iatt) +{ + iatt->ia_ino = ntoh64 (gf_stat->ino); + iatt->ia_dev = ntoh64 (gf_stat->dev); + iatt->ia_type = ia_type_from_st_mode (ntoh32 (gf_stat->mode)); + iatt->ia_prot = ia_prot_from_st_mode (ntoh32 (gf_stat->mode)); + iatt->ia_nlink = ntoh32 (gf_stat->nlink); + iatt->ia_uid = ntoh32 (gf_stat->uid); + iatt->ia_gid = ntoh32 (gf_stat->gid); + iatt->ia_rdev = ntoh64 (gf_stat->rdev); + iatt->ia_size = ntoh64 (gf_stat->size); + iatt->ia_blksize = ntoh32 (gf_stat->blksize); + iatt->ia_blocks = ntoh64 (gf_stat->blocks); + iatt->ia_atime = ntoh32 (gf_stat->atime); + iatt->ia_atime_nsec = ntoh32 (gf_stat->atime_nsec); + iatt->ia_mtime = ntoh32 (gf_stat->mtime); + iatt->ia_mtime_nsec = ntoh32 (gf_stat->mtime_nsec); + iatt->ia_ctime = ntoh32 (gf_stat->ctime); + iatt->ia_ctime_nsec = ntoh32 (gf_stat->ctime_nsec); + + iatt->ia_gen = ntoh64 (gf_stat->dev); +} + + +static inline void +gf_stat_from_iatt (struct gf_stat *gf_stat, struct iatt *iatt) +{ + gf_stat->ino = hton64 (iatt->ia_ino); + gf_stat->dev = hton64 (iatt->ia_dev); + gf_stat->mode = hton32 (st_mode_from_ia (iatt->ia_prot, + iatt->ia_type)); + gf_stat->nlink = hton32 (iatt->ia_nlink); + gf_stat->uid = hton32 (iatt->ia_uid); + gf_stat->gid = hton32 (iatt->ia_gid); + gf_stat->rdev = hton32 (iatt->ia_rdev); + gf_stat->size = hton64 (iatt->ia_size); + gf_stat->blksize = hton32 (iatt->ia_blksize); + gf_stat->blocks = hton64 (iatt->ia_blocks); + gf_stat->atime = hton32 (iatt->ia_atime); + gf_stat->atime_nsec = hton32 (iatt->ia_atime_nsec); + gf_stat->mtime = hton32 (iatt->ia_mtime); + gf_stat->mtime_nsec = hton32 (iatt->ia_mtime_nsec); + gf_stat->ctime = hton32 (iatt->ia_ctime); + gf_stat->ctime_nsec = hton32 (iatt->ia_ctime_nsec); + + gf_stat->dev = hton64 (iatt->ia_gen); + +} + + +struct gf_statfs { + uint64_t bsize; + uint64_t frsize; + uint64_t blocks; + uint64_t bfree; + uint64_t bavail; + uint64_t files; + uint64_t ffree; + uint64_t favail; + uint64_t fsid; + uint64_t flag; + uint64_t namemax; +} __attribute__((packed)); + + +static inline void +gf_statfs_to_statfs (struct gf_statfs *gf_stat, struct statvfs *stat) +{ + stat->f_bsize = ntoh64 (gf_stat->bsize); + stat->f_frsize = ntoh64 (gf_stat->frsize); + stat->f_blocks = ntoh64 (gf_stat->blocks); + stat->f_bfree = ntoh64 (gf_stat->bfree); + stat->f_bavail = ntoh64 (gf_stat->bavail); + stat->f_files = ntoh64 (gf_stat->files); + stat->f_ffree = ntoh64 (gf_stat->ffree); + stat->f_favail = ntoh64 (gf_stat->favail); + stat->f_fsid = ntoh64 (gf_stat->fsid); + stat->f_flag = ntoh64 (gf_stat->flag); + stat->f_namemax = ntoh64 (gf_stat->namemax); +} + + +static inline void +gf_statfs_from_statfs (struct gf_statfs *gf_stat, struct statvfs *stat) +{ + gf_stat->bsize = hton64 (stat->f_bsize); + gf_stat->frsize = hton64 (stat->f_frsize); + gf_stat->blocks = hton64 (stat->f_blocks); + gf_stat->bfree = hton64 (stat->f_bfree); + gf_stat->bavail = hton64 (stat->f_bavail); + gf_stat->files = hton64 (stat->f_files); + gf_stat->ffree = hton64 (stat->f_ffree); + gf_stat->favail = hton64 (stat->f_favail); + gf_stat->fsid = hton64 (stat->f_fsid); + gf_stat->flag = hton64 (stat->f_flag); + gf_stat->namemax = hton64 (stat->f_namemax); +} + + +struct gf_flock { + uint16_t type; + uint16_t whence; + uint64_t start; + uint64_t len; + uint32_t pid; +} __attribute__((packed)); + + +static inline void +gf_flock_to_flock (struct gf_flock *gf_flock, struct flock *flock) +{ + flock->l_type = ntoh16 (gf_flock->type); + flock->l_whence = ntoh16 (gf_flock->whence); + flock->l_start = ntoh64 (gf_flock->start); + flock->l_len = ntoh64 (gf_flock->len); + flock->l_pid = ntoh32 (gf_flock->pid); +} + + +static inline void +gf_flock_from_flock (struct gf_flock *gf_flock, struct flock *flock) +{ + gf_flock->type = hton16 (flock->l_type); + gf_flock->whence = hton16 (flock->l_whence); + gf_flock->start = hton64 (flock->l_start); + gf_flock->len = hton64 (flock->l_len); + gf_flock->pid = hton32 (flock->l_pid); +} + + +struct gf_timespec { + uint32_t tv_sec; + uint32_t tv_nsec; +} __attribute__((packed)); + + +static inline void +gf_timespec_to_timespec (struct gf_timespec *gf_ts, struct timespec *ts) +{ + + ts[0].tv_sec = ntoh32 (gf_ts[0].tv_sec); + ts[0].tv_nsec = ntoh32 (gf_ts[0].tv_nsec); + ts[1].tv_sec = ntoh32 (gf_ts[1].tv_sec); + ts[1].tv_nsec = ntoh32 (gf_ts[1].tv_nsec); +} + + +static inline void +gf_timespec_from_timespec (struct gf_timespec *gf_ts, struct timespec *ts) +{ + gf_ts[0].tv_sec = hton32 (ts[0].tv_sec); + gf_ts[0].tv_nsec = hton32 (ts[0].tv_nsec); + gf_ts[1].tv_sec = hton32 (ts[1].tv_sec); + gf_ts[1].tv_nsec = hton32 (ts[1].tv_nsec); +} + + +#define GF_O_ACCMODE 003 +#define GF_O_RDONLY 00 +#define GF_O_WRONLY 01 +#define GF_O_RDWR 02 +#define GF_O_CREAT 0100 +#define GF_O_EXCL 0200 +#define GF_O_NOCTTY 0400 +#define GF_O_TRUNC 01000 +#define GF_O_APPEND 02000 +#define GF_O_NONBLOCK 04000 +#define GF_O_SYNC 010000 +#define GF_O_ASYNC 020000 + +#define GF_O_DIRECT 040000 +#define GF_O_DIRECTORY 0200000 +#define GF_O_NOFOLLOW 0400000 +#define GF_O_NOATIME 01000000 +#define GF_O_CLOEXEC 02000000 + +#define GF_O_LARGEFILE 0100000 + +#define XLATE_BIT(from, to, bit) do { \ + if (from & bit) \ + to = to | GF_##bit; \ + } while (0) + +#define UNXLATE_BIT(from, to, bit) do { \ + if (from & GF_##bit) \ + to = to | bit; \ + } while (0) + +#define XLATE_ACCESSMODE(from, to) do { \ + switch (from & O_ACCMODE) { \ + case O_RDONLY: to |= GF_O_RDONLY; \ + break; \ + case O_WRONLY: to |= GF_O_WRONLY; \ + break; \ + case O_RDWR: to |= GF_O_RDWR; \ + break; \ + } \ + } while (0) + +#define UNXLATE_ACCESSMODE(from, to) do { \ + switch (from & GF_O_ACCMODE) { \ + case GF_O_RDONLY: to |= O_RDONLY; \ + break; \ + case GF_O_WRONLY: to |= O_WRONLY; \ + break; \ + case GF_O_RDWR: to |= O_RDWR; \ + break; \ + } \ + } while (0) + +static inline uint32_t +gf_flags_from_flags (uint32_t flags) +{ + uint32_t gf_flags = 0; + + XLATE_ACCESSMODE (flags, gf_flags); + + XLATE_BIT (flags, gf_flags, O_CREAT); + XLATE_BIT (flags, gf_flags, O_EXCL); + XLATE_BIT (flags, gf_flags, O_NOCTTY); + XLATE_BIT (flags, gf_flags, O_TRUNC); + XLATE_BIT (flags, gf_flags, O_APPEND); + XLATE_BIT (flags, gf_flags, O_NONBLOCK); + XLATE_BIT (flags, gf_flags, O_SYNC); + XLATE_BIT (flags, gf_flags, O_ASYNC); + + XLATE_BIT (flags, gf_flags, O_DIRECT); + XLATE_BIT (flags, gf_flags, O_DIRECTORY); + XLATE_BIT (flags, gf_flags, O_NOFOLLOW); +#ifdef O_NOATIME + XLATE_BIT (flags, gf_flags, O_NOATIME); +#endif +#ifdef O_CLOEXEC + XLATE_BIT (flags, gf_flags, O_CLOEXEC); +#endif + XLATE_BIT (flags, gf_flags, O_LARGEFILE); + + return gf_flags; +} + +static inline uint32_t +gf_flags_to_flags (uint32_t gf_flags) +{ + uint32_t flags = 0; + + UNXLATE_ACCESSMODE (gf_flags, flags); + + UNXLATE_BIT (gf_flags, flags, O_CREAT); + UNXLATE_BIT (gf_flags, flags, O_EXCL); + UNXLATE_BIT (gf_flags, flags, O_NOCTTY); + UNXLATE_BIT (gf_flags, flags, O_TRUNC); + UNXLATE_BIT (gf_flags, flags, O_APPEND); + UNXLATE_BIT (gf_flags, flags, O_NONBLOCK); + UNXLATE_BIT (gf_flags, flags, O_SYNC); + UNXLATE_BIT (gf_flags, flags, O_ASYNC); + + UNXLATE_BIT (gf_flags, flags, O_DIRECT); + UNXLATE_BIT (gf_flags, flags, O_DIRECTORY); + UNXLATE_BIT (gf_flags, flags, O_NOFOLLOW); +#ifdef O_NOATIME + UNXLATE_BIT (gf_flags, flags, O_NOATIME); +#endif +#ifdef O_CLOEXEC + UNXLATE_BIT (gf_flags, flags, O_CLOEXEC); +#endif + UNXLATE_BIT (gf_flags, flags, O_LARGEFILE); + + return flags; +} + + +typedef struct { + uint64_t ino; + uint64_t gen; + char path[0]; /* NULL terminated */ +} __attribute__((packed)) gf_fop_stat_req_t;; +typedef struct { + struct gf_stat stat; +} __attribute__((packed)) gf_fop_stat_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + uint32_t size; + char path[0]; /* NULL terminated */ +} __attribute__((packed)) gf_fop_readlink_req_t; +typedef struct { + struct gf_stat buf; + char path[0]; /* NULL terminated */ +} __attribute__((packed)) gf_fop_readlink_rsp_t; + + +typedef struct { + uint64_t par; + uint64_t gen; + uint64_t dev; + uint32_t mode; + char path[0]; /* NULL terminated */ + char bname[0]; /* NULL terminated */ +} __attribute__((packed)) gf_fop_mknod_req_t; +typedef struct { + struct gf_stat stat; + struct gf_stat preparent; + struct gf_stat postparent; +} __attribute__((packed)) gf_fop_mknod_rsp_t; + + +typedef struct { + uint64_t par; + uint64_t gen; + uint32_t mode; + char path[0]; /* NULL terminated */ + char bname[0]; /* NULL terminated */ +} __attribute__((packed)) gf_fop_mkdir_req_t; +typedef struct { + struct gf_stat stat; + struct gf_stat preparent; + struct gf_stat postparent; +} __attribute__((packed)) gf_fop_mkdir_rsp_t; + + +typedef struct { + uint64_t par; + uint64_t gen; + char path[0]; /* NULL terminated */ + char bname[0]; /* NULL terminated */ +} __attribute__((packed)) gf_fop_unlink_req_t; +typedef struct { + struct gf_stat preparent; + struct gf_stat postparent; +} __attribute__((packed)) gf_fop_unlink_rsp_t; + + +typedef struct { + uint64_t par; + uint64_t gen; + char path[0]; + char bname[0]; /* NULL terminated */ +} __attribute__((packed)) gf_fop_rmdir_req_t; +typedef struct { + struct gf_stat preparent; + struct gf_stat postparent; +} __attribute__((packed)) gf_fop_rmdir_rsp_t; + + +typedef struct { + uint64_t par; + uint64_t gen; + char path[0]; + char bname[0]; + char linkname[0]; +} __attribute__((packed)) gf_fop_symlink_req_t; +typedef struct { + struct gf_stat stat; + struct gf_stat preparent; + struct gf_stat postparent; +}__attribute__((packed)) gf_fop_symlink_rsp_t; + + +typedef struct { + uint64_t oldpar; + uint64_t oldgen; + uint64_t newpar; + uint64_t newgen; + char oldpath[0]; + char oldbname[0]; /* NULL terminated */ + char newpath[0]; + char newbname[0]; /* NULL terminated */ +} __attribute__((packed)) gf_fop_rename_req_t; +typedef struct { + struct gf_stat stat; + struct gf_stat preoldparent; + struct gf_stat postoldparent; + struct gf_stat prenewparent; + struct gf_stat postnewparent; +} __attribute__((packed)) gf_fop_rename_rsp_t; + + +typedef struct { + uint64_t oldino; + uint64_t oldgen; + uint64_t newpar; + uint64_t newgen; + char oldpath[0]; + char newpath[0]; + char newbname[0]; +}__attribute__((packed)) gf_fop_link_req_t; +typedef struct { + struct gf_stat stat; + struct gf_stat preparent; + struct gf_stat postparent; +} __attribute__((packed)) gf_fop_link_rsp_t; + +typedef struct { + uint64_t ino; + uint64_t gen; + uint64_t offset; + char path[0]; +} __attribute__((packed)) gf_fop_truncate_req_t; +typedef struct { + struct gf_stat prestat; + struct gf_stat poststat; +} __attribute__((packed)) gf_fop_truncate_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + uint32_t flags; + uint32_t wbflags; + char path[0]; +} __attribute__((packed)) gf_fop_open_req_t; +typedef struct { + int64_t fd; +} __attribute__((packed)) gf_fop_open_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + int64_t fd; + uint64_t offset; + uint32_t size; +} __attribute__((packed)) gf_fop_read_req_t; +typedef struct { + struct gf_stat stat; + char buf[0]; +} __attribute__((packed)) gf_fop_read_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + int64_t fd; + uint64_t offset; + uint32_t size; +} __attribute__((packed)) gf_fop_write_req_t; +typedef struct { + struct gf_stat prestat; + struct gf_stat poststat; +} __attribute__((packed)) gf_fop_write_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + char path[0]; +} __attribute__((packed)) gf_fop_statfs_req_t; +typedef struct { + struct gf_statfs statfs; +} __attribute__((packed)) gf_fop_statfs_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + int64_t fd; +} __attribute__((packed)) gf_fop_flush_req_t; +typedef struct { } __attribute__((packed)) gf_fop_flush_rsp_t; + + +typedef struct fsync_req { + uint64_t ino; + uint64_t gen; + int64_t fd; + uint32_t data; +} __attribute__((packed)) gf_fop_fsync_req_t; +typedef struct { + struct gf_stat prestat; + struct gf_stat poststat; +} __attribute__((packed)) gf_fop_fsync_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + uint32_t flags; + uint32_t dict_len; + char dict[0]; + char path[0]; +} __attribute__((packed)) gf_fop_setxattr_req_t; +typedef struct { } __attribute__((packed)) gf_fop_setxattr_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + int64_t fd; + uint32_t flags; + uint32_t dict_len; + char dict[0]; +} __attribute__((packed)) gf_fop_fsetxattr_req_t; +typedef struct { } __attribute__((packed)) gf_fop_fsetxattr_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + uint32_t flags; + uint32_t dict_len; + char dict[0]; + char path[0]; +} __attribute__((packed)) gf_fop_xattrop_req_t; + +typedef struct { + uint32_t dict_len; + char dict[0]; +} __attribute__((packed)) gf_fop_xattrop_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + int64_t fd; + uint32_t flags; + uint32_t dict_len; + char dict[0]; +} __attribute__((packed)) gf_fop_fxattrop_req_t; + +typedef struct { + uint32_t dict_len; + char dict[0]; +} __attribute__((packed)) gf_fop_fxattrop_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + uint32_t namelen; + char path[0]; + char name[0]; +} __attribute__((packed)) gf_fop_getxattr_req_t; +typedef struct { + uint32_t dict_len; + char dict[0]; +} __attribute__((packed)) gf_fop_getxattr_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + int64_t fd; + uint32_t namelen; + char name[0]; +} __attribute__((packed)) gf_fop_fgetxattr_req_t; +typedef struct { + uint32_t dict_len; + char dict[0]; +} __attribute__((packed)) gf_fop_fgetxattr_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + char path[0]; + char name[0]; +} __attribute__((packed)) gf_fop_removexattr_req_t; +typedef struct { } __attribute__((packed)) gf_fop_removexattr_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + char path[0]; +} __attribute__((packed)) gf_fop_opendir_req_t; +typedef struct { + int64_t fd; +} __attribute__((packed)) gf_fop_opendir_rsp_t; + + +typedef struct fsyncdir_req { + uint64_t ino; + uint64_t gen; + int64_t fd; + int32_t data; +} __attribute__((packed)) gf_fop_fsyncdir_req_t; +typedef struct { +} __attribute__((packed)) gf_fop_fsyncdir_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + int64_t fd; + uint64_t offset; + uint32_t size; +} __attribute__((packed)) gf_fop_readdir_req_t; +typedef struct { + uint32_t size; + char buf[0]; +} __attribute__((packed)) gf_fop_readdir_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + int64_t fd; + uint64_t offset; + uint32_t size; +} __attribute__((packed)) gf_fop_readdirp_req_t; +typedef struct { + uint32_t size; + char buf[0]; +} __attribute__((packed)) gf_fop_readdirp_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + uint32_t mask; + char path[0]; +} __attribute__((packed)) gf_fop_access_req_t; +typedef struct { +} __attribute__((packed)) gf_fop_access_rsp_t; + + +typedef struct { + uint64_t par; + uint64_t gen; + uint32_t flags; + uint32_t mode; + char path[0]; + char bname[0]; +} __attribute__((packed)) gf_fop_create_req_t; +typedef struct { + struct gf_stat stat; + uint64_t fd; + struct gf_stat preparent; + struct gf_stat postparent; +} __attribute__((packed)) gf_fop_create_rsp_t; + + + +typedef struct { + uint64_t ino; + uint64_t gen; + int64_t fd; + uint64_t offset; +} __attribute__((packed)) gf_fop_ftruncate_req_t; +typedef struct { + struct gf_stat prestat; + struct gf_stat poststat; +} __attribute__((packed)) gf_fop_ftruncate_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + int64_t fd; +} __attribute__((packed)) gf_fop_fstat_req_t; +typedef struct { + struct gf_stat stat; +} __attribute__((packed)) gf_fop_fstat_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + int64_t fd; + uint32_t cmd; + uint32_t type; + struct gf_flock flock; +} __attribute__((packed)) gf_fop_lk_req_t; +typedef struct { + struct gf_flock flock; +} __attribute__((packed)) gf_fop_lk_rsp_t; + +typedef struct { + uint64_t ino; + uint64_t gen; + uint32_t cmd; + uint32_t type; + struct gf_flock flock; + char path[0]; + char volume[0]; +} __attribute__((packed)) gf_fop_inodelk_req_t; +typedef struct { +} __attribute__((packed)) gf_fop_inodelk_rsp_t; + +typedef struct { + uint64_t ino; + uint64_t gen; + int64_t fd; + uint32_t cmd; + uint32_t type; + struct gf_flock flock; + char volume[0]; +} __attribute__((packed)) gf_fop_finodelk_req_t; +typedef struct { +} __attribute__((packed)) gf_fop_finodelk_rsp_t; + +typedef struct { + uint64_t ino; + uint64_t gen; + uint32_t cmd; + uint32_t type; + uint64_t namelen; + char path[0]; + char name[0]; + char volume[0]; +} __attribute__((packed)) gf_fop_entrylk_req_t; +typedef struct { +} __attribute__((packed)) gf_fop_entrylk_rsp_t; + +typedef struct { + uint64_t ino; + uint64_t gen; + int64_t fd; + uint32_t cmd; + uint32_t type; + uint64_t namelen; + char name[0]; + char volume[0]; +} __attribute__((packed)) gf_fop_fentrylk_req_t; +typedef struct { +} __attribute__((packed)) gf_fop_fentrylk_rsp_t; + +typedef struct { + uint64_t ino; /* NOTE: used only in case of 'root' lookup */ + uint64_t par; + uint64_t gen; + uint32_t flags; + uint32_t dictlen; + char path[0]; + char bname[0]; + char dict[0]; +} __attribute__((packed)) gf_fop_lookup_req_t; +typedef struct { + struct gf_stat stat; + struct gf_stat postparent; + uint32_t dict_len; + char dict[0]; +} __attribute__((packed)) gf_fop_lookup_rsp_t; + +typedef struct { + uint64_t ino; + uint64_t gen; + uint32_t flag; + char path[0]; +} __attribute__((packed)) gf_fop_checksum_req_t; +typedef struct { + unsigned char fchecksum[0]; + unsigned char dchecksum[0]; +} __attribute__((packed)) gf_fop_checksum_rsp_t; + +typedef struct { + uint64_t ino; + uint64_t gen; + struct gf_stat stbuf; + int32_t valid; + char path[0]; +} __attribute__((packed)) gf_fop_setattr_req_t; +typedef struct { + struct gf_stat statpre; + struct gf_stat statpost; +} __attribute__((packed)) gf_fop_setattr_rsp_t; + +typedef struct { + int64_t fd; + struct gf_stat stbuf; + int32_t valid; +} __attribute__((packed)) gf_fop_fsetattr_req_t; +typedef struct { + struct gf_stat statpre; + struct gf_stat statpost; +} __attribute__((packed)) gf_fop_fsetattr_rsp_t; + +typedef struct { + int64_t fd; + uint64_t offset; + uint32_t len; +} __attribute__((packed)) gf_fop_rchecksum_req_t; +typedef struct { + uint32_t weak_checksum; + unsigned char strong_checksum[0]; +} __attribute__((packed)) gf_fop_rchecksum_rsp_t; + +typedef struct { + uint32_t flags; + uint32_t keylen; + char key[0]; +} __attribute__((packed)) gf_mop_getspec_req_t; +typedef struct { + char spec[0]; +} __attribute__((packed)) gf_mop_getspec_rsp_t; + + +typedef struct { + uint32_t msglen; + char msg[0]; +} __attribute__((packed)) gf_mop_log_req_t; +typedef struct { +} __attribute__((packed)) gf_mop_log_rsp_t; + + +typedef struct { + uint32_t dict_len; + char buf[0]; +} __attribute__((packed)) gf_mop_setvolume_req_t; +typedef struct { + uint32_t dict_len; + char buf[0]; +} __attribute__((packed)) gf_mop_setvolume_rsp_t; + + +typedef struct { +} __attribute__((packed)) gf_mop_ping_req_t; +typedef struct { +} __attribute__((packed)) gf_mop_ping_rsp_t; + +typedef struct { + uint32_t flags; + char buf[0]; +} __attribute__((packed)) gf_mop_notify_req_t; +typedef struct { + uint32_t flags; + char buf[0]; +} __attribute__((packed)) gf_mop_notify_rsp_t; + +typedef struct { + uint64_t ino; + uint64_t gen; + int64_t fd; +} __attribute__((packed)) gf_cbk_releasedir_req_t; +typedef struct { +} __attribute__((packed)) gf_cbk_releasedir_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t gen; + int64_t fd; +} __attribute__((packed)) gf_cbk_release_req_t; +typedef struct { +} __attribute__((packed)) gf_cbk_release_rsp_t; + + +typedef struct { + uint32_t count; + uint64_t ino_array[0]; +} __attribute__((packed)) gf_cbk_forget_req_t; +typedef struct { } __attribute__((packed)) gf_cbk_forget_rsp_t; + + +typedef struct { + uint32_t pid; + uint32_t uid; + uint32_t gid; + + /* Number of groups being sent through the array above. */ + uint32_t ngrps; + + /* Array of groups to which the uid belongs apart from the primary group + * in gid. + */ + uint32_t groups[GF_REQUEST_MAXGROUPS]; + + uint64_t lk_owner; +} __attribute__ ((packed)) gf_hdr_req_t; + + +typedef struct { + uint32_t op_ret; + uint32_t op_errno; +} __attribute__ ((packed)) gf_hdr_rsp_t; + + +typedef struct { + uint64_t callid; + uint32_t type; + uint32_t op; + uint32_t size; + union { + gf_hdr_req_t req; + gf_hdr_rsp_t rsp; + } __attribute__ ((packed)); +} __attribute__ ((packed)) gf_hdr_common_t; + + +static inline gf_hdr_common_t * +__gf_hdr_new (int size) +{ + gf_hdr_common_t *hdr = NULL; + + /* TODO: use mem-pool */ + hdr = GF_CALLOC (sizeof (gf_hdr_common_t) + size, 1, + gf_common_mt_gf_hdr_common_t); + + if (!hdr) { + return NULL; + } + + hdr->size = hton32 (size); + + return hdr; +} + + +#define gf_hdr_len(type, x) (sizeof (gf_hdr_common_t) + sizeof (*type) + x) +#define gf_hdr_new(type, x) __gf_hdr_new (sizeof (*type) + x) + + +static inline void * +gf_param (gf_hdr_common_t *hdr) +{ + return ((void *)hdr) + sizeof (*hdr); +} + + +struct gf_dirent_nb { + uint64_t d_ino; + uint64_t d_off; + uint32_t d_len; + uint32_t d_type; + struct gf_stat d_stat; + char d_name[0]; +} __attribute__((packed)); + + +static inline int +gf_dirent_nb_size (gf_dirent_t *entries) +{ + return (sizeof (struct gf_dirent_nb) + strlen (entries->d_name) + 1); +} + +static inline int +gf_dirent_serialize (gf_dirent_t *entries, char *buf, size_t buf_size) +{ + struct gf_dirent_nb *entry_nb = NULL; + gf_dirent_t *entry = NULL; + int size = 0; + int entry_size = 0; + + + list_for_each_entry (entry, &entries->list, list) { + entry_size = gf_dirent_nb_size (entry); + + if (buf && (size + entry_size <= buf_size)) { + entry_nb = (void *) (buf + size); + + entry_nb->d_ino = hton64 (entry->d_ino); + entry_nb->d_off = hton64 (entry->d_off); + entry_nb->d_len = hton32 (entry->d_len); + entry_nb->d_type = hton32 (entry->d_type); + + gf_stat_from_iatt (&entry_nb->d_stat, &entry->d_stat); + + strcpy (entry_nb->d_name, entry->d_name); + } + size += entry_size; + } + + return size; +} + + +static inline int +gf_dirent_unserialize (gf_dirent_t *entries, const char *buf, size_t buf_size) +{ + struct gf_dirent_nb *entry_nb = NULL; + int remaining_size = 0; + int least_dirent_size = 0; + int count = 0; + gf_dirent_t *entry = NULL; + int entry_strlen = 0; + int entry_len = 0; + + + remaining_size = buf_size; + least_dirent_size = (sizeof (struct gf_dirent_nb) + 2); + + while (remaining_size >= least_dirent_size) { + entry_nb = (void *)(buf + (buf_size - remaining_size)); + + entry_strlen = strnlen (entry_nb->d_name, remaining_size); + if (entry_strlen == remaining_size) { + break; + } + + entry_len = sizeof (gf_dirent_t) + entry_strlen + 1; + entry = GF_CALLOC (1, entry_len, gf_common_mt_gf_dirent_t); + if (!entry) { + break; + } + + entry->d_ino = ntoh64 (entry_nb->d_ino); + entry->d_off = ntoh64 (entry_nb->d_off); + entry->d_len = ntoh32 (entry_nb->d_len); + entry->d_type = ntoh32 (entry_nb->d_type); + + gf_stat_to_iatt (&entry_nb->d_stat, &entry->d_stat); + + strcpy (entry->d_name, entry_nb->d_name); + + list_add_tail (&entry->list, &entries->list); + + remaining_size -= (sizeof (*entry_nb) + entry_strlen + 1); + count++; + } + + return count; +} + +#endif diff --git a/xlators/protocol/lib/src/transport.c b/xlators/protocol/lib/src/transport.c new file mode 100644 index 00000000..d460d020 --- /dev/null +++ b/xlators/protocol/lib/src/transport.c @@ -0,0 +1,422 @@ +/* + Copyright (c) 2006-2009 Gluster, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#include +#include +#include +#include +#include +#include + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "logging.h" +#include "transport.h" +#include "glusterfs.h" +#include "xlator.h" +#include "list.h" + + +transport_t * +transport_load (dict_t *options, + xlator_t *xl) +{ + struct transport *trans = NULL, *return_trans = NULL; + char *name = NULL; + void *handle = NULL; + char *type = NULL; + char str[] = "ERROR"; + int32_t ret = -1; + int8_t is_tcp = 0, is_unix = 0, is_ibsdp = 0; + volume_opt_list_t *vol_opt = NULL; + + GF_VALIDATE_OR_GOTO("transport", options, fail); + GF_VALIDATE_OR_GOTO("transport", xl, fail); + + trans = GF_CALLOC (1, sizeof (struct transport), + gf_common_mt_transport); + GF_VALIDATE_OR_GOTO("transport", trans, fail); + + trans->xl = xl; + type = str; + + /* Backward compatibility */ + ret = dict_get_str (options, "transport-type", &type); + if (ret < 0) { + ret = dict_set_str (options, "transport-type", "socket"); + if (ret < 0) + gf_log ("dict", GF_LOG_DEBUG, + "setting transport-type failed"); + gf_log ("transport", GF_LOG_WARNING, + "missing 'option transport-type'. defaulting to " + "\"socket\""); + } else { + { + /* Backword compatibility to handle * /client, + * * /server. + */ + char *tmp = strchr (type, '/'); + if (tmp) + *tmp = '\0'; + } + + is_tcp = strcmp (type, "tcp"); + is_unix = strcmp (type, "unix"); + is_ibsdp = strcmp (type, "ib-sdp"); + if ((is_tcp == 0) || + (is_unix == 0) || + (is_ibsdp == 0)) { + if (is_unix == 0) + ret = dict_set_str (options, + "transport.address-family", + "unix"); + if (is_ibsdp == 0) + ret = dict_set_str (options, + "transport.address-family", + "inet-sdp"); + + if (ret < 0) + gf_log ("dict", GF_LOG_DEBUG, + "setting address-family failed"); + + ret = dict_set_str (options, + "transport-type", "socket"); + if (ret < 0) + gf_log ("dict", GF_LOG_DEBUG, + "setting transport-type failed"); + } + } + + ret = dict_get_str (options, "transport-type", &type); + if (ret < 0) { + GF_FREE (trans); + gf_log ("transport", GF_LOG_ERROR, + "'option transport-type ' missing in volume '%s'", + xl->name); + goto fail; + } + + ret = gf_asprintf (&name, "%s/%s.so", TRANSPORTDIR, type); + if (-1 == ret) { + gf_log ("transport", GF_LOG_ERROR, "asprintf failed"); + goto fail; + } + gf_log ("transport", GF_LOG_DEBUG, + "attempt to load file %s", name); + + handle = dlopen (name, RTLD_NOW|RTLD_GLOBAL); + if (handle == NULL) { + gf_log ("transport", GF_LOG_ERROR, "%s", dlerror ()); + gf_log ("transport", GF_LOG_ERROR, + "volume '%s': transport-type '%s' is not valid or " + "not found on this machine", + xl->name, type); + GF_FREE (name); + GF_FREE (trans); + goto fail; + } + GF_FREE (name); + + trans->ops = dlsym (handle, "tops"); + if (trans->ops == NULL) { + gf_log ("transport", GF_LOG_ERROR, + "dlsym (transport_ops) on %s", dlerror ()); + GF_FREE (trans); + goto fail; + } + + trans->init = dlsym (handle, "init"); + if (trans->init == NULL) { + gf_log ("transport", GF_LOG_ERROR, + "dlsym (gf_transport_init) on %s", dlerror ()); + GF_FREE (trans); + goto fail; + } + + trans->fini = dlsym (handle, "fini"); + if (trans->fini == NULL) { + gf_log ("transport", GF_LOG_ERROR, + "dlsym (gf_transport_fini) on %s", dlerror ()); + GF_FREE (trans); + goto fail; + } + + vol_opt = GF_CALLOC (1, sizeof (volume_opt_list_t), + gf_common_mt_volume_opt_list_t); + vol_opt->given_opt = dlsym (handle, "options"); + if (vol_opt->given_opt == NULL) { + gf_log ("transport", GF_LOG_DEBUG, + "volume option validation not specified"); + } else { + list_add_tail (&vol_opt->list, &xl->volume_options); + if (-1 == + validate_xlator_volume_options (xl, + vol_opt->given_opt)) { + gf_log ("transport", GF_LOG_ERROR, + "volume option validation failed"); + GF_FREE (trans); + goto fail; + } + } + + ret = trans->init (trans); + if (ret != 0) { + gf_log ("transport", GF_LOG_ERROR, + "'%s' initialization failed", type); + GF_FREE (trans); + goto fail; + } + + pthread_mutex_init (&trans->lock, NULL); + return_trans = trans; +fail: + return return_trans; +} + + +int32_t +transport_submit (transport_t *this, char *buf, int32_t len, + struct iovec *vector, int count, + struct iobref *iobref) +{ + int32_t ret = -1; + transport_t *peer_trans = NULL; + struct iobuf *iobuf = NULL; + struct transport_msg *msg = NULL; + + if (this->peer_trans) { + peer_trans = this->peer_trans; + + msg = GF_CALLOC (1, sizeof (*msg), + gf_common_mt_transport_msg); + if (!msg) { + return -ENOMEM; + } + + msg->hdr = buf; + msg->hdrlen = len; + + if (vector) { + iobuf = iobuf_get (this->xl->ctx->iobuf_pool); + if (!iobuf) { + GF_FREE (msg->hdr); + GF_FREE (msg); + return -ENOMEM; + } + + iov_unload (iobuf->ptr, vector, count); + msg->iobuf = iobuf; + } + + pthread_mutex_lock (&peer_trans->handover.mutex); + { + list_add_tail (&msg->list, &peer_trans->handover.msgs); + pthread_cond_broadcast (&peer_trans->handover.cond); + } + pthread_mutex_unlock (&peer_trans->handover.mutex); + + return 0; + } + + GF_VALIDATE_OR_GOTO("transport", this, fail); + GF_VALIDATE_OR_GOTO("transport", this->ops, fail); + + ret = this->ops->submit (this, buf, len, vector, count, iobref); +fail: + return ret; +} + + +int32_t +transport_connect (transport_t *this) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO("transport", this, fail); + + ret = this->ops->connect (this); +fail: + return ret; +} + + +int32_t +transport_listen (transport_t *this) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO("transport", this, fail); + + ret = this->ops->listen (this); +fail: + return ret; +} + + +int32_t +transport_disconnect (transport_t *this) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("transport", this, fail); + + ret = this->ops->disconnect (this); +fail: + return ret; +} + + +int32_t +transport_destroy (transport_t *this) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("transport", this, fail); + + if (this->fini) + this->fini (this); + + pthread_mutex_destroy (&this->lock); + GF_FREE (this); +fail: + return ret; +} + + +transport_t * +transport_ref (transport_t *this) +{ + transport_t *return_this = NULL; + + GF_VALIDATE_OR_GOTO("transport", this, fail); + + pthread_mutex_lock (&this->lock); + { + this->refcount ++; + } + pthread_mutex_unlock (&this->lock); + + return_this = this; +fail: + return return_this; +} + + +int32_t +transport_receive (transport_t *this, char **hdr_p, size_t *hdrlen_p, + struct iobuf **iobuf_p) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("transport", this, fail); + + if (this->peer_trans) { + *hdr_p = this->handover.msg->hdr; + *hdrlen_p = this->handover.msg->hdrlen; + *iobuf_p = this->handover.msg->iobuf; + + return 0; + } + + ret = this->ops->receive (this, hdr_p, hdrlen_p, iobuf_p); +fail: + return ret; +} + + +int32_t +transport_unref (transport_t *this) +{ + int32_t refcount = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("transport", this, fail); + + pthread_mutex_lock (&this->lock); + { + refcount = --this->refcount; + } + pthread_mutex_unlock (&this->lock); + + if (refcount == 0) { + xlator_notify (this->xl, GF_EVENT_TRANSPORT_CLEANUP, this); + transport_destroy (this); + } + + ret = 0; +fail: + return ret; +} + + +void * +transport_peerproc (void *trans_data) +{ + transport_t *trans = NULL; + struct transport_msg *msg = NULL; + + trans = trans_data; + + while (1) { + pthread_mutex_lock (&trans->handover.mutex); + { + while (list_empty (&trans->handover.msgs)) + pthread_cond_wait (&trans->handover.cond, + &trans->handover.mutex); + + msg = list_entry (trans->handover.msgs.next, + struct transport_msg, list); + + list_del_init (&msg->list); + } + pthread_mutex_unlock (&trans->handover.mutex); + + trans->handover.msg = msg; + + xlator_notify (trans->xl, GF_EVENT_POLLIN, trans); + + GF_FREE (msg); + } +} + + +int +transport_setpeer (transport_t *trans, transport_t *peer_trans) +{ + trans->peer_trans = transport_ref (peer_trans); + + INIT_LIST_HEAD (&trans->handover.msgs); + pthread_cond_init (&trans->handover.cond, NULL); + pthread_mutex_init (&trans->handover.mutex, NULL); + pthread_create (&trans->handover.thread, NULL, + transport_peerproc, trans); + + peer_trans->peer_trans = transport_ref (trans); + + INIT_LIST_HEAD (&peer_trans->handover.msgs); + pthread_cond_init (&peer_trans->handover.cond, NULL); + pthread_mutex_init (&peer_trans->handover.mutex, NULL); + pthread_create (&peer_trans->handover.thread, NULL, + transport_peerproc, peer_trans); + + return 0; +} diff --git a/xlators/protocol/lib/src/transport.h b/xlators/protocol/lib/src/transport.h new file mode 100644 index 00000000..f0623d5b --- /dev/null +++ b/xlators/protocol/lib/src/transport.h @@ -0,0 +1,106 @@ +/* + Copyright (c) 2006-2009 Gluster, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef __TRANSPORT_H__ +#define __TRANSPORT_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include + +struct transport_ops; +typedef struct transport transport_t; + +#include "xlator.h" +#include "dict.h" +#include "compat.h" + +typedef struct peer_info { + struct sockaddr_storage sockaddr; + socklen_t sockaddr_len; + char identifier[UNIX_PATH_MAX]; +}peer_info_t; + +struct transport_msg { + struct list_head list; + char *hdr; + int hdrlen; + struct iobuf *iobuf; +}; + +struct transport { + struct transport_ops *ops; + void *private; + void *xl_private; + pthread_mutex_t lock; + int32_t refcount; + + xlator_t *xl; + void *dnscache; + data_t *buf; + int32_t (*init) (transport_t *this); + void (*fini) (transport_t *this); + /* int (*notify) (transport_t *this, int event, void *data); */ + peer_info_t peerinfo; + peer_info_t myinfo; + + transport_t *peer_trans; + struct { + pthread_mutex_t mutex; + pthread_cond_t cond; + pthread_t thread; + struct list_head msgs; + struct transport_msg *msg; + } handover; + +}; + +struct transport_ops { + int32_t (*receive) (transport_t *this, char **hdr_p, size_t *hdrlen_p, + struct iobuf **iobuf_p); + int32_t (*submit) (transport_t *this, char *buf, int len, + struct iovec *vector, int count, + struct iobref *iobref); + int32_t (*connect) (transport_t *this); + int32_t (*listen) (transport_t *this); + int32_t (*disconnect) (transport_t *this); +}; + + +int32_t transport_listen (transport_t *this); +int32_t transport_connect (transport_t *this); +int32_t transport_disconnect (transport_t *this); +int32_t transport_notify (transport_t *this, int event); +int32_t transport_submit (transport_t *this, char *buf, int len, + struct iovec *vector, int count, + struct iobref *iobref); +int32_t transport_receive (transport_t *this, char **hdr_p, size_t *hdrlen_p, + struct iobuf **iobuf_p); +int32_t transport_destroy (transport_t *this); + +transport_t *transport_load (dict_t *options, xlator_t *xl); +transport_t *transport_ref (transport_t *trans); +int32_t transport_unref (transport_t *trans); + +int transport_setpeer (transport_t *trans, transport_t *trans_peer); + +#endif /* __TRANSPORT_H__ */ diff --git a/xlators/protocol/server/src/Makefile.am b/xlators/protocol/server/src/Makefile.am index ae93912f..faf82ee2 100644 --- a/xlators/protocol/server/src/Makefile.am +++ b/xlators/protocol/server/src/Makefile.am @@ -5,14 +5,15 @@ xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/protocol server_la_LDFLAGS = -module -avoidversion server_la_SOURCES = server-protocol.c server-resolve.c server-helpers.c -server_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +server_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(top_builddir)/xlators/protocol/lib/src/libgfproto.la noinst_HEADERS = server-protocol.h server-helpers.h AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles \ -DDATADIR=\"$(localstatedir)\" -DCONFDIR=\"$(sysconfdir)/glusterfs\" \ - $(GF_CFLAGS) + $(GF_CFLAGS) -I$(top_srcdir)/xlators/protocol/lib/src CLEANFILES = diff --git a/xlators/protocol/transport/Makefile.am b/xlators/protocol/transport/Makefile.am new file mode 100644 index 00000000..e2f97437 --- /dev/null +++ b/xlators/protocol/transport/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = socket $(IBVERBS_SUBDIR) + +CLEANFILES = diff --git a/xlators/protocol/transport/ib-verbs/Makefile.am b/xlators/protocol/transport/ib-verbs/Makefile.am new file mode 100644 index 00000000..f963effe --- /dev/null +++ b/xlators/protocol/transport/ib-verbs/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src \ No newline at end of file diff --git a/xlators/protocol/transport/ib-verbs/src/Makefile.am b/xlators/protocol/transport/ib-verbs/src/Makefile.am new file mode 100644 index 00000000..8f6e6a35 --- /dev/null +++ b/xlators/protocol/transport/ib-verbs/src/Makefile.am @@ -0,0 +1,19 @@ +# TODO : need to change transportdir + +transport_LTLIBRARIES = ib-verbs.la +transportdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/transport + +ib_verbs_la_LDFLAGS = -module -avoidversion + +ib_verbs_la_SOURCES = ib-verbs.c name.c +ib_verbs_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + -libverbs $(top_builddir)/xlators/protocol/lib/src/libgfproto.la + +noinst_HEADERS = ib-verbs.h name.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) \ + -I$(top_srcdir)/xlators/protocol/transport/ib-verbs \ + -I$(top_srcdir)/xlators/protocol/lib/src + +CLEANFILES = *~ diff --git a/xlators/protocol/transport/ib-verbs/src/ib-verbs-mem-types.h b/xlators/protocol/transport/ib-verbs/src/ib-verbs-mem-types.h new file mode 100644 index 00000000..bac55964 --- /dev/null +++ b/xlators/protocol/transport/ib-verbs/src/ib-verbs-mem-types.h @@ -0,0 +1,39 @@ + +/* + Copyright (c) 2008-2009 Gluster, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + + +#ifndef __IB_VERBS_MEM_TYPES_H__ +#define __IB_VERBS_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_ib_verbs_mem_types_ { + gf_ibv_mt_ib_verbs_private_t = gf_common_mt_end + 1, + gf_ibv_mt_ib_verbs_ioq_t, + gf_ibv_mt_transport_t, + gf_ibv_mt_ib_verbs_local_t, + gf_ibv_mt_ib_verbs_post_t, + gf_ibv_mt_char, + gf_ibv_mt_qpent, + gf_ibv_mt_ib_verbs_device_t, + gf_ibv_mt_end +}; +#endif + diff --git a/xlators/protocol/transport/ib-verbs/src/ib-verbs.c b/xlators/protocol/transport/ib-verbs/src/ib-verbs.c new file mode 100644 index 00000000..a252a13d --- /dev/null +++ b/xlators/protocol/transport/ib-verbs/src/ib-verbs.c @@ -0,0 +1,2613 @@ +/* + Copyright (c) 2006-2009 Gluster, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "dict.h" +#include "glusterfs.h" +#include "transport.h" +#include "protocol.h" +#include "logging.h" +#include "xlator.h" +#include "name.h" +#include "ib-verbs.h" +#include + +int32_t +gf_resolve_ip6 (const char *hostname, + uint16_t port, + int family, + void **dnscache, + struct addrinfo **addr_info); + +static uint16_t +ib_verbs_get_local_lid (struct ibv_context *context, + int32_t port) +{ + struct ibv_port_attr attr; + + if (ibv_query_port (context, port, &attr)) + return 0; + + return attr.lid; +} + +static const char * +get_port_state_str(enum ibv_port_state pstate) +{ + switch (pstate) { + case IBV_PORT_DOWN: return "PORT_DOWN"; + case IBV_PORT_INIT: return "PORT_INIT"; + case IBV_PORT_ARMED: return "PORT_ARMED"; + case IBV_PORT_ACTIVE: return "PORT_ACTIVE"; + case IBV_PORT_ACTIVE_DEFER: return "PORT_ACTIVE_DEFER"; + default: return "invalid state"; + } +} + +static int32_t +ib_check_active_port (struct ibv_context *ctx, uint8_t port) +{ + struct ibv_port_attr port_attr; + + int32_t ret = 0; + const char *state_str = NULL; + + if (!ctx) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "Error in supplied context"); + return -1; + } + + ret = ibv_query_port (ctx, port, &port_attr); + + if (ret) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "Failed to query port %u properties", port); + return -1; + } + + state_str = get_port_state_str (port_attr.state); + gf_log ("transport/ib-verbs", GF_LOG_TRACE, + "Infiniband PORT: (%u) STATE: (%s)", + port, state_str); + + if (port_attr.state == IBV_PORT_ACTIVE) + return 0; + + return -1; +} + +static int32_t +ib_get_active_port (struct ibv_context *ib_ctx) +{ + struct ibv_device_attr ib_device_attr; + + int32_t ret = -1; + uint8_t ib_port = 0; + + if (!ib_ctx) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "Error in supplied context"); + return -1; + } + if (ibv_query_device (ib_ctx, &ib_device_attr)) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "Failed to query device properties"); + return -1; + } + + for (ib_port = 1; ib_port <= ib_device_attr.phys_port_cnt; ++ib_port) { + ret = ib_check_active_port (ib_ctx, ib_port); + if (ret == 0) + return ib_port; + + gf_log ("transport/ib-verbs", GF_LOG_TRACE, + "Port:(%u) not active", ib_port); + continue; + } + return ret; +} + + + +static void +ib_verbs_put_post (ib_verbs_queue_t *queue, + ib_verbs_post_t *post) +{ + pthread_mutex_lock (&queue->lock); + if (post->prev) { + queue->active_count--; + post->prev->next = post->next; + } + if (post->next) + post->next->prev = post->prev; + post->prev = &queue->passive_posts; + post->next = post->prev->next; + post->prev->next = post; + post->next->prev = post; + queue->passive_count++; + pthread_mutex_unlock (&queue->lock); +} + + +static ib_verbs_post_t * +ib_verbs_new_post (ib_verbs_device_t *device, int32_t len) +{ + ib_verbs_post_t *post; + + post = (ib_verbs_post_t *) GF_CALLOC (1, sizeof (*post), + gf_ibv_mt_ib_verbs_post_t); + if (!post) + return NULL; + + post->buf_size = len; + + post->buf = valloc (len); + if (!post->buf) { + GF_FREE (post); + return NULL; + } + + post->mr = ibv_reg_mr (device->pd, + post->buf, + post->buf_size, + IBV_ACCESS_LOCAL_WRITE); + if (!post->mr) { + free (post->buf); + GF_FREE (post); + return NULL; + } + + return post; +} + + +static ib_verbs_post_t * +ib_verbs_get_post (ib_verbs_queue_t *queue) +{ + ib_verbs_post_t *post; + + pthread_mutex_lock (&queue->lock); + { + post = queue->passive_posts.next; + if (post == &queue->passive_posts) + post = NULL; + + if (post) { + if (post->prev) + post->prev->next = post->next; + if (post->next) + post->next->prev = post->prev; + post->prev = &queue->active_posts; + post->next = post->prev->next; + post->prev->next = post; + post->next->prev = post; + post->reused++; + queue->active_count++; + } + } + pthread_mutex_unlock (&queue->lock); + + return post; +} + +void +ib_verbs_destroy_post (ib_verbs_post_t *post) +{ + ibv_dereg_mr (post->mr); + free (post->buf); + GF_FREE (post); +} + + +static int32_t +__ib_verbs_quota_get (ib_verbs_peer_t *peer) +{ + int32_t ret = -1; + ib_verbs_private_t *priv = peer->trans->private; + + if (priv->connected && peer->quota > 0) { + ret = peer->quota--; + } + + return ret; +} + +/* + static int32_t + ib_verbs_quota_get (ib_verbs_peer_t *peer) + { + int32_t ret = -1; + ib_verbs_private_t *priv = peer->trans->private; + + pthread_mutex_lock (&priv->write_mutex); + { + ret = __ib_verbs_quota_get (peer); + } + pthread_mutex_unlock (&priv->write_mutex); + + return ret; + } +*/ + +static void +__ib_verbs_ioq_entry_free (ib_verbs_ioq_t *entry) +{ + list_del_init (&entry->list); + if (entry->iobref) + iobref_unref (entry->iobref); + + /* TODO: use mem-pool */ + GF_FREE (entry->buf); + + /* TODO: use mem-pool */ + GF_FREE (entry); +} + + +static void +__ib_verbs_ioq_flush (ib_verbs_peer_t *peer) +{ + ib_verbs_ioq_t *entry = NULL, *dummy = NULL; + + list_for_each_entry_safe (entry, dummy, &peer->ioq, list) { + __ib_verbs_ioq_entry_free (entry); + } +} + + +static int32_t +__ib_verbs_disconnect (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + int32_t ret = 0; + + if (priv->connected || priv->tcp_connected) { + fcntl (priv->sock, F_SETFL, O_NONBLOCK); + if (shutdown (priv->sock, SHUT_RDWR) != 0) { + gf_log ("transport/ib-verbs", + GF_LOG_DEBUG, + "shutdown () - error: %s", + strerror (errno)); + ret = -errno; + priv->tcp_connected = 0; + } + } + + return ret; +} + + +static int32_t +ib_verbs_post_send (struct ibv_qp *qp, + ib_verbs_post_t *post, + int32_t len) +{ + struct ibv_sge list = { + .addr = (unsigned long) post->buf, + .length = len, + .lkey = post->mr->lkey + }; + + struct ibv_send_wr wr = { + .wr_id = (unsigned long) post, + .sg_list = &list, + .num_sge = 1, + .opcode = IBV_WR_SEND, + .send_flags = IBV_SEND_SIGNALED, + }, *bad_wr; + + if (!qp) + return -1; + + return ibv_post_send (qp, &wr, &bad_wr); +} + + +static int32_t +__ib_verbs_ioq_churn_entry (ib_verbs_peer_t *peer, ib_verbs_ioq_t *entry) +{ + int32_t ret = 0, quota = 0; + ib_verbs_private_t *priv = peer->trans->private; + ib_verbs_device_t *device = priv->device; + ib_verbs_options_t *options = &priv->options; + ib_verbs_post_t *post = NULL; + int32_t len = 0; + + quota = __ib_verbs_quota_get (peer); + if (quota > 0) { + post = ib_verbs_get_post (&device->sendq); + if (!post) + post = ib_verbs_new_post (device, + (options->send_size + 2048)); + + len = iov_length ((const struct iovec *)&entry->vector, + entry->count); + if (len >= (options->send_size + 2048)) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "increase value of option 'transport.ib-verbs." + "work-request-send-size' (given=> %"PRId64") " + "to send bigger (%d) messages", + (options->send_size + 2048), len); + return -1; + } + + iov_unload (post->buf, + (const struct iovec *)&entry->vector, + entry->count); + + ret = ib_verbs_post_send (peer->qp, post, len); + if (!ret) { + __ib_verbs_ioq_entry_free (entry); + ret = len; + } else { + gf_log ("transport/ib-verbs", GF_LOG_DEBUG, + "ibv_post_send failed with ret = %d", ret); + ib_verbs_put_post (&device->sendq, post); + __ib_verbs_disconnect (peer->trans); + ret = -1; + } + } + + return ret; +} + + +static int32_t +__ib_verbs_ioq_churn (ib_verbs_peer_t *peer) +{ + ib_verbs_ioq_t *entry = NULL; + int32_t ret = 0; + + while (!list_empty (&peer->ioq)) + { + /* pick next entry */ + entry = peer->ioq_next; + + ret = __ib_verbs_ioq_churn_entry (peer, entry); + + if (ret <= 0) + break; + } + + /* + list_for_each_entry_safe (entry, dummy, &peer->ioq, list) { + ret = __ib_verbs_ioq_churn_entry (peer, entry); + if (ret <= 0) { + break; + } + } + */ + + return ret; +} + +static int32_t +__ib_verbs_quota_put (ib_verbs_peer_t *peer) +{ + int32_t ret; + + peer->quota++; + ret = peer->quota; + + if (!list_empty (&peer->ioq)) { + ret = __ib_verbs_ioq_churn (peer); + } + + return ret; +} + + +static int32_t +ib_verbs_quota_put (ib_verbs_peer_t *peer) +{ + int32_t ret; + ib_verbs_private_t *priv = peer->trans->private; + + pthread_mutex_lock (&priv->write_mutex); + { + ret = __ib_verbs_quota_put (peer); + } + pthread_mutex_unlock (&priv->write_mutex); + + return ret; +} + + +static int32_t +ib_verbs_post_recv (struct ibv_srq *srq, + ib_verbs_post_t *post) +{ + struct ibv_sge list = { + .addr = (unsigned long) post->buf, + .length = post->buf_size, + .lkey = post->mr->lkey + }; + + struct ibv_recv_wr wr = { + .wr_id = (unsigned long) post, + .sg_list = &list, + .num_sge = 1, + }, *bad_wr; + + return ibv_post_srq_recv (srq, &wr, &bad_wr); +} + + +static int32_t +ib_verbs_writev (transport_t *this, + ib_verbs_ioq_t *entry) +{ + int32_t ret = 0, need_append = 1; + ib_verbs_private_t *priv = this->private; + ib_verbs_peer_t *peer = NULL; + + pthread_mutex_lock (&priv->write_mutex); + { + if (!priv->connected) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "ib-verbs is not connected to post a " + "send request"); + ret = -1; + goto unlock; + } + + peer = &priv->peer; + if (list_empty (&peer->ioq)) { + ret = __ib_verbs_ioq_churn_entry (peer, entry); + if (ret != 0) { + need_append = 0; + } + } + + if (need_append) { + list_add_tail (&entry->list, &peer->ioq); + } + } +unlock: + pthread_mutex_unlock (&priv->write_mutex); + return ret; +} + + +static ib_verbs_ioq_t * +ib_verbs_ioq_new (char *buf, int len, struct iovec *vector, + int count, struct iobref *iobref) +{ + ib_verbs_ioq_t *entry = NULL; + + /* TODO: use mem-pool */ + entry = GF_CALLOC (1, sizeof (*entry), gf_ibv_mt_ib_verbs_ioq_t); + + assert (count <= (MAX_IOVEC-2)); + + entry->header.colonO[0] = ':'; + entry->header.colonO[1] = 'O'; + entry->header.colonO[2] = '\0'; + entry->header.version = 42; + entry->header.size1 = hton32 (len); + entry->header.size2 = hton32 (iov_length (vector, count)); + + entry->vector[0].iov_base = &entry->header; + entry->vector[0].iov_len = sizeof (entry->header); + entry->count++; + + entry->vector[1].iov_base = buf; + entry->vector[1].iov_len = len; + entry->count++; + + if (vector && count) + { + memcpy (&entry->vector[2], vector, sizeof (*vector) * count); + entry->count += count; + } + + if (iobref) + entry->iobref = iobref_ref (iobref); + + entry->buf = buf; + + INIT_LIST_HEAD (&entry->list); + + return entry; +} + + +static int32_t +ib_verbs_submit (transport_t *this, char *buf, int32_t len, + struct iovec *vector, int count, struct iobref *iobref) +{ + int32_t ret = 0; + ib_verbs_ioq_t *entry = NULL; + + entry = ib_verbs_ioq_new (buf, len, vector, count, iobref); + ret = ib_verbs_writev (this, entry); + + if (ret > 0) { + ret = 0; + } + + return ret; +} + +static int +ib_verbs_receive (transport_t *this, char **hdr_p, size_t *hdrlen_p, + struct iobuf **iobuf_p) +{ + ib_verbs_private_t *priv = this->private; + /* TODO: return error if !priv->connected, check with locks */ + /* TODO: boundry checks for data_ptr/offset */ + char *copy_from = NULL; + ib_verbs_header_t *header = NULL; + uint32_t size1, size2, data_len = 0; + char *hdr = NULL; + struct iobuf *iobuf = NULL; + int32_t ret = 0; + + pthread_mutex_lock (&priv->recv_mutex); + { +/* + while (!priv->data_ptr) + pthread_cond_wait (&priv->recv_cond, &priv->recv_mutex); +*/ + + copy_from = priv->data_ptr + priv->data_offset; + + priv->data_ptr = NULL; + data_len = priv->data_len; + pthread_cond_broadcast (&priv->recv_cond); + } + pthread_mutex_unlock (&priv->recv_mutex); + + header = (ib_verbs_header_t *)copy_from; + if (strcmp (header->colonO, ":O")) { + gf_log ("transport/ib-verbs", GF_LOG_DEBUG, + "%s: corrupt header received", this->xl->name); + ret = -1; + goto err; + } + + size1 = ntoh32 (header->size1); + size2 = ntoh32 (header->size2); + + if (data_len != (size1 + size2 + sizeof (*header))) { + gf_log ("transport/ib-verbs", GF_LOG_DEBUG, + "%s: sizeof data read from transport is not equal " + "to the size specified in the header", + this->xl->name); + ret = -1; + goto err; + } + + copy_from += sizeof (*header); + + if (size1) { + hdr = GF_CALLOC (1, size1, gf_ibv_mt_char); + if (!hdr) { + gf_log (this->xl->name, GF_LOG_ERROR, + "unable to allocate header for peer %s", + this->peerinfo.identifier); + ret = -ENOMEM; + goto err; + } + memcpy (hdr, copy_from, size1); + copy_from += size1; + *hdr_p = hdr; + } + *hdrlen_p = size1; + + if (size2) { + iobuf = iobuf_get (this->xl->ctx->iobuf_pool); + if (!iobuf) { + gf_log (this->xl->name, GF_LOG_ERROR, + "unable to allocate IO buffer for peer %s", + this->peerinfo.identifier); + ret = -ENOMEM; + goto err; + } + memcpy (iobuf->ptr, copy_from, size2); + *iobuf_p = iobuf; + } + +err: + return ret; +} + + +static void +ib_verbs_destroy_cq (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + ib_verbs_device_t *device = priv->device; + + if (device->recv_cq) + ibv_destroy_cq (device->recv_cq); + device->recv_cq = NULL; + + if (device->send_cq) + ibv_destroy_cq (device->send_cq); + device->send_cq = NULL; + + return; +} + + +static int32_t +ib_verbs_create_cq (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + ib_verbs_options_t *options = &priv->options; + ib_verbs_device_t *device = priv->device; + int32_t ret = 0; + + device->recv_cq = ibv_create_cq (priv->device->context, + options->recv_count * 2, + device, + device->recv_chan, + 0); + if (!device->recv_cq) { + gf_log ("transport/ib-verbs", + GF_LOG_ERROR, + "%s: creation of CQ failed", + this->xl->name); + ret = -1; + } else if (ibv_req_notify_cq (device->recv_cq, 0)) { + gf_log ("transport/ib-verbs", + GF_LOG_ERROR, + "%s: ibv_req_notify_cq on CQ failed", + this->xl->name); + ret = -1; + } + + do { + /* TODO: make send_cq size dynamically adaptive */ + device->send_cq = ibv_create_cq (priv->device->context, + options->send_count * 1024, + device, + device->send_chan, + 0); + if (!device->send_cq) { + gf_log ("transport/ib-verbs", + GF_LOG_ERROR, + "%s: creation of send_cq failed", + this->xl->name); + ret = -1; + break; + } + + if (ibv_req_notify_cq (device->send_cq, 0)) { + gf_log ("transport/ib-verbs", + GF_LOG_ERROR, + "%s: ibv_req_notify_cq on send_cq failed", + this->xl->name); + ret = -1; + break; + } + } while (0); + + if (ret != 0) + ib_verbs_destroy_cq (this); + + return ret; +} + + +static void +ib_verbs_register_peer (ib_verbs_device_t *device, + int32_t qp_num, + ib_verbs_peer_t *peer) +{ + struct _qpent *ent; + ib_verbs_qpreg_t *qpreg = &device->qpreg; + int32_t hash = qp_num % 42; + + pthread_mutex_lock (&qpreg->lock); + ent = qpreg->ents[hash].next; + while ((ent != &qpreg->ents[hash]) && (ent->qp_num != qp_num)) + ent = ent->next; + if (ent->qp_num == qp_num) { + pthread_mutex_unlock (&qpreg->lock); + return; + } + ent = (struct _qpent *) GF_CALLOC (1, sizeof (*ent), gf_ibv_mt_qpent); + ERR_ABORT (ent); + /* TODO: ref reg->peer */ + ent->peer = peer; + ent->next = &qpreg->ents[hash]; + ent->prev = ent->next->prev; + ent->next->prev = ent; + ent->prev->next = ent; + ent->qp_num = qp_num; + qpreg->count++; + pthread_mutex_unlock (&qpreg->lock); +} + + +static void +ib_verbs_unregister_peer (ib_verbs_device_t *device, + int32_t qp_num) +{ + struct _qpent *ent; + ib_verbs_qpreg_t *qpreg = &device->qpreg; + int32_t hash = qp_num % 42; + + pthread_mutex_lock (&qpreg->lock); + ent = qpreg->ents[hash].next; + while ((ent != &qpreg->ents[hash]) && (ent->qp_num != qp_num)) + ent = ent->next; + if (ent->qp_num != qp_num) { + pthread_mutex_unlock (&qpreg->lock); + return; + } + ent->prev->next = ent->next; + ent->next->prev = ent->prev; + /* TODO: unref reg->peer */ + GF_FREE (ent); + qpreg->count--; + pthread_mutex_unlock (&qpreg->lock); +} + + +static ib_verbs_peer_t * +__ib_verbs_lookup_peer (ib_verbs_device_t *device, int32_t qp_num) +{ + struct _qpent *ent = NULL; + ib_verbs_peer_t *peer = NULL; + ib_verbs_qpreg_t *qpreg = NULL; + int32_t hash = 0; + + qpreg = &device->qpreg; + hash = qp_num % 42; + ent = qpreg->ents[hash].next; + while ((ent != &qpreg->ents[hash]) && (ent->qp_num != qp_num)) + ent = ent->next; + + if (ent != &qpreg->ents[hash]) { + peer = ent->peer; + } + + return peer; +} + +/* +static ib_verbs_peer_t * +ib_verbs_lookup_peer (ib_verbs_device_t *device, + int32_t qp_num) +{ + ib_verbs_qpreg_t *qpreg = NULL; + ib_verbs_peer_t *peer = NULL; + + qpreg = &device->qpreg; + pthread_mutex_lock (&qpreg->lock); + { + peer = __ib_verbs_lookup_peer (device, qp_num); + } + pthread_mutex_unlock (&qpreg->lock); + + return peer; +} +*/ + + +static void +__ib_verbs_destroy_qp (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + + if (priv->peer.qp) { + ib_verbs_unregister_peer (priv->device, priv->peer.qp->qp_num); + ibv_destroy_qp (priv->peer.qp); + } + priv->peer.qp = NULL; + + return; +} + + +static int32_t +ib_verbs_create_qp (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + ib_verbs_options_t *options = &priv->options; + ib_verbs_device_t *device = priv->device; + int32_t ret = 0; + ib_verbs_peer_t *peer; + + peer = &priv->peer; + struct ibv_qp_init_attr init_attr = { + .send_cq = device->send_cq, + .recv_cq = device->recv_cq, + .srq = device->srq, + .cap = { + .max_send_wr = peer->send_count, + .max_recv_wr = peer->recv_count, + .max_send_sge = 1, + .max_recv_sge = 1 + }, + .qp_type = IBV_QPT_RC + }; + + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_INIT, + .pkey_index = 0, + .port_num = options->port, + .qp_access_flags = 0 + }; + + peer->qp = ibv_create_qp (device->pd, &init_attr); + if (!peer->qp) { + gf_log ("transport/ib-verbs", + GF_LOG_CRITICAL, + "%s: could not create QP", + this->xl->name); + ret = -1; + goto out; + } else if (ibv_modify_qp (peer->qp, &attr, + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS)) { + gf_log ("transport/ib-verbs", + GF_LOG_ERROR, + "%s: failed to modify QP to INIT state", + this->xl->name); + ret = -1; + goto out; + } + + peer->local_lid = ib_verbs_get_local_lid (device->context, + options->port); + peer->local_qpn = peer->qp->qp_num; + peer->local_psn = lrand48 () & 0xffffff; + + ib_verbs_register_peer (device, peer->qp->qp_num, peer); + +out: + if (ret == -1) + __ib_verbs_destroy_qp (this); + + return ret; +} + + +static void +ib_verbs_destroy_posts (transport_t *this) +{ + +} + + +static int32_t +__ib_verbs_create_posts (transport_t *this, + int32_t count, + int32_t size, + ib_verbs_queue_t *q) +{ + int32_t i; + int32_t ret = 0; + ib_verbs_private_t *priv = this->private; + ib_verbs_device_t *device = priv->device; + + for (i=0 ; ixl->name); + ret = -1; + break; + } + + ib_verbs_put_post (q, post); + } + return ret; +} + + +static int32_t +ib_verbs_create_posts (transport_t *this) +{ + int32_t i, ret; + ib_verbs_post_t *post = NULL; + ib_verbs_private_t *priv = this->private; + ib_verbs_options_t *options = &priv->options; + ib_verbs_device_t *device = priv->device; + + ret = __ib_verbs_create_posts (this, options->send_count, + options->send_size, + &device->sendq); + if (!ret) + ret = __ib_verbs_create_posts (this, options->recv_count, + options->recv_size, + &device->recvq); + + if (!ret) { + for (i=0 ; irecv_count ; i++) { + post = ib_verbs_get_post (&device->recvq); + if (ib_verbs_post_recv (device->srq, post) != 0) { + ret = -1; + break; + } + } + } + + if (ret) + ib_verbs_destroy_posts (this); + + return ret; +} + + +static int32_t +ib_verbs_connect_qp (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + ib_verbs_options_t *options = &priv->options; + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_RTR, + .path_mtu = options->mtu, + .dest_qp_num = priv->peer.remote_qpn, + .rq_psn = priv->peer.remote_psn, + .max_dest_rd_atomic = 1, + .min_rnr_timer = 12, + .ah_attr = { + .is_global = 0, + .dlid = priv->peer.remote_lid, + .sl = 0, + .src_path_bits = 0, + .port_num = options->port + } + }; + if (ibv_modify_qp (priv->peer.qp, &attr, + IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER)) { + gf_log ("transport/ib-verbs", + GF_LOG_CRITICAL, + "Failed to modify QP to RTR\n"); + return -1; + } + + /* TODO: make timeout and retry_cnt configurable from options */ + attr.qp_state = IBV_QPS_RTS; + attr.timeout = 14; + attr.retry_cnt = 7; + attr.rnr_retry = 7; + attr.sq_psn = priv->peer.local_psn; + attr.max_rd_atomic = 1; + if (ibv_modify_qp (priv->peer.qp, &attr, + IBV_QP_STATE | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC)) { + gf_log ("transport/ib-verbs", + GF_LOG_CRITICAL, + "Failed to modify QP to RTS\n"); + return -1; + } + + return 0; +} + +static int32_t +__ib_verbs_teardown (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + + __ib_verbs_destroy_qp (this); + + if (!list_empty (&priv->peer.ioq)) { + __ib_verbs_ioq_flush (&priv->peer); + } + + /* TODO: decrement cq size */ + return 0; +} + +/* + * return value: + * 0 = success (completed) + * -1 = error + * > 0 = incomplete + */ + +static int +__tcp_rwv (transport_t *this, struct iovec *vector, int count, + struct iovec **pending_vector, int *pending_count, + int write) +{ + ib_verbs_private_t *priv = NULL; + int sock = -1; + int ret = -1; + struct iovec *opvector = vector; + int opcount = count; + int moved = 0; + + priv = this->private; + sock = priv->sock; + + while (opcount) + { + if (write) + { + ret = writev (sock, opvector, opcount); + + if (ret == 0 || (ret == -1 && errno == EAGAIN)) + { + /* done for now */ + break; + } + } + else + { + ret = readv (sock, opvector, opcount); + + if (ret == -1 && errno == EAGAIN) + { + /* done for now */ + break; + } + } + + if (ret == 0) + { + gf_log (this->xl->name, GF_LOG_DEBUG, + "EOF from peer %s", this->peerinfo.identifier); + opcount = -1; + errno = ENOTCONN; + break; + } + + if (ret == -1) + { + if (errno == EINTR) + continue; + + gf_log (this->xl->name, GF_LOG_DEBUG, + "%s failed (%s)", write ? "writev" : "readv", + strerror (errno)); + if (write && !priv->connected && + (errno == ECONNREFUSED)) + gf_log (this->xl->name, GF_LOG_ERROR, + "possible mismatch of 'transport-type'" + " in protocol server and client. " + "check volume file"); + opcount = -1; + break; + } + + moved = 0; + + while (moved < ret) + { + if ((ret - moved) >= opvector[0].iov_len) + { + moved += opvector[0].iov_len; + opvector++; + opcount--; + } + else + { + opvector[0].iov_len -= (ret - moved); + opvector[0].iov_base += (ret - moved); + moved += (ret - moved); + } + while (opcount && !opvector[0].iov_len) + { + opvector++; + opcount--; + } + } + } + + if (pending_vector) + *pending_vector = opvector; + + if (pending_count) + *pending_count = opcount; + + return opcount; +} + + +static int +__tcp_readv (transport_t *this, struct iovec *vector, int count, + struct iovec **pending_vector, int *pending_count) +{ + int ret = -1; + + ret = __tcp_rwv (this, vector, count, + pending_vector, pending_count, 0); + + return ret; +} + + +static int +__tcp_writev (transport_t *this, struct iovec *vector, int count, + struct iovec **pending_vector, int *pending_count) +{ + int ret = -1; + ib_verbs_private_t *priv = this->private; + + ret = __tcp_rwv (this, vector, count, pending_vector, + pending_count, 1); + + if (ret > 0) { + /* TODO: Avoid multiple calls when socket is already + registered for POLLOUT */ + priv->idx = event_select_on (this->xl->ctx->event_pool, + priv->sock, priv->idx, -1, 1); + } else if (ret == 0) { + priv->idx = event_select_on (this->xl->ctx->event_pool, + priv->sock, + priv->idx, -1, 0); + } + + return ret; +} + + +static void * +ib_verbs_recv_completion_proc (void *data) +{ + struct ibv_comp_channel *chan = data; + ib_verbs_private_t *priv = NULL; + ib_verbs_device_t *device; + ib_verbs_post_t *post; + ib_verbs_peer_t *peer; + struct ibv_cq *event_cq; + struct ibv_wc wc; + void *event_ctx; + int32_t ret = 0; + + + while (1) { + ret = ibv_get_cq_event (chan, &event_cq, &event_ctx); + if (ret) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "ibv_get_cq_event failed, terminating recv " + "thread %d (%d)", ret, errno); + continue; + } + + device = event_ctx; + + ret = ibv_req_notify_cq (event_cq, 0); + if (ret) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "ibv_req_notify_cq on %s failed, terminating " + "recv thread: %d (%d)", + device->device_name, ret, errno); + continue; + } + + device = (ib_verbs_device_t *) event_ctx; + + while ((ret = ibv_poll_cq (event_cq, 1, &wc)) > 0) { + post = (ib_verbs_post_t *) (long) wc.wr_id; + + pthread_mutex_lock (&device->qpreg.lock); + { + peer = __ib_verbs_lookup_peer (device, + wc.qp_num); + + /* + * keep a refcount on transport so that it + * doesnot get freed because of some error + * indicated by wc.status till we are done + * with usage of peer and thereby that of trans. + */ + if (peer != NULL) { + transport_ref (peer->trans); + } + } + pthread_mutex_unlock (&device->qpreg.lock); + + if (wc.status != IBV_WC_SUCCESS) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "recv work request on `%s' returned " + "error (%d)", + device->device_name, + wc.status); + if (peer) { + transport_unref (peer->trans); + transport_disconnect (peer->trans); + } + + if (post) { + ib_verbs_post_recv (device->srq, post); + } + continue; + } + + if (peer) { + priv = peer->trans->private; + + pthread_mutex_lock (&priv->recv_mutex); + { + while (priv->data_ptr) + pthread_cond_wait (&priv->recv_cond, + &priv->recv_mutex); + + priv->data_ptr = post->buf; + priv->data_offset = 0; + priv->data_len = wc.byte_len; + + /*pthread_cond_broadcast (&priv->recv_cond);*/ + } + pthread_mutex_unlock (&priv->recv_mutex); + + if ((ret = xlator_notify (peer->trans->xl, GF_EVENT_POLLIN, + peer->trans, NULL)) == -1) { + gf_log ("transport/ib-verbs", + GF_LOG_DEBUG, + "pollin notification to %s " + "failed, disconnecting " + "transport", + peer->trans->xl->name); + transport_disconnect (peer->trans); + } + + transport_unref (peer->trans); + } else { + gf_log ("transport/ib-verbs", + GF_LOG_DEBUG, + "could not lookup peer for qp_num: %d", + wc.qp_num); + } + ib_verbs_post_recv (device->srq, post); + } + + if (ret < 0) { + gf_log ("transport/ib-verbs", + GF_LOG_ERROR, + "ibv_poll_cq on `%s' returned error " + "(ret = %d, errno = %d)", + device->device_name, ret, errno); + continue; + } + ibv_ack_cq_events (event_cq, 1); + } + return NULL; +} + + +static void * +ib_verbs_send_completion_proc (void *data) +{ + struct ibv_comp_channel *chan = data; + ib_verbs_post_t *post; + ib_verbs_peer_t *peer; + struct ibv_cq *event_cq; + void *event_ctx; + ib_verbs_device_t *device; + struct ibv_wc wc; + int32_t ret; + + while (1) { + ret = ibv_get_cq_event (chan, &event_cq, &event_ctx); + if (ret) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "ibv_get_cq_event on failed, terminating " + "send thread: %d (%d)", ret, errno); + continue; + } + + device = event_ctx; + + ret = ibv_req_notify_cq (event_cq, 0); + if (ret) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "ibv_req_notify_cq on %s failed, terminating " + "send thread: %d (%d)", + device->device_name, ret, errno); + continue; + } + + while ((ret = ibv_poll_cq (event_cq, 1, &wc)) > 0) { + post = (ib_verbs_post_t *) (long) wc.wr_id; + + pthread_mutex_lock (&device->qpreg.lock); + { + peer = __ib_verbs_lookup_peer (device, + wc.qp_num); + + /* + * keep a refcount on transport so that it + * doesnot get freed because of some error + * indicated by wc.status till we are done + * with usage of peer and thereby that of trans. + */ + if (peer != NULL) { + transport_ref (peer->trans); + } + } + pthread_mutex_unlock (&device->qpreg.lock); + + if (wc.status != IBV_WC_SUCCESS) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "send work request on `%s' returned " + "error wc.status = %d, wc.vendor_err " + "= %d, post->buf = %p, wc.byte_len = " + "%d, post->reused = %d", + device->device_name, wc.status, + wc.vendor_err, + post->buf, wc.byte_len, post->reused); + if (wc.status == IBV_WC_RETRY_EXC_ERR) + gf_log ("ib-verbs", GF_LOG_ERROR, + "connection between client and" + " server not working. check by" + " running 'ibv_srq_pingpong'. " + "also make sure subnet manager" + " is running (eg: 'opensm'), " + "or check if ib-verbs port is " + "valid (or active) by running " + " 'ibv_devinfo'. contact " + "Gluster Support Team if " + "the problem persists."); + if (peer) + transport_disconnect (peer->trans); + } + + if (post) { + ib_verbs_put_post (&device->sendq, post); + } + + if (peer) { + int quota_ret = ib_verbs_quota_put (peer); + if (quota_ret < 0) { + gf_log ("ib-verbs", GF_LOG_DEBUG, + "failed to send message"); + + } + + transport_unref (peer->trans); + } else { + gf_log ("transport/ib-verbs", GF_LOG_DEBUG, + "could not lookup peer for qp_num: %d", + wc.qp_num); + } + } + + if (ret < 0) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "ibv_poll_cq on `%s' returned error (ret = %d," + " errno = %d)", + device->device_name, ret, errno); + continue; + } + ibv_ack_cq_events (event_cq, 1); + } + + return NULL; +} + +static void +ib_verbs_options_init (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + ib_verbs_options_t *options = &priv->options; + int32_t mtu; + data_t *temp; + + /* TODO: validate arguments from options below */ + + options->send_size = this->xl->ctx->page_size * 4; /* 512 KB */ + options->recv_size = this->xl->ctx->page_size * 4; /* 512 KB */ + options->send_count = 32; + options->recv_count = 32; + + temp = dict_get (this->xl->options, + "transport.ib-verbs.work-request-send-count"); + if (temp) + options->send_count = data_to_int32 (temp); + + temp = dict_get (this->xl->options, + "transport.ib-verbs.work-request-recv-count"); + if (temp) + options->recv_count = data_to_int32 (temp); + + options->port = 0; + temp = dict_get (this->xl->options, + "transport.ib-verbs.port"); + if (temp) + options->port = data_to_uint64 (temp); + + options->mtu = mtu = IBV_MTU_2048; + temp = dict_get (this->xl->options, + "transport.ib-verbs.mtu"); + if (temp) + mtu = data_to_int32 (temp); + switch (mtu) { + case 256: options->mtu = IBV_MTU_256; + break; + case 512: options->mtu = IBV_MTU_512; + break; + case 1024: options->mtu = IBV_MTU_1024; + break; + case 2048: options->mtu = IBV_MTU_2048; + break; + case 4096: options->mtu = IBV_MTU_4096; + break; + default: + if (temp) + gf_log ("transport/ib-verbs", GF_LOG_WARNING, + "%s: unrecognized MTU value '%s', defaulting " + "to '2048'", this->xl->name, + data_to_str (temp)); + else + gf_log ("transport/ib-verbs", GF_LOG_TRACE, + "%s: defaulting MTU to '2048'", + this->xl->name); + options->mtu = IBV_MTU_2048; + break; + } + + temp = dict_get (this->xl->options, + "transport.ib-verbs.device-name"); + if (temp) + options->device_name = gf_strdup (temp->data); + + return; +} + +static void +ib_verbs_queue_init (ib_verbs_queue_t *queue) +{ + pthread_mutex_init (&queue->lock, NULL); + + queue->active_posts.next = &queue->active_posts; + queue->active_posts.prev = &queue->active_posts; + queue->passive_posts.next = &queue->passive_posts; + queue->passive_posts.prev = &queue->passive_posts; +} + + +static ib_verbs_device_t * +ib_verbs_get_device (transport_t *this, + struct ibv_context *ibctx) +{ + glusterfs_ctx_t *ctx = this->xl->ctx; + ib_verbs_private_t *priv = this->private; + ib_verbs_options_t *options = &priv->options; + char *device_name = priv->options.device_name; + uint32_t port = priv->options.port; + + uint8_t active_port = 0; + int32_t ret = 0; + int32_t i = 0; + + ib_verbs_device_t *trav; + + trav = ctx->ib; + while (trav) { + if ((!strcmp (trav->device_name, device_name)) && + (trav->port == port)) + break; + trav = trav->next; + } + + if (!trav) { + + trav = GF_CALLOC (1, sizeof (*trav), + gf_ibv_mt_ib_verbs_device_t); + ERR_ABORT (trav); + priv->device = trav; + + trav->context = ibctx; + + ret = ib_get_active_port (trav->context); + + if (ret < 0) { + if (!port) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "Failed to find any active ports and " + "none specified in volume file," + " exiting"); + return NULL; + } + } + + active_port = ret; + + if (port) { + ret = ib_check_active_port (trav->context, port); + if (ret < 0) { + gf_log ("transport/ib-verbs", GF_LOG_WARNING, + "On device %s: provided port:%u is " + "found to be offline, continuing to " + "use the same port", device_name, port); + } + } else { + priv->options.port = active_port; + port = active_port; + gf_log ("transport/ib-verbs", GF_LOG_TRACE, + "Port unspecified in volume file using active " + "port: %u", port); + } + + trav->device_name = gf_strdup (device_name); + trav->port = port; + + trav->next = ctx->ib; + ctx->ib = trav; + + trav->send_chan = ibv_create_comp_channel (trav->context); + if (!trav->send_chan) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "%s: could not create send completion channel", + device_name); + /* TODO: cleanup current mess */ + return NULL; + } + + trav->recv_chan = ibv_create_comp_channel (trav->context); + if (!trav->recv_chan) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "could not create recv completion channel"); + /* TODO: cleanup current mess */ + return NULL; + } + + if (ib_verbs_create_cq (this) < 0) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "%s: could not create CQ", + this->xl->name); + return NULL; + } + + /* protection domain */ + trav->pd = ibv_alloc_pd (trav->context); + + if (!trav->pd) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "%s: could not allocate protection domain", + this->xl->name); + return NULL; + } + + struct ibv_srq_init_attr attr = { + .attr = { + .max_wr = options->recv_count, + .max_sge = 1 + } + }; + trav->srq = ibv_create_srq (trav->pd, &attr); + + if (!trav->srq) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "%s: could not create SRQ", + this->xl->name); + return NULL; + } + + /* queue init */ + ib_verbs_queue_init (&trav->sendq); + ib_verbs_queue_init (&trav->recvq); + + if (ib_verbs_create_posts (this) < 0) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "%s: could not allocate posts", + this->xl->name); + return NULL; + } + + /* completion threads */ + ret = pthread_create (&trav->send_thread, + NULL, + ib_verbs_send_completion_proc, + trav->send_chan); + if (ret) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "could not create send completion thread"); + return NULL; + } + ret = pthread_create (&trav->recv_thread, + NULL, + ib_verbs_recv_completion_proc, + trav->recv_chan); + if (ret) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "could not create recv completion thread"); + return NULL; + } + + /* qpreg */ + pthread_mutex_init (&trav->qpreg.lock, NULL); + for (i=0; i<42; i++) { + trav->qpreg.ents[i].next = &trav->qpreg.ents[i]; + trav->qpreg.ents[i].prev = &trav->qpreg.ents[i]; + } + } + return trav; +} + +static int32_t +ib_verbs_init (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + ib_verbs_options_t *options = &priv->options; + struct ibv_device **dev_list; + struct ibv_context *ib_ctx = NULL; + int32_t ret = 0; + + ib_verbs_options_init (this); + + { + dev_list = ibv_get_device_list (NULL); + + if (!dev_list) { + gf_log ("transport/ib-verbs", + GF_LOG_CRITICAL, + "Failed to get IB devices"); + ret = -1; + goto cleanup; + } + + if (!*dev_list) { + gf_log ("transport/ib-verbs", + GF_LOG_CRITICAL, + "No IB devices found"); + ret = -1; + goto cleanup; + } + + if (!options->device_name) { + if (*dev_list) { + options->device_name = + gf_strdup (ibv_get_device_name (*dev_list)); + } else { + gf_log ("transport/ib-verbs", GF_LOG_CRITICAL, + "IB device list is empty. Check for " + "'ib_uverbs' module"); + return -1; + goto cleanup; + } + } + + while (*dev_list) { + if (!strcmp (ibv_get_device_name (*dev_list), + options->device_name)) { + ib_ctx = ibv_open_device (*dev_list); + + if (!ib_ctx) { + gf_log ("transport/ib-verbs", + GF_LOG_ERROR, + "Failed to get infiniband" + "device context"); + ret = -1; + goto cleanup; + } + break; + } + ++dev_list; + } + + priv->device = ib_verbs_get_device (this, ib_ctx); + + if (!priv->device) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "could not create ib_verbs device for %s", + priv->device->device_name); + ret = -1; + goto cleanup; + } + } + + priv->peer.trans = this; + INIT_LIST_HEAD (&priv->peer.ioq); + + pthread_mutex_init (&priv->read_mutex, NULL); + pthread_mutex_init (&priv->write_mutex, NULL); + pthread_mutex_init (&priv->recv_mutex, NULL); + pthread_cond_init (&priv->recv_cond, NULL); + +cleanup: + if (-1 == ret) { + if (ib_ctx) + ibv_close_device (ib_ctx); + } + + if (dev_list) + ibv_free_device_list (dev_list); + + return ret; +} + + +static int32_t +ib_verbs_disconnect (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + int32_t ret = 0; + + pthread_mutex_lock (&priv->write_mutex); + { + ret = __ib_verbs_disconnect (this); + } + pthread_mutex_unlock (&priv->write_mutex); + + return ret; +} + + +static int32_t +__tcp_connect_finish (int fd) +{ + int ret = -1; + int optval = 0; + socklen_t optlen = sizeof (int); + + ret = getsockopt (fd, SOL_SOCKET, SO_ERROR, + (void *)&optval, &optlen); + + if (ret == 0 && optval) + { + errno = optval; + ret = -1; + } + + return ret; +} + +static inline void +ib_verbs_fill_handshake_data (char *buf, struct ib_verbs_nbio *nbio, + ib_verbs_private_t *priv) +{ + sprintf (buf, + "QP1:RECV_BLKSIZE=%08x:SEND_BLKSIZE=%08x\n" + "QP1:LID=%04x:QPN=%06x:PSN=%06x\n", + priv->peer.recv_size, + priv->peer.send_size, + priv->peer.local_lid, + priv->peer.local_qpn, + priv->peer.local_psn); + + nbio->vector.iov_base = buf; + nbio->vector.iov_len = strlen (buf) + 1; + nbio->count = 1; + return; +} + +static inline void +ib_verbs_fill_handshake_ack (char *buf, struct ib_verbs_nbio *nbio) +{ + sprintf (buf, "DONE\n"); + nbio->vector.iov_base = buf; + nbio->vector.iov_len = strlen (buf) + 1; + nbio->count = 1; + return; +} + +static int +ib_verbs_handshake_pollin (transport_t *this) +{ + int ret = 0; + ib_verbs_private_t *priv = this->private; + char *buf = priv->handshake.incoming.buf; + int32_t recv_buf_size, send_buf_size; + socklen_t sock_len; + + if (priv->handshake.incoming.state == IB_VERBS_HANDSHAKE_COMPLETE) { + return -1; + } + + pthread_mutex_lock (&priv->write_mutex); + { + while (priv->handshake.incoming.state != IB_VERBS_HANDSHAKE_COMPLETE) + { + switch (priv->handshake.incoming.state) + { + case IB_VERBS_HANDSHAKE_START: + buf = priv->handshake.incoming.buf = GF_CALLOC (1, 256, gf_ibv_mt_char); + ib_verbs_fill_handshake_data (buf, &priv->handshake.incoming, priv); + buf[0] = 0; + priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_RECEIVING_DATA; + break; + + case IB_VERBS_HANDSHAKE_RECEIVING_DATA: + ret = __tcp_readv (this, + &priv->handshake.incoming.vector, + priv->handshake.incoming.count, + &priv->handshake.incoming.pending_vector, + &priv->handshake.incoming.pending_count); + if (ret == -1) { + goto unlock; + } + + if (ret > 0) { + gf_log (this->xl->name, GF_LOG_TRACE, + "partial header read on NB socket. continue later"); + goto unlock; + } + + if (!ret) { + priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_RECEIVED_DATA; + } + break; + + case IB_VERBS_HANDSHAKE_RECEIVED_DATA: + ret = sscanf (buf, + "QP1:RECV_BLKSIZE=%08x:SEND_BLKSIZE=%08x\n" + "QP1:LID=%04x:QPN=%06x:PSN=%06x\n", + &recv_buf_size, + &send_buf_size, + &priv->peer.remote_lid, + &priv->peer.remote_qpn, + &priv->peer.remote_psn); + + if ((ret != 5) && (strncmp (buf, "QP1:", 4))) { + gf_log ("transport/ib-verbs", + GF_LOG_CRITICAL, + "%s: remote-host(%s)'s " + "transport type is different", + this->xl->name, + this->peerinfo.identifier); + ret = -1; + goto unlock; + } + + if (recv_buf_size < priv->peer.recv_size) + priv->peer.recv_size = recv_buf_size; + if (send_buf_size < priv->peer.send_size) + priv->peer.send_size = send_buf_size; + + gf_log ("transport/ib-verbs", GF_LOG_TRACE, + "%s: transacted recv_size=%d " + "send_size=%d", + this->xl->name, priv->peer.recv_size, + priv->peer.send_size); + + priv->peer.quota = priv->peer.send_count; + + if (ib_verbs_connect_qp (this)) { + gf_log ("transport/ib-verbs", + GF_LOG_ERROR, + "%s: failed to connect with " + "remote QP", this->xl->name); + ret = -1; + goto unlock; + } + ib_verbs_fill_handshake_ack (buf, &priv->handshake.incoming); + buf[0] = 0; + priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_RECEIVING_ACK; + break; + + case IB_VERBS_HANDSHAKE_RECEIVING_ACK: + ret = __tcp_readv (this, + &priv->handshake.incoming.vector, + priv->handshake.incoming.count, + &priv->handshake.incoming.pending_vector, + &priv->handshake.incoming.pending_count); + if (ret == -1) { + goto unlock; + } + + if (ret > 0) { + gf_log (this->xl->name, GF_LOG_TRACE, + "partial header read on NB " + "socket. continue later"); + goto unlock; + } + + if (!ret) { + priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_RECEIVED_ACK; + } + break; + + case IB_VERBS_HANDSHAKE_RECEIVED_ACK: + if (strncmp (buf, "DONE", 4)) { + gf_log ("transport/ib-verbs", + GF_LOG_DEBUG, + "%s: handshake-3 did not " + "return 'DONE' (%s)", + this->xl->name, buf); + ret = -1; + goto unlock; + } + ret = 0; + priv->connected = 1; + sock_len = sizeof (struct sockaddr_storage); + getpeername (priv->sock, + (struct sockaddr *) &this->peerinfo.sockaddr, + &sock_len); + + GF_FREE (priv->handshake.incoming.buf); + priv->handshake.incoming.buf = NULL; + priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_COMPLETE; + } + } + } +unlock: + pthread_mutex_unlock (&priv->write_mutex); + + if (ret == -1) { + transport_disconnect (this); + } else { + ret = 0; + } + + if (!ret && priv->connected) { + ret = xlator_notify (this->xl, GF_EVENT_CHILD_UP, this); + } + + return ret; +} + +static int +ib_verbs_handshake_pollout (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + char *buf = priv->handshake.outgoing.buf; + int32_t ret = 0; + + if (priv->handshake.outgoing.state == IB_VERBS_HANDSHAKE_COMPLETE) { + return 0; + } + + pthread_mutex_unlock (&priv->write_mutex); + { + while (priv->handshake.outgoing.state != IB_VERBS_HANDSHAKE_COMPLETE) + { + switch (priv->handshake.outgoing.state) + { + case IB_VERBS_HANDSHAKE_START: + buf = priv->handshake.outgoing.buf = GF_CALLOC (1, 256, gf_ibv_mt_char); + ib_verbs_fill_handshake_data (buf, &priv->handshake.outgoing, priv); + priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_SENDING_DATA; + break; + + case IB_VERBS_HANDSHAKE_SENDING_DATA: + ret = __tcp_writev (this, + &priv->handshake.outgoing.vector, + priv->handshake.outgoing.count, + &priv->handshake.outgoing.pending_vector, + &priv->handshake.outgoing.pending_count); + if (ret == -1) { + goto unlock; + } + + if (ret > 0) { + gf_log (this->xl->name, GF_LOG_TRACE, + "partial header read on NB socket. continue later"); + goto unlock; + } + + if (!ret) { + priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_SENT_DATA; + } + break; + + case IB_VERBS_HANDSHAKE_SENT_DATA: + ib_verbs_fill_handshake_ack (buf, &priv->handshake.outgoing); + priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_SENDING_ACK; + break; + + case IB_VERBS_HANDSHAKE_SENDING_ACK: + ret = __tcp_writev (this, + &priv->handshake.outgoing.vector, + priv->handshake.outgoing.count, + &priv->handshake.outgoing.pending_vector, + &priv->handshake.outgoing.pending_count); + + if (ret == -1) { + goto unlock; + } + + if (ret > 0) { + gf_log (this->xl->name, GF_LOG_TRACE, + "partial header read on NB " + "socket. continue later"); + goto unlock; + } + + if (!ret) { + GF_FREE (priv->handshake.outgoing.buf); + priv->handshake.outgoing.buf = NULL; + priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_COMPLETE; + } + break; + } + } + } +unlock: + pthread_mutex_unlock (&priv->write_mutex); + + if (ret == -1) { + transport_disconnect (this); + } else { + ret = 0; + } + + return ret; +} + +static int +ib_verbs_handshake_pollerr (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + int32_t ret = 0; + char need_unref = 0; + + gf_log ("transport/ib-verbs", GF_LOG_DEBUG, + "%s: peer disconnected, cleaning up", + this->xl->name); + + pthread_mutex_lock (&priv->write_mutex); + { + __ib_verbs_teardown (this); + + if (priv->sock != -1) { + event_unregister (this->xl->ctx->event_pool, + priv->sock, priv->idx); + need_unref = 1; + + if (close (priv->sock) != 0) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "close () - error: %s", + strerror (errno)); + ret = -errno; + } + priv->tcp_connected = priv->connected = 0; + priv->sock = -1; + } + + if (priv->handshake.incoming.buf) { + GF_FREE (priv->handshake.incoming.buf); + priv->handshake.incoming.buf = NULL; + } + + priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_START; + + if (priv->handshake.outgoing.buf) { + GF_FREE (priv->handshake.outgoing.buf); + priv->handshake.outgoing.buf = NULL; + } + + priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_START; + } + pthread_mutex_unlock (&priv->write_mutex); + + xlator_notify (this->xl, GF_EVENT_POLLERR, this, NULL); + + if (need_unref) + transport_unref (this); + + return 0; +} + + +static int +tcp_connect_finish (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + int error = 0, ret = 0; + + pthread_mutex_lock (&priv->write_mutex); + { + ret = __tcp_connect_finish (priv->sock); + + if (!ret) { + this->myinfo.sockaddr_len = + sizeof (this->myinfo.sockaddr); + ret = getsockname (priv->sock, + (struct sockaddr *)&this->myinfo.sockaddr, + &this->myinfo.sockaddr_len); + if (ret == -1) + { + gf_log (this->xl->name, GF_LOG_ERROR, + "getsockname on new client-socket %d " + "failed (%s)", + priv->sock, strerror (errno)); + close (priv->sock); + error = 1; + goto unlock; + } + + get_transport_identifiers (this); + priv->tcp_connected = 1; + } + + if (ret == -1 && errno != EINPROGRESS) { + gf_log (this->xl->name, GF_LOG_ERROR, + "tcp connect to %s failed (%s)", + this->peerinfo.identifier, strerror (errno)); + error = 1; + } + } +unlock: + pthread_mutex_unlock (&priv->write_mutex); + + if (error) { + transport_disconnect (this); + } + + return ret; +} + +static int +ib_verbs_event_handler (int fd, int idx, void *data, + int poll_in, int poll_out, int poll_err) +{ + transport_t *this = data; + ib_verbs_private_t *priv = this->private; + ib_verbs_options_t *options = NULL; + int ret = 0; + + if (!priv->tcp_connected) { + ret = tcp_connect_finish (this); + if (priv->tcp_connected) { + options = &priv->options; + + priv->peer.send_count = options->send_count; + priv->peer.recv_count = options->recv_count; + priv->peer.send_size = options->send_size; + priv->peer.recv_size = options->recv_size; + + if ((ret = ib_verbs_create_qp (this)) < 0) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "%s: could not create QP", + this->xl->name); + transport_disconnect (this); + } + } + } + + if (!ret && poll_out && priv->tcp_connected) { + ret = ib_verbs_handshake_pollout (this); + } + + if (!ret && poll_in && priv->tcp_connected) { + if (priv->handshake.incoming.state == IB_VERBS_HANDSHAKE_COMPLETE) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "%s: pollin received on tcp socket (peer: %s) " + "after handshake is complete", + this->xl->name, this->peerinfo.identifier); + ib_verbs_handshake_pollerr (this); + return 0; + } + ret = ib_verbs_handshake_pollin (this); + } + + if (ret < 0 || poll_err) { + ret = ib_verbs_handshake_pollerr (this); + } + + return 0; +} + +static int +__tcp_nonblock (int fd) +{ + int flags = 0; + int ret = -1; + + flags = fcntl (fd, F_GETFL); + + if (flags != -1) + ret = fcntl (fd, F_SETFL, flags | O_NONBLOCK); + + return ret; +} + +static int32_t +ib_verbs_connect (struct transport *this) +{ + dict_t *options = this->xl->options; + + ib_verbs_private_t *priv = this->private; + + int32_t ret = 0; + gf_boolean_t non_blocking = 1; + struct sockaddr_storage sockaddr; + socklen_t sockaddr_len = 0; + + if (priv->connected) { + return 0; + } + + if (dict_get (options, "non-blocking-io")) { + char *nb_connect = data_to_str (dict_get (this->xl->options, + "non-blocking-io")); + + if (gf_string2boolean (nb_connect, &non_blocking) == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "'non-blocking-io' takes only boolean " + "options, not taking any action"); + non_blocking = 1; + } + } + + ret = ibverbs_client_get_remote_sockaddr (this, (struct sockaddr *)&sockaddr, + &sockaddr_len); + if (ret != 0) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "cannot get remote address to connect"); + return ret; + } + + pthread_mutex_lock (&priv->write_mutex); + { + if (priv->sock != -1) { + ret = 0; + goto unlock; + } + + priv->sock = socket (((struct sockaddr *)&sockaddr)->sa_family, + SOCK_STREAM, 0); + + if (priv->sock == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "socket () - error: %s", strerror (errno)); + ret = -errno; + goto unlock; + } + + gf_log (this->xl->name, GF_LOG_TRACE, + "socket fd = %d", priv->sock); + + memcpy (&this->peerinfo.sockaddr, &sockaddr, sockaddr_len); + this->peerinfo.sockaddr_len = sockaddr_len; + + ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family = + ((struct sockaddr *)&this->peerinfo.sockaddr)->sa_family; + + if (non_blocking) + { + ret = __tcp_nonblock (priv->sock); + + if (ret == -1) + { + gf_log (this->xl->name, GF_LOG_ERROR, + "could not set socket %d to non " + "blocking mode (%s)", + priv->sock, strerror (errno)); + close (priv->sock); + priv->sock = -1; + goto unlock; + } + } + + ret = client_bind (this, + (struct sockaddr *)&this->myinfo.sockaddr, + &this->myinfo.sockaddr_len, priv->sock); + if (ret == -1) + { + gf_log (this->xl->name, GF_LOG_WARNING, + "client bind failed: %s", strerror (errno)); + close (priv->sock); + priv->sock = -1; + goto unlock; + } + + ret = connect (priv->sock, + (struct sockaddr *)&this->peerinfo.sockaddr, + this->peerinfo.sockaddr_len); + if (ret == -1 && errno != EINPROGRESS) + { + gf_log (this->xl->name, GF_LOG_ERROR, + "connection attempt failed (%s)", + strerror (errno)); + close (priv->sock); + priv->sock = -1; + goto unlock; + } + + priv->tcp_connected = priv->connected = 0; + + transport_ref (this); + + priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_START; + priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_START; + + priv->idx = event_register (this->xl->ctx->event_pool, + priv->sock, ib_verbs_event_handler, + this, 1, 1); + } +unlock: + pthread_mutex_unlock (&priv->write_mutex); + + return ret; +} + +static int +ib_verbs_server_event_handler (int fd, int idx, void *data, + int poll_in, int poll_out, int poll_err) +{ + int32_t main_sock = -1; + transport_t *this, *trans = data; + ib_verbs_private_t *priv = NULL; + ib_verbs_private_t *trans_priv = (ib_verbs_private_t *) trans->private; + ib_verbs_options_t *options = NULL; + + if (!poll_in) + return 0; + + this = GF_CALLOC (1, sizeof (transport_t), + gf_ibv_mt_transport_t); + ERR_ABORT (this); + priv = GF_CALLOC (1, sizeof (ib_verbs_private_t), + gf_ibv_mt_ib_verbs_private_t); + ERR_ABORT (priv); + this->private = priv; + /* Copy all the ib_verbs related values in priv, from trans_priv + as other than QP, all the values remain same */ + priv->device = trans_priv->device; + priv->options = trans_priv->options; + options = &priv->options; + + this->ops = trans->ops; + this->xl = trans->xl; + this->init = trans->init; + this->fini = trans->fini; + + memcpy (&this->myinfo.sockaddr, &trans->myinfo.sockaddr, + trans->myinfo.sockaddr_len); + this->myinfo.sockaddr_len = trans->myinfo.sockaddr_len; + + main_sock = (trans_priv)->sock; + this->peerinfo.sockaddr_len = sizeof (this->peerinfo.sockaddr); + priv->sock = accept (main_sock, + (struct sockaddr *)&this->peerinfo.sockaddr, + &this->peerinfo.sockaddr_len); + if (priv->sock == -1) { + gf_log ("ib-verbs/server", GF_LOG_ERROR, + "accept() failed: %s", + strerror (errno)); + GF_FREE (this->private); + GF_FREE (this); + return -1; + } + + priv->peer.trans = this; + transport_ref (this); + + get_transport_identifiers (this); + + priv->tcp_connected = 1; + priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_START; + priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_START; + + priv->peer.send_count = options->send_count; + priv->peer.recv_count = options->recv_count; + priv->peer.send_size = options->send_size; + priv->peer.recv_size = options->recv_size; + INIT_LIST_HEAD (&priv->peer.ioq); + + if (ib_verbs_create_qp (this) < 0) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "%s: could not create QP", + this->xl->name); + transport_disconnect (this); + return -1; + } + + priv->idx = event_register (this->xl->ctx->event_pool, priv->sock, + ib_verbs_event_handler, this, 1, 1); + + pthread_mutex_init (&priv->read_mutex, NULL); + pthread_mutex_init (&priv->write_mutex, NULL); + pthread_mutex_init (&priv->recv_mutex, NULL); + /* pthread_cond_init (&priv->recv_cond, NULL); */ + + return 0; +} + +static int32_t +ib_verbs_listen (transport_t *this) +{ + struct sockaddr_storage sockaddr; + socklen_t sockaddr_len; + ib_verbs_private_t *priv = this->private; + int opt = 1, ret = 0; + char service[NI_MAXSERV], host[NI_MAXHOST]; + + memset (&sockaddr, 0, sizeof (sockaddr)); + ret = ibverbs_server_get_local_sockaddr (this, + (struct sockaddr *)&sockaddr, + &sockaddr_len); + if (ret != 0) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "cannot find network address of server to bind to"); + goto err; + } + + priv->sock = socket (((struct sockaddr *)&sockaddr)->sa_family, + SOCK_STREAM, 0); + if (priv->sock == -1) { + gf_log ("ib-verbs/server", GF_LOG_CRITICAL, + "init: failed to create socket, error: %s", + strerror (errno)); + GF_FREE (this->private); + ret = -1; + goto err; + } + + memcpy (&this->myinfo.sockaddr, &sockaddr, sockaddr_len); + this->myinfo.sockaddr_len = sockaddr_len; + + ret = getnameinfo ((struct sockaddr *)&this->myinfo.sockaddr, + this->myinfo.sockaddr_len, + host, sizeof (host), + service, sizeof (service), + NI_NUMERICHOST); + if (ret != 0) { + gf_log (this->xl->name, GF_LOG_ERROR, + "getnameinfo failed (%s)", gai_strerror (ret)); + goto err; + } + sprintf (this->myinfo.identifier, "%s:%s", host, service); + + setsockopt (priv->sock, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof (opt)); + if (bind (priv->sock, + (struct sockaddr *)&sockaddr, + sockaddr_len) != 0) { + ret = -1; + gf_log ("ib-verbs/server", GF_LOG_ERROR, + "init: failed to bind to socket for %s (%s)", + this->myinfo.identifier, strerror (errno)); + goto err; + } + + if (listen (priv->sock, 10) != 0) { + gf_log ("ib-verbs/server", GF_LOG_ERROR, + "init: listen () failed on socket for %s (%s)", + this->myinfo.identifier, strerror (errno)); + ret = -1; + goto err; + } + + /* Register the main socket */ + priv->idx = event_register (this->xl->ctx->event_pool, priv->sock, + ib_verbs_server_event_handler, + transport_ref (this), 1, 0); + +err: + return ret; +} + +struct transport_ops tops = { + .receive = ib_verbs_receive, + .submit = ib_verbs_submit, + .connect = ib_verbs_connect, + .disconnect = ib_verbs_disconnect, + .listen = ib_verbs_listen, +}; + +int32_t +init (transport_t *this) +{ + ib_verbs_private_t *priv = GF_CALLOC (1, sizeof (*priv), + gf_ibv_mt_ib_verbs_private_t); + this->private = priv; + priv->sock = -1; + + if (ib_verbs_init (this)) { + gf_log (this->xl->name, GF_LOG_ERROR, + "Failed to initialize IB Device"); + return -1; + } + + return 0; +} + +void +fini (struct transport *this) +{ + /* TODO: verify this function does graceful finish */ + ib_verbs_private_t *priv = this->private; + this->private = NULL; + + pthread_mutex_destroy (&priv->recv_mutex); + pthread_mutex_destroy (&priv->write_mutex); + pthread_mutex_destroy (&priv->read_mutex); + /* pthread_cond_destroy (&priv->recv_cond); */ + + gf_log (this->xl->name, GF_LOG_TRACE, + "called fini on transport: %p", + this); + GF_FREE (priv); + return; +} + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_common_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; + } + + return ret; +} + +/* TODO: expand each option */ +struct volume_options options[] = { + { .key = {"transport.ib-verbs.port", + "ib-verbs-port"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 4, + .description = "check the option by 'ibv_devinfo'" + }, + { .key = {"transport.ib-verbs.mtu", + "ib-verbs-mtu"}, + .type = GF_OPTION_TYPE_INT, + }, + { .key = {"transport.ib-verbs.device-name", + "ib-verbs-device-name"}, + .type = GF_OPTION_TYPE_ANY, + .description = "check by 'ibv_devinfo'" + }, + { .key = {"transport.ib-verbs.work-request-send-count", + "ib-verbs-work-request-send-count"}, + .type = GF_OPTION_TYPE_INT, + }, + { .key = {"transport.ib-verbs.work-request-recv-count", + "ib-verbs-work-request-recv-count"}, + .type = GF_OPTION_TYPE_INT, + }, + { .key = {"remote-port", + "transport.remote-port", + "transport.ib-verbs.remote-port"}, + .type = GF_OPTION_TYPE_INT + }, + { .key = {"transport.ib-verbs.listen-port", "listen-port"}, + .type = GF_OPTION_TYPE_INT + }, + { .key = {"transport.ib-verbs.connect-path", "connect-path"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"transport.ib-verbs.bind-path", "bind-path"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"transport.ib-verbs.listen-path", "listen-path"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"transport.address-family", + "address-family"}, + .value = {"inet", "inet6", "inet/inet6", "inet6/inet", + "unix", "inet-sdp" }, + .type = GF_OPTION_TYPE_STR + }, + { .key = {NULL} } +}; diff --git a/xlators/protocol/transport/ib-verbs/src/ib-verbs.h b/xlators/protocol/transport/ib-verbs/src/ib-verbs.h new file mode 100644 index 00000000..c385b62e --- /dev/null +++ b/xlators/protocol/transport/ib-verbs/src/ib-verbs.h @@ -0,0 +1,220 @@ +/* + Copyright (c) 2006-2009 Gluster, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _XPORT_IB_VERBS_H +#define _XPORT_IB_VERBS_H + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#ifndef MAX_IOVEC +#define MAX_IOVEC 16 +#endif /* MAX_IOVEC */ + +#include "xlator.h" +#include "event.h" +#include "ib-verbs-mem-types.h" + +#include +#include +#include +#include + +#define GF_DEFAULT_IBVERBS_LISTEN_PORT 6997 + +/* options per transport end point */ +struct _ib_verbs_options { + int32_t port; + char *device_name; + enum ibv_mtu mtu; + int32_t send_count; + int32_t recv_count; + uint64_t recv_size; + uint64_t send_size; +}; +typedef struct _ib_verbs_options ib_verbs_options_t; + + +struct _ib_verbs_header { + char colonO[3]; + uint32_t size1; + uint32_t size2; + char version; +} __attribute__((packed)); +typedef struct _ib_verbs_header ib_verbs_header_t; + +struct _ib_verbs_ioq { + union { + struct list_head list; + struct { + struct _ib_verbs_ioq *next; + struct _ib_verbs_ioq *prev; + }; + }; + ib_verbs_header_t header; + struct iovec vector[MAX_IOVEC]; + int count; + char *buf; + struct iobref *iobref; +}; +typedef struct _ib_verbs_ioq ib_verbs_ioq_t; + +/* represents one communication peer, two per transport_t */ +struct _ib_verbs_peer { + transport_t *trans; + struct ibv_qp *qp; + + int32_t recv_count; + int32_t send_count; + int32_t recv_size; + int32_t send_size; + + int32_t quota; + union { + struct list_head ioq; + struct { + ib_verbs_ioq_t *ioq_next; + ib_verbs_ioq_t *ioq_prev; + }; + }; + + /* QP attributes, needed to connect with remote QP */ + int32_t local_lid; + int32_t local_psn; + int32_t local_qpn; + int32_t remote_lid; + int32_t remote_psn; + int32_t remote_qpn; +}; +typedef struct _ib_verbs_peer ib_verbs_peer_t; + + +struct _ib_verbs_post { + struct _ib_verbs_post *next, *prev; + struct ibv_mr *mr; + char *buf; + int32_t buf_size; + char aux; + int32_t reused; + pthread_barrier_t wait; +}; +typedef struct _ib_verbs_post ib_verbs_post_t; + + +struct _ib_verbs_queue { + ib_verbs_post_t active_posts, passive_posts; + int32_t active_count, passive_count; + pthread_mutex_t lock; +}; +typedef struct _ib_verbs_queue ib_verbs_queue_t; + + +struct _ib_verbs_qpreg { + pthread_mutex_t lock; + int32_t count; + struct _qpent { + struct _qpent *next, *prev; + int32_t qp_num; + ib_verbs_peer_t *peer; + } ents[42]; +}; +typedef struct _ib_verbs_qpreg ib_verbs_qpreg_t; + +/* context per device, stored in global glusterfs_ctx_t->ib */ +struct _ib_verbs_device { + struct _ib_verbs_device *next; + const char *device_name; + struct ibv_context *context; + int32_t port; + struct ibv_pd *pd; + struct ibv_srq *srq; + ib_verbs_qpreg_t qpreg; + struct ibv_comp_channel *send_chan, *recv_chan; + struct ibv_cq *send_cq, *recv_cq; + ib_verbs_queue_t sendq, recvq; + pthread_t send_thread, recv_thread; +}; +typedef struct _ib_verbs_device ib_verbs_device_t; + +typedef enum { + IB_VERBS_HANDSHAKE_START = 0, + IB_VERBS_HANDSHAKE_SENDING_DATA, + IB_VERBS_HANDSHAKE_RECEIVING_DATA, + IB_VERBS_HANDSHAKE_SENT_DATA, + IB_VERBS_HANDSHAKE_RECEIVED_DATA, + IB_VERBS_HANDSHAKE_SENDING_ACK, + IB_VERBS_HANDSHAKE_RECEIVING_ACK, + IB_VERBS_HANDSHAKE_RECEIVED_ACK, + IB_VERBS_HANDSHAKE_COMPLETE, +} ib_verbs_handshake_state_t; + +struct ib_verbs_nbio { + int state; + char *buf; + int count; + struct iovec vector; + struct iovec *pending_vector; + int pending_count; +}; + + +struct _ib_verbs_private { + int32_t sock; + int32_t idx; + unsigned char connected; + unsigned char tcp_connected; + unsigned char ib_connected; + in_addr_t addr; + unsigned short port; + + /* IB Verbs Driver specific variables, pointers */ + ib_verbs_peer_t peer; + ib_verbs_device_t *device; + ib_verbs_options_t options; + + /* Used by trans->op->receive */ + char *data_ptr; + int32_t data_offset; + int32_t data_len; + + /* Mutex */ + pthread_mutex_t read_mutex; + pthread_mutex_t write_mutex; + pthread_barrier_t handshake_barrier; + char handshake_ret; + + pthread_mutex_t recv_mutex; + pthread_cond_t recv_cond; + + /* used during ib_verbs_handshake */ + struct { + struct ib_verbs_nbio incoming; + struct ib_verbs_nbio outgoing; + int state; + ib_verbs_header_t header; + char *buf; + size_t size; + } handshake; +}; +typedef struct _ib_verbs_private ib_verbs_private_t; + +#endif /* _XPORT_IB_VERBS_H */ diff --git a/xlators/protocol/transport/ib-verbs/src/name.c b/xlators/protocol/transport/ib-verbs/src/name.c new file mode 100644 index 00000000..a3e18481 --- /dev/null +++ b/xlators/protocol/transport/ib-verbs/src/name.c @@ -0,0 +1,712 @@ +/* + Copyright (c) 2008-2009 Gluster, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#include +#include +#include +#include +#include + +#ifdef CLIENT_PORT_CEILING +#undef CLIENT_PORT_CEILING +#endif + +#define CLIENT_PORT_CEILING 1024 + +#ifndef AF_INET_SDP +#define AF_INET_SDP 27 +#endif + +#include "transport.h" +#include "ib-verbs.h" + +int32_t +gf_resolve_ip6 (const char *hostname, + uint16_t port, + int family, + void **dnscache, + struct addrinfo **addr_info); + +static int32_t +af_inet_bind_to_port_lt_ceiling (int fd, struct sockaddr *sockaddr, + socklen_t sockaddr_len, int ceiling) +{ + int32_t ret = -1; + /* struct sockaddr_in sin = {0, }; */ + uint16_t port = ceiling - 1; + + while (port) + { + switch (sockaddr->sa_family) + { + case AF_INET6: + ((struct sockaddr_in6 *)sockaddr)->sin6_port = htons (port); + break; + + case AF_INET_SDP: + case AF_INET: + ((struct sockaddr_in *)sockaddr)->sin_port = htons (port); + break; + } + + ret = bind (fd, sockaddr, sockaddr_len); + + if (ret == 0) + break; + + if (ret == -1 && errno == EACCES) + break; + + port--; + } + + return ret; +} + +static int32_t +af_unix_client_bind (transport_t *this, + struct sockaddr *sockaddr, + socklen_t sockaddr_len, + int sock) +{ + data_t *path_data = NULL; + struct sockaddr_un *addr = NULL; + int32_t ret = -1; + + path_data = dict_get (this->xl->options, + "transport.ib-verbs.bind-path"); + if (path_data) { + char *path = data_to_str (path_data); + if (!path || strlen (path) > UNIX_PATH_MAX) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "transport.ib-verbs.bind-path not specfied " + "for unix socket, letting connect to assign " + "default value"); + goto err; + } + + addr = (struct sockaddr_un *) sockaddr; + strcpy (addr->sun_path, path); + ret = bind (sock, (struct sockaddr *)addr, sockaddr_len); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "cannot bind to unix-domain socket %d (%s)", + sock, strerror (errno)); + goto err; + } + } + +err: + return ret; +} + +static int32_t +client_fill_address_family (transport_t *this, struct sockaddr *sockaddr) +{ + data_t *address_family_data = NULL; + + address_family_data = dict_get (this->xl->options, + "transport.address-family"); + if (!address_family_data) { + data_t *remote_host_data = NULL, *connect_path_data = NULL; + remote_host_data = dict_get (this->xl->options, "remote-host"); + connect_path_data = dict_get (this->xl->options, + "transport.ib-verbs.connect-path"); + + if (!(remote_host_data || connect_path_data) || + (remote_host_data && connect_path_data)) { + gf_log (this->xl->name, GF_LOG_ERROR, + "address-family not specified and not able to " + "determine the same from other options " + "(remote-host:%s and connect-path:%s)", + data_to_str (remote_host_data), + data_to_str (connect_path_data)); + return -1; + } + + if (remote_host_data) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "address-family not specified, guessing it " + "to be inet/inet6"); + sockaddr->sa_family = AF_UNSPEC; + } else { + gf_log (this->xl->name, GF_LOG_DEBUG, + "address-family not specified, guessing it " + "to be unix"); + sockaddr->sa_family = AF_UNIX; + } + + } else { + char *address_family = data_to_str (address_family_data); + if (!strcasecmp (address_family, "unix")) { + sockaddr->sa_family = AF_UNIX; + } else if (!strcasecmp (address_family, "inet")) { + sockaddr->sa_family = AF_INET; + } else if (!strcasecmp (address_family, "inet6")) { + sockaddr->sa_family = AF_INET6; + } else if (!strcasecmp (address_family, "inet-sdp")) { + sockaddr->sa_family = AF_INET_SDP; + } else if (!strcasecmp (address_family, "inet/inet6") + || !strcasecmp (address_family, "inet6/inet")) { + sockaddr->sa_family = AF_UNSPEC; + } else { + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address-family (%s) specified", + address_family); + return -1; + } + } + + return 0; +} + +static int32_t +af_inet_client_get_remote_sockaddr (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len) +{ + dict_t *options = this->xl->options; + data_t *remote_host_data = NULL; + data_t *remote_port_data = NULL; + char *remote_host = NULL; + uint16_t remote_port = 0; + struct addrinfo *addr_info = NULL; + int32_t ret = 0; + + remote_host_data = dict_get (options, "remote-host"); + if (remote_host_data == NULL) + { + gf_log (this->xl->name, GF_LOG_ERROR, + "option remote-host missing in volume %s", + this->xl->name); + ret = -1; + goto err; + } + + remote_host = data_to_str (remote_host_data); + if (remote_host == NULL) + { + gf_log (this->xl->name, GF_LOG_ERROR, + "option remote-host has data NULL in volume %s", + this->xl->name); + ret = -1; + goto err; + } + + remote_port_data = dict_get (options, "remote-port"); + if (remote_port_data == NULL) + { + gf_log (this->xl->name, GF_LOG_DEBUG, + "option remote-port missing in volume %s. " + "Defaulting to %d", + this->xl->name, GF_DEFAULT_IBVERBS_LISTEN_PORT); + + remote_port = GF_DEFAULT_IBVERBS_LISTEN_PORT; + } + else + { + remote_port = data_to_uint16 (remote_port_data); + } + + if (remote_port == (uint16_t)-1) + { + gf_log (this->xl->name, GF_LOG_ERROR, + "option remote-port has invalid port in volume %s", + this->xl->name); + ret = -1; + goto err; + } + + /* TODO: gf_resolve is a blocking call. kick in some + non blocking dns techniques */ + ret = gf_resolve_ip6 (remote_host, remote_port, + sockaddr->sa_family, + &this->dnscache, &addr_info); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "DNS resolution failed on host %s", remote_host); + goto err; + } + + memcpy (sockaddr, addr_info->ai_addr, addr_info->ai_addrlen); + *sockaddr_len = addr_info->ai_addrlen; + +err: + return ret; +} + +static int32_t +af_unix_client_get_remote_sockaddr (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len) +{ + struct sockaddr_un *sockaddr_un = NULL; + char *connect_path = NULL; + data_t *connect_path_data = NULL; + int32_t ret = 0; + + connect_path_data = dict_get (this->xl->options, + "transport.ib-verbs.connect-path"); + if (!connect_path_data) { + gf_log (this->xl->name, GF_LOG_ERROR, + "option transport.ib-verbs.connect-path not " + "specified for address-family unix"); + ret = -1; + goto err; + } + + connect_path = data_to_str (connect_path_data); + if (!connect_path) { + gf_log (this->xl->name, GF_LOG_ERROR, + "connect-path is null-string"); + ret = -1; + goto err; + } + + if (strlen (connect_path) > UNIX_PATH_MAX) { + gf_log (this->xl->name, GF_LOG_ERROR, + "connect-path value length %"GF_PRI_SIZET" > " + "%d octets", strlen (connect_path), UNIX_PATH_MAX); + ret = -1; + goto err; + } + + gf_log (this->xl->name, + GF_LOG_DEBUG, + "using connect-path %s", connect_path); + sockaddr_un = (struct sockaddr_un *)sockaddr; + strcpy (sockaddr_un->sun_path, connect_path); + *sockaddr_len = sizeof (struct sockaddr_un); + +err: + return ret; +} + +static int32_t +af_unix_server_get_local_sockaddr (transport_t *this, + struct sockaddr *addr, + socklen_t *addr_len) +{ + data_t *listen_path_data = NULL; + char *listen_path = NULL; + int32_t ret = 0; + struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; + + + listen_path_data = dict_get (this->xl->options, + "transport.ib-verbs.listen-path"); + if (!listen_path_data) { + gf_log (this->xl->name, GF_LOG_ERROR, + "missing option listen-path"); + ret = -1; + goto err; + } + + listen_path = data_to_str (listen_path_data); + +#ifndef UNIX_PATH_MAX +#define UNIX_PATH_MAX 108 +#endif + + if (strlen (listen_path) > UNIX_PATH_MAX) { + gf_log (this->xl->name, GF_LOG_ERROR, + "option listen-path has value length %"GF_PRI_SIZET" > %d", + strlen (listen_path), UNIX_PATH_MAX); + ret = -1; + goto err; + } + + sunaddr->sun_family = AF_UNIX; + strcpy (sunaddr->sun_path, listen_path); + *addr_len = sizeof (struct sockaddr_un); + +err: + return ret; +} + +static int32_t +af_inet_server_get_local_sockaddr (transport_t *this, + struct sockaddr *addr, + socklen_t *addr_len) +{ + struct addrinfo hints, *res = 0; + data_t *listen_port_data = NULL, *listen_host_data = NULL; + uint16_t listen_port = -1; + char service[NI_MAXSERV], *listen_host = NULL; + dict_t *options = NULL; + int32_t ret = 0; + + options = this->xl->options; + + listen_port_data = dict_get (options, "transport.ib-verbs.listen-port"); + listen_host_data = dict_get (options, "transport.ib-verbs.bind-address"); + + if (listen_port_data) + { + listen_port = data_to_uint16 (listen_port_data); + } else { + if (addr->sa_family == AF_INET6) { + struct sockaddr_in6 *in = (struct sockaddr_in6 *) addr; + in->sin6_addr = in6addr_any; + in->sin6_port = htons(listen_port); + *addr_len = sizeof(struct sockaddr_in6); + goto out; + } else if (addr->sa_family == AF_INET) { + struct sockaddr_in *in = (struct sockaddr_in *) addr; + in->sin_addr.s_addr = htonl(INADDR_ANY); + in->sin_port = htons(listen_port); + *addr_len = sizeof(struct sockaddr_in); + goto out; + } + } + + if (listen_port == (uint16_t) -1) + listen_port = GF_DEFAULT_IBVERBS_LISTEN_PORT; + + + if (listen_host_data) + { + listen_host = data_to_str (listen_host_data); + } + + memset (service, 0, sizeof (service)); + sprintf (service, "%d", listen_port); + + memset (&hints, 0, sizeof (hints)); + hints.ai_family = addr->sa_family; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_ADDRCONFIG | AI_PASSIVE; + + ret = getaddrinfo(listen_host, service, &hints, &res); + if (ret != 0) { + gf_log (this->xl->name, + GF_LOG_ERROR, + "getaddrinfo failed for host %s, service %s (%s)", + listen_host, service, gai_strerror (ret)); + ret = -1; + goto out; + } + + memcpy (addr, res->ai_addr, res->ai_addrlen); + *addr_len = res->ai_addrlen; + + freeaddrinfo (res); + +out: + return ret; +} + +int32_t +client_bind (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len, + int sock) +{ + int ret = 0; + + *sockaddr_len = sizeof (struct sockaddr_in6); + switch (sockaddr->sa_family) + { + case AF_INET_SDP: + case AF_INET: + *sockaddr_len = sizeof (struct sockaddr_in); + + case AF_INET6: + ret = af_inet_bind_to_port_lt_ceiling (sock, sockaddr, + *sockaddr_len, + CLIENT_PORT_CEILING); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_WARNING, + "cannot bind inet socket (%d) to port " + "less than %d (%s)", + sock, CLIENT_PORT_CEILING, strerror (errno)); + ret = 0; + } + break; + + case AF_UNIX: + *sockaddr_len = sizeof (struct sockaddr_un); + ret = af_unix_client_bind (this, (struct sockaddr *)sockaddr, + *sockaddr_len, sock); + break; + + default: + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address family %d", sockaddr->sa_family); + ret = -1; + break; + } + + return ret; +} + +int32_t +ibverbs_client_get_remote_sockaddr (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len) +{ + int32_t ret = 0; + char is_inet_sdp = 0; + + ret = client_fill_address_family (this, sockaddr); + if (ret) { + ret = -1; + goto err; + } + + switch (sockaddr->sa_family) + { + case AF_INET_SDP: + sockaddr->sa_family = AF_INET; + is_inet_sdp = 1; + + case AF_INET: + case AF_INET6: + case AF_UNSPEC: + ret = af_inet_client_get_remote_sockaddr (this, + sockaddr, + sockaddr_len); + + if (is_inet_sdp) { + sockaddr->sa_family = AF_INET_SDP; + } + + break; + + case AF_UNIX: + ret = af_unix_client_get_remote_sockaddr (this, + sockaddr, + sockaddr_len); + break; + + default: + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address-family %d", sockaddr->sa_family); + ret = -1; + } + +err: + return ret; +} + +int32_t +ibverbs_server_get_local_sockaddr (transport_t *this, + struct sockaddr *addr, + socklen_t *addr_len) +{ + data_t *address_family_data = NULL; + int32_t ret = 0; + char is_inet_sdp = 0; + + address_family_data = dict_get (this->xl->options, + "transport.address-family"); + if (address_family_data) { + char *address_family = NULL; + address_family = data_to_str (address_family_data); + + if (!strcasecmp (address_family, "inet")) { + addr->sa_family = AF_INET; + } else if (!strcasecmp (address_family, "inet6")) { + addr->sa_family = AF_INET6; + } else if (!strcasecmp (address_family, "inet-sdp")) { + addr->sa_family = AF_INET_SDP; + } else if (!strcasecmp (address_family, "unix")) { + addr->sa_family = AF_UNIX; + } else if (!strcasecmp (address_family, "inet/inet6") + || !strcasecmp (address_family, "inet6/inet")) { + addr->sa_family = AF_UNSPEC; + } else { + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address family (%s) specified", + address_family); + ret = -1; + goto err; + } + } else { + gf_log (this->xl->name, GF_LOG_DEBUG, + "option address-family not specified, defaulting " + "to inet/inet6"); + addr->sa_family = AF_UNSPEC; + } + + switch (addr->sa_family) + { + case AF_INET_SDP: + is_inet_sdp = 1; + addr->sa_family = AF_INET; + + case AF_INET: + case AF_INET6: + case AF_UNSPEC: + ret = af_inet_server_get_local_sockaddr (this, addr, addr_len); + if (is_inet_sdp && !ret) { + addr->sa_family = AF_INET_SDP; + } + break; + + case AF_UNIX: + ret = af_unix_server_get_local_sockaddr (this, addr, addr_len); + break; + } + +err: + return ret; +} + +int32_t +fill_inet6_inet_identifiers (transport_t *this, struct sockaddr_storage *addr, + int32_t addr_len, char *identifier) +{ + int32_t ret = 0, tmpaddr_len = 0; + char service[NI_MAXSERV], host[NI_MAXHOST]; + struct sockaddr_storage tmpaddr; + + memset (&tmpaddr, 0, sizeof (tmpaddr)); + tmpaddr = *addr; + tmpaddr_len = addr_len; + + if (((struct sockaddr *) &tmpaddr)->sa_family == AF_INET6) { + int32_t one_to_four, four_to_eight, twelve_to_sixteen; + int16_t eight_to_ten, ten_to_twelve; + + one_to_four = four_to_eight = twelve_to_sixteen = 0; + eight_to_ten = ten_to_twelve = 0; + + one_to_four = ((struct sockaddr_in6 *) + &tmpaddr)->sin6_addr.s6_addr32[0]; + four_to_eight = ((struct sockaddr_in6 *) + &tmpaddr)->sin6_addr.s6_addr32[1]; +#ifdef GF_SOLARIS_HOST_OS + eight_to_ten = S6_ADDR16(((struct sockaddr_in6 *) + &tmpaddr)->sin6_addr)[4]; +#else + eight_to_ten = ((struct sockaddr_in6 *) + &tmpaddr)->sin6_addr.s6_addr16[4]; +#endif + +#ifdef GF_SOLARIS_HOST_OS + ten_to_twelve = S6_ADDR16(((struct sockaddr_in6 *) + &tmpaddr)->sin6_addr)[5]; +#else + ten_to_twelve = ((struct sockaddr_in6 *) + &tmpaddr)->sin6_addr.s6_addr16[5]; +#endif + twelve_to_sixteen = ((struct sockaddr_in6 *) + &tmpaddr)->sin6_addr.s6_addr32[3]; + + /* ipv4 mapped ipv6 address has + bits 0-80: 0 + bits 80-96: 0xffff + bits 96-128: ipv4 address + */ + + if (one_to_four == 0 && + four_to_eight == 0 && + eight_to_ten == 0 && + ten_to_twelve == -1) { + struct sockaddr_in *in_ptr = (struct sockaddr_in *)&tmpaddr; + memset (&tmpaddr, 0, sizeof (tmpaddr)); + + in_ptr->sin_family = AF_INET; + in_ptr->sin_port = ((struct sockaddr_in6 *)addr)->sin6_port; + in_ptr->sin_addr.s_addr = twelve_to_sixteen; + tmpaddr_len = sizeof (*in_ptr); + } + } + + ret = getnameinfo ((struct sockaddr *) &tmpaddr, + tmpaddr_len, + host, sizeof (host), + service, sizeof (service), + NI_NUMERICHOST | NI_NUMERICSERV); + if (ret != 0) { + gf_log (this->xl->name, + GF_LOG_ERROR, + "getnameinfo failed (%s)", gai_strerror (ret)); + } + + sprintf (identifier, "%s:%s", host, service); + + return ret; +} + +int32_t +get_transport_identifiers (transport_t *this) +{ + int32_t ret = 0; + char is_inet_sdp = 0; + + switch (((struct sockaddr *) &this->myinfo.sockaddr)->sa_family) + { + case AF_INET_SDP: + is_inet_sdp = 1; + ((struct sockaddr *) &this->peerinfo.sockaddr)->sa_family = ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family = AF_INET; + + case AF_INET: + case AF_INET6: + { + ret = fill_inet6_inet_identifiers (this, + &this->myinfo.sockaddr, + this->myinfo.sockaddr_len, + this->myinfo.identifier); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "can't fill inet/inet6 identifier for server"); + goto err; + } + + ret = fill_inet6_inet_identifiers (this, + &this->peerinfo.sockaddr, + this->peerinfo.sockaddr_len, + this->peerinfo.identifier); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "can't fill inet/inet6 identifier for client"); + goto err; + } + + if (is_inet_sdp) { + ((struct sockaddr *) &this->peerinfo.sockaddr)->sa_family = ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family = AF_INET_SDP; + } + } + break; + + case AF_UNIX: + { + struct sockaddr_un *sunaddr = NULL; + + sunaddr = (struct sockaddr_un *) &this->myinfo.sockaddr; + strcpy (this->myinfo.identifier, sunaddr->sun_path); + + sunaddr = (struct sockaddr_un *) &this->peerinfo.sockaddr; + strcpy (this->peerinfo.identifier, sunaddr->sun_path); + } + break; + + default: + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address family (%d)", + ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family); + ret = -1; + break; + } + +err: + return ret; +} diff --git a/xlators/protocol/transport/ib-verbs/src/name.h b/xlators/protocol/transport/ib-verbs/src/name.h new file mode 100644 index 00000000..4f0f4771 --- /dev/null +++ b/xlators/protocol/transport/ib-verbs/src/name.h @@ -0,0 +1,47 @@ +/* + Copyright (c) 2008-2009 Gluster, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _IB_VERBS_NAME_H +#define _IB_VERBS_NAME_H + +#include +#include + +#include "compat.h" + +int32_t +client_bind (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len, + int sock); + +int32_t +ibverbs_client_get_remote_sockaddr (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len); + +int32_t +ibverbs_server_get_local_sockaddr (transport_t *this, + struct sockaddr *addr, + socklen_t *addr_len); + +int32_t +get_transport_identifiers (transport_t *this); + +#endif /* _IB_VERBS_NAME_H */ diff --git a/xlators/protocol/transport/socket/Makefile.am b/xlators/protocol/transport/socket/Makefile.am new file mode 100644 index 00000000..f963effe --- /dev/null +++ b/xlators/protocol/transport/socket/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src \ No newline at end of file diff --git a/xlators/protocol/transport/socket/src/Makefile.am b/xlators/protocol/transport/socket/src/Makefile.am new file mode 100644 index 00000000..f5c46f1a --- /dev/null +++ b/xlators/protocol/transport/socket/src/Makefile.am @@ -0,0 +1,19 @@ +# TODO : change to proper transport dir + +transport_LTLIBRARIES = socket.la +transportdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/transport + +socket_la_LDFLAGS = -module -avoidversion + +socket_la_SOURCES = socket.c name.c +socket_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ + $(top_builddir)/xlators/protocol/lib/src/libgfproto.la + +noinst_HEADERS = socket.h name.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) \ + -I$(top_srcdir)/xlators/protocol/transport/socket/src \ + -I$(top_srcdir)/xlators/protocol/lib/src + +CLEANFILES = *~ diff --git a/xlators/protocol/transport/socket/src/name.c b/xlators/protocol/transport/socket/src/name.c new file mode 100644 index 00000000..120a669c --- /dev/null +++ b/xlators/protocol/transport/socket/src/name.c @@ -0,0 +1,737 @@ +/* + Copyright (c) 2008-2009 Gluster, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#include +#include +#include +#include +#include +#include + +#ifdef CLIENT_PORT_CEILING +#undef CLIENT_PORT_CEILING +#endif + +#define CLIENT_PORT_CEILING 1024 + +#ifndef AF_INET_SDP +#define AF_INET_SDP 27 +#endif + +#include "transport.h" +#include "socket.h" + +int32_t +gf_resolve_ip6 (const char *hostname, + uint16_t port, + int family, + void **dnscache, + struct addrinfo **addr_info); + +static int32_t +af_inet_bind_to_port_lt_ceiling (int fd, struct sockaddr *sockaddr, + socklen_t sockaddr_len, int ceiling) +{ + int32_t ret = -1; + /* struct sockaddr_in sin = {0, }; */ + uint16_t port = ceiling - 1; + + while (port) + { + switch (sockaddr->sa_family) + { + case AF_INET6: + ((struct sockaddr_in6 *)sockaddr)->sin6_port = htons (port); + break; + + case AF_INET_SDP: + case AF_INET: + ((struct sockaddr_in *)sockaddr)->sin_port = htons (port); + break; + } + + ret = bind (fd, sockaddr, sockaddr_len); + + if (ret == 0) + break; + + if (ret == -1 && errno == EACCES) + break; + + port--; + } + + return ret; +} + +static int32_t +af_unix_client_bind (transport_t *this, + struct sockaddr *sockaddr, + socklen_t sockaddr_len, + int sock) +{ + data_t *path_data = NULL; + struct sockaddr_un *addr = NULL; + int32_t ret = 0; + + path_data = dict_get (this->xl->options, "transport.socket.bind-path"); + if (path_data) { + char *path = data_to_str (path_data); + if (!path || strlen (path) > UNIX_PATH_MAX) { + gf_log (this->xl->name, GF_LOG_TRACE, + "bind-path not specfied for unix socket, " + "letting connect to assign default value"); + goto err; + } + + addr = (struct sockaddr_un *) sockaddr; + strcpy (addr->sun_path, path); + ret = bind (sock, (struct sockaddr *)addr, sockaddr_len); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "cannot bind to unix-domain socket %d (%s)", + sock, strerror (errno)); + goto err; + } + } else { + gf_log (this->xl->name, GF_LOG_TRACE, + "bind-path not specfied for unix socket, " + "letting connect to assign default value"); + } + +err: + return ret; +} + +int32_t +client_fill_address_family (transport_t *this, sa_family_t *sa_family) +{ + data_t *address_family_data = NULL; + int32_t ret = -1; + + if (sa_family == NULL) { + goto out; + } + + address_family_data = dict_get (this->xl->options, + "transport.address-family"); + if (!address_family_data) { + data_t *remote_host_data = NULL, *connect_path_data = NULL; + remote_host_data = dict_get (this->xl->options, "remote-host"); + connect_path_data = dict_get (this->xl->options, + "transport.socket.connect-path"); + + if (!(remote_host_data || connect_path_data) || + (remote_host_data && connect_path_data)) { + gf_log (this->xl->name, GF_LOG_ERROR, + "transport.address-family not specified and " + "not able to determine the " + "same from other options (remote-host:%s and " + "transport.unix.connect-path:%s)", + data_to_str (remote_host_data), + data_to_str (connect_path_data)); + goto out; + } + + if (remote_host_data) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "address-family not specified, guessing it " + "to be inet/inet6"); + *sa_family = AF_UNSPEC; + } else { + gf_log (this->xl->name, GF_LOG_DEBUG, + "address-family not specified, guessing it " + "to be unix"); + *sa_family = AF_UNIX; + } + + } else { + char *address_family = data_to_str (address_family_data); + if (!strcasecmp (address_family, "unix")) { + *sa_family = AF_UNIX; + } else if (!strcasecmp (address_family, "inet")) { + *sa_family = AF_INET; + } else if (!strcasecmp (address_family, "inet6")) { + *sa_family = AF_INET6; + } else if (!strcasecmp (address_family, "inet-sdp")) { + *sa_family = AF_INET_SDP; + } else if (!strcasecmp (address_family, "inet/inet6") + || !strcasecmp (address_family, "inet6/inet")) { + *sa_family = AF_UNSPEC; + } else { + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address-family (%s) specified", + address_family); + goto out; + } + } + + ret = 0; + +out: + return ret; +} + +static int32_t +af_inet_client_get_remote_sockaddr (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len) +{ + dict_t *options = this->xl->options; + data_t *remote_host_data = NULL; + data_t *remote_port_data = NULL; + char *remote_host = NULL; + uint16_t remote_port = 0; + struct addrinfo *addr_info = NULL; + int32_t ret = 0; + + remote_host_data = dict_get (options, "remote-host"); + if (remote_host_data == NULL) + { + gf_log (this->xl->name, GF_LOG_ERROR, + "option remote-host missing in volume %s", this->xl->name); + ret = -1; + goto err; + } + + remote_host = data_to_str (remote_host_data); + if (remote_host == NULL) + { + gf_log (this->xl->name, GF_LOG_ERROR, + "option remote-host has data NULL in volume %s", this->xl->name); + ret = -1; + goto err; + } + + remote_port_data = dict_get (options, "remote-port"); + if (remote_port_data == NULL) + { + gf_log (this->xl->name, GF_LOG_TRACE, + "option remote-port missing in volume %s. Defaulting to %d", + this->xl->name, GF_DEFAULT_SOCKET_LISTEN_PORT); + + remote_port = GF_DEFAULT_SOCKET_LISTEN_PORT; + } + else + { + remote_port = data_to_uint16 (remote_port_data); + } + + if (remote_port == (uint16_t)-1) + { + gf_log (this->xl->name, GF_LOG_ERROR, + "option remote-port has invalid port in volume %s", + this->xl->name); + ret = -1; + goto err; + } + + /* TODO: gf_resolve is a blocking call. kick in some + non blocking dns techniques */ + ret = gf_resolve_ip6 (remote_host, remote_port, + sockaddr->sa_family, &this->dnscache, &addr_info); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "DNS resolution failed on host %s", remote_host); + goto err; + } + + memcpy (sockaddr, addr_info->ai_addr, addr_info->ai_addrlen); + *sockaddr_len = addr_info->ai_addrlen; + +err: + return ret; +} + +static int32_t +af_unix_client_get_remote_sockaddr (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len) +{ + struct sockaddr_un *sockaddr_un = NULL; + char *connect_path = NULL; + data_t *connect_path_data = NULL; + int32_t ret = 0; + + connect_path_data = dict_get (this->xl->options, + "transport.socket.connect-path"); + if (!connect_path_data) { + gf_log (this->xl->name, GF_LOG_ERROR, + "option transport.unix.connect-path not specified for " + "address-family unix"); + ret = -1; + goto err; + } + + connect_path = data_to_str (connect_path_data); + if (!connect_path) { + gf_log (this->xl->name, GF_LOG_ERROR, + "transport.unix.connect-path is null-string"); + ret = -1; + goto err; + } + + if (strlen (connect_path) > UNIX_PATH_MAX) { + gf_log (this->xl->name, GF_LOG_ERROR, + "connect-path value length %"GF_PRI_SIZET" > %d octets", + strlen (connect_path), UNIX_PATH_MAX); + ret = -1; + goto err; + } + + gf_log (this->xl->name, GF_LOG_TRACE, + "using connect-path %s", connect_path); + sockaddr_un = (struct sockaddr_un *)sockaddr; + strcpy (sockaddr_un->sun_path, connect_path); + *sockaddr_len = sizeof (struct sockaddr_un); + +err: + return ret; +} + +static int32_t +af_unix_server_get_local_sockaddr (transport_t *this, + struct sockaddr *addr, + socklen_t *addr_len) +{ + data_t *listen_path_data = NULL; + char *listen_path = NULL; + int32_t ret = 0; + struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; + + + listen_path_data = dict_get (this->xl->options, + "transport.socket.listen-path"); + if (!listen_path_data) { + gf_log (this->xl->name, GF_LOG_ERROR, + "missing option transport.socket.listen-path"); + ret = -1; + goto err; + } + + listen_path = data_to_str (listen_path_data); + +#ifndef UNIX_PATH_MAX +#define UNIX_PATH_MAX 108 +#endif + + if (strlen (listen_path) > UNIX_PATH_MAX) { + gf_log (this->xl->name, GF_LOG_ERROR, + "option transport.unix.listen-path has value length " + "%"GF_PRI_SIZET" > %d", + strlen (listen_path), UNIX_PATH_MAX); + ret = -1; + goto err; + } + + sunaddr->sun_family = AF_UNIX; + strcpy (sunaddr->sun_path, listen_path); + *addr_len = sizeof (struct sockaddr_un); + +err: + return ret; +} + +static int32_t +af_inet_server_get_local_sockaddr (transport_t *this, + struct sockaddr *addr, + socklen_t *addr_len) +{ + struct addrinfo hints, *res = 0; + data_t *listen_port_data = NULL, *listen_host_data = NULL; + uint16_t listen_port = -1; + char service[NI_MAXSERV], *listen_host = NULL; + dict_t *options = NULL; + int32_t ret = 0; + + options = this->xl->options; + + listen_port_data = dict_get (options, "transport.socket.listen-port"); + listen_host_data = dict_get (options, "transport.socket.bind-address"); + + if (listen_port_data) + { + listen_port = data_to_uint16 (listen_port_data); + } + + if (listen_port == (uint16_t) -1) + listen_port = GF_DEFAULT_SOCKET_LISTEN_PORT; + + + if (listen_host_data) + { + listen_host = data_to_str (listen_host_data); + } else { + if (addr->sa_family == AF_INET6) { + struct sockaddr_in6 *in = (struct sockaddr_in6 *) addr; + in->sin6_addr = in6addr_any; + in->sin6_port = htons(listen_port); + *addr_len = sizeof(struct sockaddr_in6); + goto out; + } else if (addr->sa_family == AF_INET) { + struct sockaddr_in *in = (struct sockaddr_in *) addr; + in->sin_addr.s_addr = htonl(INADDR_ANY); + in->sin_port = htons(listen_port); + *addr_len = sizeof(struct sockaddr_in); + goto out; + } + } + + memset (service, 0, sizeof (service)); + sprintf (service, "%d", listen_port); + + memset (&hints, 0, sizeof (hints)); + hints.ai_family = addr->sa_family; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_ADDRCONFIG | AI_PASSIVE; + + ret = getaddrinfo(listen_host, service, &hints, &res); + if (ret != 0) { + gf_log (this->xl->name, GF_LOG_ERROR, + "getaddrinfo failed for host %s, service %s (%s)", + listen_host, service, gai_strerror (ret)); + ret = -1; + goto out; + } + + memcpy (addr, res->ai_addr, res->ai_addrlen); + *addr_len = res->ai_addrlen; + + freeaddrinfo (res); + +out: + return ret; +} + +int32_t +client_bind (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len, + int sock) +{ + int ret = 0; + + *sockaddr_len = sizeof (struct sockaddr_in6); + switch (sockaddr->sa_family) + { + case AF_INET_SDP: + case AF_INET: + *sockaddr_len = sizeof (struct sockaddr_in); + + case AF_INET6: + ret = af_inet_bind_to_port_lt_ceiling (sock, sockaddr, + *sockaddr_len, CLIENT_PORT_CEILING); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_WARNING, + "cannot bind inet socket (%d) to port less than %d (%s)", + sock, CLIENT_PORT_CEILING, strerror (errno)); + ret = 0; + } + break; + + case AF_UNIX: + *sockaddr_len = sizeof (struct sockaddr_un); + ret = af_unix_client_bind (this, (struct sockaddr *)sockaddr, + *sockaddr_len, sock); + break; + + default: + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address family %d", sockaddr->sa_family); + ret = -1; + break; + } + + return ret; +} + +int32_t +socket_client_get_remote_sockaddr (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len, + sa_family_t *sa_family) +{ + int32_t ret = 0; + + if ((sockaddr == NULL) || (sockaddr_len == NULL) + || (sa_family == NULL)) { + ret = -1; + goto err; + } + + + ret = client_fill_address_family (this, &sockaddr->sa_family); + if (ret) { + ret = -1; + goto err; + } + + *sa_family = sockaddr->sa_family; + + switch (sockaddr->sa_family) + { + case AF_INET_SDP: + sockaddr->sa_family = AF_INET; + + case AF_INET: + case AF_INET6: + case AF_UNSPEC: + ret = af_inet_client_get_remote_sockaddr (this, sockaddr, + sockaddr_len); + break; + + case AF_UNIX: + ret = af_unix_client_get_remote_sockaddr (this, sockaddr, + sockaddr_len); + break; + + default: + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address-family %d", sockaddr->sa_family); + ret = -1; + } + + if (*sa_family == AF_UNSPEC) { + *sa_family = sockaddr->sa_family; + } + +err: + return ret; +} + + +int32_t +server_fill_address_family (transport_t *this, sa_family_t *sa_family) +{ + data_t *address_family_data = NULL; + int32_t ret = -1; + + if (sa_family == NULL) { + goto out; + } + + address_family_data = dict_get (this->xl->options, + "transport.address-family"); + if (address_family_data) { + char *address_family = NULL; + address_family = data_to_str (address_family_data); + + if (!strcasecmp (address_family, "inet")) { + *sa_family = AF_INET; + } else if (!strcasecmp (address_family, "inet6")) { + *sa_family = AF_INET6; + } else if (!strcasecmp (address_family, "inet-sdp")) { + *sa_family = AF_INET_SDP; + } else if (!strcasecmp (address_family, "unix")) { + *sa_family = AF_UNIX; + } else if (!strcasecmp (address_family, "inet/inet6") + || !strcasecmp (address_family, "inet6/inet")) { + *sa_family = AF_UNSPEC; + } else { + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address family (%s) specified", address_family); + goto out; + } + } else { + gf_log (this->xl->name, GF_LOG_DEBUG, + "option address-family not specified, defaulting to inet/inet6"); + *sa_family = AF_UNSPEC; + } + + ret = 0; +out: + return ret; +} + + +int32_t +socket_server_get_local_sockaddr (transport_t *this, struct sockaddr *addr, + socklen_t *addr_len, sa_family_t *sa_family) +{ + int32_t ret = -1; + + if ((addr == NULL) || (addr_len == NULL) || (sa_family == NULL)) { + goto err; + } + + ret = server_fill_address_family (this, &addr->sa_family); + if (ret == -1) { + goto err; + } + + *sa_family = addr->sa_family; + + switch (addr->sa_family) + { + case AF_INET_SDP: + addr->sa_family = AF_INET; + + case AF_INET: + case AF_INET6: + case AF_UNSPEC: + ret = af_inet_server_get_local_sockaddr (this, addr, addr_len); + break; + + case AF_UNIX: + ret = af_unix_server_get_local_sockaddr (this, addr, addr_len); + break; + } + + if (*sa_family == AF_UNSPEC) { + *sa_family = addr->sa_family; + } + +err: + return ret; +} + +int32_t +fill_inet6_inet_identifiers (transport_t *this, struct sockaddr_storage *addr, + int32_t addr_len, char *identifier) +{ + int32_t ret = 0, tmpaddr_len = 0; + char service[NI_MAXSERV], host[NI_MAXHOST]; + struct sockaddr_storage tmpaddr; + + memset (&tmpaddr, 0, sizeof (tmpaddr)); + tmpaddr = *addr; + tmpaddr_len = addr_len; + + if (((struct sockaddr *) &tmpaddr)->sa_family == AF_INET6) { + int32_t one_to_four, four_to_eight, twelve_to_sixteen; + int16_t eight_to_ten, ten_to_twelve; + + one_to_four = four_to_eight = twelve_to_sixteen = 0; + eight_to_ten = ten_to_twelve = 0; + + one_to_four = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr32[0]; + four_to_eight = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr32[1]; +#ifdef GF_SOLARIS_HOST_OS + eight_to_ten = S6_ADDR16(((struct sockaddr_in6 *) &tmpaddr)->sin6_addr)[4]; +#else + eight_to_ten = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr16[4]; +#endif + +#ifdef GF_SOLARIS_HOST_OS + ten_to_twelve = S6_ADDR16(((struct sockaddr_in6 *) &tmpaddr)->sin6_addr)[5]; +#else + ten_to_twelve = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr16[5]; +#endif + + twelve_to_sixteen = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr32[3]; + + /* ipv4 mapped ipv6 address has + bits 0-80: 0 + bits 80-96: 0xffff + bits 96-128: ipv4 address + */ + + if (one_to_four == 0 && + four_to_eight == 0 && + eight_to_ten == 0 && + ten_to_twelve == -1) { + struct sockaddr_in *in_ptr = (struct sockaddr_in *)&tmpaddr; + memset (&tmpaddr, 0, sizeof (tmpaddr)); + + in_ptr->sin_family = AF_INET; + in_ptr->sin_port = ((struct sockaddr_in6 *)addr)->sin6_port; + in_ptr->sin_addr.s_addr = twelve_to_sixteen; + tmpaddr_len = sizeof (*in_ptr); + } + } + + ret = getnameinfo ((struct sockaddr *) &tmpaddr, + tmpaddr_len, + host, sizeof (host), + service, sizeof (service), + NI_NUMERICHOST | NI_NUMERICSERV); + if (ret != 0) { + gf_log (this->xl->name, GF_LOG_ERROR, + "getnameinfo failed (%s)", gai_strerror (ret)); + } + + sprintf (identifier, "%s:%s", host, service); + + return ret; +} + +int32_t +get_transport_identifiers (transport_t *this) +{ + int32_t ret = 0; + char is_inet_sdp = 0; + + switch (((struct sockaddr *) &this->myinfo.sockaddr)->sa_family) + { + case AF_INET_SDP: + is_inet_sdp = 1; + ((struct sockaddr *) &this->peerinfo.sockaddr)->sa_family = ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family = AF_INET; + + case AF_INET: + case AF_INET6: + { + ret = fill_inet6_inet_identifiers (this, + &this->myinfo.sockaddr, + this->myinfo.sockaddr_len, + this->myinfo.identifier); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "cannot fill inet/inet6 identifier for server"); + goto err; + } + + ret = fill_inet6_inet_identifiers (this, + &this->peerinfo.sockaddr, + this->peerinfo.sockaddr_len, + this->peerinfo.identifier); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "cannot fill inet/inet6 identifier for client"); + goto err; + } + + if (is_inet_sdp) { + ((struct sockaddr *) &this->peerinfo.sockaddr)->sa_family = ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family = AF_INET_SDP; + } + } + break; + + case AF_UNIX: + { + struct sockaddr_un *sunaddr = NULL; + + sunaddr = (struct sockaddr_un *) &this->myinfo.sockaddr; + strcpy (this->myinfo.identifier, sunaddr->sun_path); + + sunaddr = (struct sockaddr_un *) &this->peerinfo.sockaddr; + strcpy (this->peerinfo.identifier, sunaddr->sun_path); + } + break; + + default: + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address family (%d)", + ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family); + ret = -1; + break; + } + +err: + return ret; +} diff --git a/xlators/protocol/transport/socket/src/name.h b/xlators/protocol/transport/socket/src/name.h new file mode 100644 index 00000000..f50a7b7f --- /dev/null +++ b/xlators/protocol/transport/socket/src/name.h @@ -0,0 +1,44 @@ +/* + Copyright (c) 2008-2009 Gluster, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _SOCKET_NAME_H +#define _SOCKET_NAME_H + +#include "compat.h" + +int32_t +client_bind (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len, + int sock); + +int32_t +socket_client_get_remote_sockaddr (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len, + sa_family_t *sa_family); + +int32_t +socket_server_get_local_sockaddr (transport_t *this, struct sockaddr *addr, + socklen_t *addr_len, sa_family_t *sa_family); + +int32_t +get_transport_identifiers (transport_t *this); + +#endif /* _SOCKET_NAME_H */ diff --git a/xlators/protocol/transport/socket/src/socket-mem-types.h b/xlators/protocol/transport/socket/src/socket-mem-types.h new file mode 100644 index 00000000..f50f4a75 --- /dev/null +++ b/xlators/protocol/transport/socket/src/socket-mem-types.h @@ -0,0 +1,36 @@ + +/* + Copyright (c) 2008-2009 Gluster, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + + +#ifndef __SOCKET_MEM_TYPES_H__ +#define __SOCKET_MEM_TYPES_H__ + +#include "mem-types.h" + +enum gf_socket_mem_types_ { + gf_socket_mt_socket_private_t = gf_common_mt_end + 1, + gf_socket_mt_ioq, + gf_socket_mt_transport_t, + gf_socket_mt_socket_local_t, + gf_socket_mt_char, + gf_socket_mt_end +}; +#endif + diff --git a/xlators/protocol/transport/socket/src/socket.c b/xlators/protocol/transport/socket/src/socket.c new file mode 100644 index 00000000..7f7f8093 --- /dev/null +++ b/xlators/protocol/transport/socket/src/socket.c @@ -0,0 +1,1552 @@ +/* + Copyright (c) 2008-2009 Gluster, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "socket.h" +#include "name.h" +#include "dict.h" +#include "transport.h" +#include "logging.h" +#include "xlator.h" +#include "byte-order.h" +#include "common-utils.h" +#include "compat-errno.h" + +#include +#include +#include + + +#define GF_LOG_ERRNO(errno) ((errno == ENOTCONN) ? GF_LOG_DEBUG : GF_LOG_ERROR) +#define SA(ptr) ((struct sockaddr *)ptr) + +int socket_init (transport_t *this); + +/* + * return value: + * 0 = success (completed) + * -1 = error + * > 0 = incomplete + */ + +int +__socket_rwv (transport_t *this, struct iovec *vector, int count, + struct iovec **pending_vector, int *pending_count, + int write) +{ + socket_private_t *priv = NULL; + int sock = -1; + int ret = -1; + struct iovec *opvector = NULL; + int opcount = 0; + int moved = 0; + + priv = this->private; + sock = priv->sock; + + opvector = vector; + opcount = count; + + while (opcount) { + if (write) { + ret = writev (sock, opvector, opcount); + + if (ret == 0 || (ret == -1 && errno == EAGAIN)) { + /* done for now */ + break; + } + } else { + ret = readv (sock, opvector, opcount); + + if (ret == -1 && errno == EAGAIN) { + /* done for now */ + break; + } + } + + if (ret == 0) { + /* Mostly due to 'umount' in client */ + gf_log (this->xl->name, GF_LOG_TRACE, + "EOF from peer %s", this->peerinfo.identifier); + opcount = -1; + errno = ENOTCONN; + break; + } + + if (ret == -1) { + if (errno == EINTR) + continue; + + gf_log (this->xl->name, GF_LOG_TRACE, + "%s failed (%s)", write ? "writev" : "readv", + strerror (errno)); + opcount = -1; + break; + } + + moved = 0; + + while (moved < ret) { + if ((ret - moved) >= opvector[0].iov_len) { + moved += opvector[0].iov_len; + opvector++; + opcount--; + } else { + opvector[0].iov_len -= (ret - moved); + opvector[0].iov_base += (ret - moved); + moved += (ret - moved); + } + while (opcount && !opvector[0].iov_len) { + opvector++; + opcount--; + } + } + } + + if (pending_vector) + *pending_vector = opvector; + + if (pending_count) + *pending_count = opcount; + + return opcount; +} + + +int +__socket_readv (transport_t *this, struct iovec *vector, int count, + struct iovec **pending_vector, int *pending_count) +{ + int ret = -1; + + ret = __socket_rwv (this, vector, count, + pending_vector, pending_count, 0); + + return ret; +} + + +int +__socket_writev (transport_t *this, struct iovec *vector, int count, + struct iovec **pending_vector, int *pending_count) +{ + int ret = -1; + + ret = __socket_rwv (this, vector, count, + pending_vector, pending_count, 1); + + return ret; +} + + +int +__socket_disconnect (transport_t *this) +{ + socket_private_t *priv = NULL; + int ret = -1; + + priv = this->private; + + if (priv->sock != -1) { + ret = shutdown (priv->sock, SHUT_RDWR); + priv->connected = -1; + gf_log (this->xl->name, GF_LOG_TRACE, + "shutdown() returned %d. set connection state to -1", + ret); + } + + return ret; +} + + +int +__socket_server_bind (transport_t *this) +{ + socket_private_t *priv = NULL; + int ret = -1; + int opt = 1; + + priv = this->private; + + ret = setsockopt (priv->sock, SOL_SOCKET, SO_REUSEADDR, + &opt, sizeof (opt)); + + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "setsockopt() for SO_REUSEADDR failed (%s)", + strerror (errno)); + } + + ret = bind (priv->sock, (struct sockaddr *)&this->myinfo.sockaddr, + this->myinfo.sockaddr_len); + + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "binding to %s failed: %s", + this->myinfo.identifier, strerror (errno)); + if (errno == EADDRINUSE) { + gf_log (this->xl->name, GF_LOG_ERROR, + "Port is already in use"); + } + } + + return ret; +} + + +int +__socket_nonblock (int fd) +{ + int flags = 0; + int ret = -1; + + flags = fcntl (fd, F_GETFL); + + if (flags != -1) + ret = fcntl (fd, F_SETFL, flags | O_NONBLOCK); + + return ret; +} + + +int +__socket_nodelay (int fd) +{ + int on = 1; + int ret = -1; + + ret = setsockopt (fd, IPPROTO_TCP, TCP_NODELAY, + &on, sizeof (on)); + if (!ret) + gf_log ("", GF_LOG_TRACE, + "NODELAY enabled for socket %d", fd); + + return ret; +} + +int +__socket_connect_finish (int fd) +{ + int ret = -1; + int optval = 0; + socklen_t optlen = sizeof (int); + + ret = getsockopt (fd, SOL_SOCKET, SO_ERROR, (void *)&optval, &optlen); + + if (ret == 0 && optval) { + errno = optval; + ret = -1; + } + + return ret; +} + + +void +__socket_reset (transport_t *this) +{ + socket_private_t *priv = NULL; + + priv = this->private; + + /* TODO: use mem-pool on incoming data */ + + if (priv->incoming.hdr_p) + GF_FREE (priv->incoming.hdr_p); + + if (priv->incoming.iobuf) + iobuf_unref (priv->incoming.iobuf); + + memset (&priv->incoming, 0, sizeof (priv->incoming)); + + event_unregister (this->xl->ctx->event_pool, priv->sock, priv->idx); + close (priv->sock); + priv->sock = -1; + priv->idx = -1; + priv->connected = -1; +} + + +struct ioq * +__socket_ioq_new (transport_t *this, char *buf, int len, + struct iovec *vector, int count, struct iobref *iobref) +{ + socket_private_t *priv = NULL; + struct ioq *entry = NULL; + + priv = this->private; + + /* TODO: use mem-pool */ + entry = GF_CALLOC (1, sizeof (*entry), + gf_common_mt_ioq); + if (!entry) + return NULL; + + assert (count <= (MAX_IOVEC-2)); + + entry->header.colonO[0] = ':'; + entry->header.colonO[1] = 'O'; + entry->header.colonO[2] = '\0'; + entry->header.version = 42; + entry->header.size1 = hton32 (len); + entry->header.size2 = hton32 (iov_length (vector, count)); + + entry->vector[0].iov_base = &entry->header; + entry->vector[0].iov_len = sizeof (entry->header); + entry->count++; + + entry->vector[1].iov_base = buf; + entry->vector[1].iov_len = len; + entry->count++; + + if (vector && count) { + memcpy (&entry->vector[2], vector, sizeof (*vector) * count); + entry->count += count; + } + + entry->pending_vector = entry->vector; + entry->pending_count = entry->count; + + if (iobref) + entry->iobref = iobref_ref (iobref); + + entry->buf = buf; + + INIT_LIST_HEAD (&entry->list); + + return entry; +} + + +void +__socket_ioq_entry_free (struct ioq *entry) +{ + list_del_init (&entry->list); + if (entry->iobref) + iobref_unref (entry->iobref); + + /* TODO: use mem-pool */ + GF_FREE (entry->buf); + + /* TODO: use mem-pool */ + GF_FREE (entry); +} + + +void +__socket_ioq_flush (transport_t *this) +{ + socket_private_t *priv = NULL; + struct ioq *entry = NULL; + + priv = this->private; + + while (!list_empty (&priv->ioq)) { + entry = priv->ioq_next; + __socket_ioq_entry_free (entry); + } + + return; +} + + +int +__socket_ioq_churn_entry (transport_t *this, struct ioq *entry) +{ + int ret = -1; + + ret = __socket_writev (this, entry->pending_vector, + entry->pending_count, + &entry->pending_vector, + &entry->pending_count); + + if (ret == 0) { + /* current entry was completely written */ + assert (entry->pending_count == 0); + __socket_ioq_entry_free (entry); + } + + return ret; +} + + +int +__socket_ioq_churn (transport_t *this) +{ + socket_private_t *priv = NULL; + int ret = 0; + struct ioq *entry = NULL; + + priv = this->private; + + while (!list_empty (&priv->ioq)) { + /* pick next entry */ + entry = priv->ioq_next; + + ret = __socket_ioq_churn_entry (this, entry); + + if (ret != 0) + break; + } + + if (list_empty (&priv->ioq)) { + /* all pending writes done, not interested in POLLOUT */ + priv->idx = event_select_on (this->xl->ctx->event_pool, + priv->sock, priv->idx, -1, 0); + } + + return ret; +} + + +int +socket_event_poll_err (transport_t *this) +{ + socket_private_t *priv = NULL; + int ret = -1; + + priv = this->private; + + pthread_mutex_lock (&priv->lock); + { + __socket_ioq_flush (this); + __socket_reset (this); + } + pthread_mutex_unlock (&priv->lock); + + xlator_notify (this->xl, GF_EVENT_POLLERR, this); + + return ret; +} + + +int +socket_event_poll_out (transport_t *this) +{ + socket_private_t *priv = NULL; + int ret = -1; + + priv = this->private; + + pthread_mutex_lock (&priv->lock); + { + if (priv->connected == 1) { + ret = __socket_ioq_churn (this); + + if (ret == -1) { + __socket_disconnect (this); + } + } + } + pthread_mutex_unlock (&priv->lock); + + xlator_notify (this->xl, GF_EVENT_POLLOUT, this); + + return ret; +} + + +int +__socket_proto_validate_header (transport_t *this, + struct socket_header *header, + size_t *size1_p, size_t *size2_p) +{ + size_t size1 = 0; + size_t size2 = 0; + + if (strcmp (header->colonO, ":O")) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "socket header signature does not match :O (%x.%x.%x)", + header->colonO[0], header->colonO[1], + header->colonO[2]); + return -1; + } + + if (header->version != 42) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "socket header version does not match 42 != %d", + header->version); + return -1; + } + + size1 = ntoh32 (header->size1); + size2 = ntoh32 (header->size2); + + if (size1 <= 0 || size1 > 1048576) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "socket header has incorrect size1=%"GF_PRI_SIZET, + size1); + return -1; + } + + if (size2 > (131072)) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "socket header has incorrect size2=%"GF_PRI_SIZET, + size2); + return -1; + } + + if (size1_p) + *size1_p = size1; + + if (size2_p) + *size2_p = size2; + + return 0; +} + + + +/* socket protocol state machine */ + +int +__socket_proto_state_machine (transport_t *this) +{ + int ret = -1; + socket_private_t *priv = NULL; + size_t size1 = 0; + size_t size2 = 0; + int previous_state = -1; + struct socket_header *hdr = NULL; + struct iobuf *iobuf = NULL; + + + priv = this->private; + + while (priv->incoming.state != SOCKET_PROTO_STATE_COMPLETE) { + /* debug check against infinite loops */ + if (previous_state == priv->incoming.state) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "state did not change! (%d) breaking", + previous_state); + ret = -1; + goto unlock; + } + previous_state = priv->incoming.state; + + switch (priv->incoming.state) { + + case SOCKET_PROTO_STATE_NADA: + priv->incoming.pending_vector = + priv->incoming.vector; + + priv->incoming.pending_vector->iov_base = + &priv->incoming.header; + + priv->incoming.pending_vector->iov_len = + sizeof (struct socket_header); + + priv->incoming.state = + SOCKET_PROTO_STATE_HEADER_COMING; + break; + + case SOCKET_PROTO_STATE_HEADER_COMING: + + ret = __socket_readv (this, + priv->incoming.pending_vector, 1, + &priv->incoming.pending_vector, + NULL); + if (ret == 0) { + priv->incoming.state = + SOCKET_PROTO_STATE_HEADER_CAME; + break; + } + + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_TRACE, + "read (%s) in state %d (%s)", + strerror (errno), + SOCKET_PROTO_STATE_HEADER_COMING, + this->peerinfo.identifier); + goto unlock; + } + + if (ret > 0) { + gf_log (this->xl->name, GF_LOG_TRACE, + "partial header read on NB socket."); + goto unlock; + } + break; + + case SOCKET_PROTO_STATE_HEADER_CAME: + hdr = &priv->incoming.header; + ret = __socket_proto_validate_header (this, hdr, + &size1, &size2); + + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "socket header validate failed (%s). " + "possible mismatch of transport-type " + "between server and client volumes, " + "or version mismatch", + this->peerinfo.identifier); + goto unlock; + } + + priv->incoming.hdrlen = size1; + priv->incoming.buflen = size2; + + /* TODO: use mem-pool */ + priv->incoming.hdr_p = GF_MALLOC (size1, + gf_common_mt_char); + if (size2) { + /* TODO: sanity check size2 < page size + */ + iobuf = iobuf_get (this->xl->ctx->iobuf_pool); + if (!iobuf) { + gf_log (this->xl->name, GF_LOG_ERROR, + "unable to allocate IO buffer " + "for peer %s", + this->peerinfo.identifier); + ret = -ENOMEM; + goto unlock; + } + priv->incoming.iobuf = iobuf; + priv->incoming.buf_p = iobuf->ptr; + } + + priv->incoming.vector[0].iov_base = + priv->incoming.hdr_p; + + priv->incoming.vector[0].iov_len = size1; + + priv->incoming.vector[1].iov_base = + priv->incoming.buf_p; + + priv->incoming.vector[1].iov_len = size2; + priv->incoming.count = size2 ? 2 : 1; + + priv->incoming.pending_vector = + priv->incoming.vector; + + priv->incoming.pending_count = + priv->incoming.count; + + priv->incoming.state = + SOCKET_PROTO_STATE_DATA_COMING; + break; + + case SOCKET_PROTO_STATE_DATA_COMING: + + ret = __socket_readv (this, + priv->incoming.pending_vector, + priv->incoming.pending_count, + &priv->incoming.pending_vector, + &priv->incoming.pending_count); + if (ret == 0) { + priv->incoming.state = + SOCKET_PROTO_STATE_DATA_CAME; + break; + } + + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "read (%s) in state %d (%s)", + strerror (errno), + SOCKET_PROTO_STATE_DATA_COMING, + this->peerinfo.identifier); + goto unlock; + } + + if (ret > 0) { + gf_log (this->xl->name, GF_LOG_TRACE, + "partial data read on NB socket"); + goto unlock; + } + break; + + case SOCKET_PROTO_STATE_DATA_CAME: + memset (&priv->incoming.vector, 0, + sizeof (priv->incoming.vector)); + priv->incoming.pending_vector = NULL; + priv->incoming.pending_count = 0; + priv->incoming.state = SOCKET_PROTO_STATE_COMPLETE; + break; + + case SOCKET_PROTO_STATE_COMPLETE: + /* not reached */ + break; + + default: + gf_log (this->xl->name, GF_LOG_DEBUG, + "undefined state reached: %d", + priv->incoming.state); + goto unlock; + } + } +unlock: + + return ret; +} + + +int +socket_proto_state_machine (transport_t *this) +{ + socket_private_t *priv = NULL; + int ret = 0; + + priv = this->private; + + pthread_mutex_lock (&priv->lock); + { + ret = __socket_proto_state_machine (this); + } + pthread_mutex_unlock (&priv->lock); + + return ret; +} + + +int +socket_event_poll_in (transport_t *this) +{ + int ret = -1; + + ret = socket_proto_state_machine (this); + + /* call POLLIN on xlator even if complete block is not received, + just to keep the last_received timestamp ticking */ + + if (ret == 0) + ret = xlator_notify (this->xl, GF_EVENT_POLLIN, this); + + return ret; +} + + +int +socket_connect_finish (transport_t *this) +{ + int ret = -1; + socket_private_t *priv = NULL; + int event = -1; + char notify_xlator = 0; + + priv = this->private; + + pthread_mutex_lock (&priv->lock); + { + if (priv->connected) + goto unlock; + + ret = __socket_connect_finish (priv->sock); + + if (ret == -1 && errno == EINPROGRESS) + ret = 1; + + if (ret == -1 && errno != EINPROGRESS) { + if (!priv->connect_finish_log) { + gf_log (this->xl->name, GF_LOG_ERROR, + "connection to %s failed (%s)", + this->peerinfo.identifier, + strerror (errno)); + priv->connect_finish_log = 1; + } + __socket_disconnect (this); + notify_xlator = 1; + event = GF_EVENT_POLLERR; + goto unlock; + } + + if (ret == 0) { + notify_xlator = 1; + + this->myinfo.sockaddr_len = + sizeof (this->myinfo.sockaddr); + + ret = getsockname (priv->sock, + SA (&this->myinfo.sockaddr), + &this->myinfo.sockaddr_len); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "getsockname on (%d) failed (%s)", + priv->sock, strerror (errno)); + __socket_disconnect (this); + event = GF_EVENT_POLLERR; + goto unlock; + } + + priv->connected = 1; + priv->connect_finish_log = 0; + event = GF_EVENT_CHILD_UP; + get_transport_identifiers (this); + } + } +unlock: + pthread_mutex_unlock (&priv->lock); + + if (notify_xlator) + xlator_notify (this->xl, event, this); + + return 0; +} + + +int +socket_event_handler (int fd, int idx, void *data, + int poll_in, int poll_out, int poll_err) +{ + transport_t *this = NULL; + socket_private_t *priv = NULL; + int ret = 0; + + this = data; + priv = this->private; + + pthread_mutex_lock (&priv->lock); + { + priv->idx = idx; + } + pthread_mutex_unlock (&priv->lock); + + if (!priv->connected) { + ret = socket_connect_finish (this); + } + + if (!ret && poll_out) { + ret = socket_event_poll_out (this); + } + + if (!ret && poll_in) { + ret = socket_event_poll_in (this); + } + + if (ret < 0 || poll_err) { + socket_event_poll_err (this); + transport_unref (this); + } + + return 0; +} + + +int +socket_server_event_handler (int fd, int idx, void *data, + int poll_in, int poll_out, int poll_err) +{ + transport_t *this = NULL; + socket_private_t *priv = NULL; + int ret = 0; + int new_sock = -1; + transport_t *new_trans = NULL; + struct sockaddr_storage new_sockaddr = {0, }; + socklen_t addrlen = sizeof (new_sockaddr); + socket_private_t *new_priv = NULL; + glusterfs_ctx_t *ctx = NULL; + + this = data; + priv = this->private; + ctx = this->xl->ctx; + + pthread_mutex_lock (&priv->lock); + { + priv->idx = idx; + + if (poll_in) { + new_sock = accept (priv->sock, SA (&new_sockaddr), + &addrlen); + + if (new_sock == -1) + goto unlock; + + if (!priv->bio) { + ret = __socket_nonblock (new_sock); + + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "NBIO on %d failed (%s)", + new_sock, strerror (errno)); + close (new_sock); + goto unlock; + } + } + + if (priv->nodelay) { + ret = __socket_nodelay (new_sock); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "setsockopt() failed for " + "NODELAY (%s)", + strerror (errno)); + } + } + + new_trans = GF_CALLOC (1, sizeof (*new_trans), + gf_common_mt_transport_t); + new_trans->xl = this->xl; + new_trans->fini = this->fini; + + memcpy (&new_trans->peerinfo.sockaddr, &new_sockaddr, + addrlen); + new_trans->peerinfo.sockaddr_len = addrlen; + + new_trans->myinfo.sockaddr_len = + sizeof (new_trans->myinfo.sockaddr); + + ret = getsockname (new_sock, + SA (&new_trans->myinfo.sockaddr), + &new_trans->myinfo.sockaddr_len); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "getsockname on %d failed (%s)", + new_sock, strerror (errno)); + close (new_sock); + goto unlock; + } + + get_transport_identifiers (new_trans); + socket_init (new_trans); + new_trans->ops = this->ops; + new_trans->init = this->init; + new_trans->fini = this->fini; + + new_priv = new_trans->private; + + pthread_mutex_lock (&new_priv->lock); + { + new_priv->sock = new_sock; + new_priv->connected = 1; + + transport_ref (new_trans); + new_priv->idx = + event_register (ctx->event_pool, + new_sock, + socket_event_handler, + new_trans, 1, 0); + + if (new_priv->idx == -1) + ret = -1; + } + pthread_mutex_unlock (&new_priv->lock); + } + } +unlock: + pthread_mutex_unlock (&priv->lock); + + return ret; +} + + +int +socket_disconnect (transport_t *this) +{ + socket_private_t *priv = NULL; + int ret = -1; + + priv = this->private; + + pthread_mutex_lock (&priv->lock); + { + ret = __socket_disconnect (this); + } + pthread_mutex_unlock (&priv->lock); + + return ret; +} + + +int +socket_connect (transport_t *this) +{ + int ret = -1; + int sock = -1; + socket_private_t *priv = NULL; + struct sockaddr_storage sockaddr = {0, }; + socklen_t sockaddr_len = 0; + glusterfs_ctx_t *ctx = NULL; + sa_family_t sa_family = {0, }; + + priv = this->private; + ctx = this->xl->ctx; + + if (!priv) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "connect() called on uninitialized transport"); + goto err; + } + + pthread_mutex_lock (&priv->lock); + { + sock = priv->sock; + } + pthread_mutex_unlock (&priv->lock); + + if (sock != -1) { + gf_log (this->xl->name, GF_LOG_TRACE, + "connect () called on transport already connected"); + ret = 0; + goto err; + } + + ret = socket_client_get_remote_sockaddr (this, SA (&sockaddr), + &sockaddr_len, &sa_family); + if (ret == -1) { + /* logged inside client_get_remote_sockaddr */ + goto err; + } + + pthread_mutex_lock (&priv->lock); + { + if (priv->sock != -1) { + gf_log (this->xl->name, GF_LOG_TRACE, + "connect() -- already connected"); + goto unlock; + } + + memcpy (&this->peerinfo.sockaddr, &sockaddr, sockaddr_len); + this->peerinfo.sockaddr_len = sockaddr_len; + + priv->sock = socket (sa_family, SOCK_STREAM, 0); + if (priv->sock == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "socket creation failed (%s)", + strerror (errno)); + goto unlock; + } + + /* Cant help if setting socket options fails. We can continue + * working nonetheless. + */ + if (setsockopt (priv->sock, SOL_SOCKET, SO_RCVBUF, + &priv->windowsize, + sizeof (priv->windowsize)) < 0) { + gf_log (this->xl->name, GF_LOG_ERROR, + "setting receive window size failed: %d: %d: " + "%s", priv->sock, priv->windowsize, + strerror (errno)); + } + + if (setsockopt (priv->sock, SOL_SOCKET, SO_SNDBUF, + &priv->windowsize, + sizeof (priv->windowsize)) < 0) { + gf_log (this->xl->name, GF_LOG_ERROR, + "setting send window size failed: %d: %d: " + "%s", priv->sock, priv->windowsize, + strerror (errno)); + } + + + if (priv->nodelay && priv->lowlat) { + ret = __socket_nodelay (priv->sock); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "setsockopt() failed for NODELAY (%s)", + strerror (errno)); + } + } + + if (!priv->bio) { + ret = __socket_nonblock (priv->sock); + + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "NBIO on %d failed (%s)", + priv->sock, strerror (errno)); + close (priv->sock); + priv->sock = -1; + goto unlock; + } + } + + SA (&this->myinfo.sockaddr)->sa_family = + SA (&this->peerinfo.sockaddr)->sa_family; + + ret = client_bind (this, SA (&this->myinfo.sockaddr), + &this->myinfo.sockaddr_len, priv->sock); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_WARNING, + "client bind failed: %s", strerror (errno)); + close (priv->sock); + priv->sock = -1; + goto unlock; + } + + ret = connect (priv->sock, SA (&this->peerinfo.sockaddr), + this->peerinfo.sockaddr_len); + + if (ret == -1 && errno != EINPROGRESS) { + gf_log (this->xl->name, GF_LOG_ERROR, + "connection attempt failed (%s)", + strerror (errno)); + close (priv->sock); + priv->sock = -1; + goto unlock; + } + + priv->connected = 0; + + transport_ref (this); + + priv->idx = event_register (ctx->event_pool, priv->sock, + socket_event_handler, this, 1, 1); + if (priv->idx == -1) + ret = -1; + } +unlock: + pthread_mutex_unlock (&priv->lock); + +err: + return ret; +} + + +int +socket_listen (transport_t *this) +{ + socket_private_t * priv = NULL; + int ret = -1; + int sock = -1; + struct sockaddr_storage sockaddr; + socklen_t sockaddr_len; + peer_info_t *myinfo = NULL; + glusterfs_ctx_t *ctx = NULL; + sa_family_t sa_family = {0, }; + + priv = this->private; + myinfo = &this->myinfo; + ctx = this->xl->ctx; + + pthread_mutex_lock (&priv->lock); + { + sock = priv->sock; + } + pthread_mutex_unlock (&priv->lock); + + if (sock != -1) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "alreading listening"); + return ret; + } + + ret = socket_server_get_local_sockaddr (this, SA (&sockaddr), + &sockaddr_len, &sa_family); + if (ret == -1) { + return ret; + } + + pthread_mutex_lock (&priv->lock); + { + if (priv->sock != -1) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "already listening"); + goto unlock; + } + + memcpy (&myinfo->sockaddr, &sockaddr, sockaddr_len); + myinfo->sockaddr_len = sockaddr_len; + + priv->sock = socket (sa_family, SOCK_STREAM, 0); + + if (priv->sock == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "socket creation failed (%s)", + strerror (errno)); + goto unlock; + } + + /* Cant help if setting socket options fails. We can continue + * working nonetheless. + */ + if (setsockopt (priv->sock, SOL_SOCKET, SO_RCVBUF, + &priv->windowsize, + sizeof (priv->windowsize)) < 0) { + gf_log (this->xl->name, GF_LOG_ERROR, + "setting receive window size failed: %d: %d: " + "%s", priv->sock, priv->windowsize, + strerror (errno)); + } + + if (setsockopt (priv->sock, SOL_SOCKET, SO_SNDBUF, + &priv->windowsize, + sizeof (priv->windowsize)) < 0) { + gf_log (this->xl->name, GF_LOG_ERROR, + "setting send window size failed: %d: %d: " + "%s", priv->sock, priv->windowsize, + strerror (errno)); + } + + if (priv->nodelay) { + ret = __socket_nodelay (priv->sock); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "setsockopt() failed for NODELAY (%s)", + strerror (errno)); + } + } + + if (!priv->bio) { + ret = __socket_nonblock (priv->sock); + + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "NBIO on %d failed (%s)", + priv->sock, strerror (errno)); + close (priv->sock); + priv->sock = -1; + goto unlock; + } + } + + ret = __socket_server_bind (this); + + if (ret == -1) { + /* logged inside __socket_server_bind() */ + close (priv->sock); + priv->sock = -1; + goto unlock; + } + + ret = listen (priv->sock, 10); + + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "could not set socket %d to listen mode (%s)", + priv->sock, strerror (errno)); + close (priv->sock); + priv->sock = -1; + goto unlock; + } + + transport_ref (this); + + priv->idx = event_register (ctx->event_pool, priv->sock, + socket_server_event_handler, + this, 1, 0); + + if (priv->idx == -1) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "could not register socket %d with events", + priv->sock); + ret = -1; + close (priv->sock); + priv->sock = -1; + goto unlock; + } + } +unlock: + pthread_mutex_unlock (&priv->lock); + + return ret; +} + + +int +socket_receive (transport_t *this, char **hdr_p, size_t *hdrlen_p, + struct iobuf **iobuf_p) +{ + socket_private_t *priv = NULL; + int ret = -1; + + priv = this->private; + + pthread_mutex_lock (&priv->lock); + { + if (priv->connected != 1) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "socket not connected to receive"); + goto unlock; + } + + if (!hdr_p || !hdrlen_p || !iobuf_p) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "bad parameters %p %p %p", + hdr_p, hdrlen_p, iobuf_p); + goto unlock; + } + + if (priv->incoming.state == SOCKET_PROTO_STATE_COMPLETE) { + *hdr_p = priv->incoming.hdr_p; + *hdrlen_p = priv->incoming.hdrlen; + *iobuf_p = priv->incoming.iobuf; + + memset (&priv->incoming, 0, sizeof (priv->incoming)); + priv->incoming.state = SOCKET_PROTO_STATE_NADA; + + ret = 0; + } + } +unlock: + pthread_mutex_unlock (&priv->lock); + + return ret; +} + + +/* TODO: implement per transfer limit */ +int +socket_submit (transport_t *this, char *buf, int len, + struct iovec *vector, int count, + struct iobref *iobref) +{ + socket_private_t *priv = NULL; + int ret = -1; + char need_poll_out = 0; + char need_append = 1; + struct ioq *entry = NULL; + glusterfs_ctx_t *ctx = NULL; + + priv = this->private; + ctx = this->xl->ctx; + + pthread_mutex_lock (&priv->lock); + { + if (priv->connected != 1) { + if (!priv->submit_log && !priv->connect_finish_log) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "not connected (priv->connected = %d)", + priv->connected); + priv->submit_log = 1; + } + goto unlock; + } + + priv->submit_log = 0; + entry = __socket_ioq_new (this, buf, len, vector, count, iobref); + if (!entry) + goto unlock; + + if (list_empty (&priv->ioq)) { + ret = __socket_ioq_churn_entry (this, entry); + + if (ret == 0) + need_append = 0; + + if (ret > 0) + need_poll_out = 1; + } + + if (need_append) { + list_add_tail (&entry->list, &priv->ioq); + ret = 0; + } + + if (need_poll_out) { + /* first entry to wait. continue writing on POLLOUT */ + priv->idx = event_select_on (ctx->event_pool, + priv->sock, + priv->idx, -1, 1); + } + } +unlock: + pthread_mutex_unlock (&priv->lock); + + return ret; +} + + +struct transport_ops tops = { + .listen = socket_listen, + .connect = socket_connect, + .disconnect = socket_disconnect, + .submit = socket_submit, + .receive = socket_receive +}; + + +int +socket_init (transport_t *this) +{ + socket_private_t *priv = NULL; + gf_boolean_t tmp_bool = 0; + uint64_t windowsize = GF_DEFAULT_SOCKET_WINDOW_SIZE; + char *optstr = NULL; + + if (this->private) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "double init attempted"); + return -1; + } + + priv = GF_CALLOC (1, sizeof (*priv), + gf_common_mt_socket_private_t); + if (!priv) { + gf_log (this->xl->name, GF_LOG_ERROR, + "calloc (1, %"GF_PRI_SIZET") returned NULL", + sizeof (*priv)); + return -1; + } + + pthread_mutex_init (&priv->lock, NULL); + + priv->sock = -1; + priv->idx = -1; + priv->connected = -1; + + INIT_LIST_HEAD (&priv->ioq); + + if (dict_get (this->xl->options, "non-blocking-io")) { + optstr = data_to_str (dict_get (this->xl->options, + "non-blocking-io")); + + if (gf_string2boolean (optstr, &tmp_bool) == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "'non-blocking-io' takes only boolean options," + " not taking any action"); + tmp_bool = 1; + } + priv->bio = 0; + if (!tmp_bool) { + priv->bio = 1; + gf_log (this->xl->name, GF_LOG_WARNING, + "disabling non-blocking IO"); + } + } + + optstr = NULL; + + // By default, we enable NODELAY + priv->nodelay = 1; + if (dict_get (this->xl->options, "transport.socket.nodelay")) { + optstr = data_to_str (dict_get (this->xl->options, + "transport.socket.nodelay")); + + if (gf_string2boolean (optstr, &tmp_bool) == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "'transport.socket.nodelay' takes only " + "boolean options, not taking any action"); + tmp_bool = 1; + } + if (!tmp_bool) { + priv->nodelay = 0; + gf_log (this->xl->name, GF_LOG_DEBUG, + "disabling nodelay"); + } + } + + + optstr = NULL; + if (dict_get_str (this->xl->options, "transport.window-size", + &optstr) == 0) { + if (gf_string2bytesize (optstr, &windowsize) != 0) { + gf_log (this->xl->name, GF_LOG_ERROR, + "invalid number format: %s", optstr); + return -1; + } + } + + optstr = NULL; + + if (dict_get_str (this->xl->options, "transport.socket.lowlat", + &optstr) == 0) { + priv->lowlat = 1; + } + + priv->windowsize = (int)windowsize; + this->private = priv; + + return 0; +} + + +void +fini (transport_t *this) +{ + socket_private_t *priv = this->private; + + gf_log (this->xl->name, GF_LOG_TRACE, + "transport %p destroyed", this); + + pthread_mutex_destroy (&priv->lock); + GF_FREE (priv); +} + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_common_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; + } + + return ret; +} + +int32_t +init (transport_t *this) +{ + int ret = -1; + + ret = socket_init (this); + + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_DEBUG, "socket_init() failed"); + } + + return ret; +} + +struct volume_options options[] = { + { .key = {"remote-port", + "transport.remote-port", + "transport.socket.remote-port"}, + .type = GF_OPTION_TYPE_INT + }, + { .key = {"transport.socket.listen-port", "listen-port"}, + .type = GF_OPTION_TYPE_INT + }, + { .key = {"transport.socket.bind-address", "bind-address" }, + .type = GF_OPTION_TYPE_INTERNET_ADDRESS + }, + { .key = {"transport.socket.connect-path", "connect-path"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"transport.socket.bind-path", "bind-path"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"transport.socket.listen-path", "listen-path"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = { "transport.address-family", + "address-family" }, + .value = {"inet", "inet6", "inet/inet6", "inet6/inet", + "unix", "inet-sdp" }, + .type = GF_OPTION_TYPE_STR + }, + + { .key = {"non-blocking-io"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"transport.window-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = GF_MIN_SOCKET_WINDOW_SIZE, + .max = GF_MAX_SOCKET_WINDOW_SIZE, + }, + { .key = {"transport.socket.nodelay"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"transport.socket.lowlat"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {NULL} } +}; + diff --git a/xlators/protocol/transport/socket/src/socket.h b/xlators/protocol/transport/socket/src/socket.h new file mode 100644 index 00000000..bc6d3b27 --- /dev/null +++ b/xlators/protocol/transport/socket/src/socket.h @@ -0,0 +1,125 @@ +/* + Copyright (c) 2006-2009 Gluster, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _SOCKET_H +#define _SOCKET_H + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "event.h" +#include "transport.h" +#include "logging.h" +#include "dict.h" +#include "mem-pool.h" +#include "socket-mem-types.h" + +#ifndef MAX_IOVEC +#define MAX_IOVEC 16 +#endif /* MAX_IOVEC */ + +#define GF_DEFAULT_SOCKET_LISTEN_PORT 6996 + +/* This is the size set through setsockopt for + * both the TCP receive window size and the + * send buffer size. + * Till the time iobuf size becomes configurable, this size is set to include + * two iobufs + the GlusterFS protocol headers. + * Linux allows us to over-ride the max values for the system. + * Should we over-ride them? Because if we set a value larger than the default + * setsockopt will fail. Having larger values might be beneficial for + * IB links. + */ +#define GF_DEFAULT_SOCKET_WINDOW_SIZE (512 * GF_UNIT_KB) +#define GF_MAX_SOCKET_WINDOW_SIZE (1 * GF_UNIT_MB) +#define GF_MIN_SOCKET_WINDOW_SIZE (128 * GF_UNIT_KB) + +typedef enum { + SOCKET_PROTO_STATE_NADA = 0, + SOCKET_PROTO_STATE_HEADER_COMING, + SOCKET_PROTO_STATE_HEADER_CAME, + SOCKET_PROTO_STATE_DATA_COMING, + SOCKET_PROTO_STATE_DATA_CAME, + SOCKET_PROTO_STATE_COMPLETE, +} socket_proto_state_t; + +struct socket_header { + char colonO[3]; + uint32_t size1; + uint32_t size2; + char version; +} __attribute__((packed)); + + +struct ioq { + union { + struct list_head list; + struct { + struct ioq *next; + struct ioq *prev; + }; + }; + struct socket_header header; + struct iovec vector[MAX_IOVEC]; + int count; + struct iovec *pending_vector; + int pending_count; + char *buf; + struct iobref *iobref; +}; + + +typedef struct { + int32_t sock; + int32_t idx; + unsigned char connected; // -1 = not connected. 0 = in progress. 1 = connected + char bio; + char connect_finish_log; + char submit_log; + union { + struct list_head ioq; + struct { + struct ioq *ioq_next; + struct ioq *ioq_prev; + }; + }; + struct { + int state; + struct socket_header header; + char *hdr_p; + size_t hdrlen; + struct iobuf *iobuf; + char *buf_p; + size_t buflen; + struct iovec vector[2]; + int count; + struct iovec *pending_vector; + int pending_count; + } incoming; + pthread_mutex_t lock; + int windowsize; + char lowlat; + char nodelay; +} socket_private_t; + + +#endif -- cgit