summaryrefslogtreecommitdiffstats
path: root/rpc/rpc-transport/rdma/src/rdma.h
diff options
context:
space:
mode:
authorRaghavendra G <raghavendra@gluster.com>2010-08-30 08:03:52 +0000
committerVijay Bellur <vijay@dev.gluster.com>2010-08-30 06:54:12 -0700
commit300b4fefcbb5a5ced4f0554e109679e44cdf44ea (patch)
treec77b64f94272188b8f2ce81e2f300222630ebdbe /rpc/rpc-transport/rdma/src/rdma.h
parent55bbf23b3a608f67b7a05939f4205049e92d081a (diff)
Bring in new transport rdma.
- rdma is new transport and improvement over current ib-verbs. It uses rdma-read and rdma-write for efficiently transferring large buffers. For more details please refer to rfc-5666 and rfc-5667. Signed-off-by: Raghavendra G <raghavendra@gluster.com> Signed-off-by: Vijay Bellur <vijay@dev.gluster.com> BUG: 513 (Introduce 0 copy rdma) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=513
Diffstat (limited to 'rpc/rpc-transport/rdma/src/rdma.h')
-rw-r--r--rpc/rpc-transport/rdma/src/rdma.h395
1 files changed, 395 insertions, 0 deletions
diff --git a/rpc/rpc-transport/rdma/src/rdma.h b/rpc/rpc-transport/rdma/src/rdma.h
new file mode 100644
index 000000000..470ee7206
--- /dev/null
+++ b/rpc/rpc-transport/rdma/src/rdma.h
@@ -0,0 +1,395 @@
+/*
+ Copyright (c) 2006-2009 Gluster, Inc. <http://www.gluster.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _XPORT_RDMA_H
+#define _XPORT_RDMA_H
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef MAX_IOVEC
+#define MAX_IOVEC 16
+#endif /* MAX_IOVEC */
+
+#include "rpc-clnt.h"
+#include "rpc-transport.h"
+#include "xlator.h"
+#include "event.h"
+#include <stdio.h>
+#include <list.h>
+#include <arpa/inet.h>
+#include <infiniband/verbs.h>
+
+/* FIXME: give appropriate values to these macros */
+#define GF_DEFAULT_RDMA_LISTEN_PORT 6997
+#define RDMA_MAX_SEGMENTS 8
+#define RDMA_MAX_HEADER_SIZE (sizeof (rdma_header_t) \
+ + RDMA_MAX_SEGMENTS \
+ * sizeof (rdma_read_chunk_t))
+#define RDMA_INLINE_THRESHOLD (1024 * 128)
+#define RDMA_VERSION 1
+#define RDMA_POOL_SIZE 512
+
+typedef enum rdma_errcode {
+ ERR_VERS = 1,
+ ERR_CHUNK = 2
+}rdma_errcode_t;
+
+struct rdma_err_vers {
+ uint32_t rdma_vers_low; /* Version range supported by peer */
+ uint32_t rdma_vers_high;
+}__attribute__ ((packed));
+typedef struct rdma_err_vers rdma_err_vers_t;
+
+typedef enum rdma_proc {
+ RDMA_MSG = 0, /* An RPC call or reply msg */
+ RDMA_NOMSG = 1, /* An RPC call or reply msg - separate body */
+ RDMA_MSGP = 2, /* An RPC call or reply msg with padding */
+ RDMA_DONE = 3, /* Client signals reply completion */
+ RDMA_ERROR = 4 /* An RPC RDMA encoding error */
+}rdma_proc_t;
+
+typedef enum rdma_chunktype {
+ rdma_noch = 0, /* no chunk */
+ rdma_readch, /* some argument through rdma read */
+ rdma_areadch, /* entire request through rdma read */
+ rdma_writech, /* some result through rdma write */
+ rdma_replych /* entire reply through rdma write */
+}rdma_chunktype_t;
+
+struct __rdma_header {
+ uint32_t rm_xid; /* Mirrors the RPC header xid */
+ uint32_t rm_vers; /* Version of this protocol */
+ uint32_t rm_credit; /* Buffers requested/granted */
+ uint32_t rm_type; /* Type of message (enum rdma_proc) */
+ union {
+ struct { /* no chunks */
+ uint32_t rm_empty[3]; /* 3 empty chunk lists */
+ }__attribute__((packed)) rm_nochunks;
+
+ struct { /* no chunks and padded */
+ uint32_t rm_align; /* Padding alignment */
+ uint32_t rm_thresh; /* Padding threshold */
+ uint32_t rm_pempty[3]; /* 3 empty chunk lists */
+ }__attribute__((packed)) rm_padded;
+
+ struct {
+ uint32_t rm_type;
+ rdma_err_vers_t rm_version;
+ }__attribute__ ((packed)) rm_error;
+
+ uint32_t rm_chunks[0]; /* read, write and reply chunks */
+ }__attribute__ ((packed)) rm_body;
+} __attribute__((packed));
+typedef struct __rdma_header rdma_header_t;
+
+struct __rdma_segment {
+ uint32_t rs_handle; /* Registered memory handle */
+ uint32_t rs_length; /* Length of the chunk in bytes */
+ uint64_t rs_offset; /* Chunk virtual address or offset */
+} __attribute__((packed));
+typedef struct __rdma_segment rdma_segment_t;
+
+/* read chunk(s), encoded as a linked list. */
+struct __rdma_read_chunk {
+ uint32_t rc_discrim; /* 1 indicates presence */
+ uint32_t rc_position; /* Position in XDR stream */
+ rdma_segment_t rc_target;
+} __attribute__((packed));
+typedef struct __rdma_read_chunk rdma_read_chunk_t;
+
+/* write chunk, and reply chunk. */
+struct __rdma_write_chunk {
+ rdma_segment_t wc_target;
+} __attribute__((packed));
+typedef struct __rdma_write_chunk rdma_write_chunk_t;
+
+/* write chunk(s), encoded as a counted array. */
+struct __rdma_write_array {
+ uint32_t wc_discrim; /* 1 indicates presence */
+ uint32_t wc_nchunks; /* Array count */
+ struct __rdma_write_chunk wc_array[0];
+} __attribute__((packed));
+typedef struct __rdma_write_array rdma_write_array_t;
+
+/* options per transport end point */
+struct __rdma_options {
+ int32_t port;
+ char *device_name;
+ enum ibv_mtu mtu;
+ int32_t send_count;
+ int32_t recv_count;
+ uint64_t recv_size;
+ uint64_t send_size;
+};
+typedef struct __rdma_options rdma_options_t;
+
+struct __rdma_reply_info {
+ uint32_t rm_xid; /* xid in network endian */
+ rdma_chunktype_t type; /*
+ * can be either rdma_replych
+ * or rdma_writech.
+ */
+ rdma_write_array_t *wc_array;
+ struct mem_pool *pool;
+};
+typedef struct __rdma_reply_info rdma_reply_info_t;
+
+struct __rdma_ioq {
+ union {
+ struct list_head list;
+ struct {
+ struct __rdma_ioq *next;
+ struct __rdma_ioq *prev;
+ };
+ };
+
+ char is_request;
+ struct iovec rpchdr[MAX_IOVEC];
+ int rpchdr_count;
+ struct iovec proghdr[MAX_IOVEC];
+ int proghdr_count;
+ struct iovec prog_payload[MAX_IOVEC];
+ int prog_payload_count;
+
+ struct iobref *iobref;
+
+ union {
+ struct __rdma_ioq_request {
+ /* used to build reply_chunk for RDMA_NOMSG type msgs */
+ struct iovec rsphdr_vec[MAX_IOVEC];
+ int rsphdr_count;
+
+ /*
+ * used to build write_array during operations like
+ * read.
+ */
+ struct iovec rsp_payload[MAX_IOVEC];
+ int rsp_payload_count;
+
+ struct rpc_req *rpc_req; /* FIXME: hack! hack! should be
+ * cleaned up later
+ */
+ struct iobref *rsp_iobref;
+ }request;
+
+ rdma_reply_info_t *reply_info;
+ }msg;
+
+ struct mem_pool *pool;
+};
+typedef struct __rdma_ioq rdma_ioq_t;
+
+typedef enum __rdma_send_post_type {
+ RDMA_SEND_POST_NO_CHUNKLIST, /* post which is sent using rdma-send
+ * and the msg carries no
+ * chunklists.
+ */
+ RDMA_SEND_POST_READ_CHUNKLIST, /* post which is sent using rdma-send
+ * and the msg carries only read
+ * chunklist.
+ */
+ RDMA_SEND_POST_WRITE_CHUNKLIST, /* post which is sent using
+ * rdma-send and the msg carries
+ * only write chunklist.
+ */
+ RDMA_SEND_POST_READ_WRITE_CHUNKLIST, /* post which is sent using
+ * rdma-send and the msg
+ * carries both read and
+ * write chunklists.
+ */
+ RDMA_SEND_POST_RDMA_READ, /* RDMA read */
+ RDMA_SEND_POST_RDMA_WRITE, /* RDMA write */
+}rdma_send_post_type_t;
+
+/* represents one communication peer, two per transport_t */
+struct __rdma_peer {
+ rpc_transport_t *trans;
+ struct ibv_qp *qp;
+
+ int32_t recv_count;
+ int32_t send_count;
+ int32_t recv_size;
+ int32_t send_size;
+
+ int32_t quota;
+ union {
+ struct list_head ioq;
+ struct {
+ rdma_ioq_t *ioq_next;
+ rdma_ioq_t *ioq_prev;
+ };
+ };
+
+ /* QP attributes, needed to connect with remote QP */
+ int32_t local_lid;
+ int32_t local_psn;
+ int32_t local_qpn;
+ int32_t remote_lid;
+ int32_t remote_psn;
+ int32_t remote_qpn;
+};
+typedef struct __rdma_peer rdma_peer_t;
+
+struct __rdma_post_context {
+ struct ibv_mr *mr[RDMA_MAX_SEGMENTS];
+ int mr_count;
+ struct iovec vector[MAX_IOVEC];
+ int count;
+ struct iobref *iobref;
+ char is_request;
+ rdma_reply_info_t *reply_info;
+};
+typedef struct __rdma_post_context rdma_post_context_t;
+
+typedef enum {
+ RDMA_SEND_POST,
+ RDMA_RECV_POST
+} rdma_post_type_t;
+
+struct __rdma_post {
+ struct __rdma_post *next, *prev;
+ struct ibv_mr *mr;
+ char *buf;
+ int32_t buf_size;
+ char aux;
+ int32_t reused;
+ struct __rdma_device *device;
+ rdma_post_type_t type;
+ rdma_post_context_t ctx;
+ int refcount;
+ pthread_mutex_t lock;
+};
+typedef struct __rdma_post rdma_post_t;
+
+struct __rdma_queue {
+ rdma_post_t active_posts, passive_posts;
+ int32_t active_count, passive_count;
+ pthread_mutex_t lock;
+};
+typedef struct __rdma_queue rdma_queue_t;
+
+struct __rdma_qpreg {
+ pthread_mutex_t lock;
+ int32_t count;
+ struct _qpent {
+ struct _qpent *next, *prev;
+ int32_t qp_num;
+ rdma_peer_t *peer;
+ } ents[42];
+};
+typedef struct __rdma_qpreg rdma_qpreg_t;
+
+/* context per device, stored in global glusterfs_ctx_t->ib */
+struct __rdma_device {
+ struct __rdma_device *next;
+ const char *device_name;
+ struct ibv_context *context;
+ int32_t port;
+ struct ibv_pd *pd;
+ struct ibv_srq *srq;
+ rdma_qpreg_t qpreg;
+ struct ibv_comp_channel *send_chan, *recv_chan;
+ struct ibv_cq *send_cq, *recv_cq;
+ rdma_queue_t sendq, recvq;
+ pthread_t send_thread, recv_thread;
+};
+typedef struct __rdma_device rdma_device_t;
+
+typedef enum {
+ RDMA_HANDSHAKE_START = 0,
+ RDMA_HANDSHAKE_SENDING_DATA,
+ RDMA_HANDSHAKE_RECEIVING_DATA,
+ RDMA_HANDSHAKE_SENT_DATA,
+ RDMA_HANDSHAKE_RECEIVED_DATA,
+ RDMA_HANDSHAKE_SENDING_ACK,
+ RDMA_HANDSHAKE_RECEIVING_ACK,
+ RDMA_HANDSHAKE_RECEIVED_ACK,
+ RDMA_HANDSHAKE_COMPLETE,
+} rdma_handshake_state_t;
+
+struct rdma_nbio {
+ int state;
+ char *buf;
+ int count;
+ struct iovec vector;
+ struct iovec *pending_vector;
+ int pending_count;
+};
+
+struct __rdma_request_context {
+ struct ibv_mr *mr[RDMA_MAX_SEGMENTS];
+ int mr_count;
+ struct mem_pool *pool;
+ rdma_peer_t *peer;
+ struct iobref *iobref;
+ struct iobref *rsp_iobref;
+};
+typedef struct __rdma_request_context rdma_request_context_t;
+
+struct __rdma_private {
+ int32_t sock;
+ int32_t idx;
+ unsigned char connected;
+ unsigned char tcp_connected;
+ unsigned char ib_connected;
+ in_addr_t addr;
+ unsigned short port;
+
+ /* IB Verbs Driver specific variables, pointers */
+ rdma_peer_t peer;
+ struct __rdma_device *device;
+ rdma_options_t options;
+
+ /* Used by trans->op->receive */
+ char *data_ptr;
+ int32_t data_offset;
+ int32_t data_len;
+
+ /* Mutex */
+ pthread_mutex_t read_mutex;
+ pthread_mutex_t write_mutex;
+ pthread_barrier_t handshake_barrier;
+ char handshake_ret;
+ char is_server;
+ rpc_transport_t *listener;
+
+ pthread_mutex_t recv_mutex;
+ pthread_cond_t recv_cond;
+
+ struct mem_pool *request_ctx_pool;
+ struct mem_pool *ioq_pool;
+ struct mem_pool *reply_info_pool;
+
+ /* used during rdma_handshake */
+ struct {
+ struct rdma_nbio incoming;
+ struct rdma_nbio outgoing;
+ int state;
+ rdma_header_t header;
+ char *buf;
+ size_t size;
+ } handshake;
+};
+typedef struct __rdma_private rdma_private_t;
+
+#endif /* _XPORT_RDMA_H */