diff options
author | Raghavendra G <raghavendra@gluster.com> | 2010-08-30 08:03:52 +0000 |
---|---|---|
committer | Vijay Bellur <vijay@dev.gluster.com> | 2010-08-30 06:54:12 -0700 |
commit | 300b4fefcbb5a5ced4f0554e109679e44cdf44ea (patch) | |
tree | c77b64f94272188b8f2ce81e2f300222630ebdbe /rpc/rpc-transport/rdma/src/rdma.h | |
parent | 55bbf23b3a608f67b7a05939f4205049e92d081a (diff) |
Bring in new transport rdma.
- rdma is new transport and improvement over current ib-verbs. It uses
rdma-read and rdma-write for efficiently transferring large buffers.
For more details please refer to rfc-5666 and rfc-5667.
Signed-off-by: Raghavendra G <raghavendra@gluster.com>
Signed-off-by: Vijay Bellur <vijay@dev.gluster.com>
BUG: 513 (Introduce 0 copy rdma)
URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=513
Diffstat (limited to 'rpc/rpc-transport/rdma/src/rdma.h')
-rw-r--r-- | rpc/rpc-transport/rdma/src/rdma.h | 395 |
1 files changed, 395 insertions, 0 deletions
diff --git a/rpc/rpc-transport/rdma/src/rdma.h b/rpc/rpc-transport/rdma/src/rdma.h new file mode 100644 index 000000000..470ee7206 --- /dev/null +++ b/rpc/rpc-transport/rdma/src/rdma.h @@ -0,0 +1,395 @@ +/* + Copyright (c) 2006-2009 Gluster, Inc. <http://www.gluster.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _XPORT_RDMA_H +#define _XPORT_RDMA_H + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#ifndef MAX_IOVEC +#define MAX_IOVEC 16 +#endif /* MAX_IOVEC */ + +#include "rpc-clnt.h" +#include "rpc-transport.h" +#include "xlator.h" +#include "event.h" +#include <stdio.h> +#include <list.h> +#include <arpa/inet.h> +#include <infiniband/verbs.h> + +/* FIXME: give appropriate values to these macros */ +#define GF_DEFAULT_RDMA_LISTEN_PORT 6997 +#define RDMA_MAX_SEGMENTS 8 +#define RDMA_MAX_HEADER_SIZE (sizeof (rdma_header_t) \ + + RDMA_MAX_SEGMENTS \ + * sizeof (rdma_read_chunk_t)) +#define RDMA_INLINE_THRESHOLD (1024 * 128) +#define RDMA_VERSION 1 +#define RDMA_POOL_SIZE 512 + +typedef enum rdma_errcode { + ERR_VERS = 1, + ERR_CHUNK = 2 +}rdma_errcode_t; + +struct rdma_err_vers { + uint32_t rdma_vers_low; /* Version range supported by peer */ + uint32_t rdma_vers_high; +}__attribute__ ((packed)); +typedef struct rdma_err_vers rdma_err_vers_t; + +typedef enum rdma_proc { + RDMA_MSG = 0, /* An RPC call or reply msg */ + RDMA_NOMSG = 1, /* An RPC call or reply msg - separate body */ + RDMA_MSGP = 2, /* An RPC call or reply msg with padding */ + RDMA_DONE = 3, /* Client signals reply completion */ + RDMA_ERROR = 4 /* An RPC RDMA encoding error */ +}rdma_proc_t; + +typedef enum rdma_chunktype { + rdma_noch = 0, /* no chunk */ + rdma_readch, /* some argument through rdma read */ + rdma_areadch, /* entire request through rdma read */ + rdma_writech, /* some result through rdma write */ + rdma_replych /* entire reply through rdma write */ +}rdma_chunktype_t; + +struct __rdma_header { + uint32_t rm_xid; /* Mirrors the RPC header xid */ + uint32_t rm_vers; /* Version of this protocol */ + uint32_t rm_credit; /* Buffers requested/granted */ + uint32_t rm_type; /* Type of message (enum rdma_proc) */ + union { + struct { /* no chunks */ + uint32_t rm_empty[3]; /* 3 empty chunk lists */ + }__attribute__((packed)) rm_nochunks; + + struct { /* no chunks and padded */ + uint32_t rm_align; /* Padding alignment */ + uint32_t rm_thresh; /* Padding threshold */ + uint32_t rm_pempty[3]; /* 3 empty chunk lists */ + }__attribute__((packed)) rm_padded; + + struct { + uint32_t rm_type; + rdma_err_vers_t rm_version; + }__attribute__ ((packed)) rm_error; + + uint32_t rm_chunks[0]; /* read, write and reply chunks */ + }__attribute__ ((packed)) rm_body; +} __attribute__((packed)); +typedef struct __rdma_header rdma_header_t; + +struct __rdma_segment { + uint32_t rs_handle; /* Registered memory handle */ + uint32_t rs_length; /* Length of the chunk in bytes */ + uint64_t rs_offset; /* Chunk virtual address or offset */ +} __attribute__((packed)); +typedef struct __rdma_segment rdma_segment_t; + +/* read chunk(s), encoded as a linked list. */ +struct __rdma_read_chunk { + uint32_t rc_discrim; /* 1 indicates presence */ + uint32_t rc_position; /* Position in XDR stream */ + rdma_segment_t rc_target; +} __attribute__((packed)); +typedef struct __rdma_read_chunk rdma_read_chunk_t; + +/* write chunk, and reply chunk. */ +struct __rdma_write_chunk { + rdma_segment_t wc_target; +} __attribute__((packed)); +typedef struct __rdma_write_chunk rdma_write_chunk_t; + +/* write chunk(s), encoded as a counted array. */ +struct __rdma_write_array { + uint32_t wc_discrim; /* 1 indicates presence */ + uint32_t wc_nchunks; /* Array count */ + struct __rdma_write_chunk wc_array[0]; +} __attribute__((packed)); +typedef struct __rdma_write_array rdma_write_array_t; + +/* options per transport end point */ +struct __rdma_options { + int32_t port; + char *device_name; + enum ibv_mtu mtu; + int32_t send_count; + int32_t recv_count; + uint64_t recv_size; + uint64_t send_size; +}; +typedef struct __rdma_options rdma_options_t; + +struct __rdma_reply_info { + uint32_t rm_xid; /* xid in network endian */ + rdma_chunktype_t type; /* + * can be either rdma_replych + * or rdma_writech. + */ + rdma_write_array_t *wc_array; + struct mem_pool *pool; +}; +typedef struct __rdma_reply_info rdma_reply_info_t; + +struct __rdma_ioq { + union { + struct list_head list; + struct { + struct __rdma_ioq *next; + struct __rdma_ioq *prev; + }; + }; + + char is_request; + struct iovec rpchdr[MAX_IOVEC]; + int rpchdr_count; + struct iovec proghdr[MAX_IOVEC]; + int proghdr_count; + struct iovec prog_payload[MAX_IOVEC]; + int prog_payload_count; + + struct iobref *iobref; + + union { + struct __rdma_ioq_request { + /* used to build reply_chunk for RDMA_NOMSG type msgs */ + struct iovec rsphdr_vec[MAX_IOVEC]; + int rsphdr_count; + + /* + * used to build write_array during operations like + * read. + */ + struct iovec rsp_payload[MAX_IOVEC]; + int rsp_payload_count; + + struct rpc_req *rpc_req; /* FIXME: hack! hack! should be + * cleaned up later + */ + struct iobref *rsp_iobref; + }request; + + rdma_reply_info_t *reply_info; + }msg; + + struct mem_pool *pool; +}; +typedef struct __rdma_ioq rdma_ioq_t; + +typedef enum __rdma_send_post_type { + RDMA_SEND_POST_NO_CHUNKLIST, /* post which is sent using rdma-send + * and the msg carries no + * chunklists. + */ + RDMA_SEND_POST_READ_CHUNKLIST, /* post which is sent using rdma-send + * and the msg carries only read + * chunklist. + */ + RDMA_SEND_POST_WRITE_CHUNKLIST, /* post which is sent using + * rdma-send and the msg carries + * only write chunklist. + */ + RDMA_SEND_POST_READ_WRITE_CHUNKLIST, /* post which is sent using + * rdma-send and the msg + * carries both read and + * write chunklists. + */ + RDMA_SEND_POST_RDMA_READ, /* RDMA read */ + RDMA_SEND_POST_RDMA_WRITE, /* RDMA write */ +}rdma_send_post_type_t; + +/* represents one communication peer, two per transport_t */ +struct __rdma_peer { + rpc_transport_t *trans; + struct ibv_qp *qp; + + int32_t recv_count; + int32_t send_count; + int32_t recv_size; + int32_t send_size; + + int32_t quota; + union { + struct list_head ioq; + struct { + rdma_ioq_t *ioq_next; + rdma_ioq_t *ioq_prev; + }; + }; + + /* QP attributes, needed to connect with remote QP */ + int32_t local_lid; + int32_t local_psn; + int32_t local_qpn; + int32_t remote_lid; + int32_t remote_psn; + int32_t remote_qpn; +}; +typedef struct __rdma_peer rdma_peer_t; + +struct __rdma_post_context { + struct ibv_mr *mr[RDMA_MAX_SEGMENTS]; + int mr_count; + struct iovec vector[MAX_IOVEC]; + int count; + struct iobref *iobref; + char is_request; + rdma_reply_info_t *reply_info; +}; +typedef struct __rdma_post_context rdma_post_context_t; + +typedef enum { + RDMA_SEND_POST, + RDMA_RECV_POST +} rdma_post_type_t; + +struct __rdma_post { + struct __rdma_post *next, *prev; + struct ibv_mr *mr; + char *buf; + int32_t buf_size; + char aux; + int32_t reused; + struct __rdma_device *device; + rdma_post_type_t type; + rdma_post_context_t ctx; + int refcount; + pthread_mutex_t lock; +}; +typedef struct __rdma_post rdma_post_t; + +struct __rdma_queue { + rdma_post_t active_posts, passive_posts; + int32_t active_count, passive_count; + pthread_mutex_t lock; +}; +typedef struct __rdma_queue rdma_queue_t; + +struct __rdma_qpreg { + pthread_mutex_t lock; + int32_t count; + struct _qpent { + struct _qpent *next, *prev; + int32_t qp_num; + rdma_peer_t *peer; + } ents[42]; +}; +typedef struct __rdma_qpreg rdma_qpreg_t; + +/* context per device, stored in global glusterfs_ctx_t->ib */ +struct __rdma_device { + struct __rdma_device *next; + const char *device_name; + struct ibv_context *context; + int32_t port; + struct ibv_pd *pd; + struct ibv_srq *srq; + rdma_qpreg_t qpreg; + struct ibv_comp_channel *send_chan, *recv_chan; + struct ibv_cq *send_cq, *recv_cq; + rdma_queue_t sendq, recvq; + pthread_t send_thread, recv_thread; +}; +typedef struct __rdma_device rdma_device_t; + +typedef enum { + RDMA_HANDSHAKE_START = 0, + RDMA_HANDSHAKE_SENDING_DATA, + RDMA_HANDSHAKE_RECEIVING_DATA, + RDMA_HANDSHAKE_SENT_DATA, + RDMA_HANDSHAKE_RECEIVED_DATA, + RDMA_HANDSHAKE_SENDING_ACK, + RDMA_HANDSHAKE_RECEIVING_ACK, + RDMA_HANDSHAKE_RECEIVED_ACK, + RDMA_HANDSHAKE_COMPLETE, +} rdma_handshake_state_t; + +struct rdma_nbio { + int state; + char *buf; + int count; + struct iovec vector; + struct iovec *pending_vector; + int pending_count; +}; + +struct __rdma_request_context { + struct ibv_mr *mr[RDMA_MAX_SEGMENTS]; + int mr_count; + struct mem_pool *pool; + rdma_peer_t *peer; + struct iobref *iobref; + struct iobref *rsp_iobref; +}; +typedef struct __rdma_request_context rdma_request_context_t; + +struct __rdma_private { + int32_t sock; + int32_t idx; + unsigned char connected; + unsigned char tcp_connected; + unsigned char ib_connected; + in_addr_t addr; + unsigned short port; + + /* IB Verbs Driver specific variables, pointers */ + rdma_peer_t peer; + struct __rdma_device *device; + rdma_options_t options; + + /* Used by trans->op->receive */ + char *data_ptr; + int32_t data_offset; + int32_t data_len; + + /* Mutex */ + pthread_mutex_t read_mutex; + pthread_mutex_t write_mutex; + pthread_barrier_t handshake_barrier; + char handshake_ret; + char is_server; + rpc_transport_t *listener; + + pthread_mutex_t recv_mutex; + pthread_cond_t recv_cond; + + struct mem_pool *request_ctx_pool; + struct mem_pool *ioq_pool; + struct mem_pool *reply_info_pool; + + /* used during rdma_handshake */ + struct { + struct rdma_nbio incoming; + struct rdma_nbio outgoing; + int state; + rdma_header_t header; + char *buf; + size_t size; + } handshake; +}; +typedef struct __rdma_private rdma_private_t; + +#endif /* _XPORT_RDMA_H */ |