diff options
| author | Mohammed Rafi KC <rkavunga@redhat.com> | 2015-02-17 20:17:58 +0530 | 
|---|---|---|
| committer | Raghavendra Bhat <raghavendra@redhat.com> | 2015-03-27 04:34:08 -0700 | 
| commit | 7febb66a26f01c94f8e76bb90cf4edd7c6cc1421 (patch) | |
| tree | fc455167a14965ac54490af2acbc08ad480a5d6c | |
| parent | d21990e093d99d8adbacae1ba2c56ff7606e2c37 (diff) | |
rdma: pre-register iobuf_pool with rdma devices.
        Back port pf : http://review.gluster.org/9506
registring buffers with rdma device is a time consuming
operation. So performing registration in code path will
decrease the performance.
Using a pre registered memory will give a bettor performance,
ie, register iobuf_pool during rdma initialization. For
dynamically created arena, we can register with all the
device.
Change-Id: Ic79183e2efd014c43faf5911fdb6d5cfbcee64ca
BUG: 1202212
Signed-off-by: Mohammed Rafi KC <rkavunga@redhat.com>
Reviewed-on: http://review.gluster.org/9506
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
Tested-by: Raghavendra G <rgowdapp@redhat.com>
Reviewed-on: http://review.gluster.org/9889
Reviewed-by: Raghavendra Bhat <raghavendra@redhat.com>
| -rw-r--r-- | libglusterfs/src/iobuf.c | 38 | ||||
| -rw-r--r-- | libglusterfs/src/iobuf.h | 12 | ||||
| -rw-r--r-- | libglusterfs/src/mem-types.h | 1 | ||||
| -rw-r--r-- | rpc/rpc-transport/rdma/src/rdma.c | 200 | ||||
| -rw-r--r-- | rpc/rpc-transport/rdma/src/rdma.h | 10 | 
5 files changed, 240 insertions, 21 deletions
diff --git a/libglusterfs/src/iobuf.c b/libglusterfs/src/iobuf.c index 82ffe2dd8fd..f8f1860889b 100644 --- a/libglusterfs/src/iobuf.c +++ b/libglusterfs/src/iobuf.c @@ -50,6 +50,7 @@ gf_iobuf_get_arena_index (size_t page_size)          return i;  } +  size_t  gf_iobuf_get_pagesize (size_t page_size)  { @@ -138,10 +139,15 @@ out:  void -__iobuf_arena_destroy (struct iobuf_arena *iobuf_arena) +__iobuf_arena_destroy (struct iobuf_pool *iobuf_pool, +                       struct iobuf_arena *iobuf_arena)  {          GF_VALIDATE_OR_GOTO ("iobuf", iobuf_arena, out); +        if (iobuf_pool->rdma_deregistration) +                iobuf_pool->rdma_deregistration (iobuf_pool->mr_list, +                                                 iobuf_arena); +          __iobuf_arena_destroy_iobufs (iobuf_arena);          if (iobuf_arena->mem_base @@ -169,6 +175,7 @@ __iobuf_arena_alloc (struct iobuf_pool *iobuf_pool, size_t page_size,                  goto err;          INIT_LIST_HEAD (&iobuf_arena->list); +        INIT_LIST_HEAD (&iobuf_arena->all_list);          INIT_LIST_HEAD (&iobuf_arena->active.list);          INIT_LIST_HEAD (&iobuf_arena->passive.list);          iobuf_arena->iobuf_pool = iobuf_pool; @@ -188,6 +195,13 @@ __iobuf_arena_alloc (struct iobuf_pool *iobuf_pool, size_t page_size,                  goto err;          } +        if (iobuf_pool->rdma_registration) { +                iobuf_pool->rdma_registration (iobuf_pool->device, +                                               iobuf_arena); +        } + +        list_add_tail (&iobuf_arena->all_list, &iobuf_pool->all_arenas); +          __iobuf_arena_init_iobufs (iobuf_arena);          if (!iobuf_arena->iobufs) {                  gf_log (THIS->name, GF_LOG_ERROR, "init failed"); @@ -199,7 +213,7 @@ __iobuf_arena_alloc (struct iobuf_pool *iobuf_pool, size_t page_size,          return iobuf_arena;  err: -        __iobuf_arena_destroy (iobuf_arena); +        __iobuf_arena_destroy (iobuf_pool, iobuf_arena);  out:          return NULL; @@ -258,8 +272,8 @@ __iobuf_pool_add_arena (struct iobuf_pool *iobuf_pool, size_t page_size,                  gf_log (THIS->name, GF_LOG_WARNING, "arena not found");                  return NULL;          } +        list_add (&iobuf_arena->list, &iobuf_pool->arenas[index]); -        list_add_tail (&iobuf_arena->list, &iobuf_pool->arenas[index]);          return iobuf_arena;  } @@ -299,7 +313,8 @@ iobuf_pool_destroy (struct iobuf_pool *iobuf_pool)                                            &iobuf_pool->arenas[i], list) {                          list_del_init (&iobuf_arena->list);                          iobuf_pool->arena_cnt--; -                        __iobuf_arena_destroy (iobuf_arena); + +                        __iobuf_arena_destroy (iobuf_pool, iobuf_arena);                  }          } @@ -347,7 +362,7 @@ iobuf_pool_new (void)                                  gf_common_mt_iobuf_pool);          if (!iobuf_pool)                  goto out; - +        INIT_LIST_HEAD (&iobuf_pool->all_arenas);          pthread_mutex_init (&iobuf_pool->mutex, NULL);          for (i = 0; i <= IOBUF_ARENA_MAX_INDEX; i++) {                  INIT_LIST_HEAD (&iobuf_pool->arenas[i]); @@ -357,6 +372,16 @@ iobuf_pool_new (void)          iobuf_pool->default_page_size  = 128 * GF_UNIT_KB; +        iobuf_pool->rdma_registration = NULL; +        iobuf_pool->rdma_deregistration = NULL; + +        for (i = 0; i < GF_RDMA_DEVICE_COUNT; i++) { + +                iobuf_pool->device[i] = NULL; +                iobuf_pool->mr_list[i] = NULL; + +        } +          arena_size = 0;          for (i = 0; i < IOBUF_ARENA_MAX_INDEX; i++) {                  page_size = gf_iobuf_init_config[i].pagesize; @@ -393,9 +418,10 @@ __iobuf_arena_prune (struct iobuf_pool *iobuf_pool,          /* All cases matched, destroy */          list_del_init (&iobuf_arena->list); +        list_del_init (&iobuf_arena->all_list);          iobuf_pool->arena_cnt--; -        __iobuf_arena_destroy (iobuf_arena); +        __iobuf_arena_destroy (iobuf_pool, iobuf_arena);  out:          return; diff --git a/libglusterfs/src/iobuf.h b/libglusterfs/src/iobuf.h index 4e07910d722..7e5cfe37a28 100644 --- a/libglusterfs/src/iobuf.h +++ b/libglusterfs/src/iobuf.h @@ -19,6 +19,8 @@  #define GF_VARIABLE_IOBUF_COUNT 32 +#define GF_RDMA_DEVICE_COUNT 8 +  /* Lets try to define the new anonymous mapping   * flag, in case the system is still using the   * now deprecated MAP_ANON flag. @@ -81,6 +83,7 @@ struct iobuf_arena {                  };          }; +        struct list_head    all_list;          size_t              page_size;  /* size of all iobufs in this arena */          size_t              arena_size; /* this is equal to                                             (iobuf_pool->arena_size / page_size) @@ -110,6 +113,7 @@ struct iobuf_pool {          size_t              default_page_size; /* default size of iobuf */          int                 arena_cnt; +        struct list_head    all_arenas;          struct list_head    arenas[GF_VARIABLE_IOBUF_COUNT];          /* array of arenas. Each element of the array is a list of arenas             holding iobufs of particular page_size */ @@ -121,7 +125,13 @@ struct iobuf_pool {          /* array of of arenas which can be purged */          uint64_t            request_misses; /* mostly the requests for higher -                                               value of iobufs */ +                                              value of iobufs */ +        int                 rdma_device_count; +        struct list_head    *mr_list[GF_RDMA_DEVICE_COUNT]; +        void                *device[GF_RDMA_DEVICE_COUNT]; +        int (*rdma_registration)(void **, void*); +        int (*rdma_deregistration)(struct list_head**, struct iobuf_arena *); +  }; diff --git a/libglusterfs/src/mem-types.h b/libglusterfs/src/mem-types.h index 4f566f9ec57..4359488c5f9 100644 --- a/libglusterfs/src/mem-types.h +++ b/libglusterfs/src/mem-types.h @@ -126,6 +126,7 @@ enum gf_common_mem_types_ {  	gf_common_mt_strfd_data_t         = 110,          gf_common_mt_regex_t              = 111,          gf_common_mt_wr                   = 112, +        gf_common_mt_rdma_arena_mr        = 113,          gf_common_mt_end  };  #endif diff --git a/rpc/rpc-transport/rdma/src/rdma.c b/rpc/rpc-transport/rdma/src/rdma.c index 92d5da258f2..cb5ce77291e 100644 --- a/rpc/rpc-transport/rdma/src/rdma.c +++ b/rpc/rpc-transport/rdma/src/rdma.c @@ -15,6 +15,7 @@  #include "dict.h"  #include "glusterfs.h" +#include "iobuf.h"  #include "logging.h"  #include "rdma.h"  #include "name.h" @@ -361,6 +362,135 @@ gf_rdma_post_recv (struct ibv_srq *srq,          return ibv_post_srq_recv (srq, &wr, &bad_wr);  } +int +gf_rdma_deregister_arena (struct list_head **mr_list, +                          struct iobuf_arena *iobuf_arena) +{ +        gf_rdma_arena_mr *tmp     = NULL; +        int               count   = 0, i = 0; + +        count = iobuf_arena->iobuf_pool->rdma_device_count; +        for (i = 0; i < count; i++) { +                list_for_each_entry(tmp, mr_list[i], list) { +                        if (tmp->iobuf_arena == iobuf_arena) { +                                if (ibv_dereg_mr(tmp->mr)) { +                                        gf_log("rdma", GF_LOG_WARNING, +                                        "deallocation of memory region " +                                        "failed"); +                                        return -1; +                                } +                                list_del(&tmp->list); +                                GF_FREE(tmp); +                                break; +                        } +                } +        } + +        return 0; +} + + +int +gf_rdma_register_arena (void **arg1, void *arg2) +{ +        struct ibv_mr       *mr          = NULL; +        gf_rdma_arena_mr    *new         = NULL; +        struct iobuf_pool   *iobuf_pool  = NULL; +        gf_rdma_device_t    **device     = (gf_rdma_device_t **)arg1; +        struct iobuf_arena  *iobuf_arena = arg2; +        int                  count       = 0, i = 0; + +        iobuf_pool = iobuf_arena->iobuf_pool; +        count = iobuf_pool->rdma_device_count; +        for (i = 0; i < count; i++) { +                new = GF_CALLOC(1, sizeof(gf_rdma_arena_mr), +                                gf_common_mt_rdma_arena_mr); +                INIT_LIST_HEAD (&new->list); +                new->iobuf_arena = iobuf_arena; + +                mr = ibv_reg_mr(device[i]->pd, iobuf_arena->mem_base, +                                         iobuf_arena->arena_size, +                                         IBV_ACCESS_REMOTE_READ | +                                         IBV_ACCESS_LOCAL_WRITE | +                                         IBV_ACCESS_REMOTE_WRITE +                                         ); +                if (!mr) +                        gf_log("rdma", GF_LOG_WARNING, +                               "allocation of mr failed"); + +                new->mr = mr; +                list_add (&new->list, &device[i]->all_mr); +                new = NULL; +        } + +        return 0; + +} + +static void +gf_rdma_register_iobuf_pool (rpc_transport_t *this) +{ +        struct iobuf_pool   *iobuf_pool = NULL; +        struct iobuf_arena  *tmp        = NULL; +        gf_rdma_private_t   *priv       = NULL; +        gf_rdma_device_t    *device     = NULL; +        struct ibv_mr       *mr         = NULL; +        gf_rdma_arena_mr    *new        = NULL; + +        priv = this->private; +        device = priv->device; +        iobuf_pool = this->ctx->iobuf_pool; + +        if (!list_empty(&iobuf_pool->all_arenas)) { + +                list_for_each_entry (tmp, &iobuf_pool->all_arenas, all_list) { +                        new = GF_CALLOC(1, sizeof(gf_rdma_arena_mr), +                                        gf_common_mt_rdma_arena_mr); +                        INIT_LIST_HEAD (&new->list); +                        new->iobuf_arena = tmp; + +                        mr = ibv_reg_mr(device->pd, tmp->mem_base, +                                        tmp->arena_size, +                                        IBV_ACCESS_REMOTE_READ | +                                        IBV_ACCESS_LOCAL_WRITE | +                                        IBV_ACCESS_REMOTE_WRITE); +                        if (!mr) { +                                gf_log ("rdma", GF_LOG_WARNING, "failed to pre" +                                        " register buffers with rdma " +                                        "devices."); + +                        } +                        new->mr = mr; +                        list_add (&new->list, &device->all_mr); + +                        new = NULL; +                } +        } + +       return; +} + +static struct ibv_mr* +gf_rdma_get_pre_registred_mr(rpc_transport_t *this, void *ptr, int size) +{ +        gf_rdma_arena_mr  *tmp        = NULL; +        gf_rdma_private_t  *priv       = NULL; +        gf_rdma_device_t   *device     = NULL; + +        priv = this->private; +        device = priv->device; + +        if (!list_empty(&device->all_mr)) { +                list_for_each_entry (tmp, &device->all_mr, list) { +                        if (tmp->iobuf_arena->mem_base <= ptr && +                            ptr < tmp->iobuf_arena->mem_base + +                            tmp->iobuf_arena->arena_size) +                                return tmp->mr; +                        } +        } + +        return NULL; +}  static int32_t  gf_rdma_create_posts (rpc_transport_t *this) @@ -510,11 +640,13 @@ gf_rdma_get_device (rpc_transport_t *this, struct ibv_context *ibctx,          int32_t            i        = 0;          gf_rdma_device_t  *trav     = NULL, *device = NULL;          gf_rdma_ctx_t     *rdma_ctx = NULL; +        struct iobuf_pool *iobuf_pool = NULL;          priv        = this->private;          options     = &priv->options;          ctx         = this->ctx;          rdma_ctx    = ctx->ib; +        iobuf_pool = ctx->iobuf_pool;          trav = rdma_ctx->device; @@ -530,10 +662,10 @@ gf_rdma_get_device (rpc_transport_t *this, struct ibv_context *ibctx,                  if (trav == NULL) {                          goto out;                  } -                  priv->device = trav;                  trav->context = ibctx; - +                iobuf_pool->device[iobuf_pool->rdma_device_count] = trav; +                iobuf_pool->mr_list[iobuf_pool->rdma_device_count++] = &trav->all_mr;                  trav->request_ctx_pool                          = mem_pool_new (gf_rdma_request_context_t,                                          GF_RDMA_POOL_SIZE); @@ -613,6 +745,9 @@ gf_rdma_get_device (rpc_transport_t *this, struct ibv_context *ibctx,                  gf_rdma_queue_init (&trav->sendq);                  gf_rdma_queue_init (&trav->recvq); +                INIT_LIST_HEAD (&trav->all_mr); +                gf_rdma_register_iobuf_pool(this); +                  if (gf_rdma_create_posts (this) < 0) {                          gf_log (this->name, GF_LOG_ERROR,                                  "could not allocate posts for device (%s)", @@ -1239,9 +1374,13 @@ __gf_rdma_create_read_chunks_from_vector (gf_rdma_peer_t *peer,                  readch->rc_discrim = hton32 (1);                  readch->rc_position = hton32 (*pos); +                mr = gf_rdma_get_pre_registred_mr(peer->trans, +                                (void *)vector[i].iov_base, vector[i].iov_len); +                if (!mr) {                  mr = ibv_reg_mr (device->pd, vector[i].iov_base,                                   vector[i].iov_len,                                   IBV_ACCESS_REMOTE_READ); +                }                  if (!mr) {                          gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,                                  "memory registration failed (%s) (peer:%s)", @@ -1374,10 +1513,16 @@ __gf_rdma_create_write_chunks_from_vector (gf_rdma_peer_t *peer,          device = priv->device;          for (i = 0; i < count; i++) { + +                mr = gf_rdma_get_pre_registred_mr(peer->trans, +                                (void *)vector[i].iov_base, vector[i].iov_len); +                if (!mr) {                  mr = ibv_reg_mr (device->pd, vector[i].iov_base,                                   vector[i].iov_len,                                   IBV_ACCESS_REMOTE_WRITE                                   | IBV_ACCESS_LOCAL_WRITE); +                } +                  if (!mr) {                          gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,                                  "memory registration failed (%s) (peer:%s)", @@ -1504,16 +1649,30 @@ out:  static inline void -__gf_rdma_deregister_mr (struct ibv_mr **mr, int count) +__gf_rdma_deregister_mr (gf_rdma_device_t *device, +                         struct ibv_mr **mr, int count)  { -        int i = 0; +        gf_rdma_arena_mr    *tmp   = NULL; +        int                  i     = 0; +        int                  found = 0; -        if (mr == NULL) { +               if (mr == NULL) {                  goto out;          }          for (i = 0; i < count; i++) { -                ibv_dereg_mr (mr[i]); +                 found = 0; +                 if (!list_empty(&device->all_mr)) { +                 list_for_each_entry(tmp, &device->all_mr, list) { +                        if (tmp->mr == mr[i]) { +                                found = 1; +                                break; +                        } +                 } +                 } +                if (!found) +                        ibv_dereg_mr (mr[i]); +          }  out: @@ -1558,9 +1717,10 @@ gf_rdma_quota_put (gf_rdma_peer_t *peer)  void  __gf_rdma_request_context_destroy (gf_rdma_request_context_t *context)  { -        gf_rdma_peer_t    *peer = NULL; -        gf_rdma_private_t *priv = NULL; -        int32_t            ret  = 0; +        gf_rdma_peer_t    *peer   = NULL; +        gf_rdma_private_t *priv   = NULL; +        gf_rdma_device_t  *device = NULL; +        int32_t            ret    = 0;          if (context == NULL) {                  goto out; @@ -1568,9 +1728,10 @@ __gf_rdma_request_context_destroy (gf_rdma_request_context_t *context)          peer = context->peer; -        __gf_rdma_deregister_mr (context->mr, context->mr_count); -          priv = peer->trans->private; +        device = priv->device; +        __gf_rdma_deregister_mr (device, context->mr, context->mr_count); +          if (priv->connected) {                  ret = __gf_rdma_quota_put (peer); @@ -1602,13 +1763,14 @@ out:  void -gf_rdma_post_context_destroy (gf_rdma_post_context_t *ctx) +gf_rdma_post_context_destroy (gf_rdma_device_t *device, +                              gf_rdma_post_context_t *ctx)  {          if (ctx == NULL) {                  goto out;          } -        __gf_rdma_deregister_mr (ctx->mr, ctx->mr_count); +        __gf_rdma_deregister_mr (device, ctx->mr, ctx->mr_count);          if (ctx->iobref != NULL) {                  iobref_unref (ctx->iobref); @@ -1640,7 +1802,7 @@ gf_rdma_post_unref (gf_rdma_post_t *post)          pthread_mutex_unlock (&post->lock);          if (refcount == 0) { -                gf_rdma_post_context_destroy (&post->ctx); +                gf_rdma_post_context_destroy (post->device, &post->ctx);                  if (post->type == GF_RDMA_SEND_POST) {                          gf_rdma_put_post (&post->device->sendq, post);                  } else { @@ -2060,10 +2222,16 @@ __gf_rdma_register_local_mr_for_rdma (gf_rdma_peer_t *peer,                   * Infiniband Architecture Specification Volume 1                   * (Release 1.2.1)                   */ +                ctx->mr[ctx->mr_count] = gf_rdma_get_pre_registred_mr( +                                peer->trans, (void *)vector[i].iov_base, +                                vector[i].iov_len); + +                if (!ctx->mr[ctx->mr_count]) {                  ctx->mr[ctx->mr_count] = ibv_reg_mr (device->pd,                                                       vector[i].iov_base,                                                       vector[i].iov_len,                                                       IBV_ACCESS_LOCAL_WRITE); +                }                  if (ctx->mr[ctx->mr_count] == NULL) {                          gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,                                  "registering memory for IBV_ACCESS_LOCAL_WRITE " @@ -4553,6 +4721,7 @@ int32_t  init (rpc_transport_t *this)  {          gf_rdma_private_t *priv = NULL; +        struct iobuf_pool *iobuf_pool = NULL;          priv = GF_CALLOC (1, sizeof (*priv), gf_common_mt_rdma_private_t);          if (!priv) @@ -4565,6 +4734,9 @@ init (rpc_transport_t *this)                          "Failed to initialize IB Device");                  return -1;          } +        iobuf_pool = this->ctx->iobuf_pool; +        iobuf_pool->rdma_registration = gf_rdma_register_arena; +        iobuf_pool->rdma_deregistration = gf_rdma_deregister_arena;          return 0;  } diff --git a/rpc/rpc-transport/rdma/src/rdma.h b/rpc/rpc-transport/rdma/src/rdma.h index 7f76244f071..fda01aa53ef 100644 --- a/rpc/rpc-transport/rdma/src/rdma.h +++ b/rpc/rpc-transport/rdma/src/rdma.h @@ -34,6 +34,7 @@  /* FIXME: give appropriate values to these macros */  #define GF_DEFAULT_RDMA_LISTEN_PORT (GF_DEFAULT_BASE_PORT + 1) +  /* If you are changing GF_RDMA_MAX_SEGMENTS, please make sure to update   * GLUSTERFS_GF_RDMA_MAX_HEADER_SIZE defined in glusterfs.h .   */ @@ -328,9 +329,18 @@ struct __gf_rdma_device {          struct mem_pool *request_ctx_pool;          struct mem_pool *ioq_pool;          struct mem_pool *reply_info_pool; +        struct list_head all_mr;  };  typedef struct __gf_rdma_device gf_rdma_device_t; + +struct __gf_rdma_arena_mr { +        struct list_head list; +        struct iobuf_arena *iobuf_arena; +        struct ibv_mr *mr; +}; + +typedef struct __gf_rdma_arena_mr gf_rdma_arena_mr;  struct __gf_rdma_ctx {          gf_rdma_device_t          *device;          struct rdma_event_channel *rdma_cm_event_channel;  | 
