1 files changed, 338 insertions, 349 deletions
diff --git a/libglusterfs/src/mem-pool.c b/libglusterfs/src/mem-pool.c
index 4ef62b8da48..2d5a12b0a00 100644
--- a/libglusterfs/src/mem-pool.c
+++ b/libglusterfs/src/mem-pool.c
@@ -8,14 +8,14 @@
   cases as published by the Free Software Foundation.
 */
 
-#include "mem-pool.h"
-#include "logging.h"
-#include "xlator.h"
+#include "glusterfs/mem-pool.h"
+#include "glusterfs/common-utils.h"  // for GF_ASSERT, gf_thread_cr...
+#include "glusterfs/globals.h"       // for xlator_t, THIS
 #include <stdlib.h>
 #include <stdarg.h>
 
 #include "unittest/unittest.h"
-#include "libglusterfs-messages.h"
+#include "glusterfs/libglusterfs-messages.h"
 
 void
 gf_mem_acct_enable_set(void *data)
@@ -35,61 +35,101 @@ gf_mem_acct_enable_set(void *data)
     return;
 }
 
-int
-gf_mem_set_acct_info(xlator_t *xl, char **alloc_ptr, size_t size, uint32_t type,
-                     const char *typestr)
+static void *
+gf_mem_header_prepare(struct mem_header *header, size_t size)
 {
-    void *ptr = NULL;
-    struct mem_header *header = NULL;
+    void *ptr;
 
-    if (!alloc_ptr)
-        return -1;
+    header->size = size;
 
-    ptr = *alloc_ptr;
+    ptr = header + 1;
 
-    GF_ASSERT(xl != NULL);
+    /* data follows in this gap of 'size' bytes */
+    *(uint32_t *)(ptr + size) = GF_MEM_TRAILER_MAGIC;
 
-    GF_ASSERT(xl->mem_acct != NULL);
+    return ptr;
+}
 
-    GF_ASSERT(type <= xl->mem_acct->num_types);
+static void *
+gf_mem_set_acct_info(struct mem_acct *mem_acct, struct mem_header *header,
+                     size_t size, uint32_t type, const char *typestr)
+{
+    struct mem_acct_rec *rec = NULL;
+    bool new_ref = false;
 
-    LOCK(&xl->mem_acct->rec[type].lock);
-    {
-        if (!xl->mem_acct->rec[type].typestr)
-            xl->mem_acct->rec[type].typestr = typestr;
-        xl->mem_acct->rec[type].size += size;
-        xl->mem_acct->rec[type].num_allocs++;
-        xl->mem_acct->rec[type].total_allocs++;
-        xl->mem_acct->rec[type].max_size = max(xl->mem_acct->rec[type].max_size,
-                                               xl->mem_acct->rec[type].size);
-        xl->mem_acct->rec[type].max_num_allocs = max(
-            xl->mem_acct->rec[type].max_num_allocs,
-            xl->mem_acct->rec[type].num_allocs);
-    }
-    UNLOCK(&xl->mem_acct->rec[type].lock);
+    if (mem_acct != NULL) {
+        GF_ASSERT(type <= mem_acct->num_types);
 
-    GF_ATOMIC_INC(xl->mem_acct->refcnt);
+        rec = &mem_acct->rec[type];
+        LOCK(&rec->lock);
+        {
+            if (!rec->typestr) {
+                rec->typestr = typestr;
+            }
+            rec->size += size;
+            new_ref = (rec->num_allocs == 0);
+            rec->num_allocs++;
+            rec->total_allocs++;
+            rec->max_size = max(rec->max_size, rec->size);
+            rec->max_num_allocs = max(rec->max_num_allocs, rec->num_allocs);
+
+#ifdef DEBUG
+            list_add(&header->acct_list, &rec->obj_list);
+#endif
+        }
+        UNLOCK(&rec->lock);
+
+        /* We only take a reference for each memory type used, not for each
+         * allocation. This minimizes the use of atomic operations. */
+        if (new_ref) {
+            GF_ATOMIC_INC(mem_acct->refcnt);
+        }
+    }
 
-    header = (struct mem_header *)ptr;
     header->type = type;
-    header->size = size;
-    header->mem_acct = xl->mem_acct;
+    header->mem_acct = mem_acct;
     header->magic = GF_MEM_HEADER_MAGIC;
 
+    return gf_mem_header_prepare(header, size);
+}
+
+static void *
+gf_mem_update_acct_info(struct mem_acct *mem_acct, struct mem_header *header,
+                        size_t size)
+{
+    struct mem_acct_rec *rec = NULL;
+
+    if (mem_acct != NULL) {
+        rec = &mem_acct->rec[header->type];
+        LOCK(&rec->lock);
+        {
+            rec->size += size - header->size;
+            rec->total_allocs++;
+            rec->max_size = max(rec->max_size, rec->size);
+
 #ifdef DEBUG
-    INIT_LIST_HEAD(&header->acct_list);
-    LOCK(&xl->mem_acct->rec[type].lock);
-    {
-        list_add(&header->acct_list, &(xl->mem_acct->rec[type].obj_list));
-    }
-    UNLOCK(&xl->mem_acct->rec[type].lock);
+            /* The old 'header' already was present in 'obj_list', but
+             * realloc() could have changed its address. We need to remove
+             * the old item from the list and add the new one. This can be
+             * done this way because list_move() doesn't use the pointers
+             * to the old location (which are not valid anymore) already
+             * present in the list, it simply overwrites them. */
+            list_move(&header->acct_list, &rec->obj_list);
 #endif
-    ptr += sizeof(struct mem_header);
-    /* data follows in this gap of 'size' bytes */
-    *(uint32_t *)(ptr + size) = GF_MEM_TRAILER_MAGIC;
+        }
+        UNLOCK(&rec->lock);
+    }
+
+    return gf_mem_header_prepare(header, size);
+}
 
-    *alloc_ptr = ptr;
-    return 0;
+static bool
+gf_mem_acct_enabled(void)
+{
+    xlator_t *x = THIS;
+    /* Low-level __gf_xxx() may be called
+       before ctx is initialized. */
+    return x->ctx && x->ctx->mem_acct_enable;
 }
 
 void *
@@ -97,10 +137,10 @@ __gf_calloc(size_t nmemb, size_t size, uint32_t type, const char *typestr)
 {
     size_t tot_size = 0;
     size_t req_size = 0;
-    char *ptr = NULL;
+    void *ptr = NULL;
     xlator_t *xl = NULL;
 
-    if (!THIS->ctx->mem_acct_enable)
+    if (!gf_mem_acct_enabled())
         return CALLOC(nmemb, size);
 
     xl = THIS;
@@ -114,19 +154,18 @@ __gf_calloc(size_t nmemb, size_t size, uint32_t type, const char *typestr)
         gf_msg_nomem("", GF_LOG_ALERT, tot_size);
         return NULL;
     }
-    gf_mem_set_acct_info(xl, &ptr, req_size, type, typestr);
 
-    return (void *)ptr;
+    return gf_mem_set_acct_info(xl->mem_acct, ptr, req_size, type, typestr);
 }
 
 void *
 __gf_malloc(size_t size, uint32_t type, const char *typestr)
 {
     size_t tot_size = 0;
-    char *ptr = NULL;
+    void *ptr = NULL;
     xlator_t *xl = NULL;
 
-    if (!THIS->ctx->mem_acct_enable)
+    if (!gf_mem_acct_enabled())
         return MALLOC(size);
 
     xl = THIS;
@@ -138,84 +177,32 @@ __gf_malloc(size_t size, uint32_t type, const char *typestr)
         gf_msg_nomem("", GF_LOG_ALERT, tot_size);
         return NULL;
     }
-    gf_mem_set_acct_info(xl, &ptr, size, type, typestr);
 
-    return (void *)ptr;
+    return gf_mem_set_acct_info(xl->mem_acct, ptr, size, type, typestr);
 }
 
 void *
 __gf_realloc(void *ptr, size_t size)
 {
     size_t tot_size = 0;
-    char *new_ptr;
-    struct mem_header *old_header = NULL;
-    struct mem_header *new_header = NULL;
-    struct mem_header tmp_header;
+    struct mem_header *header = NULL;
 
-    if (!THIS->ctx->mem_acct_enable)
+    if (!gf_mem_acct_enabled())
         return REALLOC(ptr, size);
 
     REQUIRE(NULL != ptr);
 
-    old_header = (struct mem_header *)(ptr - GF_MEM_HEADER_SIZE);
-    GF_ASSERT(old_header->magic == GF_MEM_HEADER_MAGIC);
-    tmp_header = *old_header;
-
-#ifdef DEBUG
-    int type = 0;
-    size_t copy_size = 0;
-
-    /* Making these changes for realloc is not straightforward. So
-     * I am simulating realloc using calloc and free
-     */
-
-    type = tmp_header.type;
-    new_ptr = __gf_calloc(1, size, type,
-                          tmp_header.mem_acct->rec[type].typestr);
-    if (new_ptr) {
-        copy_size = (size > tmp_header.size) ? tmp_header.size : size;
-        memcpy(new_ptr, ptr, copy_size);
-        __gf_free(ptr);
-    }
-
-    /* This is not quite what the man page says should happen */
-    return new_ptr;
-#endif
+    header = (struct mem_header *)(ptr - GF_MEM_HEADER_SIZE);
+    GF_ASSERT(header->magic == GF_MEM_HEADER_MAGIC);
 
     tot_size = size + GF_MEM_HEADER_SIZE + GF_MEM_TRAILER_SIZE;
-    new_ptr = realloc(old_header, tot_size);
-    if (!new_ptr) {
+    header = realloc(header, tot_size);
+    if (!header) {
         gf_msg_nomem("", GF_LOG_ALERT, tot_size);
         return NULL;
     }
 
-    /*
-     * We used to pass (char **)&ptr as the second
-     * argument after the value of realloc was saved
-     * in ptr, but the compiler warnings complained
-     * about the casting to and forth from void ** to
-     * char **.
-     * TBD: it would be nice to adjust the memory accounting info here,
-     * but calling gf_mem_set_acct_info here is wrong because it bumps
-     * up counts as though this is a new allocation - which it's not.
-     * The consequence of doing nothing here is only that the sizes will be
-     * wrong, but at least the counts won't be.
-    uint32_t           type = 0;
-    xlator_t          *xl = NULL;
-    type = header->type;
-    xl = (xlator_t *) header->xlator;
-    gf_mem_set_acct_info (xl, &new_ptr, size, type, NULL);
-     */
-
-    new_header = (struct mem_header *)new_ptr;
-    *new_header = tmp_header;
-    new_header->size = size;
-
-    new_ptr += sizeof(struct mem_header);
-    /* data follows in this gap of 'size' bytes */
-    *(uint32_t *)(new_ptr + size) = GF_MEM_TRAILER_MAGIC;
-
-    return (void *)new_ptr;
+    return gf_mem_update_acct_info(header->mem_acct, header, size);
 }
 
 int
@@ -321,8 +308,9 @@ __gf_free(void *free_ptr)
     void *ptr = NULL;
     struct mem_acct *mem_acct;
     struct mem_header *header = NULL;
+    bool last_ref = false;
 
-    if (!THIS->ctx->mem_acct_enable) {
+    if (!gf_mem_acct_enabled()) {
         FREE(free_ptr);
         return;
     }
@@ -348,21 +336,22 @@ __gf_free(void *free_ptr)
 
     LOCK(&mem_acct->rec[header->type].lock);
     {
-        GF_ASSERT(mem_acct->rec[header->type].size >= header->size);
         mem_acct->rec[header->type].size -= header->size;
         mem_acct->rec[header->type].num_allocs--;
         /* If all the instances are freed up then ensure typestr is set
          * to NULL */
-        if (!mem_acct->rec[header->type].num_allocs)
+        if (!mem_acct->rec[header->type].num_allocs) {
+            last_ref = true;
             mem_acct->rec[header->type].typestr = NULL;
+        }
 #ifdef DEBUG
         list_del(&header->acct_list);
 #endif
     }
     UNLOCK(&mem_acct->rec[header->type].lock);
 
-    if (GF_ATOMIC_DEC(mem_acct->refcnt) == 0) {
-        FREE(mem_acct);
+    if (last_ref) {
+        xlator_mem_acct_unref(mem_acct);
     }
 
 free:
@@ -373,11 +362,30 @@ free:
     FREE(ptr);
 }
 
-#define POOL_SMALLEST 7 /* i.e. 128 */
-#define POOL_LARGEST 20 /* i.e. 1048576 */
-#define NPOOLS (POOL_LARGEST - POOL_SMALLEST + 1)
+#if defined(GF_DISABLE_MEMPOOL)
+
+struct mem_pool *
+mem_pool_new_fn(glusterfs_ctx_t *ctx, unsigned long sizeof_type,
+                unsigned long count, char *name)
+{
+    struct mem_pool *new;
+
+    new = GF_MALLOC(sizeof(struct mem_pool), gf_common_mt_mem_pool);
+    if (!new)
+        return NULL;
+
+    new->sizeof_type = sizeof_type;
+    return new;
+}
+
+void
+mem_pool_destroy(struct mem_pool *pool)
+{
+    GF_FREE(pool);
+}
+
+#else /* !GF_DISABLE_MEMPOOL */
 
-static pthread_key_t pool_key;
 static pthread_mutex_t pool_lock = PTHREAD_MUTEX_INITIALIZER;
 static struct list_head pool_threads;
 static pthread_mutex_t pool_free_lock = PTHREAD_MUTEX_INITIALIZER;
@@ -385,23 +393,18 @@ static struct list_head pool_free_threads;
 static struct mem_pool_shared pools[NPOOLS];
 static size_t pool_list_size;
 
-#if !defined(GF_DISABLE_MEMPOOL)
+static __thread per_thread_pool_list_t *thread_pool_list = NULL;
+
 #define N_COLD_LISTS 1024
 #define POOL_SWEEP_SECS 30
 
-static unsigned long sweep_times;
-static unsigned long sweep_usecs;
-static unsigned long frees_to_system;
-
 typedef struct {
-    struct list_head death_row;
     pooled_obj_hdr_t *cold_lists[N_COLD_LISTS];
     unsigned int n_cold_lists;
 } sweep_state_t;
 
 enum init_state {
     GF_MEMPOOL_INIT_NONE = 0,
-    GF_MEMPOOL_INIT_PREINIT,
     GF_MEMPOOL_INIT_EARLY,
     GF_MEMPOOL_INIT_LATE,
     GF_MEMPOOL_INIT_DESTROY
@@ -412,36 +415,33 @@ static pthread_mutex_t init_mutex = PTHREAD_MUTEX_INITIALIZER;
 static unsigned int init_count = 0;
 static pthread_t sweeper_tid;
 
-gf_boolean_t
+static bool
 collect_garbage(sweep_state_t *state, per_thread_pool_list_t *pool_list)
 {
     unsigned int i;
     per_thread_pool_t *pt_pool;
-    gf_boolean_t poisoned;
 
     (void)pthread_spin_lock(&pool_list->lock);
 
-    poisoned = pool_list->poison != 0;
-    if (!poisoned) {
-        for (i = 0; i < NPOOLS; ++i) {
-            pt_pool = &pool_list->pools[i];
-            if (pt_pool->cold_list) {
-                if (state->n_cold_lists >= N_COLD_LISTS) {
-                    break;
-                }
-                state->cold_lists[state->n_cold_lists++] = pt_pool->cold_list;
+    for (i = 0; i < NPOOLS; ++i) {
+        pt_pool = &pool_list->pools[i];
+        if (pt_pool->cold_list) {
+            if (state->n_cold_lists >= N_COLD_LISTS) {
+                (void)pthread_spin_unlock(&pool_list->lock);
+                return true;
             }
-            pt_pool->cold_list = pt_pool->hot_list;
-            pt_pool->hot_list = NULL;
+            state->cold_lists[state->n_cold_lists++] = pt_pool->cold_list;
         }
+        pt_pool->cold_list = pt_pool->hot_list;
+        pt_pool->hot_list = NULL;
     }
 
     (void)pthread_spin_unlock(&pool_list->lock);
 
-    return poisoned;
+    return false;
 }
 
-void
+static void
 free_obj_list(pooled_obj_hdr_t *victim)
 {
     pooled_obj_hdr_t *next;
@@ -450,93 +450,101 @@ free_obj_list(pooled_obj_hdr_t *victim)
         next = victim->next;
         free(victim);
         victim = next;
-        ++frees_to_system;
     }
 }
 
-void *
+static void *
 pool_sweeper(void *arg)
 {
     sweep_state_t state;
     per_thread_pool_list_t *pool_list;
-    per_thread_pool_list_t *next_pl;
-    per_thread_pool_t *pt_pool;
-    unsigned int i;
-    struct timeval begin_time;
-    struct timeval end_time;
-    struct timeval elapsed;
-    gf_boolean_t poisoned;
+    uint32_t i;
+    bool pending;
 
     /*
      * This is all a bit inelegant, but the point is to avoid doing
      * expensive things (like freeing thousands of objects) while holding a
-     * global lock.  Thus, we split each iteration into three passes, with
+     * global lock.  Thus, we split each iteration into two passes, with
      * only the first and fastest holding the lock.
      */
 
+    pending = true;
+
     for (;;) {
-        sleep(POOL_SWEEP_SECS);
+        /* If we know there's pending work to do (or it's the first run), we
+         * do collect garbage more often. */
+        sleep(pending ? POOL_SWEEP_SECS / 5 : POOL_SWEEP_SECS);
+
         (void)pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
-        INIT_LIST_HEAD(&state.death_row);
         state.n_cold_lists = 0;
+        pending = false;
 
         /* First pass: collect stuff that needs our attention. */
-        (void)gettimeofday(&begin_time, NULL);
         (void)pthread_mutex_lock(&pool_lock);
-        list_for_each_entry_safe(pool_list, next_pl, &pool_threads, thr_list)
+        list_for_each_entry(pool_list, &pool_threads, thr_list)
         {
-            (void)pthread_mutex_unlock(&pool_lock);
-            poisoned = collect_garbage(&state, pool_list);
-            (void)pthread_mutex_lock(&pool_lock);
-
-            if (poisoned) {
-                list_move(&pool_list->thr_list, &state.death_row);
+            if (collect_garbage(&state, pool_list)) {
+                pending = true;
             }
         }
         (void)pthread_mutex_unlock(&pool_lock);
-        (void)gettimeofday(&end_time, NULL);
-        timersub(&end_time, &begin_time, &elapsed);
-        sweep_usecs += elapsed.tv_sec * 1000000 + elapsed.tv_usec;
-        sweep_times += 1;
-
-        /* Second pass: free dead pools. */
-        (void)pthread_mutex_lock(&pool_free_lock);
-        list_for_each_entry_safe(pool_list, next_pl, &state.death_row, thr_list)
-        {
-            for (i = 0; i < NPOOLS; ++i) {
-                pt_pool = &pool_list->pools[i];
-                free_obj_list(pt_pool->cold_list);
-                free_obj_list(pt_pool->hot_list);
-                pt_pool->hot_list = pt_pool->cold_list = NULL;
-            }
-            list_del(&pool_list->thr_list);
-            list_add(&pool_list->thr_list, &pool_free_threads);
-        }
-        (void)pthread_mutex_unlock(&pool_free_lock);
 
-        /* Third pass: free cold objects from live pools. */
+        /* Second pass: free cold objects from live pools. */
         for (i = 0; i < state.n_cold_lists; ++i) {
             free_obj_list(state.cold_lists[i]);
         }
         (void)pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
     }
+
+    return NULL;
 }
 
 void
-pool_destructor(void *arg)
+mem_pool_thread_destructor(per_thread_pool_list_t *pool_list)
 {
-    per_thread_pool_list_t *pool_list = arg;
+    per_thread_pool_t *pt_pool;
+    uint32_t i;
 
-    /* The pool-sweeper thread will take it from here.
-     *
-     * We can change 'poison' here without taking locks because the change
-     * itself doesn't interact with other parts of the code and a simple write
-     * is already atomic from the point of view of the processor.
-     *
-     * This change can modify what mem_put() does, but both possibilities are
-     * fine until the sweeper thread kicks in. The real synchronization must be
-     * between mem_put() and the sweeper thread. */
-    pool_list->poison = 1;
+    if (pool_list == NULL) {
+        pool_list = thread_pool_list;
+    }
+
+    /* The current thread is terminating. None of the allocated objects will
+     * be used again. We can directly destroy them here instead of delaying
+     * it until the next sweeper loop. */
+    if (pool_list != NULL) {
+        /* Remove pool_list from the global list to avoid that sweeper
+         * could touch it. */
+        pthread_mutex_lock(&pool_lock);
+        list_del(&pool_list->thr_list);
+        pthread_mutex_unlock(&pool_lock);
+
+        /* We need to protect hot/cold changes from potential mem_put() calls
+         * that reference this pool_list. Once poison is set to true, we are
+         * sure that no one else will touch hot/cold lists. The only possible
+         * race is when at the same moment a mem_put() is adding a new item
+         * to the hot list. We protect from that by taking pool_list->lock.
+         * After that we don't need the lock to destroy the hot/cold lists. */
+        pthread_spin_lock(&pool_list->lock);
+        pool_list->poison = true;
+        pthread_spin_unlock(&pool_list->lock);
+
+        for (i = 0; i < NPOOLS; i++) {
+            pt_pool = &pool_list->pools[i];
+
+            free_obj_list(pt_pool->hot_list);
+            pt_pool->hot_list = NULL;
+
+            free_obj_list(pt_pool->cold_list);
+            pt_pool->cold_list = NULL;
+        }
+
+        pthread_mutex_lock(&pool_free_lock);
+        list_add(&pool_list->thr_list, &pool_free_threads);
+        pthread_mutex_unlock(&pool_free_lock);
+
+        thread_pool_list = NULL;
+    }
 }
 
 static __attribute__((constructor)) void
@@ -559,46 +567,30 @@ mem_pools_preinit(void)
     pool_list_size = sizeof(per_thread_pool_list_t) +
                      sizeof(per_thread_pool_t) * (NPOOLS - 1);
 
-    init_done = GF_MEMPOOL_INIT_PREINIT;
+    init_done = GF_MEMPOOL_INIT_EARLY;
 }
 
-/* Use mem_pools_init_early() function for basic initialization. There will be
- * no cleanup done by the pool_sweeper thread until mem_pools_init_late() has
- * been called. Calling mem_get() will be possible after this function has
- * setup the basic structures. */
-void
-mem_pools_init_early(void)
+static __attribute__((destructor)) void
+mem_pools_postfini(void)
 {
-    pthread_mutex_lock(&init_mutex);
-    /* Use a pthread_key destructor to clean up when a thread exits.
+    /* TODO: This function should destroy all per thread memory pools that
+     *       are still alive, but this is not possible right now because glibc
+     *       starts calling destructors as soon as exit() is called, and
+     *       gluster doesn't ensure that all threads have been stopped before
+     *       calling exit(). Existing threads would crash when they try to use
+     *       memory or they terminate if we destroy things here.
      *
-     * We won't increase init_count here, that is only done when the
-     * pool_sweeper thread is started too.
-     */
-    if (init_done == GF_MEMPOOL_INIT_PREINIT ||
-        init_done == GF_MEMPOOL_INIT_DESTROY) {
-        /* key has not been created yet */
-        if (pthread_key_create(&pool_key, pool_destructor) != 0) {
-            gf_log("mem-pool", GF_LOG_CRITICAL,
-                   "failed to initialize mem-pool key");
-        }
-
-        init_done = GF_MEMPOOL_INIT_EARLY;
-    } else {
-        gf_log("mem-pool", GF_LOG_CRITICAL,
-               "incorrect order of mem-pool initialization "
-               "(init_done=%d)",
-               init_done);
-    }
-
-    pthread_mutex_unlock(&init_mutex);
+     *       When we propertly terminate all threads, we can add the needed
+     *       code here. Till then we need to leave the memory allocated. Most
+     *       probably this function will be executed on process termination,
+     *       so the memory will be released anyway by the system. */
 }
 
-/* Call mem_pools_init_late() once threading has been configured completely.
- * This prevent the pool_sweeper thread from getting killed once the main()
- * thread exits during deamonizing. */
+/* Call mem_pools_init() once threading has been configured completely. This
+ * prevent the pool_sweeper thread from getting killed once the main() thread
+ * exits during deamonizing. */
 void
-mem_pools_init_late(void)
+mem_pools_init(void)
 {
     pthread_mutex_lock(&init_mutex);
     if ((init_count++) == 0) {
@@ -617,56 +609,27 @@ mem_pools_fini(void)
     switch (init_count) {
         case 0:
             /*
-             * If init_count is already zero (as e.g. if somebody called
-             * this before mem_pools_init_late) then the sweeper was
-             * probably never even started so we don't need to stop it.
-             * Even if there's some crazy circumstance where there is a
-             * sweeper but init_count is still zero, that just means we'll
-             * leave it running.  Not perfect, but far better than any
-             * known alternative.
+             * If init_count is already zero (as e.g. if somebody called this
+             * before mem_pools_init) then the sweeper was probably never even
+             * started so we don't need to stop it. Even if there's some crazy
+             * circumstance where there is a sweeper but init_count is still
+             * zero, that just means we'll leave it running. Not perfect, but
+             * far better than any known alternative.
              */
             break;
         case 1: {
-            per_thread_pool_list_t *pool_list;
-            per_thread_pool_list_t *next_pl;
-            unsigned int i;
-
-            /* if only mem_pools_init_early() was called, sweeper_tid will
-             * be invalid and the functions will error out. That is not
-             * critical. In all other cases, the sweeper_tid will be valid
-             * and the thread gets stopped. */
+            /* if mem_pools_init() was not called, sweeper_tid will be invalid
+             * and the functions will error out. That is not critical. In all
+             * other cases, the sweeper_tid will be valid and the thread gets
+             * stopped. */
             (void)pthread_cancel(sweeper_tid);
             (void)pthread_join(sweeper_tid, NULL);
 
-            /* Need to clean the pool_key to prevent further usage of the
-             * per_thread_pool_list_t structure that is stored for each
-             * thread.
-             * This also prevents calling pool_destructor() when a thread
-             * exits, so there is no chance on a use-after-free of the
-             * per_thread_pool_list_t structure. */
-            (void)pthread_key_delete(pool_key);
-
-            /* free all objects from all pools */
-            list_for_each_entry_safe(pool_list, next_pl, &pool_threads,
-                                     thr_list)
-            {
-                for (i = 0; i < NPOOLS; ++i) {
-                    free_obj_list(pool_list->pools[i].hot_list);
-                    free_obj_list(pool_list->pools[i].cold_list);
-                    pool_list->pools[i].hot_list = NULL;
-                    pool_list->pools[i].cold_list = NULL;
-                }
-
-                list_del(&pool_list->thr_list);
-                FREE(pool_list);
-            }
-
-            list_for_each_entry_safe(pool_list, next_pl, &pool_free_threads,
-                                     thr_list)
-            {
-                list_del(&pool_list->thr_list);
-                FREE(pool_list);
-            }
+            /* There could be threads still running in some cases, so we can't
+             * destroy pool_lists in use. We can also not destroy unused
+             * pool_lists because some allocated objects may still be pointing
+             * to them. */
+            mem_pool_thread_destructor(NULL);
 
             init_done = GF_MEMPOOL_INIT_DESTROY;
             /* Fall through. */
@@ -677,26 +640,36 @@ mem_pools_fini(void)
     pthread_mutex_unlock(&init_mutex);
 }
 
-#else
-void
-mem_pools_init_early(void)
-{
-}
 void
-mem_pools_init_late(void)
-{
-}
-void
-mem_pools_fini(void)
+mem_pool_destroy(struct mem_pool *pool)
 {
+    if (!pool)
+        return;
+
+    /* remove this pool from the owner (glusterfs_ctx_t) */
+    LOCK(&pool->ctx->lock);
+    {
+        list_del(&pool->owner);
+    }
+    UNLOCK(&pool->ctx->lock);
+
+    /* free this pool, but keep the mem_pool_shared */
+    GF_FREE(pool);
+
+    /*
+     * Pools are now permanent, so the mem_pool->pool is kept around. All
+     * of the objects *in* the pool will eventually be freed via the
+     * pool-sweeper thread, and this way we don't have to add a lot of
+     * reference-counting complexity.
+     */
 }
-#endif
 
 struct mem_pool *
 mem_pool_new_fn(glusterfs_ctx_t *ctx, unsigned long sizeof_type,
                 unsigned long count, char *name)
 {
-    unsigned int i;
+    unsigned long extra_size, size;
+    unsigned int power;
     struct mem_pool *new = NULL;
     struct mem_pool_shared *pool = NULL;
 
@@ -706,20 +679,33 @@ mem_pool_new_fn(glusterfs_ctx_t *ctx, unsigned long sizeof_type,
         return NULL;
     }
 
-    for (i = 0; i < NPOOLS; ++i) {
-        if (sizeof_type <= AVAILABLE_SIZE(pools[i].power_of_two)) {
-            pool = &pools[i];
-            break;
-        }
-    }
-
-    if (!pool) {
+    /* This is the overhead we'll have because of memory accounting for each
+     * memory block. */
+    extra_size = sizeof(pooled_obj_hdr_t);
+
+    /* We need to compute the total space needed to hold the data type and
+     * the header. Given that the smallest block size we have in the pools
+     * is 2^POOL_SMALLEST, we need to take the MAX(size, 2^POOL_SMALLEST).
+     * However, since this value is only needed to compute its rounded
+     * logarithm in base 2, and this only depends on the highest bit set,
+     * we can simply do a bitwise or with the minimum size. We need to
+     * subtract 1 for correct handling of sizes that are exactly a power
+     * of 2. */
+    size = (sizeof_type + extra_size - 1UL) | ((1UL << POOL_SMALLEST) - 1UL);
+
+    /* We compute the logarithm in base 2 rounded up of the resulting size.
+     * This value will identify which pool we need to use from the pools of
+     * powers of 2. This is equivalent to finding the position of the highest
+     * bit set. */
+    power = sizeof(size) * 8 - __builtin_clzl(size);
+    if (power > POOL_LARGEST) {
         gf_msg_callingfn("mem-pool", GF_LOG_ERROR, EINVAL, LG_MSG_INVALID_ARG,
                          "invalid argument");
         return NULL;
     }
+    pool = &pools[power - POOL_SMALLEST];
 
-    new = GF_CALLOC(sizeof(struct mem_pool), 1, gf_common_mt_mem_pool);
+    new = GF_MALLOC(sizeof(struct mem_pool), gf_common_mt_mem_pool);
     if (!new)
         return NULL;
 
@@ -727,8 +713,13 @@ mem_pool_new_fn(glusterfs_ctx_t *ctx, unsigned long sizeof_type,
     new->sizeof_type = sizeof_type;
     new->count = count;
     new->name = name;
+    new->xl_name = THIS->name;
     new->pool = pool;
     GF_ATOMIC_INIT(new->active, 0);
+#ifdef DEBUG
+    GF_ATOMIC_INIT(new->hit, 0);
+    GF_ATOMIC_INIT(new->miss, 0);
+#endif
     INIT_LIST_HEAD(&new->owner);
 
     LOCK(&ctx->lock);
@@ -740,36 +731,13 @@ mem_pool_new_fn(glusterfs_ctx_t *ctx, unsigned long sizeof_type,
     return new;
 }
 
-void *
-mem_get0(struct mem_pool *mem_pool)
-{
-    void *ptr = NULL;
-
-    if (!mem_pool) {
-        gf_msg_callingfn("mem-pool", GF_LOG_ERROR, EINVAL, LG_MSG_INVALID_ARG,
-                         "invalid argument");
-        return NULL;
-    }
-
-    ptr = mem_get(mem_pool);
-    if (ptr) {
-#if defined(GF_DISABLE_MEMPOOL)
-        memset(ptr, 0, mem_pool->sizeof_type);
-#else
-        memset(ptr, 0, AVAILABLE_SIZE(mem_pool->pool->power_of_two));
-#endif
-    }
-
-    return ptr;
-}
-
 per_thread_pool_list_t *
 mem_get_pool_list(void)
 {
     per_thread_pool_list_t *pool_list;
     unsigned int i;
 
-    pool_list = pthread_getspecific(pool_key);
+    pool_list = thread_pool_list;
     if (pool_list) {
         return pool_list;
     }
@@ -797,21 +765,33 @@ mem_get_pool_list(void)
         }
     }
 
+    /* There's no need to take pool_list->lock, because this is already an
+     * atomic operation and we don't need to synchronize it with any change
+     * in hot/cold lists. */
+    pool_list->poison = false;
+
     (void)pthread_mutex_lock(&pool_lock);
-    pool_list->poison = 0;
     list_add(&pool_list->thr_list, &pool_threads);
     (void)pthread_mutex_unlock(&pool_lock);
 
-    (void)pthread_setspecific(pool_key, pool_list);
+    thread_pool_list = pool_list;
+
+    /* Ensure that all memory objects associated to the new pool_list are
+     * destroyed when the thread terminates. */
+    gf_thread_needs_cleanup();
+
     return pool_list;
 }
 
-pooled_obj_hdr_t *
+static pooled_obj_hdr_t *
 mem_get_from_pool(struct mem_pool *mem_pool)
 {
     per_thread_pool_list_t *pool_list;
     per_thread_pool_t *pt_pool;
     pooled_obj_hdr_t *retval;
+#ifdef DEBUG
+    gf_boolean_t hit = _gf_true;
+#endif
 
     pool_list = mem_get_pool_list();
     if (!pool_list || pool_list->poison) {
@@ -837,34 +817,58 @@ mem_get_from_pool(struct mem_pool *mem_pool)
             (void)pthread_spin_unlock(&pool_list->lock);
             GF_ATOMIC_INC(pt_pool->parent->allocs_stdc);
             retval = malloc(1 << pt_pool->parent->power_of_two);
+#ifdef DEBUG
+            hit = _gf_false;
+#endif
         }
     }
 
     if (retval != NULL) {
-        retval->magic = GF_MEM_HEADER_MAGIC;
         retval->pool = mem_pool;
-        retval->pool_list = pool_list;
         retval->power_of_two = mem_pool->pool->power_of_two;
+#ifdef DEBUG
+        if (hit == _gf_true)
+            GF_ATOMIC_INC(mem_pool->hit);
+        else
+            GF_ATOMIC_INC(mem_pool->miss);
+#endif
+        retval->magic = GF_MEM_HEADER_MAGIC;
+        retval->pool_list = pool_list;
     }
 
     return retval;
 }
 
+#endif /* GF_DISABLE_MEMPOOL */
+
 void *
-mem_get(struct mem_pool *mem_pool)
+mem_get0(struct mem_pool *mem_pool)
 {
+    void *ptr = mem_get(mem_pool);
+    if (ptr) {
 #if defined(GF_DISABLE_MEMPOOL)
-    return GF_MALLOC(mem_pool->sizeof_type, gf_common_mt_mem_pool);
+        memset(ptr, 0, mem_pool->sizeof_type);
 #else
-    pooled_obj_hdr_t *retval;
+        memset(ptr, 0, AVAILABLE_SIZE(mem_pool->pool->power_of_two));
+#endif
+    }
 
+    return ptr;
+}
+
+void *
+mem_get(struct mem_pool *mem_pool)
+{
     if (!mem_pool) {
         gf_msg_callingfn("mem-pool", GF_LOG_ERROR, EINVAL, LG_MSG_INVALID_ARG,
                          "invalid argument");
         return NULL;
     }
 
-    retval = mem_get_from_pool(mem_pool);
+#if defined(GF_DISABLE_MEMPOOL)
+    return GF_MALLOC(mem_pool->sizeof_type, gf_common_mt_mem_pool);
+#else
+    pooled_obj_hdr_t *retval = mem_get_from_pool(mem_pool);
     if (!retval) {
         return NULL;
     }
@@ -896,10 +900,19 @@ mem_put(void *ptr)
         /* Not one of ours; don't touch it. */
         return;
     }
+
+    if (!hdr->pool_list) {
+        gf_msg_callingfn("mem-pool", GF_LOG_CRITICAL, EINVAL,
+                         LG_MSG_INVALID_ARG,
+                         "invalid argument hdr->pool_list NULL");
+        return;
+    }
+
     pool_list = hdr->pool_list;
     pt_pool = &pool_list->pools[hdr->power_of_two - POOL_SMALLEST];
 
-    GF_ATOMIC_DEC(hdr->pool->active);
+    if (hdr->pool)
+        GF_ATOMIC_DEC(hdr->pool->active);
 
     hdr->magic = GF_MEM_INVALID_MAGIC;
 
@@ -917,27 +930,3 @@ mem_put(void *ptr)
     }
 #endif /* GF_DISABLE_MEMPOOL */
 }
-
-void
-mem_pool_destroy(struct mem_pool *pool)
-{
-    if (!pool)
-        return;
-
-    /* remove this pool from the owner (glusterfs_ctx_t) */
-    LOCK(&pool->ctx->lock);
-    {
-        list_del(&pool->owner);
-    }
-    UNLOCK(&pool->ctx->lock);
-
-    /* free this pool, but keep the mem_pool_shared */
-    GF_FREE(pool);
-
-    /*
-     * Pools are now permanent, so the mem_pool->pool is kept around. All
-     * of the objects *in* the pool will eventually be freed via the
-     * pool-sweeper thread, and this way we don't have to add a lot of
-     * reference-counting complexity.
-     */
-}