/* Copyright (c) 2008-2012 Red Hat, Inc. This file is part of GlusterFS. This file is licensed to you under your choice of the GNU Lesser General Public License, version 3 or any later version (LGPLv3 or later), or the GNU General Public License, version 2 (GPLv2), in all cases as published by the Free Software Foundation. */ #include "mem-pool.h" #include "logging.h" #include "xlator.h" #include #include #define GF_MEM_POOL_LIST_BOUNDARY (sizeof(struct list_head)) #define GF_MEM_POOL_PTR (sizeof(struct mem_pool*)) #define GF_MEM_POOL_PAD_BOUNDARY (GF_MEM_POOL_LIST_BOUNDARY + GF_MEM_POOL_PTR + sizeof(int)) #define mem_pool_chunkhead2ptr(head) ((head) + GF_MEM_POOL_PAD_BOUNDARY) #define mem_pool_ptr2chunkhead(ptr) ((ptr) - GF_MEM_POOL_PAD_BOUNDARY) #define is_mem_chunk_in_use(ptr) (*ptr == 1) #define mem_pool_from_ptr(ptr) ((ptr) + GF_MEM_POOL_LIST_BOUNDARY) #define GLUSTERFS_ENV_MEM_ACCT_STR "GLUSTERFS_DISABLE_MEM_ACCT" #include "unittest/unittest.h" #include "libglusterfs-messages.h" void gf_mem_acct_enable_set (void *data) { glusterfs_ctx_t *ctx = NULL; REQUIRE(data != NULL); ctx = data; GF_ASSERT (ctx != NULL); ctx->mem_acct_enable = 1; ENSURE(1 == ctx->mem_acct_enable); return; } int gf_mem_set_acct_info (xlator_t *xl, char **alloc_ptr, size_t size, uint32_t type, const char *typestr) { void *ptr = NULL; struct mem_header *header = NULL; if (!alloc_ptr) return -1; ptr = *alloc_ptr; GF_ASSERT (xl != NULL); GF_ASSERT (xl->mem_acct != NULL); GF_ASSERT (type <= xl->mem_acct->num_types); LOCK(&xl->mem_acct->rec[type].lock); { if (!xl->mem_acct->rec[type].typestr) xl->mem_acct->rec[type].typestr = typestr; xl->mem_acct->rec[type].size += size; xl->mem_acct->rec[type].num_allocs++; xl->mem_acct->rec[type].total_allocs++; xl->mem_acct->rec[type].max_size = max (xl->mem_acct->rec[type].max_size, xl->mem_acct->rec[type].size); xl->mem_acct->rec[type].max_num_allocs = max (xl->mem_acct->rec[type].max_num_allocs, xl->mem_acct->rec[type].num_allocs); } UNLOCK(&xl->mem_acct->rec[type].lock); GF_ATOMIC_INC (xl->mem_acct->refcnt); header = (struct mem_header *) ptr; header->type = type; header->size = size; header->mem_acct = xl->mem_acct; header->magic = GF_MEM_HEADER_MAGIC; ptr += sizeof (struct mem_header); /* data follows in this gap of 'size' bytes */ *(uint32_t *) (ptr + size) = GF_MEM_TRAILER_MAGIC; *alloc_ptr = ptr; return 0; } void * __gf_calloc (size_t nmemb, size_t size, uint32_t type, const char *typestr) { size_t tot_size = 0; size_t req_size = 0; char *ptr = NULL; xlator_t *xl = NULL; if (!THIS->ctx->mem_acct_enable) return CALLOC (nmemb, size); xl = THIS; req_size = nmemb * size; tot_size = req_size + GF_MEM_HEADER_SIZE + GF_MEM_TRAILER_SIZE; ptr = calloc (1, tot_size); if (!ptr) { gf_msg_nomem ("", GF_LOG_ALERT, tot_size); return NULL; } gf_mem_set_acct_info (xl, &ptr, req_size, type, typestr); return (void *)ptr; } void * __gf_malloc (size_t size, uint32_t type, const char *typestr) { size_t tot_size = 0; char *ptr = NULL; xlator_t *xl = NULL; if (!THIS->ctx->mem_acct_enable) return MALLOC (size); xl = THIS; tot_size = size + GF_MEM_HEADER_SIZE + GF_MEM_TRAILER_SIZE; ptr = malloc (tot_size); if (!ptr) { gf_msg_nomem ("", GF_LOG_ALERT, tot_size); return NULL; } gf_mem_set_acct_info (xl, &ptr, size, type, typestr); return (void *)ptr; } void * __gf_realloc (void *ptr, size_t size) { size_t tot_size = 0; char *new_ptr; struct mem_header *old_header = NULL; struct mem_header *new_header = NULL; struct mem_header tmp_header; if (!THIS->ctx->mem_acct_enable) return REALLOC (ptr, size); REQUIRE(NULL != ptr); old_header = (struct mem_header *) (ptr - GF_MEM_HEADER_SIZE); GF_ASSERT (old_header->magic == GF_MEM_HEADER_MAGIC); tmp_header = *old_header; tot_size = size + GF_MEM_HEADER_SIZE + GF_MEM_TRAILER_SIZE; new_ptr = realloc (old_header, tot_size); if (!new_ptr) { gf_msg_nomem ("", GF_LOG_ALERT, tot_size); return NULL; } /* * We used to pass (char **)&ptr as the second * argument after the value of realloc was saved * in ptr, but the compiler warnings complained * about the casting to and forth from void ** to * char **. * TBD: it would be nice to adjust the memory accounting info here, * but calling gf_mem_set_acct_info here is wrong because it bumps * up counts as though this is a new allocation - which it's not. * The consequence of doing nothing here is only that the sizes will be * wrong, but at least the counts won't be. uint32_t type = 0; xlator_t *xl = NULL; type = header->type; xl = (xlator_t *) header->xlator; gf_mem_set_acct_info (xl, &new_ptr, size, type, NULL); */ new_header = (struct mem_header *) new_ptr; *new_header = tmp_header; new_header->size = size; new_ptr += sizeof (struct mem_header); /* data follows in this gap of 'size' bytes */ *(uint32_t *) (new_ptr + size) = GF_MEM_TRAILER_MAGIC; return (void *)new_ptr; } int gf_vasprintf (char **string_ptr, const char *format, va_list arg) { va_list arg_save; char *str = NULL; int size = 0; int rv = 0; if (!string_ptr || !format) return -1; va_copy (arg_save, arg); size = vsnprintf (NULL, 0, format, arg); size++; str = GF_MALLOC (size, gf_common_mt_asprintf); if (str == NULL) { /* log is done in GF_MALLOC itself */ va_end (arg_save); return -1; } rv = vsnprintf (str, size, format, arg_save); *string_ptr = str; va_end (arg_save); return (rv); } int gf_asprintf (char **string_ptr, const char *format, ...) { va_list arg; int rv = 0; va_start (arg, format); rv = gf_vasprintf (string_ptr, format, arg); va_end (arg); return rv; } #ifdef DEBUG void __gf_mem_invalidate (void *ptr) { struct mem_header *header = ptr; void *end = NULL; struct mem_invalid inval = { .magic = GF_MEM_INVALID_MAGIC, .mem_acct = header->mem_acct, .type = header->type, .size = header->size, .baseaddr = ptr + GF_MEM_HEADER_SIZE, }; /* calculate the last byte of the allocated area */ end = ptr + GF_MEM_HEADER_SIZE + inval.size + GF_MEM_TRAILER_SIZE; /* overwrite the old mem_header */ memcpy (ptr, &inval, sizeof (inval)); ptr += sizeof (inval); /* zero out remaining (old) mem_header bytes) */ memset (ptr, 0x00, sizeof (*header) - sizeof (inval)); ptr += sizeof (*header) - sizeof (inval); /* zero out the first byte of data */ *(uint32_t *)(ptr) = 0x00; ptr += 1; /* repeated writes of invalid structurein data area */ while ((ptr + (sizeof (inval))) < (end - 1)) { memcpy (ptr, &inval, sizeof (inval)); ptr += sizeof (inval); } /* fill out remaining data area with 0xff */ memset (ptr, 0xff, end - ptr); } #endif /* DEBUG */ void __gf_free (void *free_ptr) { void *ptr = NULL; struct mem_acct *mem_acct; struct mem_header *header = NULL; if (!THIS->ctx->mem_acct_enable) { FREE (free_ptr); return; } if (!free_ptr) return; ptr = free_ptr - GF_MEM_HEADER_SIZE; header = (struct mem_header *) ptr; //Possible corruption, assert here GF_ASSERT (GF_MEM_HEADER_MAGIC == header->magic); mem_acct = header->mem_acct; if (!mem_acct) { goto free; } // This points to a memory overrun GF_ASSERT (GF_MEM_TRAILER_MAGIC == *(uint32_t *)((char *)free_ptr + header->size)); LOCK (&mem_acct->rec[header->type].lock); { mem_acct->rec[header->type].size -= header->size; mem_acct->rec[header->type].num_allocs--; /* If all the instances are freed up then ensure typestr is set * to NULL */ if (!mem_acct->rec[header->type].num_allocs) mem_acct->rec[header->type].typestr = NULL; } UNLOCK (&mem_acct->rec[header->type].lock); if (GF_ATOMIC_DEC (mem_acct->refcnt) == 0) { FREE (mem_acct); } free: #ifdef DEBUG __gf_mem_invalidate (ptr); #endif FREE (ptr); } /* * Based on the mem-type that is used for the allocation, GF_FREE can be * called, or something more intelligent for the structure can be done. * * NOTE: this will not work for allocations from a memory pool. It never did, * because those allocations never set the type in the first place. Any caller * that relies on knowing whether a particular type was allocated via a pool or * not is *BROKEN*, or will be any time either this module or the module * "owning" the type changes. The proper way to handle this, assuming the * caller is not smart enough to call a type-specific free function themselves, * would be to create a callback interface where destructors for specific types * can be registered so that code *here* (GF_FREE, mem_put, etc.) can do the * right thing. That allows type-specific behavior without creating the kind * of fragile coupling that we have now. */ int gf_get_mem_type (void *ptr) { struct mem_header *header = NULL; if (!ptr || !THIS->ctx->mem_acct_enable) return 0; header = (struct mem_header *) (ptr - GF_MEM_HEADER_SIZE); /* Possible corruption, assert here */ GF_ASSERT (GF_MEM_HEADER_MAGIC == header->magic); return header->type; } #define POOL_SMALLEST 7 /* i.e. 128 */ #define POOL_LARGEST 20 /* i.e. 1048576 */ #define NPOOLS (POOL_LARGEST - POOL_SMALLEST + 1) #define N_COLD_LISTS 1024 #define POOL_SWEEP_SECS 30 static pthread_key_t pool_key; static pthread_mutex_t pool_lock = PTHREAD_MUTEX_INITIALIZER; static struct list_head pool_threads; static pthread_mutex_t pool_free_lock = PTHREAD_MUTEX_INITIALIZER; static struct list_head pool_free_threads; static struct mem_pool pools[NPOOLS]; static size_t pool_list_size; static unsigned long sweep_times; static unsigned long sweep_usecs; static unsigned long frees_to_system; typedef struct { struct list_head death_row; pooled_obj_hdr_t *cold_lists[N_COLD_LISTS]; unsigned int n_cold_lists; } sweep_state_t; void collect_garbage (sweep_state_t *state, per_thread_pool_list_t *pool_list) { unsigned int i; per_thread_pool_t *pt_pool; if (pool_list->poison) { list_del (&pool_list->thr_list); list_add (&pool_list->thr_list, &state->death_row); return; } if (state->n_cold_lists >= N_COLD_LISTS) { return; } (void) pthread_spin_lock (&pool_list->lock); for (i = 0; i < NPOOLS; ++i) { pt_pool = &pool_list->pools[i]; if (pt_pool->cold_list) { state->cold_lists[state->n_cold_lists++] = pt_pool->cold_list; } pt_pool->cold_list = pt_pool->hot_list; pt_pool->hot_list = NULL; if (state->n_cold_lists >= N_COLD_LISTS) { /* We'll just catch up on a future pass. */ break; } } (void) pthread_spin_unlock (&pool_list->lock); } void free_obj_list (pooled_obj_hdr_t *victim) { pooled_obj_hdr_t *next; while (victim) { next = victim->next; free (victim); victim = next; ++frees_to_system; } } void * pool_sweeper (void *arg) { sweep_state_t state; per_thread_pool_list_t *pool_list; per_thread_pool_list_t *next_pl; per_thread_pool_t *pt_pool; unsigned int i; struct timeval begin_time; struct timeval end_time; struct timeval elapsed; /* * This is all a bit inelegant, but the point is to avoid doing * expensive things (like freeing thousands of objects) while holding a * global lock. Thus, we split each iteration into three passes, with * only the first and fastest holding the lock. */ for (;;) { sleep (POOL_SWEEP_SECS); INIT_LIST_HEAD (&state.death_row); state.n_cold_lists = 0; /* First pass: collect stuff that needs our attention. */ (void) gettimeofday (&begin_time, NULL); (void) pthread_mutex_lock (&pool_lock); list_for_each_entry_safe (pool_list, next_pl, &pool_threads, thr_list) { collect_garbage (&state, pool_list); } (void) pthread_mutex_unlock (&pool_lock); (void) gettimeofday (&end_time, NULL); timersub (&end_time, &begin_time, &elapsed); sweep_usecs += elapsed.tv_sec * 1000000 + elapsed.tv_usec; sweep_times += 1; /* Second pass: free dead pools. */ (void) pthread_mutex_lock (&pool_free_lock); list_for_each_entry_safe (pool_list, next_pl, &state.death_row, thr_list) { for (i = 0; i < NPOOLS; ++i) { pt_pool = &pool_list->pools[i]; free_obj_list (pt_pool->cold_list); free_obj_list (pt_pool->hot_list); pt_pool->hot_list = pt_pool->cold_list = NULL; } list_del (&pool_list->thr_list); list_add (&pool_list->thr_list, &pool_free_threads); } (void) pthread_mutex_unlock (&pool_free_lock); /* Third pass: free cold objects from live pools. */ for (i = 0; i < state.n_cold_lists; ++i) { free_obj_list (state.cold_lists[i]); } } } void pool_destructor (void *arg) { per_thread_pool_list_t *pool_list = arg; /* The pool-sweeper thread will take it from here. */ pool_list->poison = 1; } static __attribute__((constructor)) void mem_pools_preinit (void) { #if !defined(GF_DISABLE_MEMPOOL) unsigned int i; /* Use a pthread_key destructor to clean up when a thread exits. */ if (pthread_key_create (&pool_key, pool_destructor) != 0) { gf_log ("mem-pool", GF_LOG_CRITICAL, "failed to initialize mem-pool key"); } INIT_LIST_HEAD (&pool_threads); INIT_LIST_HEAD (&pool_free_threads); for (i = 0; i < NPOOLS; ++i) { pools[i].power_of_two = POOL_SMALLEST + i; GF_ATOMIC_INIT (pools[i].allocs_hot, 0); GF_ATOMIC_INIT (pools[i].allocs_cold, 0); GF_ATOMIC_INIT (pools[i].allocs_stdc, 0); GF_ATOMIC_INIT (pools[i].frees_to_list, 0); } pool_list_size = sizeof (per_thread_pool_list_t) + sizeof (per_thread_pool_t) * (NPOOLS - 1); #endif } void mem_pools_init (void) { pthread_t kid; (void) pthread_create (&kid, NULL, pool_sweeper, NULL); (void) pthread_detach (kid); } struct mem_pool * mem_pool_new_fn (unsigned long sizeof_type, unsigned long count, char *name) { unsigned int i; if (!sizeof_type) { gf_msg_callingfn ("mem-pool", GF_LOG_ERROR, EINVAL, LG_MSG_INVALID_ARG, "invalid argument"); return NULL; } for (i = 0; i < NPOOLS; ++i) { if (sizeof_type <= AVAILABLE_SIZE(pools[i].power_of_two)) { return &pools[i]; } } gf_msg_callingfn ("mem-pool", GF_LOG_ERROR, EINVAL, LG_MSG_INVALID_ARG, "invalid argument"); return NULL; } void* mem_get0 (struct mem_pool *mem_pool) { void *ptr = NULL; if (!mem_pool) { gf_msg_callingfn ("mem-pool", GF_LOG_ERROR, EINVAL, LG_MSG_INVALID_ARG, "invalid argument"); return NULL; } ptr = mem_get(mem_pool); if (ptr) { memset (ptr, 0, AVAILABLE_SIZE(mem_pool->power_of_two)); } return ptr; } per_thread_pool_list_t * mem_get_pool_list (void) { per_thread_pool_list_t *pool_list; unsigned int i; pool_list = pthread_getspecific (pool_key); if (pool_list) { return pool_list; } (void) pthread_mutex_lock (&pool_free_lock); if (!list_empty (&pool_free_threads)) { pool_list = list_entry (pool_free_threads.next, per_thread_pool_list_t, thr_list); list_del (&pool_list->thr_list); } (void) pthread_mutex_unlock (&pool_free_lock); if (!pool_list) { pool_list = GF_CALLOC (pool_list_size, 1, gf_common_mt_mem_pool); if (!pool_list) { return NULL; } INIT_LIST_HEAD (&pool_list->thr_list); (void) pthread_spin_init (&pool_list->lock, PTHREAD_PROCESS_PRIVATE); for (i = 0; i < NPOOLS; ++i) { pool_list->pools[i].parent = &pools[i]; pool_list->pools[i].hot_list = NULL; pool_list->pools[i].cold_list = NULL; } } (void) pthread_mutex_lock (&pool_lock); pool_list->poison = 0; list_add (&pool_list->thr_list, &pool_threads); (void) pthread_mutex_unlock (&pool_lock); (void) pthread_setspecific (pool_key, pool_list); return pool_list; } pooled_obj_hdr_t * mem_get_from_pool (per_thread_pool_t *pt_pool) { pooled_obj_hdr_t *retval; retval = pt_pool->hot_list; if (retval) { GF_ATOMIC_INC (pt_pool->parent->allocs_hot); pt_pool->hot_list = retval->next; return retval; } retval = pt_pool->cold_list; if (retval) { GF_ATOMIC_INC (pt_pool->parent->allocs_cold); pt_pool->cold_list = retval->next; return retval; } GF_ATOMIC_INC (pt_pool->parent->allocs_stdc); return malloc (1 << pt_pool->parent->power_of_two); } void * mem_get (struct mem_pool *mem_pool) { #if defined(GF_DISABLE_MEMPOOL) return GF_CALLOC (1, mem_pool->real_sizeof_type, gf_common_mt_mem_pool); #else per_thread_pool_list_t *pool_list; per_thread_pool_t *pt_pool; pooled_obj_hdr_t *retval; if (!mem_pool) { gf_msg_callingfn ("mem-pool", GF_LOG_ERROR, EINVAL, LG_MSG_INVALID_ARG, "invalid argument"); return NULL; } pool_list = mem_get_pool_list (); if (!pool_list || pool_list->poison) { return NULL; } (void) pthread_spin_lock (&pool_list->lock); pt_pool = &pool_list->pools[mem_pool->power_of_two-POOL_SMALLEST]; retval = mem_get_from_pool (pt_pool); (void) pthread_spin_unlock (&pool_list->lock); if (!retval) { return NULL; } retval->magic = GF_MEM_HEADER_MAGIC; retval->next = NULL; retval->pool_list = pool_list;; retval->power_of_two = mem_pool->power_of_two; return retval + 1; #endif /* GF_DISABLE_MEMPOOL */ } void mem_put (void *ptr) { #if defined(GF_DISABLE_MEMPOOL) GF_FREE (ptr); #else pooled_obj_hdr_t *hdr; per_thread_pool_list_t *pool_list; per_thread_pool_t *pt_pool; if (!ptr) { gf_msg_callingfn ("mem-pool", GF_LOG_ERROR, EINVAL, LG_MSG_INVALID_ARG, "invalid argument"); return; } hdr = ((pooled_obj_hdr_t *)ptr) - 1; if (hdr->magic != GF_MEM_HEADER_MAGIC) { /* Not one of ours; don't touch it. */ return; } pool_list = hdr->pool_list; pt_pool = &pool_list->pools[hdr->power_of_two-POOL_SMALLEST]; (void) pthread_spin_lock (&pool_list->lock); hdr->magic = GF_MEM_INVALID_MAGIC; hdr->next = pt_pool->hot_list; pt_pool->hot_list = hdr; GF_ATOMIC_INC (pt_pool->parent->frees_to_list); (void) pthread_spin_unlock (&pool_list->lock); #endif /* GF_DISABLE_MEMPOOL */ } void mem_pool_destroy (struct mem_pool *pool) { if (!pool) return; /* * Pools are now permanent, so this does nothing. Yes, this means we * can keep allocating from a pool after calling mem_destroy on it, but * that's kind of OK. All of the objects *in* the pool will eventually * be freed via the pool-sweeper thread, and this way we don't have to * add a lot of reference-counting complexity. */ }