diff options
-rw-r--r-- | xlators/cluster/ec/src/ec-common.c | 115 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-common.h | 1 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-helpers.c | 15 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-inode-write.c | 211 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-mem-types.h | 1 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec-types.h | 206 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec.c | 52 | ||||
-rw-r--r-- | xlators/cluster/ec/src/ec.h | 2 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 6 |
9 files changed, 525 insertions, 84 deletions
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c index ea5773c9879..29ab66f374c 100644 --- a/xlators/cluster/ec/src/ec-common.c +++ b/xlators/cluster/ec/src/ec-common.c @@ -1437,6 +1437,23 @@ ec_set_inode_size(ec_fop_data_t *fop, inode_t *inode, return found; } +static void +ec_release_stripe_cache (ec_inode_t *ctx) +{ + ec_stripe_list_t *stripe_cache = NULL; + ec_stripe_t *stripe = NULL; + + stripe_cache = &ctx->stripe_cache; + while (!list_empty (&stripe_cache->lru)) { + stripe = list_first_entry (&stripe_cache->lru, ec_stripe_t, + lru); + list_del (&stripe->lru); + GF_FREE (stripe); + } + stripe_cache->count = 0; + stripe_cache->max = 0; +} + void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode) { ec_inode_t *ctx; @@ -1448,6 +1465,7 @@ void ec_clear_inode_info(ec_fop_data_t *fop, inode_t *inode) goto unlock; } + ec_release_stripe_cache (ctx); ctx->have_info = _gf_false; ctx->have_config = _gf_false; ctx->have_version = _gf_false; @@ -2465,6 +2483,102 @@ ec_use_eager_lock(ec_t *ec, ec_fop_data_t *fop) return ec->other_eager_lock; } +static void +ec_update_stripe(ec_t *ec, ec_stripe_list_t *stripe_cache, ec_stripe_t *stripe, + ec_fop_data_t *fop) +{ + off_t base; + + /* On write fops, we only update existing fragments if the write has + * succeeded. Otherwise, we remove them from the cache. */ + if ((fop->id == GF_FOP_WRITE) && (fop->answer != NULL) && + (fop->answer->op_ret >= 0)) { + base = stripe->frag_offset - fop->frag_range.first; + base *= ec->fragments; + + /* We check if the stripe offset falls inside the real region + * modified by the write fop (a write request is allowed, + * though uncommon, to write less bytes than requested). The + * current write fop implementation doesn't allow partial + * writes of fragments, so if there's no error, we are sure + * that a full stripe has been completely modified or not + * touched at all. The value of op_ret may not be a multiple + * of the stripe size because it depends on the requested + * size by the user, so we update the stripe if the write has + * modified at least one byte (meaning ec has written the full + * stripe). */ + if (base < fop->answer->op_ret) { + memcpy(stripe->data, fop->vector[0].iov_base + base, + ec->stripe_size); + list_move_tail(&stripe->lru, &stripe_cache->lru); + + GF_ATOMIC_INC(ec->stats.stripe_cache.updates); + } + } else { + stripe->frag_offset = -1; + list_move (&stripe->lru, &stripe_cache->lru); + + GF_ATOMIC_INC(ec->stats.stripe_cache.invals); + } +} + +static void +ec_update_cached_stripes (ec_fop_data_t *fop) +{ + uint64_t first; + uint64_t last; + ec_stripe_t *stripe = NULL; + ec_inode_t *ctx = NULL; + ec_stripe_list_t *stripe_cache = NULL; + inode_t *inode = NULL; + struct list_head *temp; + struct list_head sentinel; + + first = fop->frag_range.first; + /* 'last' represents the first stripe not touched by the operation */ + last = fop->frag_range.last; + + /* If there are no modified stripes, we don't need to do anything + * else. */ + if (last <= first) { + return; + } + + if (!fop->use_fd) { + inode = fop->loc[0].inode; + } else { + inode = fop->fd->inode; + } + + LOCK(&inode->lock); + + ctx = __ec_inode_get (inode, fop->xl); + if (ctx == NULL) { + goto out; + } + stripe_cache = &ctx->stripe_cache; + + /* Since we'll be moving elements of the list to the tail, we might + * end in an infinite loop. To avoid it, we insert a sentinel element + * into the list, so that it will be used to detect when we have + * traversed all existing elements once. */ + list_add_tail(&sentinel, &stripe_cache->lru); + temp = stripe_cache->lru.next; + while (temp != &sentinel) { + stripe = list_entry(temp, ec_stripe_t, lru); + temp = temp->next; + if ((first <= stripe->frag_offset) && + (stripe->frag_offset < last)) { + ec_update_stripe (fop->xl->private, stripe_cache, + stripe, fop); + } + } + list_del(&sentinel); + +out: + UNLOCK(&inode->lock); +} + void ec_lock_reuse(ec_fop_data_t *fop) { ec_cbk_data_t *cbk; @@ -2491,6 +2605,7 @@ void ec_lock_reuse(ec_fop_data_t *fop) * the lock. */ release = _gf_true; } + ec_update_cached_stripes (fop); for (i = 0; i < fop->lock_count; i++) { ec_lock_next_owner(&fop->locks[i], cbk, release); diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h index 9a35391a781..35c6a8107c2 100644 --- a/xlators/cluster/ec/src/ec-common.h +++ b/xlators/cluster/ec/src/ec-common.h @@ -139,4 +139,5 @@ ec_get_heal_info (xlator_t *this, loc_t *loc, dict_t **dict); int32_t ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata); + #endif /* __EC_COMMON_H__ */ diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c index 122fe24b5d3..fe610748f0f 100644 --- a/xlators/cluster/ec/src/ec-helpers.c +++ b/xlators/cluster/ec/src/ec-helpers.c @@ -706,6 +706,17 @@ void ec_owner_copy(call_frame_t *frame, gf_lkowner_t *owner) lk_owner_copy (&frame->root->lk_owner, owner); } +static void +ec_stripe_cache_init (ec_t *ec, ec_inode_t *ctx) +{ + ec_stripe_list_t *stripe_cache = NULL; + + stripe_cache = &(ctx->stripe_cache); + if (stripe_cache->max == 0) { + stripe_cache->max = ec->stripe_cache; + } +} + ec_inode_t * __ec_inode_get(inode_t * inode, xlator_t * xl) { ec_inode_t * ctx = NULL; @@ -718,7 +729,7 @@ ec_inode_t * __ec_inode_get(inode_t * inode, xlator_t * xl) { memset(ctx, 0, sizeof(*ctx)); INIT_LIST_HEAD(&ctx->heal); - + INIT_LIST_HEAD(&ctx->stripe_cache.lru); value = (uint64_t)(uintptr_t)ctx; if (__inode_ctx_set(inode, xl, &value) != 0) { @@ -732,6 +743,8 @@ ec_inode_t * __ec_inode_get(inode_t * inode, xlator_t * xl) { ctx = (ec_inode_t *)(uintptr_t)value; } + if (ctx) + ec_stripe_cache_init (xl->private, ctx); return ctx; } diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c index ae5120226e3..2f8170d1ac0 100644 --- a/xlators/cluster/ec/src/ec-inode-write.c +++ b/xlators/cluster/ec/src/ec-inode-write.c @@ -18,6 +18,7 @@ #include "ec-combine.h" #include "ec-method.h" #include "ec-fops.h" +#include "ec-mem-types.h" int32_t ec_update_writev_cbk (call_frame_t *frame, void *cookie, @@ -1169,12 +1170,14 @@ void ec_discard_adjust_offset_size(ec_fop_data_t *fop) * write zeros. */ fop->int32 = ec_adjust_offset_up(ec, &fop->offset, _gf_true); + fop->frag_range.first = fop->offset; if (fop->size < fop->int32) { fop->size = 0; } else { fop->size -= fop->int32; ec_adjust_size_down(ec, &fop->size, _gf_true); } + fop->frag_range.last = fop->offset + fop->size; } int32_t ec_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, @@ -1436,6 +1439,8 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state) case EC_STATE_INIT: fop->user_size = fop->offset; ec_adjust_offset_up(fop->xl->private, &fop->offset, _gf_true); + fop->frag_range.first = fop->offset; + fop->frag_range.last = UINT64_MAX; /* Fall through */ @@ -1696,6 +1701,78 @@ out: } /* FOP: writev */ +static ec_stripe_t * +ec_allocate_stripe (ec_t *ec, ec_stripe_list_t *stripe_cache) +{ + ec_stripe_t *stripe = NULL; + + if (stripe_cache->count >= stripe_cache->max) { + GF_ASSERT (!list_empty(&stripe_cache->lru)); + stripe = list_first_entry(&stripe_cache->lru, ec_stripe_t, lru); + list_move_tail(&stripe->lru, &stripe_cache->lru); + GF_ATOMIC_INC(ec->stats.stripe_cache.evicts); + } else { + stripe = GF_MALLOC (sizeof (ec_stripe_t) + ec->stripe_size, + ec_mt_ec_stripe_t); + if (stripe != NULL) { + stripe_cache->count++; + list_add_tail (&stripe->lru, &stripe_cache->lru); + GF_ATOMIC_INC(ec->stats.stripe_cache.allocs); + } else { + GF_ATOMIC_INC(ec->stats.stripe_cache.errors); + } + } + + return stripe; +} + +static void +ec_write_stripe_data (ec_t *ec, ec_fop_data_t *fop, + ec_stripe_t *stripe) +{ + off_t base; + + base = fop->size - ec->stripe_size; + memcpy(stripe->data, fop->vector[0].iov_base + base, ec->stripe_size); + stripe->frag_offset = fop->frag_range.last - ec->fragment_size; +} + +static void +ec_add_stripe_in_cache (ec_t *ec, ec_fop_data_t *fop) +{ + ec_inode_t *ctx = NULL; + ec_stripe_t *stripe = NULL; + ec_stripe_list_t *stripe_cache = NULL; + gf_boolean_t failed = _gf_true; + + LOCK(&fop->fd->inode->lock); + + ctx = __ec_inode_get (fop->fd->inode, fop->xl); + if (ctx == NULL) { + goto out; + } + + stripe_cache = &ctx->stripe_cache; + if (stripe_cache->max > 0) { + stripe = ec_allocate_stripe (ec, stripe_cache); + if (stripe == NULL) { + goto out; + } + + ec_write_stripe_data (ec, fop, stripe); + } + + failed = _gf_false; + +out: + UNLOCK(&fop->fd->inode->lock); + + if (failed) { + gf_msg (ec->xl->name, GF_LOG_DEBUG, ENOMEM, + EC_MSG_FILE_DESC_REF_FAIL, + "Failed to create and add stripe in cache"); + } +} int32_t ec_writev_merge_tail(call_frame_t * frame, void * cookie, xlator_t * this, int32_t op_ret, int32_t op_errno, @@ -1725,8 +1802,11 @@ int32_t ec_writev_merge_tail(call_frame_t * frame, void * cookie, { memset(fop->vector[0].iov_base + fop->size - size, 0, size); } - } + if (ec->stripe_cache) { + ec_add_stripe_in_cache (ec, fop); + } + } return 0; } @@ -1775,7 +1855,7 @@ ec_make_internal_fop_xdata (dict_t **xdata) dict_t *dict = NULL; if (*xdata) - return 0; + return 0; dict = dict_new(); if (!dict) @@ -1802,8 +1882,10 @@ ec_writev_prepare_buffers(ec_t *ec, ec_fop_data_t *fop) fop->user_size = iov_length(fop->vector, fop->int32); fop->head = ec_adjust_offset_down(ec, &fop->offset, _gf_false); + fop->frag_range.first = fop->offset / ec->fragments; fop->size = fop->user_size + fop->head; ec_adjust_size_up(ec, &fop->size, _gf_false); + fop->frag_range.last = fop->frag_range.first + fop->size / ec->fragments; if ((fop->int32 != 1) || (fop->head != 0) || (fop->size > fop->user_size) || @@ -1850,6 +1932,98 @@ out: return err; } +static void +ec_merge_stripe_head_locked (ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe) +{ + size_t head, size; + + head = fop->head; + memcpy(fop->vector[0].iov_base, stripe->data, head); + + size = ec->stripe_size - head; + if (size > fop->user_size) { + head += fop->user_size; + size = ec->stripe_size - head; + memcpy(fop->vector[0].iov_base + head, stripe->data + head, + size); + } +} + +static void +ec_merge_stripe_tail_locked (ec_t *ec, ec_fop_data_t *fop, ec_stripe_t *stripe) +{ + size_t head, tail; + off_t offset; + + offset = fop->user_size + fop->head; + tail = fop->size - offset; + head = ec->stripe_size - tail; + + memcpy(fop->vector[0].iov_base + offset, stripe->data + head, tail); +} + +static ec_stripe_t * +ec_get_stripe_from_cache_locked (ec_t *ec, ec_fop_data_t *fop, + uint64_t frag_offset) +{ + ec_inode_t *ctx = NULL; + ec_stripe_t *stripe = NULL; + ec_stripe_list_t *stripe_cache = NULL; + + ctx = __ec_inode_get (fop->fd->inode, fop->xl); + if (ctx == NULL) { + GF_ATOMIC_INC(ec->stats.stripe_cache.errors); + return NULL; + } + + stripe_cache = &ctx->stripe_cache; + list_for_each_entry (stripe, &stripe_cache->lru, lru) { + if (stripe->frag_offset == frag_offset) { + list_move_tail (&stripe->lru, &stripe_cache->lru); + GF_ATOMIC_INC(ec->stats.stripe_cache.hits); + return stripe; + } + } + + GF_ATOMIC_INC(ec->stats.stripe_cache.misses); + + return NULL; +} + +static gf_boolean_t +ec_get_and_merge_stripe (ec_t *ec, ec_fop_data_t *fop, ec_stripe_part_t which) +{ + uint64_t frag_offset; + ec_stripe_t *stripe = NULL; + gf_boolean_t found = _gf_false; + + if (!ec->stripe_cache) { + return found; + } + + LOCK(&fop->fd->inode->lock); + if (which == EC_STRIPE_HEAD) { + frag_offset = fop->frag_range.first; + stripe = ec_get_stripe_from_cache_locked(ec, fop, frag_offset); + if (stripe) { + ec_merge_stripe_head_locked (ec, fop, stripe); + found = _gf_true; + } + } + + if (which == EC_STRIPE_TAIL) { + frag_offset = fop->frag_range.last - ec->fragment_size; + stripe = ec_get_stripe_from_cache_locked(ec, fop, frag_offset); + if (stripe) { + ec_merge_stripe_tail_locked (ec, fop, stripe); + found = _gf_true; + } + } + UNLOCK(&fop->fd->inode->lock); + + return found; +} + void ec_writev_start(ec_fop_data_t *fop) { ec_t *ec = fop->xl->private; @@ -1858,6 +2032,7 @@ void ec_writev_start(ec_fop_data_t *fop) dict_t *xdata = NULL; uint64_t tail, current; int32_t err = -ENOMEM; + gf_boolean_t found_stripe = _gf_false; /* This shouldn't fail because we have the inode locked. */ GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, ¤t)); @@ -1884,15 +2059,19 @@ void ec_writev_start(ec_fop_data_t *fop) if (err != 0) { goto failed_fd; } - if (fop->head > 0) { - if (ec_make_internal_fop_xdata (&xdata)) { - err = -ENOMEM; - goto failed_xdata; + found_stripe = ec_get_and_merge_stripe (ec, fop, EC_STRIPE_HEAD); + if (!found_stripe) { + if (ec_make_internal_fop_xdata (&xdata)) { + err = -ENOMEM; + goto failed_xdata; + } + ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, + ec_writev_merge_head, + NULL, fd, ec->stripe_size, fop->offset, 0, xdata); } - ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, ec_writev_merge_head, - NULL, fd, ec->stripe_size, fop->offset, 0, xdata); } + tail = fop->size - fop->user_size - fop->head; if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) { /* Current locking scheme will make sure the 'current' below will @@ -1900,13 +2079,17 @@ void ec_writev_start(ec_fop_data_t *fop) * work as expected */ if (current > fop->offset + fop->head + fop->user_size) { - if (ec_make_internal_fop_xdata (&xdata)) { - err = -ENOMEM; - goto failed_xdata; + found_stripe = ec_get_and_merge_stripe (ec, fop, EC_STRIPE_TAIL); + if (!found_stripe) { + if (ec_make_internal_fop_xdata (&xdata)) { + err = -ENOMEM; + goto failed_xdata; + } + ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, + ec_writev_merge_tail, NULL, fd, ec->stripe_size, + fop->offset + fop->size - ec->stripe_size, + 0, xdata); } - ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, - ec_writev_merge_tail, NULL, fd, ec->stripe_size, - fop->offset + fop->size - ec->stripe_size, 0, xdata); } else { memset(fop->vector[0].iov_base + fop->size - tail, 0, tail); } diff --git a/xlators/cluster/ec/src/ec-mem-types.h b/xlators/cluster/ec/src/ec-mem-types.h index 9a4b6c58049..8109a422d9d 100644 --- a/xlators/cluster/ec/src/ec-mem-types.h +++ b/xlators/cluster/ec/src/ec-mem-types.h @@ -25,6 +25,7 @@ enum gf_ec_mem_types_ ec_mt_ec_code_t, ec_mt_ec_code_builder_t, ec_mt_ec_matrix_t, + ec_mt_ec_stripe_t, ec_mt_end }; diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h index 23b30548450..7b2b7b8247d 100644 --- a/xlators/cluster/ec/src/ec-types.h +++ b/xlators/cluster/ec/src/ec-types.h @@ -14,12 +14,16 @@ #include "xlator.h" #include "timer.h" #include "libxlator.h" +#include "atomic.h" #define EC_GF_MAX_REGS 16 enum _ec_heal_need; typedef enum _ec_heal_need ec_heal_need_t; +enum _ec_stripe_part; +typedef enum _ec_stripe_part ec_stripe_part_t; + enum _ec_read_policy; typedef enum _ec_read_policy ec_read_policy_t; @@ -29,6 +33,9 @@ typedef struct _ec_config ec_config_t; struct _ec_fd; typedef struct _ec_fd ec_fd_t; +struct _ec_fragment_range; +typedef struct _ec_fragment_range ec_fragment_range_t; + struct _ec_inode; typedef struct _ec_inode ec_inode_t; @@ -77,6 +84,12 @@ typedef struct _ec_code_builder ec_code_builder_t; struct _ec_code_chunk; typedef struct _ec_code_chunk ec_code_chunk_t; +struct _ec_stripe; +typedef struct _ec_stripe ec_stripe_t; + +struct _ec_stripe_list; +typedef struct _ec_stripe_list ec_stripe_list_t; + struct _ec_code_space; typedef struct _ec_code_space ec_code_space_t; @@ -105,6 +118,9 @@ typedef struct _ec_heal ec_heal_t; struct _ec_self_heald; typedef struct _ec_self_heald ec_self_heald_t; +struct _ec_statistics; +typedef struct _ec_statistics ec_statistics_t; + struct _ec; typedef struct _ec ec_t; @@ -124,6 +140,11 @@ enum _ec_heal_need { EC_HEAL_MUST }; +enum _ec_stripe_part { + EC_STRIPE_HEAD, + EC_STRIPE_TAIL +}; + struct _ec_config { uint32_t version; uint8_t algorithm; @@ -139,6 +160,18 @@ struct _ec_fd { int32_t flags; }; +struct _ec_stripe { + struct list_head lru; /* LRU list member */ + uint64_t frag_offset; /* Fragment offset of this stripe */ + char data[]; /* Contents of the stripe */ +}; + +struct _ec_stripe_list { + struct list_head lru; + uint32_t count; + uint32_t max; +}; + struct _ec_inode { ec_lock_t *inode_lock; gf_boolean_t have_info; @@ -152,8 +185,10 @@ struct _ec_inode { uint64_t post_size; uint64_t dirty[2]; struct list_head heal; + ec_stripe_list_t stripe_cache; }; + typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t, int32_t, uintptr_t, uintptr_t, uintptr_t, dict_t *); @@ -263,75 +298,88 @@ struct _ec_lock_link { off_t fl_end; }; -struct _ec_fop_data { - int32_t id; - int32_t refs; - int32_t state; - int32_t minimum; - int32_t expected; - int32_t winds; - int32_t jobs; - int32_t error; - ec_fop_data_t *parent; - xlator_t *xl; - call_frame_t *req_frame; /* frame of the calling xlator */ - call_frame_t *frame; /* frame used by this fop */ - struct list_head cbk_list; /* sorted list of groups of answers */ - struct list_head answer_list; /* list of answers */ - struct list_head pending_list; /* member of ec_t.pending_fops */ - ec_cbk_data_t *answer; /* accepted answer */ - int32_t lock_count; - int32_t locked; - ec_lock_link_t locks[2]; - int32_t first_lock; - gf_lock_t lock; +/* This structure keeps a range of fragment offsets affected by a fop. Since + * real file offsets can be difficult to handle correctly because of overflows, + * we use the 'scaled' offset, which corresponds to the offset of the fragment + * seen by the bricks, which is always smaller and cannot overflow. */ +struct _ec_fragment_range { + uint64_t first; /* Address of the first affected fragment as seen by the + bricks (offset on brick) */ + uint64_t last; /* Address of the first non affected fragment as seen by + the bricks (offset on brick) */ +}; - uint32_t flags; - uint32_t first; - uintptr_t mask; - uintptr_t healing; /*Dispatch is done but call is successful only - if fop->minimum number of subvolumes succeed - which are not healing*/ - uintptr_t remaining; - uintptr_t received; /* Mask of responses */ - uintptr_t good; - - uid_t uid; - gid_t gid; - - ec_wind_f wind; - ec_handler_f handler; - ec_resume_f resume; - ec_cbk_t cbks; - void *data; - ec_heal_t *heal; - struct list_head healer; - - uint64_t user_size; - uint32_t head; - - int32_t use_fd; - - dict_t *xdata; - dict_t *dict; - int32_t int32; - uint32_t uint32; - uint64_t size; - off_t offset; - mode_t mode[2]; - entrylk_cmd entrylk_cmd; - entrylk_type entrylk_type; - gf_xattrop_flags_t xattrop_flags; - dev_t dev; - inode_t *inode; - fd_t *fd; - struct iatt iatt; - char *str[2]; - loc_t loc[2]; - struct gf_flock flock; - struct iovec *vector; - struct iobref *buffers; - gf_seek_what_t seek; +struct _ec_fop_data { + int32_t id; + int32_t refs; + int32_t state; + int32_t minimum; + int32_t expected; + int32_t winds; + int32_t jobs; + int32_t error; + ec_fop_data_t *parent; + xlator_t *xl; + call_frame_t *req_frame; /* frame of the calling xlator */ + call_frame_t *frame; /* frame used by this fop */ + struct list_head cbk_list; /* sorted list of groups of answers */ + struct list_head answer_list; /* list of answers */ + struct list_head pending_list; /* member of ec_t.pending_fops */ + ec_cbk_data_t *answer; /* accepted answer */ + int32_t lock_count; + int32_t locked; + ec_lock_link_t locks[2]; + int32_t first_lock; + gf_lock_t lock; + + uint32_t flags; + uint32_t first; + uintptr_t mask; + uintptr_t healing; /*Dispatch is done but call is successful + only if fop->minimum number of subvolumes + succeed which are not healing*/ + uintptr_t remaining; + uintptr_t received; /* Mask of responses */ + uintptr_t good; + + uid_t uid; + gid_t gid; + + ec_wind_f wind; + ec_handler_f handler; + ec_resume_f resume; + ec_cbk_t cbks; + void *data; + ec_heal_t *heal; + struct list_head healer; + + uint64_t user_size; + uint32_t head; + + int32_t use_fd; + + dict_t *xdata; + dict_t *dict; + int32_t int32; + uint32_t uint32; + uint64_t size; + off_t offset; + mode_t mode[2]; + entrylk_cmd entrylk_cmd; + entrylk_type entrylk_type; + gf_xattrop_flags_t xattrop_flags; + dev_t dev; + inode_t *inode; + fd_t *fd; + struct iatt iatt; + char *str[2]; + loc_t loc[2]; + struct gf_flock flock; + struct iovec *vector; + struct iobref *buffers; + gf_seek_what_t seek; + ec_fragment_range_t frag_range; /* This will hold the range of stripes + affected by the fop. */ }; struct _ec_cbk_data { @@ -551,6 +599,26 @@ struct _ec_self_heald { struct subvol_healer *full_healers; }; +struct _ec_statistics { + struct { + gf_atomic_t hits; /* Cache hits. */ + gf_atomic_t misses; /* Cache misses. */ + gf_atomic_t updates; /* Number of times an existing stripe has + been updated with new content. */ + gf_atomic_t invals; /* Number of times an existing stripe has + been invalidated because of truncates + or discards. */ + gf_atomic_t evicts; /* Number of times that an existing entry + has been evicted to make room for newer + entries. */ + gf_atomic_t allocs; /* Number of memory allocations made to + store stripes. */ + gf_atomic_t errors; /* Number of errors that have caused extra + requests. (Basically memory allocation + errors). */ + } stripe_cache; +}; + struct _ec { xlator_t *xl; int32_t healers; @@ -576,6 +644,7 @@ struct _ec { gf_boolean_t other_eager_lock; gf_boolean_t optimistic_changelog; gf_boolean_t parallel_writes; + uint32_t stripe_cache; uint32_t background_heals; uint32_t heal_wait_qlen; uint32_t self_heal_window_size; /* max size of read/writes */ @@ -590,6 +659,7 @@ struct _ec { dict_t *leaf_to_subvolid; ec_read_policy_t read_policy; ec_matrix_list_t matrix; + ec_statistics_t stats; }; #endif /* __EC_TYPES_H__ */ diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index 9f361a54aa3..275dd15a302 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -299,6 +299,8 @@ reconfigure (xlator_t *this, dict_t *options) options, bool, failed); GF_OPTION_RECONF ("parallel-writes", ec->parallel_writes, options, bool, failed); + GF_OPTION_RECONF ("stripe-cache", ec->stripe_cache, options, uint32, + failed); ret = 0; if (ec_assign_read_policy (ec, read_policy)) { ret = -1; @@ -581,6 +583,18 @@ notify (xlator_t *this, int32_t event, void *data, ...) return ret; } +static void +ec_statistics_init(ec_t *ec) +{ + GF_ATOMIC_INIT(ec->stats.stripe_cache.hits, 0); + GF_ATOMIC_INIT(ec->stats.stripe_cache.misses, 0); + GF_ATOMIC_INIT(ec->stats.stripe_cache.updates, 0); + GF_ATOMIC_INIT(ec->stats.stripe_cache.invals, 0); + GF_ATOMIC_INIT(ec->stats.stripe_cache.evicts, 0); + GF_ATOMIC_INIT(ec->stats.stripe_cache.allocs, 0); + GF_ATOMIC_INIT(ec->stats.stripe_cache.errors, 0); +} + int32_t init (xlator_t *this) { @@ -671,6 +685,7 @@ init (xlator_t *this) GF_OPTION_INIT ("shd-wait-qlength", ec->shd.wait_qlength, uint32, failed); GF_OPTION_INIT ("optimistic-change-log", ec->optimistic_changelog, bool, failed); GF_OPTION_INIT ("parallel-writes", ec->parallel_writes, bool, failed); + GF_OPTION_INIT ("stripe-cache", ec->stripe_cache, uint32, failed); this->itable = inode_table_new (EC_SHD_INODE_LRU_LIMIT, this); if (!this->itable) @@ -697,6 +712,8 @@ init (xlator_t *this) goto failed; } + ec_statistics_init(ec); + return 0; failed: @@ -1252,6 +1269,9 @@ int32_t ec_gf_forget(xlator_t * this, inode_t * inode) if ((inode_ctx_del(inode, this, &value) == 0) && (value != 0)) { ctx = (ec_inode_t *)(uintptr_t)value; + /* We can only forget an inode if it has been unlocked, so the stripe + * cache should also be empty. */ + GF_ASSERT(list_empty(&ctx->stripe_cache.lru)); GF_FREE(ctx); } @@ -1313,6 +1333,25 @@ int32_t ec_dump_private(xlator_t *this) gf_proc_dump_write("heal-waiters", "%d", ec->heal_waiters); gf_proc_dump_write("read-policy", "%s", ec_read_policies[ec->read_policy]); + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s.stats.stripe_cache", + this->type, this->name); + gf_proc_dump_add_section(key_prefix); + + gf_proc_dump_write("hits", "%llu", + GF_ATOMIC_GET(ec->stats.stripe_cache.hits)); + gf_proc_dump_write("misses", "%llu", + GF_ATOMIC_GET(ec->stats.stripe_cache.misses)); + gf_proc_dump_write("updates", "%llu", + GF_ATOMIC_GET(ec->stats.stripe_cache.updates)); + gf_proc_dump_write("invalidations", "%llu", + GF_ATOMIC_GET(ec->stats.stripe_cache.invals)); + gf_proc_dump_write("evicts", "%llu", + GF_ATOMIC_GET(ec->stats.stripe_cache.evicts)); + gf_proc_dump_write("allocations", "%llu", + GF_ATOMIC_GET(ec->stats.stripe_cache.allocs)); + gf_proc_dump_write("errors", "%llu", + GF_ATOMIC_GET(ec->stats.stripe_cache.errors)); + return 0; } @@ -1512,5 +1551,18 @@ struct volume_options options[] = .description = "This controls if writes can be wound in parallel as long" "as it doesn't modify same stripes" }, + { .key = {"stripe-cache"}, + .type = GF_OPTION_TYPE_INT, + .min = 0,/*Disabling stripe_cache*/ + .max = EC_STRIPE_CACHE_MAX_SIZE, + .default_value = "0", + .description = "This option will keep the last stripe of write fop" + "in memory. If next write falls in this stripe, we need" + "not to read it again from backend and we can save READ" + "fop going over the network. This will improve performance," + "specially for sequential writes. However, this will also" + "lead to extra memory consumption, maximum " + "(cache size * stripe size) Bytes per open file." + }, { .key = {NULL} } }; diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h index 648d444f595..b729fffc274 100644 --- a/xlators/cluster/ec/src/ec.h +++ b/xlators/cluster/ec/src/ec.h @@ -17,7 +17,7 @@ #define EC_XATTR_VERSION EC_XATTR_PREFIX"version" #define EC_XATTR_HEAL EC_XATTR_PREFIX"heal" #define EC_XATTR_DIRTY EC_XATTR_PREFIX"dirty" - +#define EC_STRIPE_CACHE_MAX_SIZE 10 #define EC_VERSION_SIZE 2 #define EC_SHD_INODE_LRU_LIMIT 10 diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index 94f3882c434..69987a01a1c 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -3503,6 +3503,12 @@ struct volopt_map_entry glusterd_volopt_map[] = { .op_version = GD_OP_VERSION_3_10_1, .flags = VOLOPT_FLAG_CLIENT_OPT }, + { .key = "disperse.stripe-cache", + .voltype = "cluster/disperse", + .type = NO_DOC, + .op_version = GD_OP_VERSION_3_13_0, + .flags = OPT_FLAG_CLIENT_OPT + }, /* Halo replication options */ { .key = "cluster.halo-enabled", |