summaryrefslogtreecommitdiffstats
path: root/xlators
diff options
context:
space:
mode:
authorPoornima G <pgurusid@redhat.com>2018-02-09 10:33:30 +0530
committerRaghavendra G <rgowdapp@redhat.com>2018-02-26 17:28:26 +0000
commit4a8255f772c8e98ccf6cae731d4d665363c3ed81 (patch)
tree83c162ddbc374b36871de9fc01c2ea6afeb35b89 /xlators
parent7d641313f46789ec0a7ba0cc04f504724c780855 (diff)
write-behind: Make aggregate size configurable
Currently the aggregate size is by default 128K (page size). From performance perspective small number of large writes is faster than large number of small writes, especially in EC volumes. But identifying the right aggregate size depends on multiple factors like the memcpy overhead, network overhead etc. On local machine, combining 128k writes to 1M writes for EC volumes yielded 30% improvement. As a part of this patch, aggregate size is just made configurable and page_size is modified accordingly. Raghavendra Gowdappa had suggested that, while aggregating writes we should get rid of memcpy of large write size, and instead add the pointer to existinf vector, will be doing it as a part of another patch. Also, in EC volumes, the vectors are merged into one vector, so even if we save memcopy in write_behind, EC would anyways do memcopy for merging vectors into one vector. Updates: #364 Change-Id: Ib67294b8577bea14dde1c84cd271012ecea99f09 Signed-off-by: Poornima G <pgurusid@redhat.com>
Diffstat (limited to 'xlators')
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c6
-rw-r--r--xlators/performance/write-behind/src/write-behind.c26
2 files changed, 27 insertions, 5 deletions
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index 4e63991155e..837c5bc9cd8 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -1845,6 +1845,12 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.op_version = GD_OP_VERSION_3_13_1,
.flags = VOLOPT_FLAG_CLIENT_OPT
},
+ { .key = "performance.aggregate-size",
+ .voltype = "performance/write-behind",
+ .option = "aggregate-size",
+ .op_version = GD_OP_VERSION_4_1_0,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
{ .key = "performance.nfs.write-behind-trickling-writes",
.voltype = "performance/write-behind",
.option = "trickling-writes",
diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c
index 5c85c872b95..c80f0e7e271 100644
--- a/xlators/performance/write-behind/src/write-behind.c
+++ b/xlators/performance/write-behind/src/write-behind.c
@@ -176,6 +176,7 @@ typedef struct wb_request {
typedef struct wb_conf {
uint64_t aggregate_size;
+ uint64_t page_size;
uint64_t window_size;
gf_boolean_t flush_behind;
gf_boolean_t trickling_writes;
@@ -1258,7 +1259,7 @@ __wb_pick_unwinds (wb_inode_t *wb_inode, list_head_t *lies)
int
-__wb_collapse_small_writes (wb_request_t *holder, wb_request_t *req)
+__wb_collapse_small_writes (wb_conf_t *conf, wb_request_t *holder, wb_request_t *req)
{
char *ptr = NULL;
struct iobuf *iobuf = NULL;
@@ -1274,7 +1275,7 @@ __wb_collapse_small_writes (wb_request_t *holder, wb_request_t *req)
req_len = iov_length (req->stub->args.vector,
req->stub->args.count);
- required_size = max ((THIS->ctx->page_size),
+ required_size = max ((conf->page_size),
(holder_len + req_len));
iobuf = iobuf_get2 (req->wb_inode->this->ctx->iobuf_pool,
required_size);
@@ -1349,8 +1350,8 @@ __wb_preprocess_winds (wb_inode_t *wb_inode)
through the interleaved ops
*/
- page_size = wb_inode->this->ctx->page_size;
conf = wb_inode->this->private;
+ page_size = conf->page_size;
list_for_each_entry_safe (req, tmp, &wb_inode->todo, todo) {
if (wb_inode->dontsync && req->ordering.lied) {
@@ -1416,7 +1417,7 @@ __wb_preprocess_winds (wb_inode_t *wb_inode)
continue;
}
- ret = __wb_collapse_small_writes (holder, req);
+ ret = __wb_collapse_small_writes (conf, holder, req);
if (ret)
continue;
@@ -2988,7 +2989,8 @@ init (xlator_t *this)
}
/* configure 'options aggregate-size <size>' */
- conf->aggregate_size = WB_AGGREGATE_SIZE;
+ GF_OPTION_INIT ("aggregate-size", conf->aggregate_size, size_uint64, out);
+ conf->page_size = conf->aggregate_size;
/* configure 'option window-size <size>' */
GF_OPTION_INIT ("cache-size", conf->window_size, size_uint64, out);
@@ -3151,5 +3153,19 @@ struct volume_options options[] = {
"fsync itself is failed irrespective of the value of "
"this option. ",
},
+ { .key = {"aggregate-size"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .default_value = "128KB",
+ .op_version = {GD_OP_VERSION_4_1_0},
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC | OPT_FLAG_CLIENT_OPT,
+ .description = "Will aggregate writes until data of specified "
+ "size is fully filled for a single file provided "
+ "there are no dependent fops on cached writes. This "
+ "option just sets the aggregate size. Note that "
+ "aggregation won't happen if performance.write-behind-trickling-writes"
+ " is turned on. Hence turn off performance.write-behind.trickling-writes"
+ " so that writes are aggregated till a max of "
+ "\"aggregate-size\" bytes",
+ },
{ .key = {NULL} },
};