diff options
author | M. Mohan Kumar <mohan@in.ibm.com> | 2013-11-15 14:19:11 +0530 |
---|---|---|
committer | Anand Avati <avati@redhat.com> | 2013-11-20 14:46:16 -0800 |
commit | 2bb025699a8b9b34491c8b13a2bbb6da302a5d77 (patch) | |
tree | bcfca804f97dbbd960c0b74b499926b717e51e07 | |
parent | 5e31894fbda74a524e1fe30d26f7ed82a77eb5ff (diff) |
bd: Add Zerofill FOP support
BUG: 1028673
Change-Id: I9ba8e3e6cf2f888640b4d2a2eb934a27ff903c42
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: M. Mohan Kumar <mohan@in.ibm.com>
Reviewed-on: http://review.gluster.org/6290
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Anand Avati <avati@redhat.com>
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-handler.c | 16 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd.h | 1 | ||||
-rw-r--r-- | xlators/storage/bd/src/bd-helper.c | 239 | ||||
-rw-r--r-- | xlators/storage/bd/src/bd.c | 34 | ||||
-rw-r--r-- | xlators/storage/bd/src/bd.h | 8 |
5 files changed, 295 insertions, 3 deletions
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c index ae03a40e567..740d04aa1e4 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handler.c +++ b/xlators/mgmt/glusterd/src/glusterd-handler.c @@ -464,6 +464,22 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo, } } + if (volinfo->caps & CAPS_OFFLOAD_ZERO) { + snprintf (key, 256, "volume%d.xlator0.caps%d", count, + caps++); + buf = GF_MALLOC (256, gf_common_mt_char); + if (!buf) { + ret = ENOMEM; + goto out; + } + snprintf (buf, 256, "offload_zerofill"); + ret = dict_set_dynstr (volumes, key, buf); + if (ret) { + GF_FREE (buf); + goto out; + } + } + } #endif diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index ad63682e55e..b081ec32eb4 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -238,6 +238,7 @@ typedef struct _auth auth_t; #define CAPS_THIN 0x00000002 #define CAPS_OFFLOAD_COPY 0x00000004 #define CAPS_OFFLOAD_SNAPSHOT 0x00000008 +#define CAPS_OFFLOAD_ZERO 0x00000020 struct glusterd_rebalance_ { gf_defrag_status_t defrag_status; diff --git a/xlators/storage/bd/src/bd-helper.c b/xlators/storage/bd/src/bd-helper.c index 5525e346bd7..63e26d8a3a6 100644 --- a/xlators/storage/bd/src/bd-helper.c +++ b/xlators/storage/bd/src/bd-helper.c @@ -6,7 +6,8 @@ #ifdef HAVE_LIBAIO #include <libaio.h> #endif - +#include <linux/fs.h> +#include <sys/ioctl.h> #include "bd.h" #include "run.h" @@ -781,3 +782,239 @@ out: return ret; } +#ifndef BLKZEROOUT + +int +bd_do_manual_zerofill (int fd, off_t offset, off_t len, int o_direct) +{ + off_t num_vect = 0; + off_t num_loop = 1; + int idx = 0; + int op_ret = -1; + int vect_size = IOV_SIZE; + off_t remain = 0; + off_t extra = 0; + struct iovec *vector = NULL; + char *iov_base = NULL; + char *alloc_buf = NULL; + + if (len == 0) + return 0; + + if (len < IOV_SIZE) + vect_size = len; + + num_vect = len / (vect_size); + remain = len % vect_size ; + + if (num_vect > MAX_NO_VECT) { + extra = num_vect % MAX_NO_VECT; + num_loop = num_vect / MAX_NO_VECT; + num_vect = MAX_NO_VECT; + } + + vector = GF_CALLOC (num_vect, sizeof(struct iovec), + gf_common_mt_iovec); + if (!vector) + return -1; + + if (o_direct) { + alloc_buf = page_aligned_alloc (vect_size, &iov_base); + if (!alloc_buf) { + gf_log ("bd_do_manual_zerofill", GF_LOG_DEBUG, + "memory alloc failed, vect_size %d: %s", + vect_size, strerror (errno)); + GF_FREE (vector); + return -1; + } + } else { + iov_base = GF_CALLOC (vect_size, sizeof(char), + gf_common_mt_char); + if (!iov_base) { + GF_FREE (vector); + return -1; + } + } + + for (idx = 0; idx < num_vect; idx++) { + vector[idx].iov_base = iov_base; + vector[idx].iov_len = vect_size; + } + + if (lseek (fd, offset, SEEK_SET) < 0) { + op_ret = -1; + goto err; + } + + for (idx = 0; idx < num_loop; idx++) { + op_ret = writev (fd, vector, num_vect); + if (op_ret < 0) + goto err; + } + if (extra) { + op_ret = writev (fd, vector, extra); + if (op_ret < 0) + goto err; + } + if (remain) { + vector[0].iov_len = remain; + op_ret = writev (fd, vector , 1); + if (op_ret < 0) + goto err; + } + op_ret = 0; +err: + if (o_direct) + GF_FREE (alloc_buf); + else + GF_FREE (iov_base); + GF_FREE (vector); + return op_ret; +} + +#else + +/* + * Issue Linux ZEROOUT ioctl to write '0' to a scsi device at given offset + * and number of bytes. Each SCSI device's maximum write same bytes are exported + * in sysfs file. Sending ioctl request greater than this bytes results in slow + * performance. Read this file to get the maximum bytes and break down single + * ZEROOUT request into multiple ZEROOUT request not exceeding maximum bytes. + * From VG & LV name of device mapper identified and sysfs file read. + * /sys/block/<block-device>/queue/write_same_max_bytes + */ +int +bd_do_ioctl_zerofill (bd_priv_t *priv, bd_attr_t *bdatt, int fd, char *vg, + off_t offset, off_t len) +{ + char *dm = NULL; + char dmname[4096] = {0, }; + char lvname[4096] = {0, }; + char sysfs[4096] = {0, }; + bd_gfid_t uuid = {0, }; + char *p = NULL; + off_t max_bytes = 0; + int sysfd = -1; + uint64_t param[2] = {0, 0}; + off_t nr_loop = 0; + char buff[16] = {0, }; + + uuid_utoa_r (bdatt->iatt.ia_gfid, uuid); + sprintf (lvname, "/dev/%s/%s", vg, uuid); + + readlink (lvname, dmname, sizeof (dmname)); + + p = strrchr (dmname, '/'); + if (p) + dm = p + 1; + else + dm = dmname; + + sprintf(sysfs, "/sys/block/%s/queue/write_same_max_bytes", dm); + sysfd = open (sysfs, O_RDONLY); + if (sysfd < 0) { + gf_log ("bd_do_ioctl_zerofill", GF_LOG_DEBUG, + "sysfs file %s does not exist", lvname); + goto skip; + } + + read (sysfd, buff, sizeof (buff)); + close (sysfd); + + max_bytes = atoll (buff); + +skip: + /* + * If requested len is less than write_same_max_bytes, + * issue single ioctl to zeroout. Otherwise split the ioctls + */ + if (!max_bytes || len <= max_bytes) { + param[0] = offset; + param[1] = len; + + if (ioctl (fd, BLKZEROOUT, param) < 0) + return errno; + return 0; + } + + /* Split ioctls to max write_same_max_bytes */ + nr_loop = len / max_bytes; + for (; nr_loop; nr_loop--) { + param[0] = offset; + param[1] = max_bytes; + + if (ioctl (fd, BLKZEROOUT, param) < 0) + return errno; + + offset += max_bytes; + } + + if (!(len % max_bytes)) + return 0; + + param[0] = offset; + param[1] = len % max_bytes; + + if (ioctl (fd, BLKZEROOUT, param) < 0) + return errno; + + return 0; +} +#endif + +int +bd_do_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, off_t len, struct iatt *prebuf, + struct iatt *postbuf) +{ + int ret = -1; + bd_fd_t *bd_fd = NULL; + bd_priv_t *priv = this->private; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (priv, out); + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "bd_fd is NULL from fd=%p", fd); + goto out; + } + + bd_inode_ctx_get (fd->inode, this, &bdatt); +#ifndef BLKZEROOUT + ret = bd_do_manual_zerofill(bd_fd->fd, offset, len, + bd_fd->flag & O_DIRECT); +#else + ret = bd_do_ioctl_zerofill(priv, bdatt, bd_fd->fd, priv->vg, offset, + len); +#endif + if (ret) { + gf_log(this->name, GF_LOG_ERROR, + "zerofill failed on fd %d length %ld %s", + bd_fd->fd, len, strerror (ret)); + goto out; + } + + if (bd_fd->flag & (O_SYNC|O_DSYNC)) { + ret = fsync (bd_fd->fd); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "fsync() in writev on fd %d failed: %s", + bd_fd->fd, strerror (errno)); + return errno; + } + } + + memcpy (&prebuf, &bdatt->iatt, sizeof (prebuf)); + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME); + memcpy (&postbuf, &bdatt->iatt, sizeof (postbuf)); + +out: + + return ret; +} + diff --git a/xlators/storage/bd/src/bd.c b/xlators/storage/bd/src/bd.c index 1eb5cd15838..17a9a5f159a 100644 --- a/xlators/storage/bd/src/bd.c +++ b/xlators/storage/bd/src/bd.c @@ -2195,6 +2195,36 @@ out: return 0; } +static int +bd_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) +{ + int32_t ret = 0; + struct iatt statpre = {0,}; + struct iatt statpost = {0,}; + bd_attr_t *bdatt = NULL; + + /* iatt already cached */ + if (bd_inode_ctx_get (fd->inode, this, &bdatt) < 0) { + STACK_WIND (frame, default_zerofill_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->zerofill, + fd, offset, len, xdata); + return 0; + } + + ret = bd_do_zerofill(frame, this, fd, offset, len, + &statpre, &statpost); + if (ret) + goto err; + + STACK_UNWIND_STRICT(zerofill, frame, 0, 0, &statpre, &statpost, NULL); + return 0; + +err: + STACK_UNWIND_STRICT(zerofill, frame, -1, ret, NULL, NULL, NULL); + return 0; +} + /** * notify - when parent sends PARENT_UP, send CHILD_UP event from here */ @@ -2324,7 +2354,8 @@ init (xlator_t *this) } } - _private->caps |= BD_CAPS_OFFLOAD_COPY | BD_CAPS_OFFLOAD_SNAPSHOT; + _private->caps |= BD_CAPS_OFFLOAD_COPY | BD_CAPS_OFFLOAD_SNAPSHOT | + BD_CAPS_OFFLOAD_ZERO; return 0; error: @@ -2384,6 +2415,7 @@ struct xlator_fops fops = { .flush = bd_flush, .setattr = bd_setattr, .discard = bd_discard, + .zerofill = bd_zerofill, }; struct xlator_cbks cbks = { diff --git a/xlators/storage/bd/src/bd.h b/xlators/storage/bd/src/bd.h index 34b4c9e2226..f59bc6a09ed 100644 --- a/xlators/storage/bd/src/bd.h +++ b/xlators/storage/bd/src/bd.h @@ -51,6 +51,7 @@ #define BD_CAPS_THIN 0x02 #define BD_CAPS_OFFLOAD_COPY 0x04 #define BD_CAPS_OFFLOAD_SNAPSHOT 0x08 +#define BD_CAPS_OFFLOAD_ZERO 0x20 #define BD_CLONE "clone" #define BD_SNAPSHOT "snapshot" @@ -61,9 +62,11 @@ #define IOV_SIZE (64 * 1024) #define ALIGN_SIZE 4096 - #define LINKTO "trusted.glusterfs.dht.linkto" +#define MAX_NO_VECT 1024 + + #define BD_VALIDATE_MEM_ALLOC(buff, op_errno, label) \ if (!buff) { \ op_errno = ENOMEM; \ @@ -174,5 +177,8 @@ int bd_snapshot_create (bd_local_t *local, bd_priv_t *priv); int bd_clone (bd_local_t *local, bd_priv_t *priv); int bd_merge (bd_priv_t *priv, uuid_t gfid); int bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict); +int bd_do_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, off_t len, struct iatt *prebuf, + struct iatt *postbuf); #endif |