diff options
Diffstat (limited to 'xlators/storage')
| -rw-r--r-- | xlators/storage/Makefile.am | 4 | ||||
| -rw-r--r-- | xlators/storage/bd/Makefile.am | 3 | ||||
| -rw-r--r-- | xlators/storage/bd/src/Makefile.am | 20 | ||||
| -rw-r--r-- | xlators/storage/bd/src/bd-helper.c | 562 | ||||
| -rw-r--r-- | xlators/storage/bd/src/bd.c | 2047 | ||||
| -rw-r--r-- | xlators/storage/bd/src/bd.h | 140 | 
6 files changed, 2776 insertions, 0 deletions
diff --git a/xlators/storage/Makefile.am b/xlators/storage/Makefile.am index 5e3ed0eb93b..c08e8e41bca 100644 --- a/xlators/storage/Makefile.am +++ b/xlators/storage/Makefile.am @@ -1,3 +1,7 @@  SUBDIRS = posix +if ENABLE_BD_XLATOR +SUBDIRS += bd +endif +  CLEANFILES = diff --git a/xlators/storage/bd/Makefile.am b/xlators/storage/bd/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/storage/bd/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/storage/bd/src/Makefile.am b/xlators/storage/bd/src/Makefile.am new file mode 100644 index 00000000000..210b7453af8 --- /dev/null +++ b/xlators/storage/bd/src/Makefile.am @@ -0,0 +1,20 @@ +if ENABLE_BD_XLATOR +xlator_LTLIBRARIES = bd.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage + +bd_la_LDFLAGS = -module -avoid-version +LIBBD = -llvm2app -lrt +bd_la_SOURCES = bd.c bd-helper.c +bd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBBD) + +noinst_HEADERS = bd.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ +            -I$(top_srcdir)/rpc/xdr/src \ +            -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -fno-strict-aliasing -Wall $(GF_CFLAGS) + +CLEANFILES = + +endif diff --git a/xlators/storage/bd/src/bd-helper.c b/xlators/storage/bd/src/bd-helper.c new file mode 100644 index 00000000000..2c1b77a9b3e --- /dev/null +++ b/xlators/storage/bd/src/bd-helper.c @@ -0,0 +1,562 @@ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif +#include <lvm2app.h> + +#include "bd.h" +#include "run.h" + +int +bd_inode_ctx_set (inode_t *inode, xlator_t *this, bd_attr_t *ctx) +{ +        int       ret  = -1; +        uint64_t  ctx_int = 0; + +        GF_VALIDATE_OR_GOTO (this->name, inode, out); +        GF_VALIDATE_OR_GOTO (this->name, ctx, out); + +        ctx_int = (long)ctx; +        ret = inode_ctx_set (inode, this, &ctx_int); +out: +        return ret; +} + +int +bd_inode_ctx_get (inode_t *inode, xlator_t *this, bd_attr_t **ctx) +{ +        int       ret     = -1; +        uint64_t  ctx_int = 0; + +        GF_VALIDATE_OR_GOTO (this->name, inode, out); +        ret = inode_ctx_get (inode, this, &ctx_int); +        if (ret) +                return ret; +        if (ctx) +                *ctx = (bd_attr_t *) ctx_int; +out: +        return ret; +} + +void +bd_local_free (xlator_t *this, bd_local_t *local) +{ +        if (!local) +                return; +        if (local->fd) +                fd_unref (local->fd); +        else if (local->loc.path) +                loc_wipe (&local->loc); +        if (local->dict) +                dict_unref (local->dict); +        if (local->inode) +                inode_unref (local->inode); +        if (local->bdatt) { +                GF_FREE (local->bdatt->type); +                GF_FREE (local->bdatt); +        } +        mem_put (local); +        local = NULL; +} + +bd_local_t * +bd_local_init (call_frame_t *frame, xlator_t *this) +{ +        frame->local = mem_get0 (this->local_pool); +        if (!frame->local) +                return NULL; + +        return frame->local; +} + +/* + * VG are set with the tag in GF_XATTR_VOL_ID_KEY:<uuid> format. + * This function validates this tag agains volume-uuid. Also goes + * through LV list to find out if a thin-pool is configured or not. + */ +int bd_scan_vg (xlator_t *this, bd_priv_t *priv) +{ +        vg_t                   brick      = NULL; +        data_t                *tmp_data   = NULL; +        struct dm_list        *tags       = NULL; +        int                    op_ret     = -1; +        uuid_t                 dict_uuid  = {0, }; +        uuid_t                 vg_uuid    = {0, }; +        gf_boolean_t           uuid       = _gf_false; +        lvm_str_list_t        *strl       = NULL; +        struct dm_list        *lv_dm_list = NULL; +        lv_list_t             *lv_list    = NULL; +        struct dm_list        *dm_seglist = NULL; +        lvseg_list_t          *seglist    = NULL; +        lvm_property_value_t   prop       = {0, }; +        gf_boolean_t           thin       = _gf_false; +        const char            *lv_name    = NULL; + +        brick = lvm_vg_open (priv->handle, priv->vg, "w", 0); +        if (!brick) { +                gf_log (this->name, GF_LOG_CRITICAL, "VG %s is not found", +                        priv->vg); +                return ENOENT; +        } + +        lv_dm_list = lvm_vg_list_lvs (brick); +        if (!lv_dm_list) +                goto check; + +        dm_list_iterate_items (lv_list, lv_dm_list) { +                dm_seglist = lvm_lv_list_lvsegs (lv_list->lv); +                if (!dm_seglist) +                        continue; +                dm_list_iterate_items (seglist, dm_seglist) { +                        prop = lvm_lvseg_get_property (seglist->lvseg, +                                                       "segtype"); +                        if (!prop.is_valid || !prop.value.string) +                                continue; +                        if (!strcmp (prop.value.string, "thin-pool")) { +                                thin = _gf_true; +                                lv_name = lvm_lv_get_name (lv_list->lv); +                                priv->pool = gf_strdup (lv_name); +                                gf_log (THIS->name, GF_LOG_INFO, "Thin Pool " +                                        "\"%s\" will be used for thin LVs", +                                        lv_name); +                                break; +                        } +                } +        } + +check: +        /* If there is no volume-id set in dict, we cant validate */ +        tmp_data = dict_get (this->options, "volume-id"); +        if (!tmp_data) { +                op_ret = 0; +                goto out; +        } + +        op_ret = uuid_parse (tmp_data->data, dict_uuid); +        if (op_ret < 0) { +                gf_log (this->name, GF_LOG_ERROR, +                        "wrong volume-id (%s) set in volume file", +                        tmp_data->data); +                op_ret = -1; +                goto out; +        } + +        tags = lvm_vg_get_tags (brick); +        if (!tags) { /* no tags in the VG */ +                gf_log (this->name, GF_LOG_ERROR, +                        "Extended attribute trusted.glusterfs." +                        "volume-id is absent"); +                op_ret = -1; +                goto out; +        } +        dm_list_iterate_items (strl, tags) { +                if (!strncmp (strl->str, GF_XATTR_VOL_ID_KEY, +                              strlen (GF_XATTR_VOL_ID_KEY))) { +                        uuid = _gf_true; +                        break; +                } +        } +        /* UUID tag is not set in VG */ +        if (!uuid) { +                gf_log (this->name, GF_LOG_ERROR, +                        "Extended attribute trusted.glusterfs." +                        "volume-id is absent"); +                op_ret = -1; +                goto out; +        } + +        op_ret = uuid_parse (strl->str + strlen (GF_XATTR_VOL_ID_KEY) + 1, +                             vg_uuid); +        if (op_ret < 0) { +                        gf_log (this->name, GF_LOG_ERROR, +                                "wrong volume-id (%s) set in VG", strl->str); +                        op_ret = -1; +                        goto out; +        } +        if (uuid_compare (dict_uuid, vg_uuid)) { +                gf_log (this->name, GF_LOG_ERROR, +                        "mismatching volume-id (%s) received. " +                        "already is a part of volume %s ", +                        tmp_data->data, vg_uuid); +                op_ret = -1; +                goto out; +        } + +        op_ret = 0; + +out: +        lvm_vg_close (brick); + +        if (!thin) +                gf_log (THIS->name, GF_LOG_WARNING, "No thin pool found in " +                        "VG %s\n", priv->vg); +        else +                priv->caps |= BD_CAPS_THIN; + +        return op_ret; +} + +/* FIXME: Move this code to common place, so posix and bd xlator can use */ +char * +page_aligned_alloc (size_t size, char **aligned_buf) +{ +        char    *alloc_buf = NULL; +        char    *buf       = NULL; + +        alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_common_mt_char); +        if (!alloc_buf) +                return NULL; +        /* page aligned buffer */ +        buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE); +        *aligned_buf = buf; + +        return alloc_buf; +} + +static int +__bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd_p) +{ +        int         ret      = -1; +        int         _fd      = -1; +        char       *devpath  = NULL; +        bd_fd_t    *bdfd     = NULL; +        uint64_t    tmp_bdfd = 0; +        bd_priv_t  *priv     = this->private; +        bd_gfid_t   gfid     = {0, }; +        bd_attr_t  *bdatt    = NULL; + +        /* not bd file */ +        if (fd->inode->ia_type != IA_IFREG || +            bd_inode_ctx_get (fd->inode, this, &bdatt)) +                return 0; + +        ret = __fd_ctx_get (fd, this, &tmp_bdfd); +        if (ret == 0) { +                bdfd = (void *)(long) tmp_bdfd; +                *bdfd_p = bdfd; +                return 0; +        } + +        uuid_utoa_r (fd->inode->gfid, gfid); +        asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid); +        if (!devpath) +                goto out; + +        _fd = open (devpath, O_RDWR | O_LARGEFILE, 0); +        if (_fd < 0) { +                ret = errno; +                gf_log (this->name, GF_LOG_ERROR, "open on %s: %s", devpath, +                        strerror (ret)); +                goto out; +        } +        bdfd = GF_CALLOC (1, sizeof(bd_fd_t), gf_bd_fd); +        BD_VALIDATE_MEM_ALLOC (bdfd, ret, out); + +        bdfd->fd = _fd; +        bdfd->flag = O_RDWR | O_LARGEFILE; +        if (__fd_ctx_set (fd, this, (uint64_t)(long)bdfd) < 0) { +                gf_log (this->name, GF_LOG_WARNING, +                        "failed to set the fd context fd=%p", fd); +                goto out; +        } + +        *bdfd_p = bdfd; + +        ret = 0; +out: +        FREE (devpath); +        if (ret) { +                close (_fd); +                GF_FREE (bdfd); +        } +        return ret; +} + +int +bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd) +{ +        int   ret; + +        /* FIXME: Is it ok to fd->lock here ? */ +        LOCK (&fd->lock); +        { +                ret = __bd_fd_ctx_get (this, fd, bdfd); +        } +        UNLOCK (&fd->lock); + +        return ret; +} + +/* + * Validates if LV exists for given inode or not. + * Returns 0 if LV exists and size also matches. + * If LV does not exist -1 returned + * If LV size mismatches, returnes 1 also lv_size is updated with actual + * size + */ +int +bd_validate_bd_xattr (xlator_t *this, char *bd, char **type, +                      uint64_t *lv_size, uuid_t uuid) +{ +        char       *path  = NULL; +        int         ret   = -1; +        bd_gfid_t   gfid  = {0, }; +        bd_priv_t  *priv  = this->private; +        struct stat stbuf = {0, }; +        uint64_t    size  = 0; +        vg_t        vg    = NULL; +        lv_t        lv    = NULL; +        char     *bytes = NULL; + +        bytes = strrchr (bd, ':'); +        if (bytes) { +                *bytes = '\0'; +                bytes++; +                gf_string2bytesize (bytes, &size); +        } + +        if (strcmp (bd, BD_LV) && strcmp (bd, BD_THIN)) { +                gf_log (this->name, GF_LOG_WARNING, +                        "invalid xattr %s", bd); +                return -1; +        } +        *type = gf_strdup (bd); + +        /* +         * Check if LV really exist, there could be a failure +         * after setxattr and successful LV creation +         */ +        uuid_utoa_r (uuid, gfid); +        gf_asprintf (&path, "/dev/%s/%s", priv->vg, gfid); +        if (!path) { +                gf_log (this->name, GF_LOG_WARNING, +                        "insufficient memory"); +                return 0; +        } + +        /* Destination file does not exist */ +        if (stat (path, &stbuf)) { +                gf_log (this->name, GF_LOG_WARNING, +                        "lstat failed for path %s", path); +                return -1; +        } + +        vg = lvm_vg_open (priv->handle, priv->vg, "r", 0); +        if (!vg) { +                gf_log (this->name, GF_LOG_WARNING, +                        "VG %s does not exist?", priv->vg); +                ret = -1; +                goto out; +        } + +        lv = lvm_lv_from_name (vg, gfid); +        if (!lv) { +                gf_log (this->name, GF_LOG_WARNING, +                        "LV %s does not exist", gfid); +                ret = -1; +                goto out; +        } + +        *lv_size = lvm_lv_get_size (lv); +        if (size == *lv_size) { +                ret = 0; +                goto out; +        } + +        ret = 1; + +out: +        if (vg) +                lvm_vg_close (vg); + +        GF_FREE (path); +        return ret; +} + +static int +create_thin_lv (char *vg, char *pool, char *lv, uint64_t extent) +{ +        int         ret    = -1; +        runner_t    runner = {0, }; +        char       *path   = NULL; +        struct stat stat   = {0, }; + +        runinit (&runner); +        runner_add_args  (&runner, LVM_CREATE, NULL); +        runner_add_args  (&runner, "--thin", NULL); +        runner_argprintf (&runner, "%s/%s", vg, pool); +        runner_add_args  (&runner, "--name", NULL); +        runner_argprintf (&runner, "%s", lv); +        runner_add_args  (&runner, "--virtualsize", NULL); +        runner_argprintf (&runner, "%ldB", extent); +        runner_start (&runner); +        runner_end (&runner); + +        gf_asprintf (&path, "/dev/%s/%s", vg, lv); +        if (!path) { +                ret = ENOMEM; +                goto out; +        } +        if (lstat (path, &stat) < 0) +                ret = EAGAIN; +        else +                ret = 0; +out: +        GF_FREE (path); +        return ret; +} + +int +bd_create (uuid_t uuid, uint64_t size, char *type, bd_priv_t *priv) +{ +        int       ret  = 0; +        vg_t      vg   = NULL; +        bd_gfid_t gfid = {0, }; + +        uuid_utoa_r (uuid, gfid); + +        if (!strcmp (type, BD_THIN)) +                return create_thin_lv (priv->vg, priv->pool, gfid, +                                       size); + +        vg = lvm_vg_open (priv->handle, priv->vg,  "w", 0); +        if (!vg) { +                gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", +                        priv->vg); +                return ENOENT; +        } + +        if (!lvm_vg_create_lv_linear (vg, gfid, size)) { +                gf_log (THIS->name, GF_LOG_WARNING, "lvm_vg_create_lv_linear " +                        "failed"); +                ret = errno; +        } + +        lvm_vg_close (vg); + +        return ret; +} + +int32_t +bd_resize (bd_priv_t *priv, uuid_t uuid, off_t size) +{ +        uint64_t        new_size  = 0; +        runner_t        runner    = {0, }; +        bd_gfid_t       gfid      = {0, }; +        int             ret       = 0; +        vg_t            vg        = NULL; +        lv_t            lv        = NULL; + +        uuid_utoa_r (uuid, gfid); + +        runinit (&runner); + +        runner_add_args  (&runner, LVM_RESIZE, NULL); +        runner_argprintf (&runner, "%s/%s", priv->vg, gfid); +        runner_argprintf (&runner, "-L%ldb", size); +        runner_add_args  (&runner, "-f", NULL); + +        runner_start (&runner); +        runner_end (&runner); + +        vg = lvm_vg_open (priv->handle, priv->vg, "w", 0); +        if (!vg) { +                gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", +                        priv->vg); +                return EAGAIN; +        } + +        lv = lvm_lv_from_name (vg, gfid); +        if (!lv) { +                gf_log (THIS->name, GF_LOG_WARNING, "LV %s not found", gfid); +                ret = EIO; +                goto out; +        } +        new_size = lvm_lv_get_size (lv); + +        if (new_size != size) { +                gf_log (THIS->name, GF_LOG_WARNING, "resized LV size %ld does " +                        "not match requested size %ld", new_size, size); +                ret = EIO; +        } + +out: +        lvm_vg_close (vg); +        return ret; +} + +uint64_t +bd_get_default_extent (bd_priv_t *priv) +{ +        vg_t   vg = NULL; +        uint64_t size = 0; + +        vg = lvm_vg_open (priv->handle, priv->vg,  "w", 0); +        if (!vg) { +                gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", +                        priv->vg); +                return 0; +        } + +        size = lvm_vg_get_extent_size (vg); + +        lvm_vg_close (vg); + +        return size; +} + +/* + * Adjusts the user specified size to VG specific extent size + */ +uint64_t +bd_adjust_size (bd_priv_t *priv, uint64_t size) +{ +        uint64_t extent = 0; +        uint64_t nr_ex  = 0; + +        extent = bd_get_default_extent (priv); +        if (!extent) +                return 0; + +        nr_ex = size / extent; +        if (size % extent) +                nr_ex++; + +        size = extent * nr_ex; + +        return size; +} + +int +bd_delete_lv (bd_priv_t *priv, const char *lv_name, int *op_errno) +{ +        vg_t    vg  = NULL; +        lv_t    lv  = NULL; +        int     ret = -1; + +        *op_errno = 0; +        vg = lvm_vg_open (priv->handle, priv->vg, "w", 0); +        if (!vg) { +                gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", +                        priv->vg); +                *op_errno = ENOENT; +                return -1; +        } +        lv = lvm_lv_from_name (vg, lv_name); +        if (!lv) { +                gf_log (THIS->name, GF_LOG_WARNING, "No such LV %s", lv_name); +                *op_errno = ENOENT; +                goto out; +        } +        ret = lvm_vg_remove_lv (lv); +        if (ret < 0) { +                gf_log (THIS->name, GF_LOG_WARNING, "removing LV %s failed", +                        lv_name); +                *op_errno = errno; +                goto out; +        } +out: +        lvm_vg_close (vg); + +        return ret; +} diff --git a/xlators/storage/bd/src/bd.c b/xlators/storage/bd/src/bd.c new file mode 100644 index 00000000000..5fa15c542c0 --- /dev/null +++ b/xlators/storage/bd/src/bd.c @@ -0,0 +1,2047 @@ +/* +  BD translator V2 - Exports Block devices on server side as regular +  files to client + +  Now only exporting Logical volumes supported. + +  Copyright IBM, Corp. 2013 + +  This file is part of GlusterFS. + +  Author: +  M. Mohan Kumar <mohan@in.ibm.com> + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif +#include <lvm2app.h> +#include <openssl/md5.h> +#include <time.h> +#include <linux/fs.h> +#include <sys/ioctl.h> + +#include "bd.h" +#include "defaults.h" +#include "glusterfs3-xdr.h" +#include "run.h" +#include "protocol-common.h" +#include "checksum.h" + +/* + * Call back function for setxattr and removexattr. + * does not do anything. FIXME: How to handle remove/setxattr failure + */ +int +bd_null_rmsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                        int op_ret, int op_errno, dict_t *xdata) +{ +        STACK_DESTROY (frame->root); +        return 0; +} + +/* + * returns 0 if a file is mapped to BD or not. + */ +int +bd_get_bd_info (call_frame_t *frame, xlator_t *this, dict_t *xattr, uuid_t gfid, +                char **type, uint64_t *size) +{ +        char         *bd_xattr = NULL; +        char         *bd       = NULL; +        int           ret      = -1; +        loc_t         loc      = {0, }; +        dict_t       *dict     = NULL; +        char         *p        = NULL; +        call_frame_t *bd_frame = NULL; + +        if (!xattr) +                return 1; + +        if (dict_get_str (xattr, BD_XATTR, &p)) +                return 1; + +        bd_xattr = gf_strdup (p); + +        memcpy (loc.gfid, gfid, sizeof (uuid_t)); + +        bd_frame = copy_frame (frame); +        BD_VALIDATE_MEM_ALLOC (bd_frame, ret, out); + +        ret = bd_validate_bd_xattr (this,  bd_xattr, type, size, gfid); +        if (ret < 0) {/* LV does not exist */ +                STACK_WIND (bd_frame, bd_null_rmsetxattr_cbk, FIRST_CHILD (this), +                            FIRST_CHILD (this)->fops->removexattr, &loc, +                            BD_XATTR, NULL); + +                gf_log (this->name, GF_LOG_WARNING, +                        "Mapped LV not available for posix file <gfid:%s>, " +                        "deleting mapping", uuid_utoa (gfid)); +        } else if (ret == 1) { +                /* BD_XATTR size and LV size mismatch. Update BD_XATTR */ +                gf_asprintf (&bd, "%s:%ld", *type, *size); + +                dict = dict_new (); +                BD_VALIDATE_MEM_ALLOC (dict, ret, out); + +                ret = dict_set_dynstr (dict, BD_XATTR, bd); +                if (ret) +                        goto out; + +                STACK_WIND (bd_frame, bd_null_rmsetxattr_cbk, FIRST_CHILD (this), +                            FIRST_CHILD (this)->fops->setxattr, &loc, dict, 0, +                            NULL); +        } + +out: +        dict_del (xattr, BD_XATTR); +        GF_FREE (bd_xattr); +        GF_FREE (bd); +        return ret; +} + +/* + * bd_lookup_cbk: Call back from posix_lookup. + */ +int32_t +bd_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, +               int op_errno, inode_t *inode, struct iatt *buf, dict_t *xattr, +               struct iatt *postparent) +{ +        int           ret    = -1; +        bd_attr_t    *bdatt  = NULL; +        uint64_t      size   = 0; +        char         *type   = BD_TYPE_NONE; + +        /* only regular files are part of BD object */ +        if (op_ret < 0 || buf->ia_type != IA_IFREG) +                goto out; + +        /* iatt already cached */ +        if (!bd_inode_ctx_get (inode, this, &bdatt)) +                goto next; + +        if (bd_get_bd_info (frame, this, xattr, buf->ia_gfid, &type, &size)) +                goto out; + +        /* BD file, update buf */ +        bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); +        if (!bdatt) { +                op_errno = ENOMEM; +                goto out; +        } +        memcpy (&bdatt->iatt, buf, sizeof (struct iatt)); +        bdatt->type = type; + +        /* Cache LV size in inode_ctx */ +        ret = bd_inode_ctx_set (inode, this, bdatt); +        if (ret < 0) { +                GF_FREE (bdatt); +                op_errno = EINVAL; +                goto out; +        } + +        bdatt->iatt.ia_size = size; +        bdatt->iatt.ia_blocks = size / 512; + +next: +        dict_del (xattr, GF_CONTENT_KEY); +        memcpy (buf, &bdatt->iatt, sizeof (struct iatt)); + +out: +        BD_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf, +                         xattr, postparent); +        return 0; +} + +/* + * bd_lookup: Issues posix_lookup to find out if file is mapped to BD + * bd_lookup -> posix_lookup -> bd_lookup_cbk +*/ +int32_t +bd_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ +        dict_t     *bd_xattr = NULL; +        bd_attr_t  *bdatt    = NULL; +        int         op_errno = EINVAL; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (loc, out); +        VALIDATE_OR_GOTO (loc->path, out); +        VALIDATE_OR_GOTO (this->private, out); + +        if (bd_inode_ctx_get (loc->inode, this, &bdatt) < 0) { +                if (!xattr_req) { +                        bd_xattr = dict_new (); +                        BD_VALIDATE_MEM_ALLOC (bd_xattr, op_errno, out); +                        xattr_req = bd_xattr; +                } +                if (dict_set_int8 (xattr_req, BD_XATTR, 1) < 0) +                        goto out; +        } + +        STACK_WIND (frame, bd_lookup_cbk, FIRST_CHILD (this), +                    FIRST_CHILD (this)->fops->lookup, loc, xattr_req); + +        if (bd_xattr) +                dict_unref (bd_xattr); +        return 0; +out: +        BD_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + +        return 0; +} + +int +bd_forget (xlator_t *this, inode_t *inode) +{ +        int          ret   = -1; +        uint64_t     ctx   = 0; +        bd_attr_t   *bdatt = NULL; + +        ret = bd_inode_ctx_get (inode, this, &bdatt); +        if (!ret) { +                inode_ctx_del (inode, this, &ctx); +                FREE (bdatt); +        } +        return 0; +} + +int +bd_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, +                 int op_errno, gf_dirent_t *entries, dict_t *xdata) +{ +        gf_dirent_t    *entry  = NULL; +        uint64_t        size   = 0; +        char           *type   = NULL; + +        if (op_ret < 0) +                goto out; + +        list_for_each_entry (entry, &entries->list, list) { +                if (entry->d_type != DT_REG) +                        continue; +                if (!bd_get_bd_info (frame, this, entry->dict, +                                     entry->d_stat.ia_gfid, &type, &size)) { +                        entry->d_stat.ia_size = size; +                        entry->d_stat.ia_blocks = size / 512; +                        FREE (type); +                } +        } + +out: +        BD_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, xdata); +        return 0; +} + +/* + * bd_readdirp: In bd_readdirp_cbk if the file and BD_XATTR_SIZE is set + * ia_size is updated with the LV(BD_XATTR_SIZE) size + */ +int32_t +bd_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, +             off_t off, dict_t *dict) +{ +        int          op_errno = EINVAL; +        bd_local_t  *local    = NULL; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); +        VALIDATE_OR_GOTO (this->private, out); + +        if (!dict) { +                local = bd_local_init (frame, this); +                BD_VALIDATE_MEM_ALLOC (local, op_errno, out); +                local->dict = dict_new (); +                BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out); +                dict = local->dict; +        } + +        if (dict_set_int8 (dict, BD_XATTR, 0)) { +                gf_log (this->name, GF_LOG_WARNING, +                        "failed to set key %s", BD_XATTR); +                goto out; +        } + +        STACK_WIND (frame, bd_readdirp_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->readdirp, fd, size, off, dict); + +        return 0; +out: +        BD_STACK_UNWIND (readdirp, frame, -1, op_errno, NULL, dict); +        return 0; +} + +int +bd_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, +             int op_errno, struct iatt *buf, dict_t *xdata) +{ +        bd_local_t  *local = frame->local; +        bd_attr_t   *bdatt = NULL; + +        /* only regular files are part of BD object */ +        if (op_ret < 0 || buf->ia_type != IA_IFREG) +                goto out; + +        BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out); + +        /* update buf with LV size */ +        if (!bd_inode_ctx_get (local->inode, this, &bdatt)) +                memcpy (buf, bdatt, sizeof (struct iatt)); + +out: +        BD_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); +        return 0; +} + +int +bd_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ +        int          op_errno = EINVAL; +        bd_local_t  *local    = NULL; +        bd_attr_t   *bdatt    = NULL; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (loc, out); +        VALIDATE_OR_GOTO (loc->path, out); +        VALIDATE_OR_GOTO (this->private, out); + +        if (!bd_inode_ctx_get (loc->inode, this, &bdatt)) { +                BD_STACK_UNWIND (stat, frame, 0, 0, &bdatt->iatt, xdata); +                return 0; +        } + +        local = bd_local_init (frame, this); +        BD_VALIDATE_MEM_ALLOC (local, op_errno, out); +        local->inode = inode_ref (loc->inode); + +        STACK_WIND(frame, bd_stat_cbk, FIRST_CHILD(this), +                   FIRST_CHILD(this)->fops->stat, loc, xdata); +        return 0; +out: +        BD_STACK_UNWIND (stat, frame, -1, op_errno, NULL, xdata); +        return 0; +} + +int +bd_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, +               int op_errno, struct statvfs *buff, dict_t *xdata) +{ +        uint64_t      size    = 0; +        uint64_t      fr_size = 0; +        bd_priv_t    *priv    = NULL; +        vg_t          vg      = NULL; + +        if (op_ret < 0) +                goto out; + +        priv = this->private; + +        vg = lvm_vg_open (priv->handle, priv->vg, "r", 0); +        if (!vg) { +                gf_log (this->name, GF_LOG_WARNING, "opening VG %s failed", +                        priv->vg); +                op_ret = -1; +                op_errno = EAGAIN; +                goto out; +        } +        size = lvm_vg_get_size (vg); +        fr_size = lvm_vg_get_free_size (vg); +        lvm_vg_close (vg); + +        buff->f_blocks += size / buff->f_frsize; +        buff->f_bfree += fr_size / buff->f_frsize; +        buff->f_bavail += fr_size / buff->f_frsize; + +out: +        BD_STACK_UNWIND (statfs, frame, op_ret, op_errno, buff, xdata); +        return 0; +} + +/* + * bd_statfs: Mimics statfs by returning used/free extents in the VG + */ +int +bd_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); +        VALIDATE_OR_GOTO (loc, out); + +        STACK_WIND (frame, bd_statfs_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->statfs, loc, xdata); +        return 0; +out: +        BD_STACK_UNWIND (statfs, frame, -1, EINVAL, NULL, NULL); +        return 0; +} + +int +bd_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, +              int op_errno, struct iatt *buf, dict_t *xdata) +{ +        bd_attr_t  *bdatt = NULL; +        bd_local_t *local = frame->local; + +        /* only regular files are part of BD object */ +        if (op_ret < 0 || buf->ia_type != IA_IFREG) +                goto out; + +        BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out); + +        /* update buf with LV size */ +        if (!bd_inode_ctx_get (local->inode, this, &bdatt)) +                memcpy (buf, &bdatt->iatt, sizeof (struct iatt)); + +out: +        BD_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); +        return 0; +} + +int +bd_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ +        int          op_errno = EINVAL; +        bd_local_t  *local    = NULL; +        bd_attr_t   *bdatt    = NULL; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); +        VALIDATE_OR_GOTO (this->private, out); + +        /* if its already cached return it */ +        if (!bd_inode_ctx_get (fd->inode, this, &bdatt)) { +                BD_STACK_UNWIND (fstat, frame, 0, 0, &bdatt->iatt, xdata); +                return 0; +        } + +        local = bd_local_init (frame, this); +        BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + +        local->inode = inode_ref (fd->inode); + +        STACK_WIND (frame, bd_fstat_cbk, FIRST_CHILD(this), +                   FIRST_CHILD(this)->fops->fstat, fd, xdata); + +        return 0; +out: +        BD_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, xdata); +        return 0; +} + +static inline void +bd_update_amtime (struct iatt *iatt, int flag) +{ +        struct timespec ts         = {0, }; + +        clock_gettime (CLOCK_REALTIME, &ts); +        if (flag & GF_SET_ATTR_ATIME) { +                iatt->ia_atime = ts.tv_sec; +                iatt->ia_atime_nsec = ts.tv_nsec; +        } +        if (flag & GF_SET_ATTR_MTIME) { +                iatt->ia_mtime = ts.tv_sec; +                iatt->ia_mtime_nsec = ts.tv_nsec; +        } +} + +/* + * bd_readv: If posix file, invokes posix_readv otherwise reads from the BD + * file + */ +int +bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, +          off_t offset, uint32_t flags, dict_t *xdata) +{ +        int             ret        = -1; +        int             _fd        = -1; +        int32_t         op_ret     = -1; +        int32_t         op_errno   = 0; +        bd_fd_t        *bd_fd      = NULL; +        struct iovec    vec        = {0, }; +        struct iobuf   *iobuf      = NULL; +        struct iobref  *iobref     = NULL; +        uint64_t        bd_size    = 0; +        bd_attr_t      *bdatt      = NULL; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); +        VALIDATE_OR_GOTO (this->private, out); + +        ret = bd_fd_ctx_get (this, fd, &bd_fd); +        if (ret < 0 || !bd_fd) { +                STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->readv, +                            fd, size, offset, flags, xdata); +                return 0; +        } +        if (!size) { +                op_errno = EINVAL; +                gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size); +                goto out; +        } +        iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); +        if (!iobuf) { +                op_errno = ENOMEM; +                goto out; +        } +        _fd = bd_fd->fd; +        op_ret = pread (_fd, iobuf->ptr, size, offset); +        if (op_ret == -1) { +                op_errno = errno; +                gf_log (this->name, GF_LOG_ERROR, +                                "read failed on fd=%p: %s", fd, +                                strerror (op_errno)); +                goto out; +        } + +        vec.iov_base = iobuf->ptr; +        vec.iov_len = op_ret; + +        iobref = iobref_new (); +        iobref_add (iobref, iobuf); + +        if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { +                op_errno = EINVAL; +                op_ret = -1; +                goto out; +        } +        bd_size = bdatt->iatt.ia_size; +        if (!bd_size || (offset + vec.iov_len) >= bd_size) +                op_errno = ENOENT; + +        op_ret = vec.iov_len; +        bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_ATIME); + +out: +        BD_STACK_UNWIND (readv, frame, op_ret, op_errno, +                         &vec, 1, &bdatt->iatt, iobref, NULL); + +        if (iobref) +                iobref_unref (iobref); +        if (iobuf) +                iobuf_unref (iobuf); + +        return 0; +} + +#ifdef BLKDISCARD +/* + * bd_discard: Sends BLKDISCARD ioctl to the block device + */ +int +bd_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +            size_t len, dict_t *xdata) +{ +        int           ret      = -1; +        int           op_errno = EINVAL; +        bd_fd_t      *bd_fd    = NULL; +        uint64_t      param[2] = {0, }; +        bd_attr_t    *bdatt    = NULL; +        struct iatt   prebuf   = {0, }; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); +        VALIDATE_OR_GOTO (fd, out); + +        /* posix */ +        if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { +                STACK_WIND (frame, default_discard_cbk, FIRST_CHILD(this), +                           FIRST_CHILD(this)->fops->discard, +                           fd, offset, len, xdata); +                return 0; +        } + +        ret = bd_fd_ctx_get (this, fd, &bd_fd); +        if (ret < 0 || !bd_fd) { +                op_errno = EINVAL; +                goto out; +        } + +        param[0] = offset; +        param[1] = len; +        ret = ioctl (bd_fd->fd, BLKDISCARD, param); +        if (ret < 0) { +                if (errno == ENOTTY) +                        op_errno = ENOSYS; +                else +                        op_errno = errno; +                goto out; +        } +        memcpy (&prebuf, &bdatt->iatt, sizeof (prebuf)); +        bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME); + +        BD_STACK_UNWIND (discard, frame, ret, op_errno, &prebuf, +                         &bdatt->iatt, xdata); +        return 0; + +out: +        BD_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); +        return 0; +} +#else + +int +bd_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +            size_t len, dict_t *xdata) +{ +        BD_STACK_UNWIND (discard, frame, -1, ENOSYS, NULL, NULL, NULL); +        return 0; +} +#endif + +/* + * Call back from posix_open for opening the backing posix file + * If it failed, close BD fd + */ +int +bd_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +             int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +{ +        bd_fd_t    *bd_fd = NULL; +        bd_attr_t  *bdatt = NULL; + +        if (!op_ret) +                goto out; + +        bd_inode_ctx_get (fd->inode, this, &bdatt); +        if (!bdatt) /* posix file */ +                goto out; + +        /* posix open failed */ +        if (bd_fd_ctx_get (this, fd, &bd_fd) < 0) { +                gf_log (this->name, GF_LOG_WARNING, +                        "bd_fd is NULL from fd=%p", fd); +                goto out; +        } +        close (bd_fd->fd); +        GF_FREE (bd_fd); + +out: +        BD_STACK_UNWIND (open, frame, op_ret, op_errno, fd, NULL); + +        return 0; +} + +/* + * bd_open: Opens BD file if given posix file is mapped to BD. Also opens + * posix file. + * fd contains both posix and BD fd + */ +int32_t +bd_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, +         fd_t *fd, dict_t *xdata) +{ +        int32_t      ret     = EINVAL; +        bd_fd_t     *bd_fd   = NULL; +        bd_attr_t   *bdatt   = NULL; +        bd_gfid_t    gfid    = {0, }; +        char        *devpath = NULL; +        bd_priv_t   *priv    = this->private; +        int         _fd      = -1; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); +        VALIDATE_OR_GOTO (loc, out); +        VALIDATE_OR_GOTO (fd, out); + +        /* not bd file */ +        if (fd->inode->ia_type != IA_IFREG || +            bd_inode_ctx_get (fd->inode, this, &bdatt)) +                goto posix; + +        uuid_utoa_r (fd->inode->gfid, gfid); +        asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid); +        BD_VALIDATE_MEM_ALLOC (devpath, ret, out); + +        _fd = open (devpath, flags | O_LARGEFILE, 0); +        if (_fd < 0) { +                ret = errno; +                gf_log (this->name, GF_LOG_ERROR, "open on %s: %s", devpath, +                        strerror (ret)); +                goto out; +        } +        bd_fd = GF_CALLOC (1, sizeof(bd_fd_t), gf_bd_fd); +        BD_VALIDATE_MEM_ALLOC (bd_fd, ret, out); + +        bd_fd->fd = _fd; +        bd_fd->flag = flags | O_LARGEFILE; +        if (fd_ctx_set (fd, this, (uint64_t)(long)bd_fd) < 0) { +                gf_log (this->name, GF_LOG_WARNING, +                        "failed to set the fd context fd=%p", fd); +                goto out; +        } + +        ret = 0; + +posix: + +        /* open posix equivalant of this file, fd needed for fd related +           operations like fsetxattr, ftruncate etc */ +        STACK_WIND (frame, bd_open_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + +        return 0; +out: +        BD_STACK_UNWIND (open, frame, -1, ret, fd, NULL); + +        FREE (devpath); +        if (ret) { +                close (_fd); +                GF_FREE (bd_fd); +        } + +        return 0; +} + +/* + * call back from posix_setattr after updating iatt to posix file. + */ +int +bd_fsync_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                      int op_ret, int op_errno, struct iatt *pre, +                      struct iatt *post, dict_t *xdata) +{ +        bd_local_t *local = frame->local; +        bd_attr_t  *bdatt = local->bdatt; + +        BD_STACK_UNWIND (fsync, frame, op_ret, op_errno, &bdatt->iatt, +                         &bdatt->iatt, NULL); +        return 0; +} + +int +bd_do_fsync (int fd, int datasync) +{ +        int   op_errno = 0; + +#ifdef HAVE_FDATASYNC +        if (datasync) { +                if (fdatasync (fd)) { +                        op_errno = errno; +                        gf_log (THIS->name, GF_LOG_ERROR, +                                "fdatasync on fd=%d failed: %s", +                                fd, strerror (errno)); +                } + +        } else +#endif +        { +                if (fsync (fd)) { +                        op_errno = errno; +                        gf_log (THIS->name, GF_LOG_ERROR, +                                "fsync on fd=%d failed: %s", +                                 fd, strerror (op_errno)); +                } +        } + +        return op_errno; +} + +/* + * bd_fsync: Syncs if BD fd, forwards the request to posix + * fsync -> posix_setattr -> posix_fsync +*/ +int32_t +bd_fsync (call_frame_t *frame, xlator_t *this, +          fd_t *fd, int32_t datasync, dict_t *xdata) +{ +        int         ret      = -1; +        int32_t     op_ret   = -1; +        int32_t     op_errno = 0; +        bd_fd_t    *bd_fd    = NULL; +        bd_priv_t  *priv     = NULL; +        bd_attr_t  *bdatt    = NULL; +        bd_local_t *local    = NULL; +        int         valid    = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; +        struct iatt prebuf   = {0, }; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        ret = bd_inode_ctx_get (fd->inode, this, &bdatt); +        ret = bd_fd_ctx_get (this, fd, &bd_fd); +        if (ret < 0 || !bd_fd || !bdatt) { +                STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD (this), +                            FIRST_CHILD (this)->fops->fsync, fd, datasync, +                            xdata); +                return 0; +        } + +        memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); + +        op_errno = bd_do_fsync (bd_fd->fd, datasync); +        if (op_errno) +                goto out; + +        /* For BD, Update the a|mtime during full fsync only */ +        if (!datasync) { +                local = bd_local_init (frame, this); +                /* In case of mem failure, should posix flush called ? */ +                BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + +                local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); +                BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out); + +                local->bdatt->type = gf_strdup (bdatt->type); +                memcpy (&local->bdatt->iatt, &bdatt->iatt, sizeof (struct iatt)); +                bd_update_amtime (&local->bdatt->iatt, valid); +                uuid_copy (local->loc.gfid, fd->inode->gfid); +                STACK_WIND (frame, bd_fsync_setattr_cbk, FIRST_CHILD (this), +                            FIRST_CHILD (this)->fops->setattr, &local->loc, +                            &local->bdatt->iatt, +                            valid, NULL); +                return 0; +        } + +out: +        BD_STACK_UNWIND (fsync, frame, op_ret, op_errno, &prebuf, +                         &bdatt->iatt, NULL); +        return 0; +} + +int +bd_flush_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                      int op_ret, int op_errno, struct iatt *pre, +                      struct iatt *post, dict_t *xdata) +{ +        BD_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata); +        return 0; +} + +int +bd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ +        int          ret    = -1; +        bd_fd_t     *bd_fd  = NULL; +        bd_priv_t   *priv   = NULL; +        bd_attr_t   *bdatt  = NULL; +        int          valid    = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; +        bd_local_t  *local    = NULL; +        int          op_errno = EINVAL; +        loc_t        loc      = {0, }; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); +        VALIDATE_OR_GOTO (this->private, out); + +        priv = this->private; + +        ret = bd_inode_ctx_get (fd->inode, this, &bdatt); +        if (!bdatt) +                goto out; + +        ret = bd_fd_ctx_get (this, fd, &bd_fd); +        if (ret < 0 || !bd_fd || !bdatt) { +                gf_log (this->name, GF_LOG_WARNING, +                        "bdfd/bdatt is NULL from fd=%p", fd); +                goto out; +        } + +        local = bd_local_init (frame, this); +        BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + +        local->fd = fd_ref (fd); +        uuid_copy (loc.gfid, bdatt->iatt.ia_gfid); + +        /* Update the a|mtime during flush */ +        STACK_WIND (frame, bd_flush_setattr_cbk, FIRST_CHILD (this), +                    FIRST_CHILD (this)->fops->setattr, &loc, &bdatt->iatt, +                    valid, NULL); + +        return 0; + +out: +        STACK_WIND (frame, default_flush_cbk, FIRST_CHILD (this), +                    FIRST_CHILD (this)->fops->flush, fd, xdata); + +        return 0; +} + +int32_t +bd_release (xlator_t *this, fd_t *fd) +{ +        int          ret      = -1; +        bd_fd_t     *bd_fd    = NULL; +        uint64_t     tmp_bfd  = 0; +        bd_attr_t   *bdatt    = NULL; +        bd_priv_t   *priv     = this->private; + +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); +        VALIDATE_OR_GOTO (priv, out); + +        ret = bd_inode_ctx_get (fd->inode, this, &bdatt); +        if (ret || !bdatt) /* posix file */ +                goto out; + +        /* FIXME: Update amtime during release */ + +        ret = fd_ctx_del (fd, this, &tmp_bfd); +        if (ret < 0) { +                gf_log (this->name, GF_LOG_WARNING, +                        "bfd is NULL from fd=%p", fd); +                goto out; +        } +        bd_fd = (bd_fd_t *)(long)tmp_bfd; + +        close (bd_fd->fd); +        GF_FREE (bd_fd); +out: +        return 0; +} + +/* + * Call back for removexattr after removing BD_XATTR incase of + * bd create failure + */ +int +bd_setx_rm_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                         int op_ret, int op_errno, dict_t *xdata) +{ +        bd_local_t *local = frame->local; + +        if (local->fd) +                BD_STACK_UNWIND (setxattr, frame, -1, EIO, xdata); +        else +                BD_STACK_UNWIND (setxattr, frame, -1, EIO, xdata); +        return 0; + +} + +/* + * Call back after setting BD_XATTR. Creates BD. If BD creation is a failure + * invokes posix_removexattr to remove created BD_XATTR + */ +int +bd_setx_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                  int op_ret, int op_errno, dict_t *xdata) +{ +        bd_local_t *local = frame->local; +        bd_attr_t  *bdatt = NULL; + +        if (op_ret < 0) +                goto next; + +        /* Create LV */ +        op_errno = bd_create (local->inode->gfid, local->bdatt->iatt.ia_size, +                              local->bdatt->type, this->private); +        if (!op_errno) +                goto out; + +        /* LV creation failed, remove BD_XATTR */ +        if (local->fd) +                STACK_WIND (frame, bd_setx_rm_xattr_cbk, FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->fremovexattr, +                            local->fd, BD_XATTR, NULL); +        else +                STACK_WIND (frame, bd_setx_rm_xattr_cbk, FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->removexattr, +                            &local->loc, BD_XATTR, NULL); + +        return 0; +out: + +        bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); +        if (!bdatt) { +                op_ret = -1; +                op_errno = ENOMEM; +                goto next; +        } + +        memcpy (&bdatt->iatt, &local->bdatt->iatt, sizeof (struct iatt)); +        bdatt->type = gf_strdup (local->bdatt->type); + +        bd_inode_ctx_set (local->inode, THIS, bdatt); + +next: +        if (local->fd) +                BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); +        else +                BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); +        return 0; + +} + +/* + * Call back from posix_stat + */ +int +bd_setx_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                  int op_ret, int op_errno, struct iatt *iatt, +                  dict_t *xdata) +{ +        char       *param  = NULL; +        char       *type   = NULL; +        char       *s_size = NULL; +        char       *p      = NULL; +        char       *copy   = NULL; +        bd_local_t *local  = frame->local; +        bd_priv_t  *priv   = this->private; +        char       *bd     = NULL; +        uint64_t    size   = 0; + +        if (op_ret < 0) +                goto out; + +        if (!IA_ISREG (iatt->ia_type)) { +                op_errno = EOPNOTSUPP; +                goto out; +        } + +        param = copy = GF_CALLOC (1, local->data->len + 1, gf_common_mt_char); +        BD_VALIDATE_MEM_ALLOC (param, op_errno, out); + +        strncpy (param, local->data->data, local->data->len); + +        type = strtok_r (param, ":", &p); +        if (!type) { +                op_errno = EINVAL; +                goto out; +        } + +        if (strcmp (type, BD_LV) && strcmp (type, BD_THIN)) { +                gf_log (this->name, GF_LOG_WARNING, "Invalid bd type %s given", +                        type); +                op_errno = EINVAL; +                goto out; +        } + +        s_size = strtok_r (NULL, ":", &p); + +        /* If size not specified get default size */ +        if (!s_size) +                size = bd_get_default_extent (priv); +        else +                gf_string2bytesize (s_size, &size); + +        gf_asprintf (&bd, "%s:%ld", type, size); +        BD_VALIDATE_MEM_ALLOC (bd, op_errno, out); + +        local->dict = dict_new (); +        BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out); + +        local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); +        BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out); + +        if (dict_set_dynstr (local->dict, BD_XATTR, bd) < 0) { +                op_errno = EINVAL; +                goto out; +        } + +        local->bdatt->type = gf_strdup (type); +        memcpy (&local->bdatt->iatt, iatt, sizeof (struct iatt)); +        local->bdatt->iatt.ia_size = size; + +        if (local->fd) +                STACK_WIND (frame, bd_setx_setx_cbk, FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->fsetxattr, +                            local->fd, local->dict, 0, NULL); +        else +                STACK_WIND (frame, bd_setx_setx_cbk, FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->setxattr, +                            &local->loc, local->dict, 0, NULL); + +        return 0; + +out: +        if (local->fd) +                BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, xdata); +        else +                BD_STACK_UNWIND (setxattr, frame, -1, op_errno, xdata); + +        GF_FREE (bd); +        GF_FREE (copy); +        return 0; +} + + +/* + * bd_setxattr: Used to create & map an LV to a posix file using + * BD_XATTR xattr + * bd_setxattr -> posix_stat -> bd_setx_stat_cbk -> posix_setxattr -> + * bd_setx_setx_cbk -> create_lv + * if create_lv failed, posix_removexattr -> bd_setx_rm_xattr_cbk + */ +int32_t +bd_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, +             int flags, dict_t *xdata) +{ +        int           op_errno  = 0; +        data_t       *data      = NULL; +        bd_attr_t    *bdatt     = NULL; +        bd_local_t   *local     = NULL; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); +        VALIDATE_OR_GOTO (loc, out); + +        bd_inode_ctx_get (loc->inode, this, &bdatt); + +        data =  dict_get (dict, BD_XATTR); +        if (!data) { +                /* non bd file object */ +                STACK_WIND (frame, default_setxattr_cbk, FIRST_CHILD(this), +                                   FIRST_CHILD(this)->fops->setxattr, +                                   loc, dict, flags, xdata); +                return 0; +        } + +        if (bdatt) { +                gf_log (this->name, GF_LOG_WARNING, +                        "%s already mapped to BD", loc->path); +                op_errno = EEXIST; +                goto out; +        } +        local = bd_local_init (frame, this); +        BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + +        local->inode = inode_ref (loc->inode); +        loc_copy (&local->loc, loc); +        local->data = data; + +        STACK_WIND (frame, bd_setx_stat_cbk, FIRST_CHILD(this), +                   FIRST_CHILD(this)->fops->stat, loc, xdata); + +        return 0; + +out: +        BD_STACK_UNWIND (setxattr, frame, -1, op_errno, xdata); +        return 0; +} + +/* + * bd_fsetxattr: Used to create/map an LV to a posix file using + * BD_XATTR xattr + * bd_fsetxattr ->  posix_fstat -> bd_setx_stat_cbk -> posix_fsetxattr -> + * bd_setx_setx_cbk -> create_lv + * if create_lv failed, posix_removexattr -> bd_setx_rm_xattr_cbk + * -> bd_fsetxattr_cbk + */ +int32_t +bd_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, +              int flags, dict_t *xdata) +{ +        int       op_errno = 0; +        data_t   *data     = NULL; +        bd_attr_t *bdatt   = NULL; +        bd_local_t *local  = NULL; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (this->private, out); +        VALIDATE_OR_GOTO (fd, out); + +        bd_inode_ctx_get (fd->inode, this, &bdatt); + +        data =  dict_get (dict, BD_XATTR); +        if (data) { +                if (bdatt) { +                        gf_log (this->name, GF_LOG_WARNING, +                                "fd %p already mapped to BD", fd); +                        op_errno = EEXIST; +                        goto out; +                } +                local = bd_local_init (frame, this); +                BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + +                local->inode = inode_ref (fd->inode); +                local->fd = fd_ref (fd); +                local->data = data; + +                STACK_WIND(frame, bd_setx_stat_cbk, FIRST_CHILD(this), +                           FIRST_CHILD(this)->fops->fstat, fd, xdata); +        } else { +                /* non bd file object */ +                STACK_WIND (frame, default_fsetxattr_cbk, FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->fsetxattr, +                            fd, dict, flags, xdata); +        } + +        return 0; +out: + +        BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + +        return 0; +} + +int32_t +bd_removexattr (call_frame_t *frame, xlator_t *this, +                   loc_t *loc, const char *name, dict_t *xdata) +{ +        if (!strcmp (name, BD_XATTR)) +            goto out; + +        STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); +        return 0; +out: +        BD_STACK_UNWIND (removexattr, frame, -1, ENODATA, NULL); +        return 0; +} + +int32_t +bd_fremovexattr (call_frame_t *frame, xlator_t *this, +                    fd_t *fd, const char *name, dict_t *xdata) +{ +        if (!strcmp (name, BD_XATTR)) +            goto out; + +        STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + +        return 0; +out: +        BD_STACK_UNWIND (fremovexattr, frame, -1, ENODATA, NULL); +        return 0; +} + +int +bd_trunc_setxattr_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                        int op_ret, int op_errno, dict_t *xdata) +{ +        bd_local_t *local = frame->local; + +        if (local->fd) +                BD_STACK_UNWIND (ftruncate, frame, -1, EIO, NULL, NULL, NULL); +        else +                BD_STACK_UNWIND (truncate, frame, -1, EIO, NULL, NULL, NULL); + +        return 0; +} + +/* + * Call back for setxattr after setting BD_XATTR_SIZE. + */ +int +bd_trunc_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                       int op_ret, int op_errno, dict_t *xdata) +{ +        bd_local_t *local = frame->local; +        bd_attr_t   *bdatt = NULL; +        struct iatt prebuf = {0, }; +        char         *bd   = NULL; + +        if (op_ret < 0) +                goto out; + +        bd_inode_ctx_get (local->inode, this, &bdatt); +        if (!bdatt) +                goto revert_xattr; + +        op_errno = bd_resize (this->private, local->inode->gfid, +                              local->bdatt->iatt.ia_size); +        if (op_errno) +                goto revert_xattr; + +        memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); +        /* LV resized, update new size in the cache */ +        bdatt->iatt.ia_size = local->bdatt->iatt.ia_size; + +        if (local->fd) +                BD_STACK_UNWIND (ftruncate, frame, 0, 0, &prebuf, &bdatt->iatt, +                                 NULL); +        else +                BD_STACK_UNWIND (truncate, frame, 0, 0, &prebuf, &bdatt->iatt, +                                 NULL); + +        return 0; + +revert_xattr: +        /* revert setxattr */ +        op_ret = dict_get_str (local->dict, BD_XATTR, &bd); +        GF_FREE (bd); +        gf_asprintf (&bd, "%s:%ld", bdatt->type, bdatt->iatt.ia_size); + +        if (local->fd) +                STACK_WIND (frame, bd_trunc_setxattr_setx_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->fsetxattr, +                            local->fd, local->dict, 0, NULL); +        else +                STACK_WIND (frame, bd_trunc_setxattr_setx_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->setxattr, +                            &local->loc, local->dict, 0, NULL); + +        return 0; +out: +        if (local->fd) +                BD_STACK_UNWIND (ftruncate, frame, -1, EIO, NULL, NULL, NULL); +        else +                BD_STACK_UNWIND (truncate, frame, -1, EIO, NULL, NULL, NULL); + +        return 0; +} + +/* + * call back from posix_[f]truncate_stat + * If offset > LV size, it resizes the LV and calls posix_setxattr + * to update new LV size in xattr else calls posix_setattr for updating + * the posix file so that truncate fop behaves properly + */ +int +bd_trunc_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                   int op_ret, int op_errno, struct iatt *buf, dict_t *xdata) +{ +        char       *bd    = NULL; +        bd_local_t *local = frame->local; +        bd_attr_t  *bdatt = NULL; + +        if (op_ret < 0) +                goto out; + +        local->dict  = dict_new (); +        BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out); + +        bd_inode_ctx_get (local->inode, this, &bdatt); +        if (!bdatt) { +                op_errno = EINVAL; +                goto out; +        } + +        gf_asprintf (&bd, "%s:%ld", bdatt->type, local->bdatt->iatt.ia_size); +        if (dict_set_dynstr (local->dict, BD_XATTR, bd)) { +                op_errno = EINVAL; +                goto out; +        } + +        if (local->fd) +                STACK_WIND (frame, bd_trunc_setxattr_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->fsetxattr, +                            local->fd, local->dict, 0, NULL); +        else +                STACK_WIND (frame, bd_trunc_setxattr_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->setxattr, +                            &local->loc, local->dict, 0, NULL); + +        return 0; +out: +        if (local->fd) +                BD_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, +                                 NULL); +        else +                BD_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, +                                 NULL); +        GF_FREE (bd); +        return 0; +} + +void +bd_do_trunc (call_frame_t *frame, xlator_t *this, fd_t *fd, loc_t *loc, +             off_t offset, bd_attr_t *bdatt) +{ +        bd_local_t *local    = NULL; +        struct iatt prebuf   = {0, }; +        int         op_errno = 0; +        int         op_ret   = -1; + +        /* If requested size is less than LV size, return success */ +        if (offset <= bdatt->iatt.ia_size) { +                memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); +                bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME); +                op_ret = 0; +                goto out; +        } + +        local = bd_local_init (frame, this); +        BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + +        local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); +        BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out); + +        if (fd) { +                local->inode = inode_ref (fd->inode); +                local->fd = fd_ref (fd); +        } else { +                local->inode = inode_ref (loc->inode); +                loc_copy (&local->loc, loc); +        } + +        local->bdatt->iatt.ia_size = +                bd_adjust_size (this->private, offset); + +        STACK_WIND (frame, bd_trunc_stat_cbk, FIRST_CHILD(this), +                   FIRST_CHILD(this)->fops->fstat, fd, NULL); + +        return; + +out: +        if (fd) +                BD_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, +                                 &prebuf, &bdatt->iatt, NULL); +        else +                BD_STACK_UNWIND (truncate, frame, op_ret, op_errno, +                                 &prebuf, &bdatt->iatt, NULL); +        return; +} + +/* + * bd_ftruncate: Resizes a LV if fd belongs to BD. + */ +int32_t +bd_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +              dict_t *xdata) +{ +        int         op_errno = 0; +        bd_attr_t  *bdatt    = NULL; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); + +        if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { +                STACK_WIND (frame, default_ftruncate_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->ftruncate, fd, +                            offset, xdata); +                return 0; +        } + +        bd_do_trunc (frame, this, fd, NULL, offset, bdatt); +        return 0; +out: +        BD_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); +        return 0; +} + +/* + * bd_truncate: Resizes a LV if file maps to LV. + */ +int32_t +bd_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, +             dict_t *xdata) +{ +        int         op_errno = 0; +        bd_attr_t  *bdatt    = NULL; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (loc, out); + +        if (bd_inode_ctx_get (loc->inode, this, &bdatt)) { +                STACK_WIND (frame, default_truncate_cbk, +                            FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->truncate, loc, +                            offset, xdata); +                return 0; +        } + +        bd_do_trunc (frame, this, NULL, loc, offset, bdatt); +        return 0; + +out: +        BD_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); +        return 0; +} + +int32_t +__bd_pwritev (int fd, struct iovec *vector, int count, off_t offset, +              uint64_t bd_size) +{ +        int        index           = 0; +        int        retval          = 0; +        off_t      internal_offset = 0; + +        if (!vector) +                return -EFAULT; + +        retval = pwritev (fd, vector, count, offset); +        if (retval == -1) { +                gf_log (THIS->name, GF_LOG_WARNING, +                        "base %p, length %ld, offset %ld, message %s", +                        vector[index].iov_base, vector[index].iov_len, +                        internal_offset, strerror (errno)); +                retval = -errno; +                goto err; +        } +/* + + +        internal_offset = offset; +        for (index = 0; index < count; index++) { +                if (internal_offset > bd_size) { +                        op_ret = -ENOSPC; +                        goto err; +                } +                if (internal_offset + vector[index].iov_len > bd_size) { +                        vector[index].iov_len = bd_size - internal_offset; +                        no_space = 1; +                } +                retval = pwritev (fd, vector[index].iov_base, +                                vector[index].iov_len, internal_offset); +                if (retval == -1) { +                        gf_log (THIS->name, GF_LOG_WARNING, +                                "base %p, length %ld, offset %ld, message %s", +                                vector[index].iov_base, vector[index].iov_len, +                                internal_offset, strerror (errno)); +                        op_ret = -errno; +                        goto err; +                } +                op_ret += retval; +                internal_offset += retval; +                if (no_space) +                        break; +        } +*/ +err: +        return retval; +} + +/* + * bd_writev: Writes to LV if its BD file or forwards the request to posix_write + * bd_writev -> posix_writev -> bd_writev_cbk + */ +int +bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, +           int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, +           dict_t *xdict) +{ +        int32_t         op_ret    = -1; +        int32_t         op_errno  = 0; +        int             _fd       = -1; +        bd_fd_t         *bd_fd    = NULL; +        int             ret       = -1; +        uint64_t        size      = 0; +        struct iatt     prebuf    = {0, }; +        bd_attr_t      *bdatt     = NULL; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); +        VALIDATE_OR_GOTO (vector, out); + +        ret = bd_fd_ctx_get (this, fd, &bd_fd); +        if (ret < 0 || !bd_fd) { /* posix fd */ +                STACK_WIND (frame, default_writev_cbk, FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->writev, fd, vector, count, +                            offset, flags, iobref, xdict); +                return 0; +        } + +        _fd = bd_fd->fd; + +        if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { +                op_ret = -1; +                op_errno = EINVAL; +                goto out; +        } +        size = bdatt->iatt.ia_size; + +        op_ret = __bd_pwritev (_fd, vector, count, offset, size); +        if (op_ret < 0) { +                op_errno = -op_ret; +                op_ret = -1; +                gf_log (this->name, GF_LOG_ERROR, "write failed: offset %"PRIu64 +                                ", %s", offset, strerror (op_errno)); +                goto out; +        } + +        memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); +        bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME); +out: + +        BD_STACK_UNWIND (writev, frame, op_ret, op_errno, &prebuf, +                         &bdatt->iatt, NULL); +        return 0; +} + +int +bd_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, +                int op_errno, struct iatt *prebuf, struct iatt *postbuf, +                dict_t *xdata) +{ +        bd_attr_t   *bdatt = NULL; +        int         *valid = cookie; +        bd_local_t  *local = frame->local; + +        if (op_ret < 0 || !valid || !local) +                goto out; + +        if (bd_inode_ctx_get (local->inode, this, &bdatt)) +                goto out; + +        if (*valid & GF_SET_ATTR_UID) +                bdatt->iatt.ia_uid = postbuf->ia_uid; +        else if (*valid & GF_SET_ATTR_GID) +                bdatt->iatt.ia_gid = postbuf->ia_gid; +        else if (*valid & GF_SET_ATTR_MODE) { +                bdatt->iatt.ia_type = postbuf->ia_type; +                bdatt->iatt.ia_prot = postbuf->ia_prot; +        } else if (*valid & GF_SET_ATTR_ATIME) { +                bdatt->iatt.ia_atime = postbuf->ia_atime; +                bdatt->iatt.ia_atime_nsec = postbuf->ia_atime_nsec; +        } else if (*valid & GF_SET_ATTR_MTIME) { +                bdatt->iatt.ia_mtime = postbuf->ia_mtime; +                bdatt->iatt.ia_mtime_nsec = postbuf->ia_mtime_nsec; +        } + +        bdatt->iatt.ia_ctime = postbuf->ia_ctime; +        bdatt->iatt.ia_ctime_nsec = postbuf->ia_ctime_nsec; + +        memcpy (postbuf, &bdatt->iatt, sizeof (struct iatt)); +out: +        FREE (valid); +        BD_STACK_UNWIND (setattr, frame, op_ret, op_errno, prebuf, +                         postbuf, xdata); +        return 0; +} + +int +bd_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, +            int32_t valid, dict_t *xdata) +{ +        bd_local_t *local     = NULL; +        bd_attr_t  *bdatt     = NULL; +        int        *ck_valid  = NULL; +        int         op_errno  = 0; + +        if (bd_inode_ctx_get (loc->inode, this, &bdatt)) { +                STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), +                           FIRST_CHILD(this)->fops->setattr, +                           loc, stbuf, valid, xdata); +                return 0; +        } + +        local = bd_local_init (frame, this); +        BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + +        ck_valid = CALLOC (1, sizeof (valid)); +        BD_VALIDATE_MEM_ALLOC (ck_valid, op_errno, out); + +        local->inode = inode_ref (loc->inode); +        *ck_valid = valid; + +        STACK_WIND_COOKIE (frame, bd_setattr_cbk, ck_valid, FIRST_CHILD(this), +                           FIRST_CHILD(this)->fops->setattr, +                           loc, stbuf, valid, xdata); + +        return 0; +out: +        BD_STACK_UNWIND (setattr, frame, -1, ENOMEM, NULL, NULL, xdata); +        return 0; +} + +int +bd_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +             int op_ret, int op_errno, inode_t *inode, struct iatt *buf, +             struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ +        bd_attr_t *bdatt = NULL; + +        if (op_ret < 0) +                goto out; + +        if (bd_inode_ctx_get (inode, this, &bdatt)) +                goto out; + +        bdatt->iatt.ia_ctime = buf->ia_ctime; +        bdatt->iatt.ia_ctime_nsec = buf->ia_ctime_nsec; +        bdatt->iatt.ia_nlink = buf->ia_nlink; +        memcpy (buf, &bdatt->iatt, sizeof (struct iatt)); + +out: +        BD_STACK_UNWIND (link, frame, op_ret, op_errno, inode, buf, +                         preparent, postparent, NULL); +        return 0; +} + +int +bd_link (call_frame_t *frame, xlator_t *this, +         loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ +        STACK_WIND (frame, bd_link_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); +        return 0; +} + +int +bd_handle_special_xattrs (call_frame_t *frame, xlator_t *this, loc_t *loc, +                          fd_t *fd, const char *name, dict_t *xdata) +{ +        dict_t *xattr   = NULL; +        int op_ret      = -1; +        int op_errno    = ENOMEM;; +        bd_priv_t *priv = this->private; + +        xattr = dict_new (); +        if (!xattr) +                goto out; + +        if (!strcmp (name, VOL_TYPE)) +                op_ret = dict_set_int64 (xattr, (char *)name, 1); +        else +                op_ret = dict_set_int64 (xattr, (char *)name, priv->caps); + +out: +        if (loc) +                BD_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, +                                 xdata); +        else +                BD_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr, +                                 xdata); + +        op_ret = dict_reset (xattr); +        dict_unref (xattr); + +        return 0; +} + +int +bd_fgetxattr (call_frame_t *frame, xlator_t *this, +              fd_t *fd, const char *name, dict_t *xdata) +{ +        if (name && (!strcmp (name, VOL_TYPE) || !strcmp (name, VOL_CAPS))) +                bd_handle_special_xattrs (frame, this, NULL, fd, name, xdata); +        else +                STACK_WIND (frame, default_fgetxattr_cbk, FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->fgetxattr, +                            fd, name, xdata); +        return 0; +} + +int +bd_getxattr (call_frame_t *frame, xlator_t *this, +             loc_t *loc, const char *name, dict_t *xdata) +{ +        if (name && (!strcmp (name, VOL_TYPE) || !strcmp (name, VOL_CAPS))) +                bd_handle_special_xattrs (frame, this, loc, NULL, name, xdata); +        else +                STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->getxattr, +                            loc, name, xdata); + +        return 0; +} + +int +bd_unlink_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +                      int op_ret, int op_errno, inode_t *inode, +                      struct iatt *buf, dict_t *xattr, +                      struct iatt *postparent) +{ +        bd_gfid_t     gfid  = {0, }; +        bd_local_t   *local = frame->local; + +        if (buf->ia_nlink > 1) +                goto posix; + +        BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out); + +        uuid_utoa_r (inode->gfid, gfid); +        if (bd_delete_lv (this->private, gfid, &op_errno) < 0) { +                if (op_errno != ENOENT) +                        goto out; +        } + +posix: +        /* remove posix */ +        STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->unlink, +                    &local->loc, 0, NULL); + +        return 0; +out: +        BD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); +        return 0; +} + +int +bd_unlink (call_frame_t *frame, xlator_t *this, +           loc_t *loc, int xflag, dict_t *xdata) +{ +        int          op_errno = 0; +        bd_attr_t   *bdatt     = NULL; +        bd_local_t  *local     = NULL; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (loc, out); + +        if (bd_inode_ctx_get (loc->inode, this, &bdatt)) { +                STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this), +                            FIRST_CHILD(this)->fops->unlink, +                            loc, xflag, xdata); +                return 0; +        } + +        local = bd_local_init (frame, this); +        BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + +        loc_copy (&local->loc, loc); + +        STACK_WIND (frame, bd_unlink_lookup_cbk, FIRST_CHILD(this), +                    FIRST_CHILD(this)->fops->lookup, loc, NULL); +        return 0; +out: +        BD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); +        return 0; +} + +int32_t +bd_priv (xlator_t *this) +{ +        return 0; +} + +int32_t +bd_inode (xlator_t *this) +{ +        return 0; +} + +int32_t +bd_rchecksum (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +              int32_t len, dict_t *xdata) +{ +        int             op_ret          = -1; +        int             op_errno        = 0; +        int             ret             = 0; +        int             _fd             = -1; +        char           *alloc_buf       = NULL; +        char           *buf             = NULL; +        int32_t         weak_checksum   = 0; +        bd_fd_t        *bd_fd           = NULL; +        unsigned char   strong_checksum[MD5_DIGEST_LENGTH] = {0}; + +        VALIDATE_OR_GOTO (frame, out); +        VALIDATE_OR_GOTO (this, out); +        VALIDATE_OR_GOTO (fd, out); + +        ret = bd_fd_ctx_get (this, fd, &bd_fd); +        if (ret < 0 || !bd_fd) { +                STACK_WIND (frame, default_rchecksum_cbk, FIRST_CHILD (this), +                            FIRST_CHILD (this)->fops->rchecksum, fd, offset, +                            len, xdata); +                return 0; +        } + +        memset (strong_checksum, 0, MD5_DIGEST_LENGTH); + +        alloc_buf = page_aligned_alloc (len, &buf); +        if (!alloc_buf) { +                op_errno = ENOMEM; +                goto out; +        } + +        _fd = bd_fd->fd; + +        LOCK (&fd->lock); +        { +                ret = pread (_fd, buf, len, offset); +                if (ret < 0) { +                        gf_log (this->name, GF_LOG_WARNING, +                                "pread of %d bytes returned %d (%s)", +                                len, ret, strerror (errno)); +                        op_errno = errno; +                } +        } +        UNLOCK (&fd->lock); + +        if (ret < 0) +                goto out; + +        weak_checksum = gf_rsync_weak_checksum ((unsigned char *) buf, +                                                (size_t) len); +        gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) len, +                                  (unsigned char *) strong_checksum); + +        op_ret = 0; +out: +        BD_STACK_UNWIND (rchecksum, frame, op_ret, op_errno, +                         weak_checksum, strong_checksum, NULL); + +        GF_FREE (alloc_buf); + +        return 0; +} + +/** + * notify - when parent sends PARENT_UP, send CHILD_UP event from here + */ +int32_t +notify (xlator_t *this, +        int32_t event, +        void *data, +        ...) +{ +        switch (event) +        { +        case GF_EVENT_PARENT_UP: +        { +                /* Tell the parent that bd xlator is up */ +                default_notify (this, GF_EVENT_CHILD_UP, data); +        } +        break; +        default: +                break; +        } +        return 0; +} + +int32_t +mem_acct_init (xlator_t *this) +{ +        int     ret = -1; + +        if (!this) +                return ret; + +        ret = xlator_mem_acct_init (this, gf_bd_mt_end + 1); + +        if (ret != 0) +                gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" +                       "failed"); +        return ret; +} + +/** + * bd xlator init - Validate configured VG + */ +int +init (xlator_t *this) +{ +        int         ret         = 0; +        char       *vg_data     = NULL; +        char       *device      = NULL; +        bd_priv_t  *_private    = NULL; + +        if (!this->children) { +                gf_log (this->name, GF_LOG_CRITICAL, +                        "FATAL: storage/bd needs posix as subvolume"); +                return -1; +        } + +        if (!this->parents) { +                gf_log (this->name, GF_LOG_WARNING, +                        "Volume is dangling. Please check the volume file."); +        } + +        GF_OPTION_INIT ("export", vg_data, str, error); +        GF_OPTION_INIT ("device", device, str, error); + +        /* Now we support only LV device */ +        if (strcasecmp (device, BACKEND_VG)) { +                gf_log (this->name, GF_LOG_CRITICAL, +                        "FATAL: unknown %s backend %s", BD_XLATOR, device); +                return -1; +        } + +        this->local_pool = mem_pool_new (bd_local_t, 64); +        if (!this->local_pool) { +                gf_log (this->name, GF_LOG_CRITICAL, +                        "FATAL: Failed to create bd memory pool"); +                return -1; +        } + +        ret = 0; +        _private = GF_CALLOC (1, sizeof (*_private), gf_bd_private); +        if (!_private) +                goto error; + +        this->private = _private; +        _private->vg = gf_strdup (vg_data); +        if (!_private->vg) +                goto error; + +        _private->handle = lvm_init (NULL); +        if (!_private->handle) { +                gf_log (this->name, GF_LOG_CRITICAL, "lvm_init failed"); +                goto error; +        } +        _private->caps = BD_CAPS_BD; +        if (bd_scan_vg (this, _private)) +                goto error; + +        return 0; +error: +        GF_FREE (_private->vg); +        if (_private->handle) +                lvm_quit (_private->handle); +        mem_pool_destroy (this->local_pool); +        GF_FREE (_private); +        return -1; +} + +void +fini (xlator_t *this) +{ +        bd_priv_t *priv = this->private; +        mem_pool_destroy (this->local_pool); +        this->local_pool = NULL; +        if (!priv) +                return; +        lvm_quit (priv->handle); +        GF_FREE (priv->vg); +        this->private = NULL; +        GF_FREE (priv); +        return; +} + +struct xlator_dumpops dumpops = { +        .priv    = bd_priv, +        .inode   = bd_inode, +}; + +struct xlator_fops fops = { +        .readdirp    = bd_readdirp, +        .lookup      = bd_lookup, +        .stat        = bd_stat, +        .statfs      = bd_statfs, +        .open        = bd_open, +        .fstat       = bd_fstat, +        .rchecksum   = bd_rchecksum, +        .readv       = bd_readv, +        .fsync       = bd_fsync, +        .setxattr    = bd_setxattr, +        .fsetxattr   = bd_fsetxattr, +        .removexattr = bd_removexattr, +        .fremovexattr=bd_fremovexattr, +        .truncate    = bd_truncate, +        .ftruncate   = bd_ftruncate, +        .writev      = bd_writev, +        .getxattr    = bd_getxattr, +        .fgetxattr   = bd_fgetxattr, +        .unlink      = bd_unlink, +        .link        = bd_link, +        .flush       = bd_flush, +        .setattr     = bd_setattr, +        .discard     = bd_discard, +}; + +struct xlator_cbks cbks = { +        .release     = bd_release, +        .forget      = bd_forget, +}; + +struct volume_options options[] = { +        { .key = {"export"}, +          .type = GF_OPTION_TYPE_STR}, +        { .key = {"device"}, +          .type = GF_OPTION_TYPE_STR, +          .default_value = BACKEND_VG}, +        { .key = {NULL} } +}; diff --git a/xlators/storage/bd/src/bd.h b/xlators/storage/bd/src/bd.h new file mode 100644 index 00000000000..4d8b8954524 --- /dev/null +++ b/xlators/storage/bd/src/bd.h @@ -0,0 +1,140 @@ +/* +  BD translator - Exports Block devices on server side as regular +  files to client + +  Copyright IBM, Corp. 2012 + +  This file is part of GlusterFS. + +  Author: +  M. Mohan Kumar <mohan@in.ibm.com> + +  This file is licensed to you under your choice of the GNU Lesser +  General Public License, version 3 or any later version (LGPLv3 or +  later), or the GNU General Public License, version 2 (GPLv2), in all +  cases as published by the Free Software Foundation. +*/ + +#ifndef _BD_H +#define _BD_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "mem-types.h" + +#define BD_XLATOR "block device mapper xlator" +#define BACKEND_VG "vg" +#define GF_XATTR "user.glusterfs" +#define BD_XATTR GF_XATTR ".bd" + +#define BD_LV "lv" +#define BD_THIN "thin" + +#define LVM_RESIZE "/sbin/lvresize" +#define LVM_CREATE "/sbin/lvcreate" + +#define VOL_TYPE "volume.type" +#define VOL_CAPS "volume.caps" + +#define ALIGN_SIZE 4096 + +#define BD_CAPS_BD               0x01 +#define BD_CAPS_THIN             0x02 + +#define BD_VALIDATE_MEM_ALLOC(buff, op_errno, label)                \ +        if (!buff) {                                                \ +                op_errno = ENOMEM;                                  \ +                gf_log (this->name, GF_LOG_ERROR, "out of memory"); \ +                goto label;                                         \ +        } + +#define BD_VALIDATE_LOCAL_OR_GOTO(local, op_errno, label) \ +        if (!local) {                                     \ +                op_errno = EINVAL;                        \ +                goto label;                               \ +        } + +#define BD_STACK_UNWIND(typ, frame, args ...) do {      \ +        bd_local_t *__local = frame->local;             \ +        xlator_t   *__this = frame->this;               \ +                                                        \ +        frame->local = NULL;                            \ +        STACK_UNWIND_STRICT (typ, frame, args);         \ +        if (__local)                                    \ +                bd_local_free (__this, __local);        \ +        } while (0) + +typedef char bd_gfid_t[GF_UUID_BUF_SIZE]; + +enum gf_bd_mem_types_ { +        gf_bd_private  = gf_common_mt_end + 1, +        gf_bd_attr, +        gf_bd_fd, +        gf_bd_mt_end +}; + +/** + * bd_fd - internal structure + */ +typedef struct bd_fd { +        int             fd; +        int32_t         flag; +} bd_fd_t; + +typedef struct bd_priv { +        lvm_t             handle; +        char              *vg; +        char              *pool; +        int                caps; +} bd_priv_t; + + +typedef enum bd_type { +        BD_TYPE_NONE, +        BD_TYPE_LV, +} bd_type_t; + +typedef struct { +        struct iatt  iatt; +        char        *type; +} bd_attr_t; + +typedef struct { +        dict_t      *dict; +        bd_attr_t   *bdatt; +        inode_t     *inode; +        loc_t        loc; +        fd_t        *fd; +        data_t      *data; /* for setxattr */ +} bd_local_t; + +typedef struct { +        char            *lv; +        struct list_head list; +} bd_del_entry; + +/* Prototypes */ +int bd_inode_ctx_set (inode_t *inode, xlator_t *this, bd_attr_t *ctx); +int bd_inode_ctx_get (inode_t *inode, xlator_t *this, bd_attr_t **ctx); +int bd_scan_vg (xlator_t *this, bd_priv_t *priv); +bd_local_t *bd_local_init (call_frame_t *frame, xlator_t *this); +void bd_local_free (xlator_t *this, bd_local_t *local); +int bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd); +char *page_aligned_alloc (size_t size, char **aligned_buf); +int bd_validate_bd_xattr (xlator_t *this, char *bd, char **type, +                          uint64_t *lv_size, uuid_t uuid); +uint64_t bd_get_default_extent (bd_priv_t *priv); +uint64_t bd_adjust_size (bd_priv_t *priv, uint64_t size); +int bd_create (uuid_t uuid, uint64_t size, char *type, bd_priv_t *priv); +int bd_resize (bd_priv_t *priv, uuid_t uuid, off_t size); +int bd_delete_lv (bd_priv_t *priv, const char *lv_name, int *op_errno); +int bd_snapshot_create (bd_local_t *local, bd_priv_t *priv); +int bd_clone (bd_local_t *local, bd_priv_t *priv); + +int bd_merge (bd_priv_t *priv, uuid_t gfid); +int bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict); +#endif  | 
