diff options
author | M. Mohan Kumar <mohan@in.ibm.com> | 2013-11-13 22:44:42 +0530 |
---|---|---|
committer | Anand Avati <avati@redhat.com> | 2013-11-13 11:38:42 -0800 |
commit | 48c40e1a42efe1b59126406084821947d139dd0e (patch) | |
tree | 74959ecda9b9bd56c85e0e32991c11c06b022296 | |
parent | 15a8ecd9b3eedf80881bd3dba81f16b7d2cb7c97 (diff) |
bd: posix/multi-brick support to BD xlator
Current BD xlator (block backend) has a few limitations such as
* Creation of directories not supported
* Supports only single brick
* Does not use extended attributes (and client gfid) like posix xlator
* Creation of special files (symbolic links, device nodes etc) not
supported
Basic limitation of not allowing directory creation is blocking
oVirt/VDSM to consume BD xlator as part of Gluster domain since VDSM
creates multi-level directories when GlusterFS is used as storage
backend for storing VM images.
To overcome these limitations a new BD xlator with following
improvements is suggested.
* New hybrid BD xlator that handles both regular files and block device
files
* The volume will have both POSIX and BD bricks. Regular files are
created on POSIX bricks, block devices are created on the BD brick (VG)
* BD xlator leverages exiting POSIX xlator for most POSIX calls and
hence sits above the POSIX xlator
* Block device file is differentiated from regular file by an extended
attribute
* The xattr 'user.glusterfs.bd' (BD_XATTR) plays a role in mapping a
posix file to Logical Volume (LV).
* When a client sends a request to set BD_XATTR on a posix file, a new
LV is created and mapped to posix file. So every block device will
have a representative file in POSIX brick with 'user.glusterfs.bd'
(BD_XATTR) set.
* Here after all operations on this file results in LV related
operations.
For example opening a file that has BD_XATTR set results in opening
the LV block device, reading results in reading the corresponding LV
block device.
When BD xlator gets request to set BD_XATTR via setxattr call, it
creates a LV and information about this LV is placed in the xattr of the
posix file. xattr "user.glusterfs.bd" used to identify that posix file
is mapped to BD.
Usage:
Server side:
[root@host1 ~]# gluster volume create bdvol host1:/storage/vg1_info?vg1 host2:/storage/vg2_info?vg2
It creates a distributed gluster volume 'bdvol' with Volume Group vg1
using posix brick /storage/vg1_info in host1 and Volume Group vg2 using
/storage/vg2_info in host2.
[root@host1 ~]# gluster volume start bdvol
Client side:
[root@node ~]# mount -t glusterfs host1:/bdvol /media
[root@node ~]# touch /media/posix
It creates regular posix file 'posix' in either host1:/vg1 or host2:/vg2 brick
[root@node ~]# mkdir /media/image
[root@node ~]# touch /media/image/lv1
It also creates regular posix file 'lv1' in either host1:/vg1 or
host2:/vg2 brick
[root@node ~]# setfattr -n "user.glusterfs.bd" -v "lv" /media/image/lv1
[root@node ~]#
Above setxattr results in creating a new LV in corresponding brick's VG
and it sets 'user.glusterfs.bd' with value 'lv:<default-extent-size'
[root@node ~]# truncate -s5G /media/image/lv1
It results in resizig LV 'lv1'to 5G
New BD xlator code is placed in xlators/storage/bd directory.
Also add volume-uuid to the VG so that same VG can't be used for other
bricks/volumes. After deleting a gluster volume, one has to manually
remove the associated tag using vgchange <vg-name> --deltag
<trusted.glusterfs.volume-id:<volume-id>>
Changes from previous version V5:
* Removed support for delayed deleting of LVs
Changes from previous version V4:
* Consolidated the patches
* Removed usage of BD_XATTR_SIZE and consolidated it in BD_XATTR.
Changes from previous version V3:
* Added support in FUSE to support full/linked clone
* Added support to merge snapshots and provide information about origin
* bd_map xlator removed
* iatt structure used in inode_ctx. iatt is cached and updated during
fsync/flush
* aio support
* Type and capabilities of volume are exported through getxattr
Changes from version 2:
* Used inode_context for caching BD size and to check if loc/fd is BD or
not.
* Added GlusterFS server offloaded copy and snapshot through setfattr
FOP. As part of this libgfapi is modified.
* BD xlator supports stripe
* During unlinking if a LV file is already opened, its added to delete
list and bd_del_thread tries to delete from this list when a last
reference to that file is closed.
Changes from previous version:
* gfid is used as name of LV
* ? is used to specify VG name for creating BD volume in volume
create, add-brick. gluster volume create volname host:/path?vg
* open-behind issue is fixed
* A replicate brick can be added dynamically and LVs from source brick
are replicated to destination brick
* A distribute brick can be added dynamically and rebalance operation
distributes existing LVs/files to the new brick
* Thin provisioning support added.
* bd_map xlator support retained
* setfattr -n user.glusterfs.bd -v "lv" creates a regular LV and
setfattr -n user.glusterfs.bd -v "thin" creates thin LV
* Capability and backend information added to gluster volume info (and
--xml) so
that management tools can exploit BD xlator.
* tracing support for bd xlator added
TODO:
* Add support to display snapshots for a given LV
* Display posix filename for list-origin instead of gfid
Change-Id: I00d32dfbab3b7c806e0841515c86c3aa519332f2
BUG: 1028672
Signed-off-by: M. Mohan Kumar <mohan@in.ibm.com>
Reviewed-on: http://review.gluster.org/4809
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Anand Avati <avati@redhat.com>
-rw-r--r-- | cli/src/cli-cmd-volume.c | 7 | ||||
-rw-r--r-- | cli/src/cli-rpc-ops.c | 42 | ||||
-rw-r--r-- | cli/src/cli-xml-output.c | 59 | ||||
-rw-r--r-- | configure.ac | 40 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/Makefile.am | 3 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-brick-ops.c | 34 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-handler.c | 47 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-op-sm.h | 3 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-store.c | 20 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-store.h | 2 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-utils.c | 81 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volgen.c | 21 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volgen.h | 1 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-ops.c | 149 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd.h | 7 | ||||
-rw-r--r-- | xlators/storage/Makefile.am | 4 | ||||
-rw-r--r-- | xlators/storage/bd/Makefile.am | 3 | ||||
-rw-r--r-- | xlators/storage/bd/src/Makefile.am | 20 | ||||
-rw-r--r-- | xlators/storage/bd/src/bd-helper.c | 562 | ||||
-rw-r--r-- | xlators/storage/bd/src/bd.c | 2047 | ||||
-rw-r--r-- | xlators/storage/bd/src/bd.h | 140 |
21 files changed, 3289 insertions, 3 deletions
diff --git a/cli/src/cli-cmd-volume.c b/cli/src/cli-cmd-volume.c index dcb317f54fc..100be0b7337 100644 --- a/cli/src/cli-cmd-volume.c +++ b/cli/src/cli-cmd-volume.c @@ -1829,7 +1829,12 @@ struct cli_cmd volume_cmds[] = { "list information of all volumes"}, { "volume create <NEW-VOLNAME> [stripe <COUNT>] [replica <COUNT>] " - "[transport <tcp|rdma|tcp,rdma>] <NEW-BRICK> ... [force]", + "[transport <tcp|rdma|tcp,rdma>] <NEW-BRICK>" +#ifdef HAVE_BD_XLATOR + "?<vg_name>" +#endif + "... [force]", + cli_cmd_volume_create_cbk, "create a new volume of specified type with mentioned bricks"}, diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c index 0ff997dc55e..80814501541 100644 --- a/cli/src/cli-rpc-ops.c +++ b/cli/src/cli-rpc-ops.c @@ -496,6 +496,8 @@ gf_cli_get_volume_cbk (struct rpc_req *req, struct iovec *iov, char key[1024] = {0}; char err_str[2048] = {0}; gf_cli_rsp rsp = {0}; + char *caps = NULL; + int k __attribute__((unused)) = 0; if (-1 == req->rpc_status) goto out; @@ -658,6 +660,40 @@ xml_output: cli_out ("Volume ID: %s", volume_id_str); cli_out ("Status: %s", cli_vol_status_str[status]); +#ifdef HAVE_BD_XLATOR + k = 0; + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "volume%d.xlator%d", i, k); + ret = dict_get_str (dict, key, &caps); + if (ret) + goto next; + do { + j = 0; + cli_out ("Xlator %d: %s", k + 1, caps); + do { + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), + "volume%d.xlator%d.caps%d", + i, k, j++); + ret = dict_get_str (dict, key, &caps); + if (ret) + break; + cli_out ("Capability %d: %s", j, caps); + } while (1); + + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), + "volume%d.xlator%d", i, ++k); + ret = dict_get_str (dict, key, &caps); + if (ret) + break; + } while (1); + +next: +#else + caps = 0; /* Avoid compiler warnings when BD not enabled */ +#endif + if (type == GF_CLUSTER_TYPE_STRIPE_REPLICATE) { cli_out ("Number of Bricks: %d x %d x %d = %d", (brick_count / dist_count), @@ -693,6 +729,12 @@ xml_output: goto out; cli_out ("Brick%d: %s", j, brick); +#ifdef HAVE_BD_XLATOR + snprintf (key, 256, "volume%d.vg%d", i, j); + ret = dict_get_str (dict, key, &caps); + if (!ret) + cli_out ("Brick%d VG: %s", j, caps); +#endif j++; } diff --git a/cli/src/cli-xml-output.c b/cli/src/cli-xml-output.c index cc021a34d64..0f837fc7428 100644 --- a/cli/src/cli-xml-output.c +++ b/cli/src/cli-xml-output.c @@ -2497,7 +2497,8 @@ cli_xml_output_vol_info (cli_local_t *local, dict_t *dict) char key[1024] = {0,}; int i = 0; int j = 1; - + char *caps = NULL; + int k __attribute__((unused)) = 0; ret = dict_get_int32 (dict, "count", &count); if (ret) @@ -2613,6 +2614,62 @@ cli_xml_output_vol_info (cli_local_t *local, dict_t *dict) "%d", transport); XML_RET_CHECK_AND_GOTO (ret, out); +#ifdef HAVE_BD_XLATOR + /* <xlators> */ + ret = xmlTextWriterStartElement (local->writer, + (xmlChar *)"xlators"); + XML_RET_CHECK_AND_GOTO (ret, out); + + for (k = 0; ; k++) { + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key),"volume%d.xlator%d", i, k); + ret = dict_get_str (dict, key, &caps); + if (ret) + break; + + /* <xlator> */ + ret = xmlTextWriterStartElement (local->writer, + (xmlChar *)"xlator"); + XML_RET_CHECK_AND_GOTO (ret, out); + + ret = xmlTextWriterWriteFormatElement + (local->writer, (xmlChar *)"name", "%s", caps); + XML_RET_CHECK_AND_GOTO (ret, out); + + /* <capabilities> */ + ret = xmlTextWriterStartElement (local->writer, + (xmlChar *) + "capabilities"); + XML_RET_CHECK_AND_GOTO (ret, out); + + j = 0; + for (j = 0; ;j++) { + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), + "volume%d.xlator%d.caps%d", i, k, j); + ret = dict_get_str (dict, key, &caps); + if (ret) + break; + ret = xmlTextWriterWriteFormatElement + (local->writer, (xmlChar *)"capability", + "%s", caps); + XML_RET_CHECK_AND_GOTO (ret, out); + } + /* </capabilities> */ + ret = xmlTextWriterEndElement (local->writer); + XML_RET_CHECK_AND_GOTO (ret, out); + /* </xlator> */ + ret = xmlTextWriterEndElement (local->writer); + XML_RET_CHECK_AND_GOTO (ret, out); + } + ret = xmlTextWriterFullEndElement (local->writer); + XML_RET_CHECK_AND_GOTO (ret, out); + /* </xlators> */ +#else + caps = 0; /* Avoid compiler warnings when BD not enabled */ +#endif + j = 1; + /* <bricks> */ ret = xmlTextWriterStartElement (local->writer, (xmlChar *)"bricks"); diff --git a/configure.ac b/configure.ac index 9d676cd15a9..0cecafb46dc 100644 --- a/configure.ac +++ b/configure.ac @@ -53,6 +53,8 @@ AC_CONFIG_FILES([Makefile xlators/storage/Makefile xlators/storage/posix/Makefile xlators/storage/posix/src/Makefile + xlators/storage/bd/Makefile + xlators/storage/bd/src/Makefile xlators/cluster/Makefile xlators/cluster/afr/Makefile xlators/cluster/afr/src/Makefile @@ -301,6 +303,43 @@ if test "x$enable_fuse_client" != "xno"; then BUILD_FUSE_CLIENT="yes" fi +AC_ARG_ENABLE([bd-xlator], + AC_HELP_STRING([--enable-bd-xlator], [Build BD xlator])) + +if test "x$enable_bd_xlator" != "xno"; then + AC_CHECK_LIB([lvm2app], + [lvm_init,lvm_lv_from_name], + [HAVE_BD_LIB="yes"], + [HAVE_BD_LIB="no"]) + +if test "x$HAVE_BD_LIB" = "xyes"; then + # lvm_lv_from_name() has been made public with lvm2-2.02.79 + AC_CHECK_DECLS( + [lvm_lv_from_name], + [NEED_LVM_LV_FROM_NAME_DECL="no"], + [NEED_LVM_LV_FROM_NAME_DECL="yes"], + [[#include <lvm2app.h>]]) + fi +fi + +if test "x$enable_bd_xlator" = "xyes" -a "x$HAVE_BD_LIB" = "xno"; then + echo "BD xlator requested but required lvm2 development library not found." + exit 1 +fi + +BUILD_BD_XLATOR=no +if test "x${enable-bd-xlator}" != "xno" -a "x${HAVE_BD_LIB}" = "xyes"; then + BUILD_BD_XLATOR=yes + AC_DEFINE(HAVE_BD_XLATOR, 1, [define if lvm2app library found and bd xlator + enabled]) + if test "x$NEED_LVM_LV_FROM_NAME_DECL" = "xyes"; then + AC_DEFINE(NEED_LVM_LV_FROM_NAME_DECL, 1, [defined if lvm_lv_from_name() + was not found in the lvm2app.h header, but can be linked]) + fi +fi + +AM_CONDITIONAL([ENABLE_BD_XLATOR], [test x$BUILD_BD_XLATOR = xyes]) + AC_SUBST(FUSE_CLIENT_SUBDIR) # end FUSE section @@ -821,6 +860,7 @@ echo "georeplication : $BUILD_SYNCDAEMON" echo "Linux-AIO : $BUILD_LIBAIO" echo "Enable Debug : $BUILD_DEBUG" echo "systemtap : $BUILD_SYSTEMTAP" +echo "Block Device xlator : $BUILD_BD_XLATOR" echo "glupy : $BUILD_GLUPY" echo "Use syslog : $USE_SYSLOG" echo "XML output : $BUILD_XML_OUTPUT" diff --git a/xlators/mgmt/glusterd/src/Makefile.am b/xlators/mgmt/glusterd/src/Makefile.am index 17767d7ca92..a6f49ae01b1 100644 --- a/xlators/mgmt/glusterd/src/Makefile.am +++ b/xlators/mgmt/glusterd/src/Makefile.am @@ -2,6 +2,9 @@ xlator_LTLIBRARIES = glusterd.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/mgmt glusterd_la_CPPFLAGS = $(AM_CPPFLAGS) "-DFILTERDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/filter\"" glusterd_la_LDFLAGS = -module -avoid-version +if ENABLE_BD_XLATOR +glusterd_la_LDFLAGS += -llvm2app +endif glusterd_la_SOURCES = glusterd.c glusterd-handler.c glusterd-sm.c \ glusterd-op-sm.c glusterd-utils.c glusterd-rpc-ops.c \ glusterd-store.c glusterd-handshake.c glusterd-pmap.c \ diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c index e24edb2d594..cc425353516 100644 --- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c @@ -1025,6 +1025,8 @@ glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count, glusterd_brickinfo_t *brickinfo = NULL; glusterd_gsync_status_temp_t param = {0, }; gf_boolean_t restart_needed = 0; + char msg[1024] __attribute__((unused)) = {0, }; + int caps = 0; GF_ASSERT (volinfo); @@ -1105,12 +1107,30 @@ glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count, if (count) brick = strtok_r (brick_list+1, " \n", &saveptr); +#ifdef HAVE_BD_XLATOR + if (brickinfo->vg[0]) + caps = CAPS_BD | CAPS_THIN; +#endif while (i <= count) { ret = glusterd_volume_brickinfo_get_by_brick (brick, volinfo, &brickinfo); if (ret) goto out; +#ifdef HAVE_BD_XLATOR + /* Check for VG/thin pool if its BD volume */ + if (brickinfo->vg[0]) { + ret = glusterd_is_valid_vg (brickinfo, 0, msg); + if (ret) { + gf_log (THIS->name, GF_LOG_CRITICAL, "%s", msg); + goto out; + } + /* if anyone of the brick does not have thin support, + disable it for entire volume */ + caps &= brickinfo->caps; + } else + caps = 0; +#endif if (uuid_is_null (brickinfo->uuid)) { ret = glusterd_resolve_brick (brickinfo); @@ -1147,7 +1167,7 @@ glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count, dict_foreach (volinfo->gsync_slaves, _glusterd_restart_gsync_session, ¶m); } - + volinfo->caps = caps; out: GF_FREE (free_ptr1); GF_FREE (free_ptr2); @@ -1321,6 +1341,18 @@ glusterd_op_stage_add_brick (dict_t *dict, char **op_errstr) } if (!uuid_compare (brickinfo->uuid, MY_UUID)) { +#ifdef HAVE_BD_XLATOR + if (brickinfo->vg[0]) { + ret = glusterd_is_valid_vg (brickinfo, 1, msg); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "%s", + msg); + *op_errstr = gf_strdup (msg); + goto out; + } + } +#endif + ret = glusterd_validate_and_create_brickpath (brickinfo, volinfo->volume_id, op_errstr, is_force); diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c index e545fc2120d..181b8fcf1a9 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handler.c +++ b/xlators/mgmt/glusterd/src/glusterd-handler.c @@ -50,6 +50,10 @@ #include "globals.h" #include "glusterd-syncop.h" +#ifdef HAVE_BD_XLATOR +#include <lvm2app.h> +#endif + int glusterd_big_locked_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, void *data, rpc_clnt_notify_t notify_fn) @@ -395,6 +399,39 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo, if (ret) goto out; +#ifdef HAVE_BD_XLATOR + if (volinfo->caps) { + snprintf (key, 256, "volume%d.xlator0", count); + buf = GF_MALLOC (256, gf_common_mt_char); + if (!buf) { + ret = ENOMEM; + goto out; + } + if (volinfo->caps & CAPS_BD) + snprintf (buf, 256, "BD"); + ret = dict_set_dynstr (volumes, key, buf); + if (ret) { + GF_FREE (buf); + goto out; + } + + if (volinfo->caps & CAPS_THIN) { + snprintf (key, 256, "volume%d.xlator0.caps0", count); + buf = GF_MALLOC (256, gf_common_mt_char); + if (!buf) { + ret = ENOMEM; + goto out; + } + snprintf (buf, 256, "thin"); + ret = dict_set_dynstr (volumes, key, buf); + if (ret) { + GF_FREE (buf); + goto out; + } + } + } +#endif + list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { char brick[1024] = {0,}; char brick_uuid[64] = {0,}; @@ -414,6 +451,16 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo, if (ret) goto out; +#ifdef HAVE_BD_XLATOR + if (volinfo->caps & CAPS_BD) { + snprintf (key, 256, "volume%d.vg%d", count, i); + snprintf (brick, 1024, "%s", brickinfo->vg); + buf = gf_strdup (brick); + ret = dict_set_dynstr (volumes, key, buf); + if (ret) + goto out; + } +#endif i++; } diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.h b/xlators/mgmt/glusterd/src/glusterd-op-sm.h index 5fe05069de8..498c869a025 100644 --- a/xlators/mgmt/glusterd/src/glusterd-op-sm.h +++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.h @@ -286,4 +286,7 @@ glusterd_check_gsync_running (glusterd_volinfo_t *volinfo, gf_boolean_t *flag); int glusterd_defrag_volume_node_rsp (dict_t *req_dict, dict_t *rsp_dict, dict_t *op_ctx); +int +glusterd_is_valid_vg (glusterd_brickinfo_t *brick, int check_tag, char *msg); + #endif diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c index 8b658ae3037..5902589f4b6 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.c +++ b/xlators/mgmt/glusterd/src/glusterd-store.c @@ -241,6 +241,11 @@ glusterd_store_brickinfo_write (int fd, glusterd_brickinfo_t *brickinfo) if (ret) goto out; + if (!brickinfo->vg[0]) + goto out; + + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_VGNAME, + brickinfo->vg); out: gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret); return ret; @@ -581,6 +586,13 @@ glusterd_volume_exclude_options_write (int fd, glusterd_volinfo_t *volinfo) buf); if (ret) goto out; + if (volinfo->caps) { + snprintf (buf, sizeof (buf), "%d", volinfo->caps); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_CAPS, + buf); + if (ret) + goto out; + } out: if (ret) @@ -1538,6 +1550,11 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo) } else if (!strncmp (key, GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED, strlen (GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED))) { gf_string2int (value, &brickinfo->decommissioned); + } else if (!strncmp (key, + GLUSTERD_STORE_KEY_BRICK_VGNAME, + strlen (GLUSTERD_STORE_KEY_BRICK_VGNAME))) { + strncpy (brickinfo->vg, value, + sizeof (brickinfo->vg)); } else { gf_log ("", GF_LOG_ERROR, "Unknown key: %s", key); @@ -1856,6 +1873,9 @@ glusterd_store_retrieve_volume (char *volname) } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION, strlen (GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION))) { volinfo->client_op_version = atoi (value); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_CAPS, + strlen (GLUSTERD_STORE_KEY_VOL_CAPS))) { + volinfo->caps = atoi (value); } else { if (is_key_glusterd_hooks_friendly (key)) { diff --git a/xlators/mgmt/glusterd/src/glusterd-store.h b/xlators/mgmt/glusterd/src/glusterd-store.h index facb964fa72..ce1f766b1de 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.h +++ b/xlators/mgmt/glusterd/src/glusterd-store.h @@ -64,11 +64,13 @@ typedef enum glusterd_store_ver_ac_{ #define GLUSTERD_STORE_KEY_BRICK_PORT "listen-port" #define GLUSTERD_STORE_KEY_BRICK_RDMA_PORT "rdma.listen-port" #define GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED "decommissioned" +#define GLUSTERD_STORE_KEY_BRICK_VGNAME "vg" #define GLUSTERD_STORE_KEY_PEER_UUID "uuid" #define GLUSTERD_STORE_KEY_PEER_HOSTNAME "hostname" #define GLUSTERD_STORE_KEY_PEER_STATE "state" +#define GLUSTERD_STORE_KEY_VOL_CAPS "caps" #define glusterd_for_each_entry(entry, dir) \ do {\ diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index d4f33f2cefc..f0445cf0b72 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -49,6 +49,11 @@ #include <unistd.h> #include <fnmatch.h> #include <sys/statvfs.h> +#include <ifaddrs.h> +#ifdef HAVE_BD_XLATOR +#include <lvm2app.h> +#endif + #ifdef GF_LINUX_HOST_OS #include <mntent.h> @@ -622,6 +627,7 @@ glusterd_brickinfo_new_from_brick (char *brick, char *path = NULL; char *tmp_host = NULL; char *tmp_path = NULL; + char *vg = NULL; GF_ASSERT (brick); GF_ASSERT (brickinfo); @@ -640,6 +646,17 @@ glusterd_brickinfo_new_from_brick (char *brick, if (ret) goto out; +#ifdef HAVE_BD_XLATOR + vg = strchr (path, '?'); + /* ? is used as a delimiter for vg */ + if (vg) { + strncpy (new_brickinfo->vg, vg + 1, PATH_MAX - 1); + *vg = '\0'; + } + new_brickinfo->caps = CAPS_BD; +#else + vg = NULL; /* Avoid compiler warnings when BD not enabled */ +#endif ret = gf_canonicalize_path (path); if (ret) goto out; @@ -743,6 +760,62 @@ out: return available; } +#ifdef HAVE_BD_XLATOR +/* + * Sets the tag of the format "trusted.glusterfs.volume-id:<uuid>" in + * the brick VG. It is used to avoid using same VG for another brick. + * @volume-id - gfid, @brick - brick info, @msg - Error message returned + * to the caller + */ +int +glusterd_bd_set_vg_tag (unsigned char *volume_id, glusterd_brickinfo_t *brick, + char *msg, int msg_size) +{ + lvm_t handle = NULL; + vg_t vg = NULL; + char *uuid = NULL; + int ret = -1; + + gf_asprintf (&uuid, "%s:%s", GF_XATTR_VOL_ID_KEY, + uuid_utoa (volume_id)); + if (!uuid) { + snprintf (msg, sizeof(*msg), "Could not allocate memory " + "for tag"); + return -1; + } + + handle = lvm_init (NULL); + if (!handle) { + snprintf (msg, sizeof(*msg), "lvm_init failed"); + goto out; + } + + vg = lvm_vg_open (handle, brick->vg, "w", 0); + if (!vg) { + snprintf (msg, sizeof(*msg), "Could not open VG %s", + brick->vg); + goto out; + } + + if (lvm_vg_add_tag (vg, uuid) < 0) { + snprintf (msg, sizeof(*msg), "Could not set tag %s for " + "VG %s", uuid, brick->vg); + goto out; + } + lvm_vg_write (vg); + ret = 0; +out: + GF_FREE (uuid); + + if (vg) + lvm_vg_close (vg); + if (handle) + lvm_quit (handle); + + return ret; +} +#endif + int glusterd_validate_and_create_brickpath (glusterd_brickinfo_t *brickinfo, uuid_t volume_id, char **op_errstr, @@ -825,6 +898,14 @@ glusterd_validate_and_create_brickpath (glusterd_brickinfo_t *brickinfo, } } +#ifdef HAVE_BD_XLATOR + if (brickinfo->vg[0]) { + ret = glusterd_bd_set_vg_tag (volume_id, brickinfo, msg, + sizeof(msg)); + if (ret) + goto out; + } +#endif ret = glusterd_check_and_set_brick_xattr (brickinfo->hostname, brickinfo->path, volume_id, op_errstr, is_force); diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 6bf14bc3daa..51fba4da343 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -594,6 +594,8 @@ get_server_xlator (char *xlator) subvol = GF_XLATOR_MARKER; if (strcmp (xlator, "io-stats") == 0) subvol = GF_XLATOR_IO_STATS; + if (strcmp (xlator, "bd") == 0) + subvol = GF_XLATOR_BD; return subvol; } @@ -1456,7 +1458,26 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, "posix"); if (ret) return -1; +#ifdef HAVE_BD_XLATOR + if (*brickinfo->vg != '\0') { + /* Now add BD v2 xlator if volume is BD type */ + xl = volgen_graph_add (graph, "storage/bd", volname); + if (!xl) + return -1; + + ret = xlator_set_option (xl, "device", "vg"); + if (ret) + return -1; + ret = xlator_set_option (xl, "export", brickinfo->vg); + if (ret) + return -1; + + ret = check_and_add_debug_xl (graph, set_dict, volname, "bd"); + if (ret) + return -1; + } +#endif xl = volgen_graph_add (graph, "features/changelog", volname); if (!xl) diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.h b/xlators/mgmt/glusterd/src/glusterd-volgen.h index 0c1a76c123d..31bfe980dee 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.h +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.h @@ -75,6 +75,7 @@ typedef enum { GF_XLATOR_INDEX, GF_XLATOR_MARKER, GF_XLATOR_IO_STATS, + GF_XLATOR_BD, GF_XLATOR_NONE, } glusterd_server_xlator_t; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c index 034004dbd9e..a2bd7334c93 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c @@ -12,6 +12,10 @@ #include "config.h" #endif +#ifdef HAVE_BD_XLATOR +#include <lvm2app.h> +#endif + #include "common-utils.h" #include "syscall.h" #include "cli1-xdr.h" @@ -26,6 +30,7 @@ #define glusterd_op_start_volume_args_get(dict, volname, flags) \ glusterd_op_stop_volume_args_get (dict, volname, flags) + int __glusterd_handle_create_volume (rpcsvc_request_t *req) { @@ -599,6 +604,101 @@ glusterd_handle_cli_statedump_volume (rpcsvc_request_t *req) __glusterd_handle_cli_statedump_volume); } +#ifdef HAVE_BD_XLATOR +/* + * Validates if given VG in the brick exists or not. Also checks if VG has + * GF_XATTR_VOL_ID_KEY tag set to avoid using same VG for multiple bricks. + * Tag is checked only during glusterd_op_stage_create_volume. Tag is set during + * glusterd_validate_and_create_brickpath(). + * @brick - brick info, @check_tag - check for VG tag or not + * @msg - Error message to return to caller + */ +int +glusterd_is_valid_vg (glusterd_brickinfo_t *brick, int check_tag, char *msg) +{ + lvm_t handle = NULL; + vg_t vg = NULL; + char *vg_name = NULL; + int retval = 0; + char *p = NULL; + char *ptr = NULL; + struct dm_list *dm_lvlist = NULL; + struct dm_list *dm_seglist = NULL; + struct lvm_lv_list *lv_list = NULL; + struct lvm_property_value prop = {0, }; + struct lvm_lvseg_list *seglist = NULL; + struct dm_list *taglist = NULL; + struct lvm_str_list *strl = NULL; + + handle = lvm_init (NULL); + if (!handle) { + sprintf (msg, "lvm_init failed, could not validate vg"); + return -1; + } + if (*brick->vg == '\0') { /* BD xlator has vg in brick->path */ + p = gf_strdup (brick->path); + vg_name = strtok_r (p, "/", &ptr); + } else + vg_name = brick->vg; + + vg = lvm_vg_open (handle, vg_name, "r", 0); + if (!vg) { + sprintf (msg, "no such vg: %s", vg_name); + retval = -1; + goto out; + } + if (!check_tag) + goto next; + + taglist = lvm_vg_get_tags (vg); + if (!taglist) + goto next; + + dm_list_iterate_items (strl, taglist) { + if (!strncmp(strl->str, GF_XATTR_VOL_ID_KEY, + strlen (GF_XATTR_VOL_ID_KEY))) { + sprintf (msg, "VG %s is already part of" + " a brick", vg_name); + retval = -1; + goto out; + } + } +next: + + brick->caps = CAPS_BD; + + dm_lvlist = lvm_vg_list_lvs (vg); + if (!dm_lvlist) + goto out; + + dm_list_iterate_items (lv_list, dm_lvlist) { + dm_seglist = lvm_lv_list_lvsegs (lv_list->lv); + dm_list_iterate_items (seglist, dm_seglist) { + prop = lvm_lvseg_get_property (seglist->lvseg, + "segtype"); + if (!prop.is_valid || !prop.value.string) + continue; + if (!strcmp (prop.value.string, "thin-pool")) { + brick->caps |= CAPS_THIN; + gf_log (THIS->name, GF_LOG_INFO, "Thin Pool " + "\"%s\" will be used for thin LVs", + lvm_lv_get_name (lv_list->lv)); + break; + } + } + } + + retval = 0; +out: + if (vg) + lvm_vg_close (vg); + lvm_quit (handle); + if (p) + GF_FREE (p); + return retval; +} +#endif + /* op-sm */ int glusterd_op_stage_create_volume (dict_t *dict, char **op_errstr) @@ -712,6 +812,11 @@ glusterd_op_stage_create_volume (dict_t *dict, char **op_errstr) } if (!uuid_compare (brick_info->uuid, MY_UUID)) { + if (brick_info->vg[0]) { + ret = glusterd_is_valid_vg (brick_info, 1, msg); + if (ret) + goto out; + } ret = glusterd_validate_and_create_brickpath (brick_info, volume_uuid, op_errstr, is_force); @@ -809,6 +914,7 @@ glusterd_op_stage_start_volume (dict_t *dict, char **op_errstr) uuid_t volume_id = {0,}; char volid[50] = {0,}; char xattr_volid[50] = {0,}; + int caps = 0; this = THIS; GF_ASSERT (this); @@ -847,6 +953,7 @@ glusterd_op_stage_start_volume (dict_t *dict, char **op_errstr) } } + list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { ret = glusterd_resolve_brick (brickinfo); if (ret) { @@ -902,8 +1009,24 @@ glusterd_op_stage_start_volume (dict_t *dict, char **op_errstr) ret = -1; goto out; } +#ifdef HAVE_BD_XLATOR + if (brickinfo->vg[0]) + caps = CAPS_BD | CAPS_THIN; + + /* Check for VG/thin pool if its BD volume */ + if (brickinfo->vg[0]) { + ret = glusterd_is_valid_vg (brickinfo, 0, msg); + if (ret) + goto out; + /* if anyone of the brick does not have thin support, + disable it for entire volume */ + caps &= brickinfo->caps; + } else + caps = 0; +#endif } + volinfo->caps = caps; ret = 0; out: if (ret && (msg[0] != '\0')) { @@ -1315,6 +1438,8 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) char *str = NULL; char *username = NULL; char *password = NULL; + int caps = 0; + char msg[1024] __attribute__((unused)) = {0, }; this = THIS; GF_ASSERT (this); @@ -1477,6 +1602,7 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) if (count) brick = strtok_r (brick_list+1, " \n", &saveptr); + caps = CAPS_BD | CAPS_THIN; while ( i <= count) { ret = glusterd_brickinfo_new_from_brick (brick, &brickinfo); @@ -1489,6 +1615,27 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) brickinfo->hostname, brickinfo->path); goto out; } + +#ifdef HAVE_BD_XLATOR + if (!uuid_compare (brickinfo->uuid, MY_UUID)) { + if (brickinfo->vg[0]) { + ret = glusterd_is_valid_vg (brickinfo, 0, msg); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s", + msg); + goto out; + } + + /* if anyone of the brick does not have thin + support, disable it for entire volume */ + caps &= brickinfo->caps; + + + } else + caps = 0; + } +#endif + list_add_tail (&brickinfo->brick_list, &volinfo->bricks); brick = strtok_r (NULL, " \n", &saveptr); i++; @@ -1496,6 +1643,8 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) gd_update_volume_op_versions (volinfo); + volinfo->caps = caps; + ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT); if (ret) { glusterd_store_delete_volume (volinfo); diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index 8d8f8d4e07e..6423d5a8154 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -176,6 +176,8 @@ struct glusterd_brickinfo { gf_brick_status_t status; struct rpc_clnt *rpc; int decommissioned; + char vg[PATH_MAX]; /* FIXME: Use max size for length of vg */ + int caps; /* Capability */ }; typedef struct glusterd_brickinfo glusterd_brickinfo_t; @@ -231,6 +233,10 @@ struct _auth { typedef struct _auth auth_t; +/* Capabilities of xlator */ +#define CAPS_BD 0x00000001 +#define CAPS_THIN 0x00000010 + struct glusterd_rebalance_ { gf_defrag_status_t defrag_status; uint64_t rebalance_files; @@ -300,6 +306,7 @@ struct glusterd_volinfo_ { xlator_t *xl; gf_boolean_t memory_accounting; + int caps; /* Capability */ int op_version; int client_op_version; diff --git a/xlators/storage/Makefile.am b/xlators/storage/Makefile.am index 5e3ed0eb93b..c08e8e41bca 100644 --- a/xlators/storage/Makefile.am +++ b/xlators/storage/Makefile.am @@ -1,3 +1,7 @@ SUBDIRS = posix +if ENABLE_BD_XLATOR +SUBDIRS += bd +endif + CLEANFILES = diff --git a/xlators/storage/bd/Makefile.am b/xlators/storage/bd/Makefile.am new file mode 100644 index 00000000000..a985f42a877 --- /dev/null +++ b/xlators/storage/bd/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/storage/bd/src/Makefile.am b/xlators/storage/bd/src/Makefile.am new file mode 100644 index 00000000000..210b7453af8 --- /dev/null +++ b/xlators/storage/bd/src/Makefile.am @@ -0,0 +1,20 @@ +if ENABLE_BD_XLATOR +xlator_LTLIBRARIES = bd.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage + +bd_la_LDFLAGS = -module -avoid-version +LIBBD = -llvm2app -lrt +bd_la_SOURCES = bd.c bd-helper.c +bd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBBD) + +noinst_HEADERS = bd.h + +AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ + -I$(top_srcdir)/rpc/xdr/src \ + -I$(top_srcdir)/rpc/rpc-lib/src + +AM_CFLAGS = -fno-strict-aliasing -Wall $(GF_CFLAGS) + +CLEANFILES = + +endif diff --git a/xlators/storage/bd/src/bd-helper.c b/xlators/storage/bd/src/bd-helper.c new file mode 100644 index 00000000000..2c1b77a9b3e --- /dev/null +++ b/xlators/storage/bd/src/bd-helper.c @@ -0,0 +1,562 @@ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif +#include <lvm2app.h> + +#include "bd.h" +#include "run.h" + +int +bd_inode_ctx_set (inode_t *inode, xlator_t *this, bd_attr_t *ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; + + GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO (this->name, ctx, out); + + ctx_int = (long)ctx; + ret = inode_ctx_set (inode, this, &ctx_int); +out: + return ret; +} + +int +bd_inode_ctx_get (inode_t *inode, xlator_t *this, bd_attr_t **ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; + + GF_VALIDATE_OR_GOTO (this->name, inode, out); + ret = inode_ctx_get (inode, this, &ctx_int); + if (ret) + return ret; + if (ctx) + *ctx = (bd_attr_t *) ctx_int; +out: + return ret; +} + +void +bd_local_free (xlator_t *this, bd_local_t *local) +{ + if (!local) + return; + if (local->fd) + fd_unref (local->fd); + else if (local->loc.path) + loc_wipe (&local->loc); + if (local->dict) + dict_unref (local->dict); + if (local->inode) + inode_unref (local->inode); + if (local->bdatt) { + GF_FREE (local->bdatt->type); + GF_FREE (local->bdatt); + } + mem_put (local); + local = NULL; +} + +bd_local_t * +bd_local_init (call_frame_t *frame, xlator_t *this) +{ + frame->local = mem_get0 (this->local_pool); + if (!frame->local) + return NULL; + + return frame->local; +} + +/* + * VG are set with the tag in GF_XATTR_VOL_ID_KEY:<uuid> format. + * This function validates this tag agains volume-uuid. Also goes + * through LV list to find out if a thin-pool is configured or not. + */ +int bd_scan_vg (xlator_t *this, bd_priv_t *priv) +{ + vg_t brick = NULL; + data_t *tmp_data = NULL; + struct dm_list *tags = NULL; + int op_ret = -1; + uuid_t dict_uuid = {0, }; + uuid_t vg_uuid = {0, }; + gf_boolean_t uuid = _gf_false; + lvm_str_list_t *strl = NULL; + struct dm_list *lv_dm_list = NULL; + lv_list_t *lv_list = NULL; + struct dm_list *dm_seglist = NULL; + lvseg_list_t *seglist = NULL; + lvm_property_value_t prop = {0, }; + gf_boolean_t thin = _gf_false; + const char *lv_name = NULL; + + brick = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!brick) { + gf_log (this->name, GF_LOG_CRITICAL, "VG %s is not found", + priv->vg); + return ENOENT; + } + + lv_dm_list = lvm_vg_list_lvs (brick); + if (!lv_dm_list) + goto check; + + dm_list_iterate_items (lv_list, lv_dm_list) { + dm_seglist = lvm_lv_list_lvsegs (lv_list->lv); + if (!dm_seglist) + continue; + dm_list_iterate_items (seglist, dm_seglist) { + prop = lvm_lvseg_get_property (seglist->lvseg, + "segtype"); + if (!prop.is_valid || !prop.value.string) + continue; + if (!strcmp (prop.value.string, "thin-pool")) { + thin = _gf_true; + lv_name = lvm_lv_get_name (lv_list->lv); + priv->pool = gf_strdup (lv_name); + gf_log (THIS->name, GF_LOG_INFO, "Thin Pool " + "\"%s\" will be used for thin LVs", + lv_name); + break; + } + } + } + +check: + /* If there is no volume-id set in dict, we cant validate */ + tmp_data = dict_get (this->options, "volume-id"); + if (!tmp_data) { + op_ret = 0; + goto out; + } + + op_ret = uuid_parse (tmp_data->data, dict_uuid); + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "wrong volume-id (%s) set in volume file", + tmp_data->data); + op_ret = -1; + goto out; + } + + tags = lvm_vg_get_tags (brick); + if (!tags) { /* no tags in the VG */ + gf_log (this->name, GF_LOG_ERROR, + "Extended attribute trusted.glusterfs." + "volume-id is absent"); + op_ret = -1; + goto out; + } + dm_list_iterate_items (strl, tags) { + if (!strncmp (strl->str, GF_XATTR_VOL_ID_KEY, + strlen (GF_XATTR_VOL_ID_KEY))) { + uuid = _gf_true; + break; + } + } + /* UUID tag is not set in VG */ + if (!uuid) { + gf_log (this->name, GF_LOG_ERROR, + "Extended attribute trusted.glusterfs." + "volume-id is absent"); + op_ret = -1; + goto out; + } + + op_ret = uuid_parse (strl->str + strlen (GF_XATTR_VOL_ID_KEY) + 1, + vg_uuid); + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "wrong volume-id (%s) set in VG", strl->str); + op_ret = -1; + goto out; + } + if (uuid_compare (dict_uuid, vg_uuid)) { + gf_log (this->name, GF_LOG_ERROR, + "mismatching volume-id (%s) received. " + "already is a part of volume %s ", + tmp_data->data, vg_uuid); + op_ret = -1; + goto out; + } + + op_ret = 0; + +out: + lvm_vg_close (brick); + + if (!thin) + gf_log (THIS->name, GF_LOG_WARNING, "No thin pool found in " + "VG %s\n", priv->vg); + else + priv->caps |= BD_CAPS_THIN; + + return op_ret; +} + +/* FIXME: Move this code to common place, so posix and bd xlator can use */ +char * +page_aligned_alloc (size_t size, char **aligned_buf) +{ + char *alloc_buf = NULL; + char *buf = NULL; + + alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_common_mt_char); + if (!alloc_buf) + return NULL; + /* page aligned buffer */ + buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE); + *aligned_buf = buf; + + return alloc_buf; +} + +static int +__bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd_p) +{ + int ret = -1; + int _fd = -1; + char *devpath = NULL; + bd_fd_t *bdfd = NULL; + uint64_t tmp_bdfd = 0; + bd_priv_t *priv = this->private; + bd_gfid_t gfid = {0, }; + bd_attr_t *bdatt = NULL; + + /* not bd file */ + if (fd->inode->ia_type != IA_IFREG || + bd_inode_ctx_get (fd->inode, this, &bdatt)) + return 0; + + ret = __fd_ctx_get (fd, this, &tmp_bdfd); + if (ret == 0) { + bdfd = (void *)(long) tmp_bdfd; + *bdfd_p = bdfd; + return 0; + } + + uuid_utoa_r (fd->inode->gfid, gfid); + asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid); + if (!devpath) + goto out; + + _fd = open (devpath, O_RDWR | O_LARGEFILE, 0); + if (_fd < 0) { + ret = errno; + gf_log (this->name, GF_LOG_ERROR, "open on %s: %s", devpath, + strerror (ret)); + goto out; + } + bdfd = GF_CALLOC (1, sizeof(bd_fd_t), gf_bd_fd); + BD_VALIDATE_MEM_ALLOC (bdfd, ret, out); + + bdfd->fd = _fd; + bdfd->flag = O_RDWR | O_LARGEFILE; + if (__fd_ctx_set (fd, this, (uint64_t)(long)bdfd) < 0) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set the fd context fd=%p", fd); + goto out; + } + + *bdfd_p = bdfd; + + ret = 0; +out: + FREE (devpath); + if (ret) { + close (_fd); + GF_FREE (bdfd); + } + return ret; +} + +int +bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd) +{ + int ret; + + /* FIXME: Is it ok to fd->lock here ? */ + LOCK (&fd->lock); + { + ret = __bd_fd_ctx_get (this, fd, bdfd); + } + UNLOCK (&fd->lock); + + return ret; +} + +/* + * Validates if LV exists for given inode or not. + * Returns 0 if LV exists and size also matches. + * If LV does not exist -1 returned + * If LV size mismatches, returnes 1 also lv_size is updated with actual + * size + */ +int +bd_validate_bd_xattr (xlator_t *this, char *bd, char **type, + uint64_t *lv_size, uuid_t uuid) +{ + char *path = NULL; + int ret = -1; + bd_gfid_t gfid = {0, }; + bd_priv_t *priv = this->private; + struct stat stbuf = {0, }; + uint64_t size = 0; + vg_t vg = NULL; + lv_t lv = NULL; + char *bytes = NULL; + + bytes = strrchr (bd, ':'); + if (bytes) { + *bytes = '\0'; + bytes++; + gf_string2bytesize (bytes, &size); + } + + if (strcmp (bd, BD_LV) && strcmp (bd, BD_THIN)) { + gf_log (this->name, GF_LOG_WARNING, + "invalid xattr %s", bd); + return -1; + } + *type = gf_strdup (bd); + + /* + * Check if LV really exist, there could be a failure + * after setxattr and successful LV creation + */ + uuid_utoa_r (uuid, gfid); + gf_asprintf (&path, "/dev/%s/%s", priv->vg, gfid); + if (!path) { + gf_log (this->name, GF_LOG_WARNING, + "insufficient memory"); + return 0; + } + + /* Destination file does not exist */ + if (stat (path, &stbuf)) { + gf_log (this->name, GF_LOG_WARNING, + "lstat failed for path %s", path); + return -1; + } + + vg = lvm_vg_open (priv->handle, priv->vg, "r", 0); + if (!vg) { + gf_log (this->name, GF_LOG_WARNING, + "VG %s does not exist?", priv->vg); + ret = -1; + goto out; + } + + lv = lvm_lv_from_name (vg, gfid); + if (!lv) { + gf_log (this->name, GF_LOG_WARNING, + "LV %s does not exist", gfid); + ret = -1; + goto out; + } + + *lv_size = lvm_lv_get_size (lv); + if (size == *lv_size) { + ret = 0; + goto out; + } + + ret = 1; + +out: + if (vg) + lvm_vg_close (vg); + + GF_FREE (path); + return ret; +} + +static int +create_thin_lv (char *vg, char *pool, char *lv, uint64_t extent) +{ + int ret = -1; + runner_t runner = {0, }; + char *path = NULL; + struct stat stat = {0, }; + + runinit (&runner); + runner_add_args (&runner, LVM_CREATE, NULL); + runner_add_args (&runner, "--thin", NULL); + runner_argprintf (&runner, "%s/%s", vg, pool); + runner_add_args (&runner, "--name", NULL); + runner_argprintf (&runner, "%s", lv); + runner_add_args (&runner, "--virtualsize", NULL); + runner_argprintf (&runner, "%ldB", extent); + runner_start (&runner); + runner_end (&runner); + + gf_asprintf (&path, "/dev/%s/%s", vg, lv); + if (!path) { + ret = ENOMEM; + goto out; + } + if (lstat (path, &stat) < 0) + ret = EAGAIN; + else + ret = 0; +out: + GF_FREE (path); + return ret; +} + +int +bd_create (uuid_t uuid, uint64_t size, char *type, bd_priv_t *priv) +{ + int ret = 0; + vg_t vg = NULL; + bd_gfid_t gfid = {0, }; + + uuid_utoa_r (uuid, gfid); + + if (!strcmp (type, BD_THIN)) + return create_thin_lv (priv->vg, priv->pool, gfid, + size); + + vg = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!vg) { + gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + return ENOENT; + } + + if (!lvm_vg_create_lv_linear (vg, gfid, size)) { + gf_log (THIS->name, GF_LOG_WARNING, "lvm_vg_create_lv_linear " + "failed"); + ret = errno; + } + + lvm_vg_close (vg); + + return ret; +} + +int32_t +bd_resize (bd_priv_t *priv, uuid_t uuid, off_t size) +{ + uint64_t new_size = 0; + runner_t runner = {0, }; + bd_gfid_t gfid = {0, }; + int ret = 0; + vg_t vg = NULL; + lv_t lv = NULL; + + uuid_utoa_r (uuid, gfid); + + runinit (&runner); + + runner_add_args (&runner, LVM_RESIZE, NULL); + runner_argprintf (&runner, "%s/%s", priv->vg, gfid); + runner_argprintf (&runner, "-L%ldb", size); + runner_add_args (&runner, "-f", NULL); + + runner_start (&runner); + runner_end (&runner); + + vg = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!vg) { + gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + return EAGAIN; + } + + lv = lvm_lv_from_name (vg, gfid); + if (!lv) { + gf_log (THIS->name, GF_LOG_WARNING, "LV %s not found", gfid); + ret = EIO; + goto out; + } + new_size = lvm_lv_get_size (lv); + + if (new_size != size) { + gf_log (THIS->name, GF_LOG_WARNING, "resized LV size %ld does " + "not match requested size %ld", new_size, size); + ret = EIO; + } + +out: + lvm_vg_close (vg); + return ret; +} + +uint64_t +bd_get_default_extent (bd_priv_t *priv) +{ + vg_t vg = NULL; + uint64_t size = 0; + + vg = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!vg) { + gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + return 0; + } + + size = lvm_vg_get_extent_size (vg); + + lvm_vg_close (vg); + + return size; +} + +/* + * Adjusts the user specified size to VG specific extent size + */ +uint64_t +bd_adjust_size (bd_priv_t *priv, uint64_t size) +{ + uint64_t extent = 0; + uint64_t nr_ex = 0; + + extent = bd_get_default_extent (priv); + if (!extent) + return 0; + + nr_ex = size / extent; + if (size % extent) + nr_ex++; + + size = extent * nr_ex; + + return size; +} + +int +bd_delete_lv (bd_priv_t *priv, const char *lv_name, int *op_errno) +{ + vg_t vg = NULL; + lv_t lv = NULL; + int ret = -1; + + *op_errno = 0; + vg = lvm_vg_open (priv->handle, priv->vg, "w", 0); + if (!vg) { + gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + *op_errno = ENOENT; + return -1; + } + lv = lvm_lv_from_name (vg, lv_name); + if (!lv) { + gf_log (THIS->name, GF_LOG_WARNING, "No such LV %s", lv_name); + *op_errno = ENOENT; + goto out; + } + ret = lvm_vg_remove_lv (lv); + if (ret < 0) { + gf_log (THIS->name, GF_LOG_WARNING, "removing LV %s failed", + lv_name); + *op_errno = errno; + goto out; + } +out: + lvm_vg_close (vg); + + return ret; +} diff --git a/xlators/storage/bd/src/bd.c b/xlators/storage/bd/src/bd.c new file mode 100644 index 00000000000..5fa15c542c0 --- /dev/null +++ b/xlators/storage/bd/src/bd.c @@ -0,0 +1,2047 @@ +/* + BD translator V2 - Exports Block devices on server side as regular + files to client + + Now only exporting Logical volumes supported. + + Copyright IBM, Corp. 2013 + + This file is part of GlusterFS. + + Author: + M. Mohan Kumar <mohan@in.ibm.com> + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif +#include <lvm2app.h> +#include <openssl/md5.h> +#include <time.h> +#include <linux/fs.h> +#include <sys/ioctl.h> + +#include "bd.h" +#include "defaults.h" +#include "glusterfs3-xdr.h" +#include "run.h" +#include "protocol-common.h" +#include "checksum.h" + +/* + * Call back function for setxattr and removexattr. + * does not do anything. FIXME: How to handle remove/setxattr failure + */ +int +bd_null_rmsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + STACK_DESTROY (frame->root); + return 0; +} + +/* + * returns 0 if a file is mapped to BD or not. + */ +int +bd_get_bd_info (call_frame_t *frame, xlator_t *this, dict_t *xattr, uuid_t gfid, + char **type, uint64_t *size) +{ + char *bd_xattr = NULL; + char *bd = NULL; + int ret = -1; + loc_t loc = {0, }; + dict_t *dict = NULL; + char *p = NULL; + call_frame_t *bd_frame = NULL; + + if (!xattr) + return 1; + + if (dict_get_str (xattr, BD_XATTR, &p)) + return 1; + + bd_xattr = gf_strdup (p); + + memcpy (loc.gfid, gfid, sizeof (uuid_t)); + + bd_frame = copy_frame (frame); + BD_VALIDATE_MEM_ALLOC (bd_frame, ret, out); + + ret = bd_validate_bd_xattr (this, bd_xattr, type, size, gfid); + if (ret < 0) {/* LV does not exist */ + STACK_WIND (bd_frame, bd_null_rmsetxattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->removexattr, &loc, + BD_XATTR, NULL); + + gf_log (this->name, GF_LOG_WARNING, + "Mapped LV not available for posix file <gfid:%s>, " + "deleting mapping", uuid_utoa (gfid)); + } else if (ret == 1) { + /* BD_XATTR size and LV size mismatch. Update BD_XATTR */ + gf_asprintf (&bd, "%s:%ld", *type, *size); + + dict = dict_new (); + BD_VALIDATE_MEM_ALLOC (dict, ret, out); + + ret = dict_set_dynstr (dict, BD_XATTR, bd); + if (ret) + goto out; + + STACK_WIND (bd_frame, bd_null_rmsetxattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setxattr, &loc, dict, 0, + NULL); + } + +out: + dict_del (xattr, BD_XATTR); + GF_FREE (bd_xattr); + GF_FREE (bd); + return ret; +} + +/* + * bd_lookup_cbk: Call back from posix_lookup. + */ +int32_t +bd_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *buf, dict_t *xattr, + struct iatt *postparent) +{ + int ret = -1; + bd_attr_t *bdatt = NULL; + uint64_t size = 0; + char *type = BD_TYPE_NONE; + + /* only regular files are part of BD object */ + if (op_ret < 0 || buf->ia_type != IA_IFREG) + goto out; + + /* iatt already cached */ + if (!bd_inode_ctx_get (inode, this, &bdatt)) + goto next; + + if (bd_get_bd_info (frame, this, xattr, buf->ia_gfid, &type, &size)) + goto out; + + /* BD file, update buf */ + bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + if (!bdatt) { + op_errno = ENOMEM; + goto out; + } + memcpy (&bdatt->iatt, buf, sizeof (struct iatt)); + bdatt->type = type; + + /* Cache LV size in inode_ctx */ + ret = bd_inode_ctx_set (inode, this, bdatt); + if (ret < 0) { + GF_FREE (bdatt); + op_errno = EINVAL; + goto out; + } + + bdatt->iatt.ia_size = size; + bdatt->iatt.ia_blocks = size / 512; + +next: + dict_del (xattr, GF_CONTENT_KEY); + memcpy (buf, &bdatt->iatt, sizeof (struct iatt)); + +out: + BD_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf, + xattr, postparent); + return 0; +} + +/* + * bd_lookup: Issues posix_lookup to find out if file is mapped to BD + * bd_lookup -> posix_lookup -> bd_lookup_cbk +*/ +int32_t +bd_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + dict_t *bd_xattr = NULL; + bd_attr_t *bdatt = NULL; + int op_errno = EINVAL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (loc->path, out); + VALIDATE_OR_GOTO (this->private, out); + + if (bd_inode_ctx_get (loc->inode, this, &bdatt) < 0) { + if (!xattr_req) { + bd_xattr = dict_new (); + BD_VALIDATE_MEM_ALLOC (bd_xattr, op_errno, out); + xattr_req = bd_xattr; + } + if (dict_set_int8 (xattr_req, BD_XATTR, 1) < 0) + goto out; + } + + STACK_WIND (frame, bd_lookup_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lookup, loc, xattr_req); + + if (bd_xattr) + dict_unref (bd_xattr); + return 0; +out: + BD_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + + return 0; +} + +int +bd_forget (xlator_t *this, inode_t *inode) +{ + int ret = -1; + uint64_t ctx = 0; + bd_attr_t *bdatt = NULL; + + ret = bd_inode_ctx_get (inode, this, &bdatt); + if (!ret) { + inode_ctx_del (inode, this, &ctx); + FREE (bdatt); + } + return 0; +} + +int +bd_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, gf_dirent_t *entries, dict_t *xdata) +{ + gf_dirent_t *entry = NULL; + uint64_t size = 0; + char *type = NULL; + + if (op_ret < 0) + goto out; + + list_for_each_entry (entry, &entries->list, list) { + if (entry->d_type != DT_REG) + continue; + if (!bd_get_bd_info (frame, this, entry->dict, + entry->d_stat.ia_gfid, &type, &size)) { + entry->d_stat.ia_size = size; + entry->d_stat.ia_blocks = size / 512; + FREE (type); + } + } + +out: + BD_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, xdata); + return 0; +} + +/* + * bd_readdirp: In bd_readdirp_cbk if the file and BD_XATTR_SIZE is set + * ia_size is updated with the LV(BD_XATTR_SIZE) size + */ +int32_t +bd_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t off, dict_t *dict) +{ + int op_errno = EINVAL; + bd_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + if (!dict) { + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + local->dict = dict_new (); + BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out); + dict = local->dict; + } + + if (dict_set_int8 (dict, BD_XATTR, 0)) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set key %s", BD_XATTR); + goto out; + } + + STACK_WIND (frame, bd_readdirp_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdirp, fd, size, off, dict); + + return 0; +out: + BD_STACK_UNWIND (readdirp, frame, -1, op_errno, NULL, dict); + return 0; +} + +int +bd_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *buf, dict_t *xdata) +{ + bd_local_t *local = frame->local; + bd_attr_t *bdatt = NULL; + + /* only regular files are part of BD object */ + if (op_ret < 0 || buf->ia_type != IA_IFREG) + goto out; + + BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out); + + /* update buf with LV size */ + if (!bd_inode_ctx_get (local->inode, this, &bdatt)) + memcpy (buf, bdatt, sizeof (struct iatt)); + +out: + BD_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int +bd_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + int op_errno = EINVAL; + bd_local_t *local = NULL; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (loc->path, out); + VALIDATE_OR_GOTO (this->private, out); + + if (!bd_inode_ctx_get (loc->inode, this, &bdatt)) { + BD_STACK_UNWIND (stat, frame, 0, 0, &bdatt->iatt, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + local->inode = inode_ref (loc->inode); + + STACK_WIND(frame, bd_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + return 0; +out: + BD_STACK_UNWIND (stat, frame, -1, op_errno, NULL, xdata); + return 0; +} + +int +bd_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct statvfs *buff, dict_t *xdata) +{ + uint64_t size = 0; + uint64_t fr_size = 0; + bd_priv_t *priv = NULL; + vg_t vg = NULL; + + if (op_ret < 0) + goto out; + + priv = this->private; + + vg = lvm_vg_open (priv->handle, priv->vg, "r", 0); + if (!vg) { + gf_log (this->name, GF_LOG_WARNING, "opening VG %s failed", + priv->vg); + op_ret = -1; + op_errno = EAGAIN; + goto out; + } + size = lvm_vg_get_size (vg); + fr_size = lvm_vg_get_free_size (vg); + lvm_vg_close (vg); + + buff->f_blocks += size / buff->f_frsize; + buff->f_bfree += fr_size / buff->f_frsize; + buff->f_bavail += fr_size / buff->f_frsize; + +out: + BD_STACK_UNWIND (statfs, frame, op_ret, op_errno, buff, xdata); + return 0; +} + +/* + * bd_statfs: Mimics statfs by returning used/free extents in the VG + */ +int +bd_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + STACK_WIND (frame, bd_statfs_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->statfs, loc, xdata); + return 0; +out: + BD_STACK_UNWIND (statfs, frame, -1, EINVAL, NULL, NULL); + return 0; +} + +int +bd_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *buf, dict_t *xdata) +{ + bd_attr_t *bdatt = NULL; + bd_local_t *local = frame->local; + + /* only regular files are part of BD object */ + if (op_ret < 0 || buf->ia_type != IA_IFREG) + goto out; + + BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out); + + /* update buf with LV size */ + if (!bd_inode_ctx_get (local->inode, this, &bdatt)) + memcpy (buf, &bdatt->iatt, sizeof (struct iatt)); + +out: + BD_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); + return 0; +} + +int +bd_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int op_errno = EINVAL; + bd_local_t *local = NULL; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + /* if its already cached return it */ + if (!bd_inode_ctx_get (fd->inode, this, &bdatt)) { + BD_STACK_UNWIND (fstat, frame, 0, 0, &bdatt->iatt, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->inode = inode_ref (fd->inode); + + STACK_WIND (frame, bd_fstat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + + return 0; +out: + BD_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, xdata); + return 0; +} + +static inline void +bd_update_amtime (struct iatt *iatt, int flag) +{ + struct timespec ts = {0, }; + + clock_gettime (CLOCK_REALTIME, &ts); + if (flag & GF_SET_ATTR_ATIME) { + iatt->ia_atime = ts.tv_sec; + iatt->ia_atime_nsec = ts.tv_nsec; + } + if (flag & GF_SET_ATTR_MTIME) { + iatt->ia_mtime = ts.tv_sec; + iatt->ia_mtime_nsec = ts.tv_nsec; + } +} + +/* + * bd_readv: If posix file, invokes posix_readv otherwise reads from the BD + * file + */ +int +bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) +{ + int ret = -1; + int _fd = -1; + int32_t op_ret = -1; + int32_t op_errno = 0; + bd_fd_t *bd_fd = NULL; + struct iovec vec = {0, }; + struct iobuf *iobuf = NULL; + struct iobref *iobref = NULL; + uint64_t bd_size = 0; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { + STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, + fd, size, offset, flags, xdata); + return 0; + } + if (!size) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size); + goto out; + } + iobuf = iobuf_get2 (this->ctx->iobuf_pool, size); + if (!iobuf) { + op_errno = ENOMEM; + goto out; + } + _fd = bd_fd->fd; + op_ret = pread (_fd, iobuf->ptr, size, offset); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "read failed on fd=%p: %s", fd, + strerror (op_errno)); + goto out; + } + + vec.iov_base = iobuf->ptr; + vec.iov_len = op_ret; + + iobref = iobref_new (); + iobref_add (iobref, iobuf); + + if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { + op_errno = EINVAL; + op_ret = -1; + goto out; + } + bd_size = bdatt->iatt.ia_size; + if (!bd_size || (offset + vec.iov_len) >= bd_size) + op_errno = ENOENT; + + op_ret = vec.iov_len; + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_ATIME); + +out: + BD_STACK_UNWIND (readv, frame, op_ret, op_errno, + &vec, 1, &bdatt->iatt, iobref, NULL); + + if (iobref) + iobref_unref (iobref); + if (iobuf) + iobuf_unref (iobuf); + + return 0; +} + +#ifdef BLKDISCARD +/* + * bd_discard: Sends BLKDISCARD ioctl to the block device + */ +int +bd_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + int ret = -1; + int op_errno = EINVAL; + bd_fd_t *bd_fd = NULL; + uint64_t param[2] = {0, }; + bd_attr_t *bdatt = NULL; + struct iatt prebuf = {0, }; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (fd, out); + + /* posix */ + if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { + STACK_WIND (frame, default_discard_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->discard, + fd, offset, len, xdata); + return 0; + } + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { + op_errno = EINVAL; + goto out; + } + + param[0] = offset; + param[1] = len; + ret = ioctl (bd_fd->fd, BLKDISCARD, param); + if (ret < 0) { + if (errno == ENOTTY) + op_errno = ENOSYS; + else + op_errno = errno; + goto out; + } + memcpy (&prebuf, &bdatt->iatt, sizeof (prebuf)); + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME); + + BD_STACK_UNWIND (discard, frame, ret, op_errno, &prebuf, + &bdatt->iatt, xdata); + return 0; + +out: + BD_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} +#else + +int +bd_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + BD_STACK_UNWIND (discard, frame, -1, ENOSYS, NULL, NULL, NULL); + return 0; +} +#endif + +/* + * Call back from posix_open for opening the backing posix file + * If it failed, close BD fd + */ +int +bd_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd, dict_t *xdata) +{ + bd_fd_t *bd_fd = NULL; + bd_attr_t *bdatt = NULL; + + if (!op_ret) + goto out; + + bd_inode_ctx_get (fd->inode, this, &bdatt); + if (!bdatt) /* posix file */ + goto out; + + /* posix open failed */ + if (bd_fd_ctx_get (this, fd, &bd_fd) < 0) { + gf_log (this->name, GF_LOG_WARNING, + "bd_fd is NULL from fd=%p", fd); + goto out; + } + close (bd_fd->fd); + GF_FREE (bd_fd); + +out: + BD_STACK_UNWIND (open, frame, op_ret, op_errno, fd, NULL); + + return 0; +} + +/* + * bd_open: Opens BD file if given posix file is mapped to BD. Also opens + * posix file. + * fd contains both posix and BD fd + */ +int32_t +bd_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + int32_t ret = EINVAL; + bd_fd_t *bd_fd = NULL; + bd_attr_t *bdatt = NULL; + bd_gfid_t gfid = {0, }; + char *devpath = NULL; + bd_priv_t *priv = this->private; + int _fd = -1; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (fd, out); + + /* not bd file */ + if (fd->inode->ia_type != IA_IFREG || + bd_inode_ctx_get (fd->inode, this, &bdatt)) + goto posix; + + uuid_utoa_r (fd->inode->gfid, gfid); + asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid); + BD_VALIDATE_MEM_ALLOC (devpath, ret, out); + + _fd = open (devpath, flags | O_LARGEFILE, 0); + if (_fd < 0) { + ret = errno; + gf_log (this->name, GF_LOG_ERROR, "open on %s: %s", devpath, + strerror (ret)); + goto out; + } + bd_fd = GF_CALLOC (1, sizeof(bd_fd_t), gf_bd_fd); + BD_VALIDATE_MEM_ALLOC (bd_fd, ret, out); + + bd_fd->fd = _fd; + bd_fd->flag = flags | O_LARGEFILE; + if (fd_ctx_set (fd, this, (uint64_t)(long)bd_fd) < 0) { + gf_log (this->name, GF_LOG_WARNING, + "failed to set the fd context fd=%p", fd); + goto out; + } + + ret = 0; + +posix: + + /* open posix equivalant of this file, fd needed for fd related + operations like fsetxattr, ftruncate etc */ + STACK_WIND (frame, bd_open_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + + return 0; +out: + BD_STACK_UNWIND (open, frame, -1, ret, fd, NULL); + + FREE (devpath); + if (ret) { + close (_fd); + GF_FREE (bd_fd); + } + + return 0; +} + +/* + * call back from posix_setattr after updating iatt to posix file. + */ +int +bd_fsync_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + bd_local_t *local = frame->local; + bd_attr_t *bdatt = local->bdatt; + + BD_STACK_UNWIND (fsync, frame, op_ret, op_errno, &bdatt->iatt, + &bdatt->iatt, NULL); + return 0; +} + +int +bd_do_fsync (int fd, int datasync) +{ + int op_errno = 0; + +#ifdef HAVE_FDATASYNC + if (datasync) { + if (fdatasync (fd)) { + op_errno = errno; + gf_log (THIS->name, GF_LOG_ERROR, + "fdatasync on fd=%d failed: %s", + fd, strerror (errno)); + } + + } else +#endif + { + if (fsync (fd)) { + op_errno = errno; + gf_log (THIS->name, GF_LOG_ERROR, + "fsync on fd=%d failed: %s", + fd, strerror (op_errno)); + } + } + + return op_errno; +} + +/* + * bd_fsync: Syncs if BD fd, forwards the request to posix + * fsync -> posix_setattr -> posix_fsync +*/ +int32_t +bd_fsync (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t datasync, dict_t *xdata) +{ + int ret = -1; + int32_t op_ret = -1; + int32_t op_errno = 0; + bd_fd_t *bd_fd = NULL; + bd_priv_t *priv = NULL; + bd_attr_t *bdatt = NULL; + bd_local_t *local = NULL; + int valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; + struct iatt prebuf = {0, }; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ret = bd_inode_ctx_get (fd->inode, this, &bdatt); + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd || !bdatt) { + STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsync, fd, datasync, + xdata); + return 0; + } + + memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); + + op_errno = bd_do_fsync (bd_fd->fd, datasync); + if (op_errno) + goto out; + + /* For BD, Update the a|mtime during full fsync only */ + if (!datasync) { + local = bd_local_init (frame, this); + /* In case of mem failure, should posix flush called ? */ + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out); + + local->bdatt->type = gf_strdup (bdatt->type); + memcpy (&local->bdatt->iatt, &bdatt->iatt, sizeof (struct iatt)); + bd_update_amtime (&local->bdatt->iatt, valid); + uuid_copy (local->loc.gfid, fd->inode->gfid); + STACK_WIND (frame, bd_fsync_setattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setattr, &local->loc, + &local->bdatt->iatt, + valid, NULL); + return 0; + } + +out: + BD_STACK_UNWIND (fsync, frame, op_ret, op_errno, &prebuf, + &bdatt->iatt, NULL); + return 0; +} + +int +bd_flush_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + BD_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata); + return 0; +} + +int +bd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int ret = -1; + bd_fd_t *bd_fd = NULL; + bd_priv_t *priv = NULL; + bd_attr_t *bdatt = NULL; + int valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME; + bd_local_t *local = NULL; + int op_errno = EINVAL; + loc_t loc = {0, }; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ret = bd_inode_ctx_get (fd->inode, this, &bdatt); + if (!bdatt) + goto out; + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd || !bdatt) { + gf_log (this->name, GF_LOG_WARNING, + "bdfd/bdatt is NULL from fd=%p", fd); + goto out; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->fd = fd_ref (fd); + uuid_copy (loc.gfid, bdatt->iatt.ia_gfid); + + /* Update the a|mtime during flush */ + STACK_WIND (frame, bd_flush_setattr_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->setattr, &loc, &bdatt->iatt, + valid, NULL); + + return 0; + +out: + STACK_WIND (frame, default_flush_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->flush, fd, xdata); + + return 0; +} + +int32_t +bd_release (xlator_t *this, fd_t *fd) +{ + int ret = -1; + bd_fd_t *bd_fd = NULL; + uint64_t tmp_bfd = 0; + bd_attr_t *bdatt = NULL; + bd_priv_t *priv = this->private; + + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (priv, out); + + ret = bd_inode_ctx_get (fd->inode, this, &bdatt); + if (ret || !bdatt) /* posix file */ + goto out; + + /* FIXME: Update amtime during release */ + + ret = fd_ctx_del (fd, this, &tmp_bfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "bfd is NULL from fd=%p", fd); + goto out; + } + bd_fd = (bd_fd_t *)(long)tmp_bfd; + + close (bd_fd->fd); + GF_FREE (bd_fd); +out: + return 0; +} + +/* + * Call back for removexattr after removing BD_XATTR incase of + * bd create failure + */ +int +bd_setx_rm_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + + if (local->fd) + BD_STACK_UNWIND (setxattr, frame, -1, EIO, xdata); + else + BD_STACK_UNWIND (setxattr, frame, -1, EIO, xdata); + return 0; + +} + +/* + * Call back after setting BD_XATTR. Creates BD. If BD creation is a failure + * invokes posix_removexattr to remove created BD_XATTR + */ +int +bd_setx_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + bd_attr_t *bdatt = NULL; + + if (op_ret < 0) + goto next; + + /* Create LV */ + op_errno = bd_create (local->inode->gfid, local->bdatt->iatt.ia_size, + local->bdatt->type, this->private); + if (!op_errno) + goto out; + + /* LV creation failed, remove BD_XATTR */ + if (local->fd) + STACK_WIND (frame, bd_setx_rm_xattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, + local->fd, BD_XATTR, NULL); + else + STACK_WIND (frame, bd_setx_rm_xattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, + &local->loc, BD_XATTR, NULL); + + return 0; +out: + + bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + if (!bdatt) { + op_ret = -1; + op_errno = ENOMEM; + goto next; + } + + memcpy (&bdatt->iatt, &local->bdatt->iatt, sizeof (struct iatt)); + bdatt->type = gf_strdup (local->bdatt->type); + + bd_inode_ctx_set (local->inode, THIS, bdatt); + +next: + if (local->fd) + BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); + else + BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata); + return 0; + +} + +/* + * Call back from posix_stat + */ +int +bd_setx_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *iatt, + dict_t *xdata) +{ + char *param = NULL; + char *type = NULL; + char *s_size = NULL; + char *p = NULL; + char *copy = NULL; + bd_local_t *local = frame->local; + bd_priv_t *priv = this->private; + char *bd = NULL; + uint64_t size = 0; + + if (op_ret < 0) + goto out; + + if (!IA_ISREG (iatt->ia_type)) { + op_errno = EOPNOTSUPP; + goto out; + } + + param = copy = GF_CALLOC (1, local->data->len + 1, gf_common_mt_char); + BD_VALIDATE_MEM_ALLOC (param, op_errno, out); + + strncpy (param, local->data->data, local->data->len); + + type = strtok_r (param, ":", &p); + if (!type) { + op_errno = EINVAL; + goto out; + } + + if (strcmp (type, BD_LV) && strcmp (type, BD_THIN)) { + gf_log (this->name, GF_LOG_WARNING, "Invalid bd type %s given", + type); + op_errno = EINVAL; + goto out; + } + + s_size = strtok_r (NULL, ":", &p); + + /* If size not specified get default size */ + if (!s_size) + size = bd_get_default_extent (priv); + else + gf_string2bytesize (s_size, &size); + + gf_asprintf (&bd, "%s:%ld", type, size); + BD_VALIDATE_MEM_ALLOC (bd, op_errno, out); + + local->dict = dict_new (); + BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out); + + local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out); + + if (dict_set_dynstr (local->dict, BD_XATTR, bd) < 0) { + op_errno = EINVAL; + goto out; + } + + local->bdatt->type = gf_strdup (type); + memcpy (&local->bdatt->iatt, iatt, sizeof (struct iatt)); + local->bdatt->iatt.ia_size = size; + + if (local->fd) + STACK_WIND (frame, bd_setx_setx_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + local->fd, local->dict, 0, NULL); + else + STACK_WIND (frame, bd_setx_setx_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + &local->loc, local->dict, 0, NULL); + + return 0; + +out: + if (local->fd) + BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, xdata); + else + BD_STACK_UNWIND (setxattr, frame, -1, op_errno, xdata); + + GF_FREE (bd); + GF_FREE (copy); + return 0; +} + + +/* + * bd_setxattr: Used to create & map an LV to a posix file using + * BD_XATTR xattr + * bd_setxattr -> posix_stat -> bd_setx_stat_cbk -> posix_setxattr -> + * bd_setx_setx_cbk -> create_lv + * if create_lv failed, posix_removexattr -> bd_setx_rm_xattr_cbk + */ +int32_t +bd_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int flags, dict_t *xdata) +{ + int op_errno = 0; + data_t *data = NULL; + bd_attr_t *bdatt = NULL; + bd_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + bd_inode_ctx_get (loc->inode, this, &bdatt); + + data = dict_get (dict, BD_XATTR); + if (!data) { + /* non bd file object */ + STACK_WIND (frame, default_setxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + loc, dict, flags, xdata); + return 0; + } + + if (bdatt) { + gf_log (this->name, GF_LOG_WARNING, + "%s already mapped to BD", loc->path); + op_errno = EEXIST; + goto out; + } + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->inode = inode_ref (loc->inode); + loc_copy (&local->loc, loc); + local->data = data; + + STACK_WIND (frame, bd_setx_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, xdata); + + return 0; + +out: + BD_STACK_UNWIND (setxattr, frame, -1, op_errno, xdata); + return 0; +} + +/* + * bd_fsetxattr: Used to create/map an LV to a posix file using + * BD_XATTR xattr + * bd_fsetxattr -> posix_fstat -> bd_setx_stat_cbk -> posix_fsetxattr -> + * bd_setx_setx_cbk -> create_lv + * if create_lv failed, posix_removexattr -> bd_setx_rm_xattr_cbk + * -> bd_fsetxattr_cbk + */ +int32_t +bd_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int flags, dict_t *xdata) +{ + int op_errno = 0; + data_t *data = NULL; + bd_attr_t *bdatt = NULL; + bd_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (fd, out); + + bd_inode_ctx_get (fd->inode, this, &bdatt); + + data = dict_get (dict, BD_XATTR); + if (data) { + if (bdatt) { + gf_log (this->name, GF_LOG_WARNING, + "fd %p already mapped to BD", fd); + op_errno = EEXIST; + goto out; + } + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->inode = inode_ref (fd->inode); + local->fd = fd_ref (fd); + local->data = data; + + STACK_WIND(frame, bd_setx_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, xdata); + } else { + /* non bd file object */ + STACK_WIND (frame, default_fsetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + fd, dict, flags, xdata); + } + + return 0; +out: + + BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + + return 0; +} + +int32_t +bd_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + if (!strcmp (name, BD_XATTR)) + goto out; + + STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); + return 0; +out: + BD_STACK_UNWIND (removexattr, frame, -1, ENODATA, NULL); + return 0; +} + +int32_t +bd_fremovexattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + if (!strcmp (name, BD_XATTR)) + goto out; + + STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); + + return 0; +out: + BD_STACK_UNWIND (fremovexattr, frame, -1, ENODATA, NULL); + return 0; +} + +int +bd_trunc_setxattr_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + + if (local->fd) + BD_STACK_UNWIND (ftruncate, frame, -1, EIO, NULL, NULL, NULL); + else + BD_STACK_UNWIND (truncate, frame, -1, EIO, NULL, NULL, NULL); + + return 0; +} + +/* + * Call back for setxattr after setting BD_XATTR_SIZE. + */ +int +bd_trunc_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) +{ + bd_local_t *local = frame->local; + bd_attr_t *bdatt = NULL; + struct iatt prebuf = {0, }; + char *bd = NULL; + + if (op_ret < 0) + goto out; + + bd_inode_ctx_get (local->inode, this, &bdatt); + if (!bdatt) + goto revert_xattr; + + op_errno = bd_resize (this->private, local->inode->gfid, + local->bdatt->iatt.ia_size); + if (op_errno) + goto revert_xattr; + + memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); + /* LV resized, update new size in the cache */ + bdatt->iatt.ia_size = local->bdatt->iatt.ia_size; + + if (local->fd) + BD_STACK_UNWIND (ftruncate, frame, 0, 0, &prebuf, &bdatt->iatt, + NULL); + else + BD_STACK_UNWIND (truncate, frame, 0, 0, &prebuf, &bdatt->iatt, + NULL); + + return 0; + +revert_xattr: + /* revert setxattr */ + op_ret = dict_get_str (local->dict, BD_XATTR, &bd); + GF_FREE (bd); + gf_asprintf (&bd, "%s:%ld", bdatt->type, bdatt->iatt.ia_size); + + if (local->fd) + STACK_WIND (frame, bd_trunc_setxattr_setx_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + local->fd, local->dict, 0, NULL); + else + STACK_WIND (frame, bd_trunc_setxattr_setx_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + &local->loc, local->dict, 0, NULL); + + return 0; +out: + if (local->fd) + BD_STACK_UNWIND (ftruncate, frame, -1, EIO, NULL, NULL, NULL); + else + BD_STACK_UNWIND (truncate, frame, -1, EIO, NULL, NULL, NULL); + + return 0; +} + +/* + * call back from posix_[f]truncate_stat + * If offset > LV size, it resizes the LV and calls posix_setxattr + * to update new LV size in xattr else calls posix_setattr for updating + * the posix file so that truncate fop behaves properly + */ +int +bd_trunc_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *buf, dict_t *xdata) +{ + char *bd = NULL; + bd_local_t *local = frame->local; + bd_attr_t *bdatt = NULL; + + if (op_ret < 0) + goto out; + + local->dict = dict_new (); + BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out); + + bd_inode_ctx_get (local->inode, this, &bdatt); + if (!bdatt) { + op_errno = EINVAL; + goto out; + } + + gf_asprintf (&bd, "%s:%ld", bdatt->type, local->bdatt->iatt.ia_size); + if (dict_set_dynstr (local->dict, BD_XATTR, bd)) { + op_errno = EINVAL; + goto out; + } + + if (local->fd) + STACK_WIND (frame, bd_trunc_setxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsetxattr, + local->fd, local->dict, 0, NULL); + else + STACK_WIND (frame, bd_trunc_setxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + &local->loc, local->dict, 0, NULL); + + return 0; +out: + if (local->fd) + BD_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, + NULL); + else + BD_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, + NULL); + GF_FREE (bd); + return 0; +} + +void +bd_do_trunc (call_frame_t *frame, xlator_t *this, fd_t *fd, loc_t *loc, + off_t offset, bd_attr_t *bdatt) +{ + bd_local_t *local = NULL; + struct iatt prebuf = {0, }; + int op_errno = 0; + int op_ret = -1; + + /* If requested size is less than LV size, return success */ + if (offset <= bdatt->iatt.ia_size) { + memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME); + op_ret = 0; + goto out; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr); + BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out); + + if (fd) { + local->inode = inode_ref (fd->inode); + local->fd = fd_ref (fd); + } else { + local->inode = inode_ref (loc->inode); + loc_copy (&local->loc, loc); + } + + local->bdatt->iatt.ia_size = + bd_adjust_size (this->private, offset); + + STACK_WIND (frame, bd_trunc_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, NULL); + + return; + +out: + if (fd) + BD_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, + &prebuf, &bdatt->iatt, NULL); + else + BD_STACK_UNWIND (truncate, frame, op_ret, op_errno, + &prebuf, &bdatt->iatt, NULL); + return; +} + +/* + * bd_ftruncate: Resizes a LV if fd belongs to BD. + */ +int32_t +bd_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) +{ + int op_errno = 0; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { + STACK_WIND (frame, default_ftruncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, fd, + offset, xdata); + return 0; + } + + bd_do_trunc (frame, this, fd, NULL, offset, bdatt); + return 0; +out: + BD_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +/* + * bd_truncate: Resizes a LV if file maps to LV. + */ +int32_t +bd_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) +{ + int op_errno = 0; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + + if (bd_inode_ctx_get (loc->inode, this, &bdatt)) { + STACK_WIND (frame, default_truncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, loc, + offset, xdata); + return 0; + } + + bd_do_trunc (frame, this, NULL, loc, offset, bdatt); + return 0; + +out: + BD_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +__bd_pwritev (int fd, struct iovec *vector, int count, off_t offset, + uint64_t bd_size) +{ + int index = 0; + int retval = 0; + off_t internal_offset = 0; + + if (!vector) + return -EFAULT; + + retval = pwritev (fd, vector, count, offset); + if (retval == -1) { + gf_log (THIS->name, GF_LOG_WARNING, + "base %p, length %ld, offset %ld, message %s", + vector[index].iov_base, vector[index].iov_len, + internal_offset, strerror (errno)); + retval = -errno; + goto err; + } +/* + + + internal_offset = offset; + for (index = 0; index < count; index++) { + if (internal_offset > bd_size) { + op_ret = -ENOSPC; + goto err; + } + if (internal_offset + vector[index].iov_len > bd_size) { + vector[index].iov_len = bd_size - internal_offset; + no_space = 1; + } + retval = pwritev (fd, vector[index].iov_base, + vector[index].iov_len, internal_offset); + if (retval == -1) { + gf_log (THIS->name, GF_LOG_WARNING, + "base %p, length %ld, offset %ld, message %s", + vector[index].iov_base, vector[index].iov_len, + internal_offset, strerror (errno)); + op_ret = -errno; + goto err; + } + op_ret += retval; + internal_offset += retval; + if (no_space) + break; + } +*/ +err: + return retval; +} + +/* + * bd_writev: Writes to LV if its BD file or forwards the request to posix_write + * bd_writev -> posix_writev -> bd_writev_cbk + */ +int +bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdict) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int _fd = -1; + bd_fd_t *bd_fd = NULL; + int ret = -1; + uint64_t size = 0; + struct iatt prebuf = {0, }; + bd_attr_t *bdatt = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (vector, out); + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { /* posix fd */ + STACK_WIND (frame, default_writev_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, fd, vector, count, + offset, flags, iobref, xdict); + return 0; + } + + _fd = bd_fd->fd; + + if (bd_inode_ctx_get (fd->inode, this, &bdatt)) { + op_ret = -1; + op_errno = EINVAL; + goto out; + } + size = bdatt->iatt.ia_size; + + op_ret = __bd_pwritev (_fd, vector, count, offset, size); + if (op_ret < 0) { + op_errno = -op_ret; + op_ret = -1; + gf_log (this->name, GF_LOG_ERROR, "write failed: offset %"PRIu64 + ", %s", offset, strerror (op_errno)); + goto out; + } + + memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt)); + bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME); +out: + + BD_STACK_UNWIND (writev, frame, op_ret, op_errno, &prebuf, + &bdatt->iatt, NULL); + return 0; +} + +int +bd_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct iatt *prebuf, struct iatt *postbuf, + dict_t *xdata) +{ + bd_attr_t *bdatt = NULL; + int *valid = cookie; + bd_local_t *local = frame->local; + + if (op_ret < 0 || !valid || !local) + goto out; + + if (bd_inode_ctx_get (local->inode, this, &bdatt)) + goto out; + + if (*valid & GF_SET_ATTR_UID) + bdatt->iatt.ia_uid = postbuf->ia_uid; + else if (*valid & GF_SET_ATTR_GID) + bdatt->iatt.ia_gid = postbuf->ia_gid; + else if (*valid & GF_SET_ATTR_MODE) { + bdatt->iatt.ia_type = postbuf->ia_type; + bdatt->iatt.ia_prot = postbuf->ia_prot; + } else if (*valid & GF_SET_ATTR_ATIME) { + bdatt->iatt.ia_atime = postbuf->ia_atime; + bdatt->iatt.ia_atime_nsec = postbuf->ia_atime_nsec; + } else if (*valid & GF_SET_ATTR_MTIME) { + bdatt->iatt.ia_mtime = postbuf->ia_mtime; + bdatt->iatt.ia_mtime_nsec = postbuf->ia_mtime_nsec; + } + + bdatt->iatt.ia_ctime = postbuf->ia_ctime; + bdatt->iatt.ia_ctime_nsec = postbuf->ia_ctime_nsec; + + memcpy (postbuf, &bdatt->iatt, sizeof (struct iatt)); +out: + FREE (valid); + BD_STACK_UNWIND (setattr, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + return 0; +} + +int +bd_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) +{ + bd_local_t *local = NULL; + bd_attr_t *bdatt = NULL; + int *ck_valid = NULL; + int op_errno = 0; + + if (bd_inode_ctx_get (loc->inode, this, &bdatt)) { + STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, + loc, stbuf, valid, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + ck_valid = CALLOC (1, sizeof (valid)); + BD_VALIDATE_MEM_ALLOC (ck_valid, op_errno, out); + + local->inode = inode_ref (loc->inode); + *ck_valid = valid; + + STACK_WIND_COOKIE (frame, bd_setattr_cbk, ck_valid, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, + loc, stbuf, valid, xdata); + + return 0; +out: + BD_STACK_UNWIND (setattr, frame, -1, ENOMEM, NULL, NULL, xdata); + return 0; +} + +int +bd_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) +{ + bd_attr_t *bdatt = NULL; + + if (op_ret < 0) + goto out; + + if (bd_inode_ctx_get (inode, this, &bdatt)) + goto out; + + bdatt->iatt.ia_ctime = buf->ia_ctime; + bdatt->iatt.ia_ctime_nsec = buf->ia_ctime_nsec; + bdatt->iatt.ia_nlink = buf->ia_nlink; + memcpy (buf, &bdatt->iatt, sizeof (struct iatt)); + +out: + BD_STACK_UNWIND (link, frame, op_ret, op_errno, inode, buf, + preparent, postparent, NULL); + return 0; +} + +int +bd_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc, dict_t *xdata) +{ + STACK_WIND (frame, bd_link_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + return 0; +} + +int +bd_handle_special_xattrs (call_frame_t *frame, xlator_t *this, loc_t *loc, + fd_t *fd, const char *name, dict_t *xdata) +{ + dict_t *xattr = NULL; + int op_ret = -1; + int op_errno = ENOMEM;; + bd_priv_t *priv = this->private; + + xattr = dict_new (); + if (!xattr) + goto out; + + if (!strcmp (name, VOL_TYPE)) + op_ret = dict_set_int64 (xattr, (char *)name, 1); + else + op_ret = dict_set_int64 (xattr, (char *)name, priv->caps); + +out: + if (loc) + BD_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, + xdata); + else + BD_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr, + xdata); + + op_ret = dict_reset (xattr); + dict_unref (xattr); + + return 0; +} + +int +bd_fgetxattr (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *name, dict_t *xdata) +{ + if (name && (!strcmp (name, VOL_TYPE) || !strcmp (name, VOL_CAPS))) + bd_handle_special_xattrs (frame, this, NULL, fd, name, xdata); + else + STACK_WIND (frame, default_fgetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, + fd, name, xdata); + return 0; +} + +int +bd_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name, dict_t *xdata) +{ + if (name && (!strcmp (name, VOL_TYPE) || !strcmp (name, VOL_CAPS))) + bd_handle_special_xattrs (frame, this, loc, NULL, name, xdata); + else + STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + loc, name, xdata); + + return 0; +} + +int +bd_unlink_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xattr, + struct iatt *postparent) +{ + bd_gfid_t gfid = {0, }; + bd_local_t *local = frame->local; + + if (buf->ia_nlink > 1) + goto posix; + + BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out); + + uuid_utoa_r (inode->gfid, gfid); + if (bd_delete_lv (this->private, gfid, &op_errno) < 0) { + if (op_errno != ENOENT) + goto out; + } + +posix: + /* remove posix */ + STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + &local->loc, 0, NULL); + + return 0; +out: + BD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int +bd_unlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, int xflag, dict_t *xdata) +{ + int op_errno = 0; + bd_attr_t *bdatt = NULL; + bd_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + + if (bd_inode_ctx_get (loc->inode, this, &bdatt)) { + STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + loc, xflag, xdata); + return 0; + } + + local = bd_local_init (frame, this); + BD_VALIDATE_MEM_ALLOC (local, op_errno, out); + + loc_copy (&local->loc, loc); + + STACK_WIND (frame, bd_unlink_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, NULL); + return 0; +out: + BD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +bd_priv (xlator_t *this) +{ + return 0; +} + +int32_t +bd_inode (xlator_t *this) +{ + return 0; +} + +int32_t +bd_rchecksum (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + int32_t len, dict_t *xdata) +{ + int op_ret = -1; + int op_errno = 0; + int ret = 0; + int _fd = -1; + char *alloc_buf = NULL; + char *buf = NULL; + int32_t weak_checksum = 0; + bd_fd_t *bd_fd = NULL; + unsigned char strong_checksum[MD5_DIGEST_LENGTH] = {0}; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = bd_fd_ctx_get (this, fd, &bd_fd); + if (ret < 0 || !bd_fd) { + STACK_WIND (frame, default_rchecksum_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->rchecksum, fd, offset, + len, xdata); + return 0; + } + + memset (strong_checksum, 0, MD5_DIGEST_LENGTH); + + alloc_buf = page_aligned_alloc (len, &buf); + if (!alloc_buf) { + op_errno = ENOMEM; + goto out; + } + + _fd = bd_fd->fd; + + LOCK (&fd->lock); + { + ret = pread (_fd, buf, len, offset); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "pread of %d bytes returned %d (%s)", + len, ret, strerror (errno)); + op_errno = errno; + } + } + UNLOCK (&fd->lock); + + if (ret < 0) + goto out; + + weak_checksum = gf_rsync_weak_checksum ((unsigned char *) buf, + (size_t) len); + gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) len, + (unsigned char *) strong_checksum); + + op_ret = 0; +out: + BD_STACK_UNWIND (rchecksum, frame, op_ret, op_errno, + weak_checksum, strong_checksum, NULL); + + GF_FREE (alloc_buf); + + return 0; +} + +/** + * notify - when parent sends PARENT_UP, send CHILD_UP event from here + */ +int32_t +notify (xlator_t *this, + int32_t event, + void *data, + ...) +{ + switch (event) + { + case GF_EVENT_PARENT_UP: + { + /* Tell the parent that bd xlator is up */ + default_notify (this, GF_EVENT_CHILD_UP, data); + } + break; + default: + break; + } + return 0; +} + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + if (!this) + return ret; + + ret = xlator_mem_acct_init (this, gf_bd_mt_end + 1); + + if (ret != 0) + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; +} + +/** + * bd xlator init - Validate configured VG + */ +int +init (xlator_t *this) +{ + int ret = 0; + char *vg_data = NULL; + char *device = NULL; + bd_priv_t *_private = NULL; + + if (!this->children) { + gf_log (this->name, GF_LOG_CRITICAL, + "FATAL: storage/bd needs posix as subvolume"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "Volume is dangling. Please check the volume file."); + } + + GF_OPTION_INIT ("export", vg_data, str, error); + GF_OPTION_INIT ("device", device, str, error); + + /* Now we support only LV device */ + if (strcasecmp (device, BACKEND_VG)) { + gf_log (this->name, GF_LOG_CRITICAL, + "FATAL: unknown %s backend %s", BD_XLATOR, device); + return -1; + } + + this->local_pool = mem_pool_new (bd_local_t, 64); + if (!this->local_pool) { + gf_log (this->name, GF_LOG_CRITICAL, + "FATAL: Failed to create bd memory pool"); + return -1; + } + + ret = 0; + _private = GF_CALLOC (1, sizeof (*_private), gf_bd_private); + if (!_private) + goto error; + + this->private = _private; + _private->vg = gf_strdup (vg_data); + if (!_private->vg) + goto error; + + _private->handle = lvm_init (NULL); + if (!_private->handle) { + gf_log (this->name, GF_LOG_CRITICAL, "lvm_init failed"); + goto error; + } + _private->caps = BD_CAPS_BD; + if (bd_scan_vg (this, _private)) + goto error; + + return 0; +error: + GF_FREE (_private->vg); + if (_private->handle) + lvm_quit (_private->handle); + mem_pool_destroy (this->local_pool); + GF_FREE (_private); + return -1; +} + +void +fini (xlator_t *this) +{ + bd_priv_t *priv = this->private; + mem_pool_destroy (this->local_pool); + this->local_pool = NULL; + if (!priv) + return; + lvm_quit (priv->handle); + GF_FREE (priv->vg); + this->private = NULL; + GF_FREE (priv); + return; +} + +struct xlator_dumpops dumpops = { + .priv = bd_priv, + .inode = bd_inode, +}; + +struct xlator_fops fops = { + .readdirp = bd_readdirp, + .lookup = bd_lookup, + .stat = bd_stat, + .statfs = bd_statfs, + .open = bd_open, + .fstat = bd_fstat, + .rchecksum = bd_rchecksum, + .readv = bd_readv, + .fsync = bd_fsync, + .setxattr = bd_setxattr, + .fsetxattr = bd_fsetxattr, + .removexattr = bd_removexattr, + .fremovexattr=bd_fremovexattr, + .truncate = bd_truncate, + .ftruncate = bd_ftruncate, + .writev = bd_writev, + .getxattr = bd_getxattr, + .fgetxattr = bd_fgetxattr, + .unlink = bd_unlink, + .link = bd_link, + .flush = bd_flush, + .setattr = bd_setattr, + .discard = bd_discard, +}; + +struct xlator_cbks cbks = { + .release = bd_release, + .forget = bd_forget, +}; + +struct volume_options options[] = { + { .key = {"export"}, + .type = GF_OPTION_TYPE_STR}, + { .key = {"device"}, + .type = GF_OPTION_TYPE_STR, + .default_value = BACKEND_VG}, + { .key = {NULL} } +}; diff --git a/xlators/storage/bd/src/bd.h b/xlators/storage/bd/src/bd.h new file mode 100644 index 00000000000..4d8b8954524 --- /dev/null +++ b/xlators/storage/bd/src/bd.h @@ -0,0 +1,140 @@ +/* + BD translator - Exports Block devices on server side as regular + files to client + + Copyright IBM, Corp. 2012 + + This file is part of GlusterFS. + + Author: + M. Mohan Kumar <mohan@in.ibm.com> + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + +#ifndef _BD_H +#define _BD_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "mem-types.h" + +#define BD_XLATOR "block device mapper xlator" +#define BACKEND_VG "vg" +#define GF_XATTR "user.glusterfs" +#define BD_XATTR GF_XATTR ".bd" + +#define BD_LV "lv" +#define BD_THIN "thin" + +#define LVM_RESIZE "/sbin/lvresize" +#define LVM_CREATE "/sbin/lvcreate" + +#define VOL_TYPE "volume.type" +#define VOL_CAPS "volume.caps" + +#define ALIGN_SIZE 4096 + +#define BD_CAPS_BD 0x01 +#define BD_CAPS_THIN 0x02 + +#define BD_VALIDATE_MEM_ALLOC(buff, op_errno, label) \ + if (!buff) { \ + op_errno = ENOMEM; \ + gf_log (this->name, GF_LOG_ERROR, "out of memory"); \ + goto label; \ + } + +#define BD_VALIDATE_LOCAL_OR_GOTO(local, op_errno, label) \ + if (!local) { \ + op_errno = EINVAL; \ + goto label; \ + } + +#define BD_STACK_UNWIND(typ, frame, args ...) do { \ + bd_local_t *__local = frame->local; \ + xlator_t *__this = frame->this; \ + \ + frame->local = NULL; \ + STACK_UNWIND_STRICT (typ, frame, args); \ + if (__local) \ + bd_local_free (__this, __local); \ + } while (0) + +typedef char bd_gfid_t[GF_UUID_BUF_SIZE]; + +enum gf_bd_mem_types_ { + gf_bd_private = gf_common_mt_end + 1, + gf_bd_attr, + gf_bd_fd, + gf_bd_mt_end +}; + +/** + * bd_fd - internal structure + */ +typedef struct bd_fd { + int fd; + int32_t flag; +} bd_fd_t; + +typedef struct bd_priv { + lvm_t handle; + char *vg; + char *pool; + int caps; +} bd_priv_t; + + +typedef enum bd_type { + BD_TYPE_NONE, + BD_TYPE_LV, +} bd_type_t; + +typedef struct { + struct iatt iatt; + char *type; +} bd_attr_t; + +typedef struct { + dict_t *dict; + bd_attr_t *bdatt; + inode_t *inode; + loc_t loc; + fd_t *fd; + data_t *data; /* for setxattr */ +} bd_local_t; + +typedef struct { + char *lv; + struct list_head list; +} bd_del_entry; + +/* Prototypes */ +int bd_inode_ctx_set (inode_t *inode, xlator_t *this, bd_attr_t *ctx); +int bd_inode_ctx_get (inode_t *inode, xlator_t *this, bd_attr_t **ctx); +int bd_scan_vg (xlator_t *this, bd_priv_t *priv); +bd_local_t *bd_local_init (call_frame_t *frame, xlator_t *this); +void bd_local_free (xlator_t *this, bd_local_t *local); +int bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd); +char *page_aligned_alloc (size_t size, char **aligned_buf); +int bd_validate_bd_xattr (xlator_t *this, char *bd, char **type, + uint64_t *lv_size, uuid_t uuid); +uint64_t bd_get_default_extent (bd_priv_t *priv); +uint64_t bd_adjust_size (bd_priv_t *priv, uint64_t size); +int bd_create (uuid_t uuid, uint64_t size, char *type, bd_priv_t *priv); +int bd_resize (bd_priv_t *priv, uuid_t uuid, off_t size); +int bd_delete_lv (bd_priv_t *priv, const char *lv_name, int *op_errno); +int bd_snapshot_create (bd_local_t *local, bd_priv_t *priv); +int bd_clone (bd_local_t *local, bd_priv_t *priv); + +int bd_merge (bd_priv_t *priv, uuid_t gfid); +int bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict); +#endif |