From 48c40e1a42efe1b59126406084821947d139dd0e Mon Sep 17 00:00:00 2001 From: "M. Mohan Kumar" Date: Wed, 13 Nov 2013 22:44:42 +0530 Subject: bd: posix/multi-brick support to BD xlator Current BD xlator (block backend) has a few limitations such as * Creation of directories not supported * Supports only single brick * Does not use extended attributes (and client gfid) like posix xlator * Creation of special files (symbolic links, device nodes etc) not supported Basic limitation of not allowing directory creation is blocking oVirt/VDSM to consume BD xlator as part of Gluster domain since VDSM creates multi-level directories when GlusterFS is used as storage backend for storing VM images. To overcome these limitations a new BD xlator with following improvements is suggested. * New hybrid BD xlator that handles both regular files and block device files * The volume will have both POSIX and BD bricks. Regular files are created on POSIX bricks, block devices are created on the BD brick (VG) * BD xlator leverages exiting POSIX xlator for most POSIX calls and hence sits above the POSIX xlator * Block device file is differentiated from regular file by an extended attribute * The xattr 'user.glusterfs.bd' (BD_XATTR) plays a role in mapping a posix file to Logical Volume (LV). * When a client sends a request to set BD_XATTR on a posix file, a new LV is created and mapped to posix file. So every block device will have a representative file in POSIX brick with 'user.glusterfs.bd' (BD_XATTR) set. * Here after all operations on this file results in LV related operations. For example opening a file that has BD_XATTR set results in opening the LV block device, reading results in reading the corresponding LV block device. When BD xlator gets request to set BD_XATTR via setxattr call, it creates a LV and information about this LV is placed in the xattr of the posix file. xattr "user.glusterfs.bd" used to identify that posix file is mapped to BD. Usage: Server side: [root@host1 ~]# gluster volume create bdvol host1:/storage/vg1_info?vg1 host2:/storage/vg2_info?vg2 It creates a distributed gluster volume 'bdvol' with Volume Group vg1 using posix brick /storage/vg1_info in host1 and Volume Group vg2 using /storage/vg2_info in host2. [root@host1 ~]# gluster volume start bdvol Client side: [root@node ~]# mount -t glusterfs host1:/bdvol /media [root@node ~]# touch /media/posix It creates regular posix file 'posix' in either host1:/vg1 or host2:/vg2 brick [root@node ~]# mkdir /media/image [root@node ~]# touch /media/image/lv1 It also creates regular posix file 'lv1' in either host1:/vg1 or host2:/vg2 brick [root@node ~]# setfattr -n "user.glusterfs.bd" -v "lv" /media/image/lv1 [root@node ~]# Above setxattr results in creating a new LV in corresponding brick's VG and it sets 'user.glusterfs.bd' with value 'lv: --deltag > Changes from previous version V5: * Removed support for delayed deleting of LVs Changes from previous version V4: * Consolidated the patches * Removed usage of BD_XATTR_SIZE and consolidated it in BD_XATTR. Changes from previous version V3: * Added support in FUSE to support full/linked clone * Added support to merge snapshots and provide information about origin * bd_map xlator removed * iatt structure used in inode_ctx. iatt is cached and updated during fsync/flush * aio support * Type and capabilities of volume are exported through getxattr Changes from version 2: * Used inode_context for caching BD size and to check if loc/fd is BD or not. * Added GlusterFS server offloaded copy and snapshot through setfattr FOP. As part of this libgfapi is modified. * BD xlator supports stripe * During unlinking if a LV file is already opened, its added to delete list and bd_del_thread tries to delete from this list when a last reference to that file is closed. Changes from previous version: * gfid is used as name of LV * ? is used to specify VG name for creating BD volume in volume create, add-brick. gluster volume create volname host:/path?vg * open-behind issue is fixed * A replicate brick can be added dynamically and LVs from source brick are replicated to destination brick * A distribute brick can be added dynamically and rebalance operation distributes existing LVs/files to the new brick * Thin provisioning support added. * bd_map xlator support retained * setfattr -n user.glusterfs.bd -v "lv" creates a regular LV and setfattr -n user.glusterfs.bd -v "thin" creates thin LV * Capability and backend information added to gluster volume info (and --xml) so that management tools can exploit BD xlator. * tracing support for bd xlator added TODO: * Add support to display snapshots for a given LV * Display posix filename for list-origin instead of gfid Change-Id: I00d32dfbab3b7c806e0841515c86c3aa519332f2 BUG: 1028672 Signed-off-by: M. Mohan Kumar Reviewed-on: http://review.gluster.org/4809 Tested-by: Gluster Build System Reviewed-by: Anand Avati --- cli/src/cli-cmd-volume.c | 7 +++++- cli/src/cli-rpc-ops.c | 42 ++++++++++++++++++++++++++++++++++ cli/src/cli-xml-output.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 106 insertions(+), 2 deletions(-) (limited to 'cli') diff --git a/cli/src/cli-cmd-volume.c b/cli/src/cli-cmd-volume.c index dcb317f54..100be0b73 100644 --- a/cli/src/cli-cmd-volume.c +++ b/cli/src/cli-cmd-volume.c @@ -1829,7 +1829,12 @@ struct cli_cmd volume_cmds[] = { "list information of all volumes"}, { "volume create [stripe ] [replica ] " - "[transport ] ... [force]", + "[transport ] " +#ifdef HAVE_BD_XLATOR + "?" +#endif + "... [force]", + cli_cmd_volume_create_cbk, "create a new volume of specified type with mentioned bricks"}, diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c index 0ff997dc5..808145015 100644 --- a/cli/src/cli-rpc-ops.c +++ b/cli/src/cli-rpc-ops.c @@ -496,6 +496,8 @@ gf_cli_get_volume_cbk (struct rpc_req *req, struct iovec *iov, char key[1024] = {0}; char err_str[2048] = {0}; gf_cli_rsp rsp = {0}; + char *caps = NULL; + int k __attribute__((unused)) = 0; if (-1 == req->rpc_status) goto out; @@ -658,6 +660,40 @@ xml_output: cli_out ("Volume ID: %s", volume_id_str); cli_out ("Status: %s", cli_vol_status_str[status]); +#ifdef HAVE_BD_XLATOR + k = 0; + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "volume%d.xlator%d", i, k); + ret = dict_get_str (dict, key, &caps); + if (ret) + goto next; + do { + j = 0; + cli_out ("Xlator %d: %s", k + 1, caps); + do { + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), + "volume%d.xlator%d.caps%d", + i, k, j++); + ret = dict_get_str (dict, key, &caps); + if (ret) + break; + cli_out ("Capability %d: %s", j, caps); + } while (1); + + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), + "volume%d.xlator%d", i, ++k); + ret = dict_get_str (dict, key, &caps); + if (ret) + break; + } while (1); + +next: +#else + caps = 0; /* Avoid compiler warnings when BD not enabled */ +#endif + if (type == GF_CLUSTER_TYPE_STRIPE_REPLICATE) { cli_out ("Number of Bricks: %d x %d x %d = %d", (brick_count / dist_count), @@ -693,6 +729,12 @@ xml_output: goto out; cli_out ("Brick%d: %s", j, brick); +#ifdef HAVE_BD_XLATOR + snprintf (key, 256, "volume%d.vg%d", i, j); + ret = dict_get_str (dict, key, &caps); + if (!ret) + cli_out ("Brick%d VG: %s", j, caps); +#endif j++; } diff --git a/cli/src/cli-xml-output.c b/cli/src/cli-xml-output.c index cc021a34d..0f837fc74 100644 --- a/cli/src/cli-xml-output.c +++ b/cli/src/cli-xml-output.c @@ -2497,7 +2497,8 @@ cli_xml_output_vol_info (cli_local_t *local, dict_t *dict) char key[1024] = {0,}; int i = 0; int j = 1; - + char *caps = NULL; + int k __attribute__((unused)) = 0; ret = dict_get_int32 (dict, "count", &count); if (ret) @@ -2613,6 +2614,62 @@ cli_xml_output_vol_info (cli_local_t *local, dict_t *dict) "%d", transport); XML_RET_CHECK_AND_GOTO (ret, out); +#ifdef HAVE_BD_XLATOR + /* */ + ret = xmlTextWriterStartElement (local->writer, + (xmlChar *)"xlators"); + XML_RET_CHECK_AND_GOTO (ret, out); + + for (k = 0; ; k++) { + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key),"volume%d.xlator%d", i, k); + ret = dict_get_str (dict, key, &caps); + if (ret) + break; + + /* */ + ret = xmlTextWriterStartElement (local->writer, + (xmlChar *)"xlator"); + XML_RET_CHECK_AND_GOTO (ret, out); + + ret = xmlTextWriterWriteFormatElement + (local->writer, (xmlChar *)"name", "%s", caps); + XML_RET_CHECK_AND_GOTO (ret, out); + + /* */ + ret = xmlTextWriterStartElement (local->writer, + (xmlChar *) + "capabilities"); + XML_RET_CHECK_AND_GOTO (ret, out); + + j = 0; + for (j = 0; ;j++) { + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), + "volume%d.xlator%d.caps%d", i, k, j); + ret = dict_get_str (dict, key, &caps); + if (ret) + break; + ret = xmlTextWriterWriteFormatElement + (local->writer, (xmlChar *)"capability", + "%s", caps); + XML_RET_CHECK_AND_GOTO (ret, out); + } + /* */ + ret = xmlTextWriterEndElement (local->writer); + XML_RET_CHECK_AND_GOTO (ret, out); + /* */ + ret = xmlTextWriterEndElement (local->writer); + XML_RET_CHECK_AND_GOTO (ret, out); + } + ret = xmlTextWriterFullEndElement (local->writer); + XML_RET_CHECK_AND_GOTO (ret, out); + /* */ +#else + caps = 0; /* Avoid compiler warnings when BD not enabled */ +#endif + j = 1; + /* */ ret = xmlTextWriterStartElement (local->writer, (xmlChar *)"bricks"); -- cgit