From 48c40e1a42efe1b59126406084821947d139dd0e Mon Sep 17 00:00:00 2001 From: "M. Mohan Kumar" Date: Wed, 13 Nov 2013 22:44:42 +0530 Subject: bd: posix/multi-brick support to BD xlator Current BD xlator (block backend) has a few limitations such as * Creation of directories not supported * Supports only single brick * Does not use extended attributes (and client gfid) like posix xlator * Creation of special files (symbolic links, device nodes etc) not supported Basic limitation of not allowing directory creation is blocking oVirt/VDSM to consume BD xlator as part of Gluster domain since VDSM creates multi-level directories when GlusterFS is used as storage backend for storing VM images. To overcome these limitations a new BD xlator with following improvements is suggested. * New hybrid BD xlator that handles both regular files and block device files * The volume will have both POSIX and BD bricks. Regular files are created on POSIX bricks, block devices are created on the BD brick (VG) * BD xlator leverages exiting POSIX xlator for most POSIX calls and hence sits above the POSIX xlator * Block device file is differentiated from regular file by an extended attribute * The xattr 'user.glusterfs.bd' (BD_XATTR) plays a role in mapping a posix file to Logical Volume (LV). * When a client sends a request to set BD_XATTR on a posix file, a new LV is created and mapped to posix file. So every block device will have a representative file in POSIX brick with 'user.glusterfs.bd' (BD_XATTR) set. * Here after all operations on this file results in LV related operations. For example opening a file that has BD_XATTR set results in opening the LV block device, reading results in reading the corresponding LV block device. When BD xlator gets request to set BD_XATTR via setxattr call, it creates a LV and information about this LV is placed in the xattr of the posix file. xattr "user.glusterfs.bd" used to identify that posix file is mapped to BD. Usage: Server side: [root@host1 ~]# gluster volume create bdvol host1:/storage/vg1_info?vg1 host2:/storage/vg2_info?vg2 It creates a distributed gluster volume 'bdvol' with Volume Group vg1 using posix brick /storage/vg1_info in host1 and Volume Group vg2 using /storage/vg2_info in host2. [root@host1 ~]# gluster volume start bdvol Client side: [root@node ~]# mount -t glusterfs host1:/bdvol /media [root@node ~]# touch /media/posix It creates regular posix file 'posix' in either host1:/vg1 or host2:/vg2 brick [root@node ~]# mkdir /media/image [root@node ~]# touch /media/image/lv1 It also creates regular posix file 'lv1' in either host1:/vg1 or host2:/vg2 brick [root@node ~]# setfattr -n "user.glusterfs.bd" -v "lv" /media/image/lv1 [root@node ~]# Above setxattr results in creating a new LV in corresponding brick's VG and it sets 'user.glusterfs.bd' with value 'lv: --deltag > Changes from previous version V5: * Removed support for delayed deleting of LVs Changes from previous version V4: * Consolidated the patches * Removed usage of BD_XATTR_SIZE and consolidated it in BD_XATTR. Changes from previous version V3: * Added support in FUSE to support full/linked clone * Added support to merge snapshots and provide information about origin * bd_map xlator removed * iatt structure used in inode_ctx. iatt is cached and updated during fsync/flush * aio support * Type and capabilities of volume are exported through getxattr Changes from version 2: * Used inode_context for caching BD size and to check if loc/fd is BD or not. * Added GlusterFS server offloaded copy and snapshot through setfattr FOP. As part of this libgfapi is modified. * BD xlator supports stripe * During unlinking if a LV file is already opened, its added to delete list and bd_del_thread tries to delete from this list when a last reference to that file is closed. Changes from previous version: * gfid is used as name of LV * ? is used to specify VG name for creating BD volume in volume create, add-brick. gluster volume create volname host:/path?vg * open-behind issue is fixed * A replicate brick can be added dynamically and LVs from source brick are replicated to destination brick * A distribute brick can be added dynamically and rebalance operation distributes existing LVs/files to the new brick * Thin provisioning support added. * bd_map xlator support retained * setfattr -n user.glusterfs.bd -v "lv" creates a regular LV and setfattr -n user.glusterfs.bd -v "thin" creates thin LV * Capability and backend information added to gluster volume info (and --xml) so that management tools can exploit BD xlator. * tracing support for bd xlator added TODO: * Add support to display snapshots for a given LV * Display posix filename for list-origin instead of gfid Change-Id: I00d32dfbab3b7c806e0841515c86c3aa519332f2 BUG: 1028672 Signed-off-by: M. Mohan Kumar Reviewed-on: http://review.gluster.org/4809 Tested-by: Gluster Build System Reviewed-by: Anand Avati --- xlators/mgmt/glusterd/src/Makefile.am | 3 + xlators/mgmt/glusterd/src/glusterd-brick-ops.c | 34 +++++- xlators/mgmt/glusterd/src/glusterd-handler.c | 47 ++++++++ xlators/mgmt/glusterd/src/glusterd-op-sm.h | 3 + xlators/mgmt/glusterd/src/glusterd-store.c | 20 ++++ xlators/mgmt/glusterd/src/glusterd-store.h | 2 + xlators/mgmt/glusterd/src/glusterd-utils.c | 81 +++++++++++++ xlators/mgmt/glusterd/src/glusterd-volgen.c | 21 ++++ xlators/mgmt/glusterd/src/glusterd-volgen.h | 1 + xlators/mgmt/glusterd/src/glusterd-volume-ops.c | 149 ++++++++++++++++++++++++ xlators/mgmt/glusterd/src/glusterd.h | 7 ++ 11 files changed, 367 insertions(+), 1 deletion(-) (limited to 'xlators/mgmt') diff --git a/xlators/mgmt/glusterd/src/Makefile.am b/xlators/mgmt/glusterd/src/Makefile.am index 17767d7ca..a6f49ae01 100644 --- a/xlators/mgmt/glusterd/src/Makefile.am +++ b/xlators/mgmt/glusterd/src/Makefile.am @@ -2,6 +2,9 @@ xlator_LTLIBRARIES = glusterd.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/mgmt glusterd_la_CPPFLAGS = $(AM_CPPFLAGS) "-DFILTERDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/filter\"" glusterd_la_LDFLAGS = -module -avoid-version +if ENABLE_BD_XLATOR +glusterd_la_LDFLAGS += -llvm2app +endif glusterd_la_SOURCES = glusterd.c glusterd-handler.c glusterd-sm.c \ glusterd-op-sm.c glusterd-utils.c glusterd-rpc-ops.c \ glusterd-store.c glusterd-handshake.c glusterd-pmap.c \ diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c index e24edb2d5..cc4253535 100644 --- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c @@ -1025,6 +1025,8 @@ glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count, glusterd_brickinfo_t *brickinfo = NULL; glusterd_gsync_status_temp_t param = {0, }; gf_boolean_t restart_needed = 0; + char msg[1024] __attribute__((unused)) = {0, }; + int caps = 0; GF_ASSERT (volinfo); @@ -1105,12 +1107,30 @@ glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count, if (count) brick = strtok_r (brick_list+1, " \n", &saveptr); +#ifdef HAVE_BD_XLATOR + if (brickinfo->vg[0]) + caps = CAPS_BD | CAPS_THIN; +#endif while (i <= count) { ret = glusterd_volume_brickinfo_get_by_brick (brick, volinfo, &brickinfo); if (ret) goto out; +#ifdef HAVE_BD_XLATOR + /* Check for VG/thin pool if its BD volume */ + if (brickinfo->vg[0]) { + ret = glusterd_is_valid_vg (brickinfo, 0, msg); + if (ret) { + gf_log (THIS->name, GF_LOG_CRITICAL, "%s", msg); + goto out; + } + /* if anyone of the brick does not have thin support, + disable it for entire volume */ + caps &= brickinfo->caps; + } else + caps = 0; +#endif if (uuid_is_null (brickinfo->uuid)) { ret = glusterd_resolve_brick (brickinfo); @@ -1147,7 +1167,7 @@ glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count, dict_foreach (volinfo->gsync_slaves, _glusterd_restart_gsync_session, ¶m); } - + volinfo->caps = caps; out: GF_FREE (free_ptr1); GF_FREE (free_ptr2); @@ -1321,6 +1341,18 @@ glusterd_op_stage_add_brick (dict_t *dict, char **op_errstr) } if (!uuid_compare (brickinfo->uuid, MY_UUID)) { +#ifdef HAVE_BD_XLATOR + if (brickinfo->vg[0]) { + ret = glusterd_is_valid_vg (brickinfo, 1, msg); + if (ret) { + gf_log (THIS->name, GF_LOG_ERROR, "%s", + msg); + *op_errstr = gf_strdup (msg); + goto out; + } + } +#endif + ret = glusterd_validate_and_create_brickpath (brickinfo, volinfo->volume_id, op_errstr, is_force); diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c index e545fc212..181b8fcf1 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handler.c +++ b/xlators/mgmt/glusterd/src/glusterd-handler.c @@ -50,6 +50,10 @@ #include "globals.h" #include "glusterd-syncop.h" +#ifdef HAVE_BD_XLATOR +#include +#endif + int glusterd_big_locked_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event, void *data, rpc_clnt_notify_t notify_fn) @@ -395,6 +399,39 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo, if (ret) goto out; +#ifdef HAVE_BD_XLATOR + if (volinfo->caps) { + snprintf (key, 256, "volume%d.xlator0", count); + buf = GF_MALLOC (256, gf_common_mt_char); + if (!buf) { + ret = ENOMEM; + goto out; + } + if (volinfo->caps & CAPS_BD) + snprintf (buf, 256, "BD"); + ret = dict_set_dynstr (volumes, key, buf); + if (ret) { + GF_FREE (buf); + goto out; + } + + if (volinfo->caps & CAPS_THIN) { + snprintf (key, 256, "volume%d.xlator0.caps0", count); + buf = GF_MALLOC (256, gf_common_mt_char); + if (!buf) { + ret = ENOMEM; + goto out; + } + snprintf (buf, 256, "thin"); + ret = dict_set_dynstr (volumes, key, buf); + if (ret) { + GF_FREE (buf); + goto out; + } + } + } +#endif + list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { char brick[1024] = {0,}; char brick_uuid[64] = {0,}; @@ -414,6 +451,16 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo, if (ret) goto out; +#ifdef HAVE_BD_XLATOR + if (volinfo->caps & CAPS_BD) { + snprintf (key, 256, "volume%d.vg%d", count, i); + snprintf (brick, 1024, "%s", brickinfo->vg); + buf = gf_strdup (brick); + ret = dict_set_dynstr (volumes, key, buf); + if (ret) + goto out; + } +#endif i++; } diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.h b/xlators/mgmt/glusterd/src/glusterd-op-sm.h index 5fe05069d..498c869a0 100644 --- a/xlators/mgmt/glusterd/src/glusterd-op-sm.h +++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.h @@ -286,4 +286,7 @@ glusterd_check_gsync_running (glusterd_volinfo_t *volinfo, gf_boolean_t *flag); int glusterd_defrag_volume_node_rsp (dict_t *req_dict, dict_t *rsp_dict, dict_t *op_ctx); +int +glusterd_is_valid_vg (glusterd_brickinfo_t *brick, int check_tag, char *msg); + #endif diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c index 8b658ae30..5902589f4 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.c +++ b/xlators/mgmt/glusterd/src/glusterd-store.c @@ -241,6 +241,11 @@ glusterd_store_brickinfo_write (int fd, glusterd_brickinfo_t *brickinfo) if (ret) goto out; + if (!brickinfo->vg[0]) + goto out; + + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_VGNAME, + brickinfo->vg); out: gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret); return ret; @@ -581,6 +586,13 @@ glusterd_volume_exclude_options_write (int fd, glusterd_volinfo_t *volinfo) buf); if (ret) goto out; + if (volinfo->caps) { + snprintf (buf, sizeof (buf), "%d", volinfo->caps); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_CAPS, + buf); + if (ret) + goto out; + } out: if (ret) @@ -1538,6 +1550,11 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo) } else if (!strncmp (key, GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED, strlen (GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED))) { gf_string2int (value, &brickinfo->decommissioned); + } else if (!strncmp (key, + GLUSTERD_STORE_KEY_BRICK_VGNAME, + strlen (GLUSTERD_STORE_KEY_BRICK_VGNAME))) { + strncpy (brickinfo->vg, value, + sizeof (brickinfo->vg)); } else { gf_log ("", GF_LOG_ERROR, "Unknown key: %s", key); @@ -1856,6 +1873,9 @@ glusterd_store_retrieve_volume (char *volname) } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION, strlen (GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION))) { volinfo->client_op_version = atoi (value); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_CAPS, + strlen (GLUSTERD_STORE_KEY_VOL_CAPS))) { + volinfo->caps = atoi (value); } else { if (is_key_glusterd_hooks_friendly (key)) { diff --git a/xlators/mgmt/glusterd/src/glusterd-store.h b/xlators/mgmt/glusterd/src/glusterd-store.h index facb964fa..ce1f766b1 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.h +++ b/xlators/mgmt/glusterd/src/glusterd-store.h @@ -64,11 +64,13 @@ typedef enum glusterd_store_ver_ac_{ #define GLUSTERD_STORE_KEY_BRICK_PORT "listen-port" #define GLUSTERD_STORE_KEY_BRICK_RDMA_PORT "rdma.listen-port" #define GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED "decommissioned" +#define GLUSTERD_STORE_KEY_BRICK_VGNAME "vg" #define GLUSTERD_STORE_KEY_PEER_UUID "uuid" #define GLUSTERD_STORE_KEY_PEER_HOSTNAME "hostname" #define GLUSTERD_STORE_KEY_PEER_STATE "state" +#define GLUSTERD_STORE_KEY_VOL_CAPS "caps" #define glusterd_for_each_entry(entry, dir) \ do {\ diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index d4f33f2ce..f0445cf0b 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -49,6 +49,11 @@ #include #include #include +#include +#ifdef HAVE_BD_XLATOR +#include +#endif + #ifdef GF_LINUX_HOST_OS #include @@ -622,6 +627,7 @@ glusterd_brickinfo_new_from_brick (char *brick, char *path = NULL; char *tmp_host = NULL; char *tmp_path = NULL; + char *vg = NULL; GF_ASSERT (brick); GF_ASSERT (brickinfo); @@ -640,6 +646,17 @@ glusterd_brickinfo_new_from_brick (char *brick, if (ret) goto out; +#ifdef HAVE_BD_XLATOR + vg = strchr (path, '?'); + /* ? is used as a delimiter for vg */ + if (vg) { + strncpy (new_brickinfo->vg, vg + 1, PATH_MAX - 1); + *vg = '\0'; + } + new_brickinfo->caps = CAPS_BD; +#else + vg = NULL; /* Avoid compiler warnings when BD not enabled */ +#endif ret = gf_canonicalize_path (path); if (ret) goto out; @@ -743,6 +760,62 @@ out: return available; } +#ifdef HAVE_BD_XLATOR +/* + * Sets the tag of the format "trusted.glusterfs.volume-id:" in + * the brick VG. It is used to avoid using same VG for another brick. + * @volume-id - gfid, @brick - brick info, @msg - Error message returned + * to the caller + */ +int +glusterd_bd_set_vg_tag (unsigned char *volume_id, glusterd_brickinfo_t *brick, + char *msg, int msg_size) +{ + lvm_t handle = NULL; + vg_t vg = NULL; + char *uuid = NULL; + int ret = -1; + + gf_asprintf (&uuid, "%s:%s", GF_XATTR_VOL_ID_KEY, + uuid_utoa (volume_id)); + if (!uuid) { + snprintf (msg, sizeof(*msg), "Could not allocate memory " + "for tag"); + return -1; + } + + handle = lvm_init (NULL); + if (!handle) { + snprintf (msg, sizeof(*msg), "lvm_init failed"); + goto out; + } + + vg = lvm_vg_open (handle, brick->vg, "w", 0); + if (!vg) { + snprintf (msg, sizeof(*msg), "Could not open VG %s", + brick->vg); + goto out; + } + + if (lvm_vg_add_tag (vg, uuid) < 0) { + snprintf (msg, sizeof(*msg), "Could not set tag %s for " + "VG %s", uuid, brick->vg); + goto out; + } + lvm_vg_write (vg); + ret = 0; +out: + GF_FREE (uuid); + + if (vg) + lvm_vg_close (vg); + if (handle) + lvm_quit (handle); + + return ret; +} +#endif + int glusterd_validate_and_create_brickpath (glusterd_brickinfo_t *brickinfo, uuid_t volume_id, char **op_errstr, @@ -825,6 +898,14 @@ glusterd_validate_and_create_brickpath (glusterd_brickinfo_t *brickinfo, } } +#ifdef HAVE_BD_XLATOR + if (brickinfo->vg[0]) { + ret = glusterd_bd_set_vg_tag (volume_id, brickinfo, msg, + sizeof(msg)); + if (ret) + goto out; + } +#endif ret = glusterd_check_and_set_brick_xattr (brickinfo->hostname, brickinfo->path, volume_id, op_errstr, is_force); diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 6bf14bc3d..51fba4da3 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -594,6 +594,8 @@ get_server_xlator (char *xlator) subvol = GF_XLATOR_MARKER; if (strcmp (xlator, "io-stats") == 0) subvol = GF_XLATOR_IO_STATS; + if (strcmp (xlator, "bd") == 0) + subvol = GF_XLATOR_BD; return subvol; } @@ -1456,7 +1458,26 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo, "posix"); if (ret) return -1; +#ifdef HAVE_BD_XLATOR + if (*brickinfo->vg != '\0') { + /* Now add BD v2 xlator if volume is BD type */ + xl = volgen_graph_add (graph, "storage/bd", volname); + if (!xl) + return -1; + + ret = xlator_set_option (xl, "device", "vg"); + if (ret) + return -1; + ret = xlator_set_option (xl, "export", brickinfo->vg); + if (ret) + return -1; + + ret = check_and_add_debug_xl (graph, set_dict, volname, "bd"); + if (ret) + return -1; + } +#endif xl = volgen_graph_add (graph, "features/changelog", volname); if (!xl) diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.h b/xlators/mgmt/glusterd/src/glusterd-volgen.h index 0c1a76c12..31bfe980d 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.h +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.h @@ -75,6 +75,7 @@ typedef enum { GF_XLATOR_INDEX, GF_XLATOR_MARKER, GF_XLATOR_IO_STATS, + GF_XLATOR_BD, GF_XLATOR_NONE, } glusterd_server_xlator_t; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c index 034004dbd..a2bd7334c 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c @@ -12,6 +12,10 @@ #include "config.h" #endif +#ifdef HAVE_BD_XLATOR +#include +#endif + #include "common-utils.h" #include "syscall.h" #include "cli1-xdr.h" @@ -26,6 +30,7 @@ #define glusterd_op_start_volume_args_get(dict, volname, flags) \ glusterd_op_stop_volume_args_get (dict, volname, flags) + int __glusterd_handle_create_volume (rpcsvc_request_t *req) { @@ -599,6 +604,101 @@ glusterd_handle_cli_statedump_volume (rpcsvc_request_t *req) __glusterd_handle_cli_statedump_volume); } +#ifdef HAVE_BD_XLATOR +/* + * Validates if given VG in the brick exists or not. Also checks if VG has + * GF_XATTR_VOL_ID_KEY tag set to avoid using same VG for multiple bricks. + * Tag is checked only during glusterd_op_stage_create_volume. Tag is set during + * glusterd_validate_and_create_brickpath(). + * @brick - brick info, @check_tag - check for VG tag or not + * @msg - Error message to return to caller + */ +int +glusterd_is_valid_vg (glusterd_brickinfo_t *brick, int check_tag, char *msg) +{ + lvm_t handle = NULL; + vg_t vg = NULL; + char *vg_name = NULL; + int retval = 0; + char *p = NULL; + char *ptr = NULL; + struct dm_list *dm_lvlist = NULL; + struct dm_list *dm_seglist = NULL; + struct lvm_lv_list *lv_list = NULL; + struct lvm_property_value prop = {0, }; + struct lvm_lvseg_list *seglist = NULL; + struct dm_list *taglist = NULL; + struct lvm_str_list *strl = NULL; + + handle = lvm_init (NULL); + if (!handle) { + sprintf (msg, "lvm_init failed, could not validate vg"); + return -1; + } + if (*brick->vg == '\0') { /* BD xlator has vg in brick->path */ + p = gf_strdup (brick->path); + vg_name = strtok_r (p, "/", &ptr); + } else + vg_name = brick->vg; + + vg = lvm_vg_open (handle, vg_name, "r", 0); + if (!vg) { + sprintf (msg, "no such vg: %s", vg_name); + retval = -1; + goto out; + } + if (!check_tag) + goto next; + + taglist = lvm_vg_get_tags (vg); + if (!taglist) + goto next; + + dm_list_iterate_items (strl, taglist) { + if (!strncmp(strl->str, GF_XATTR_VOL_ID_KEY, + strlen (GF_XATTR_VOL_ID_KEY))) { + sprintf (msg, "VG %s is already part of" + " a brick", vg_name); + retval = -1; + goto out; + } + } +next: + + brick->caps = CAPS_BD; + + dm_lvlist = lvm_vg_list_lvs (vg); + if (!dm_lvlist) + goto out; + + dm_list_iterate_items (lv_list, dm_lvlist) { + dm_seglist = lvm_lv_list_lvsegs (lv_list->lv); + dm_list_iterate_items (seglist, dm_seglist) { + prop = lvm_lvseg_get_property (seglist->lvseg, + "segtype"); + if (!prop.is_valid || !prop.value.string) + continue; + if (!strcmp (prop.value.string, "thin-pool")) { + brick->caps |= CAPS_THIN; + gf_log (THIS->name, GF_LOG_INFO, "Thin Pool " + "\"%s\" will be used for thin LVs", + lvm_lv_get_name (lv_list->lv)); + break; + } + } + } + + retval = 0; +out: + if (vg) + lvm_vg_close (vg); + lvm_quit (handle); + if (p) + GF_FREE (p); + return retval; +} +#endif + /* op-sm */ int glusterd_op_stage_create_volume (dict_t *dict, char **op_errstr) @@ -712,6 +812,11 @@ glusterd_op_stage_create_volume (dict_t *dict, char **op_errstr) } if (!uuid_compare (brick_info->uuid, MY_UUID)) { + if (brick_info->vg[0]) { + ret = glusterd_is_valid_vg (brick_info, 1, msg); + if (ret) + goto out; + } ret = glusterd_validate_and_create_brickpath (brick_info, volume_uuid, op_errstr, is_force); @@ -809,6 +914,7 @@ glusterd_op_stage_start_volume (dict_t *dict, char **op_errstr) uuid_t volume_id = {0,}; char volid[50] = {0,}; char xattr_volid[50] = {0,}; + int caps = 0; this = THIS; GF_ASSERT (this); @@ -847,6 +953,7 @@ glusterd_op_stage_start_volume (dict_t *dict, char **op_errstr) } } + list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { ret = glusterd_resolve_brick (brickinfo); if (ret) { @@ -902,8 +1009,24 @@ glusterd_op_stage_start_volume (dict_t *dict, char **op_errstr) ret = -1; goto out; } +#ifdef HAVE_BD_XLATOR + if (brickinfo->vg[0]) + caps = CAPS_BD | CAPS_THIN; + + /* Check for VG/thin pool if its BD volume */ + if (brickinfo->vg[0]) { + ret = glusterd_is_valid_vg (brickinfo, 0, msg); + if (ret) + goto out; + /* if anyone of the brick does not have thin support, + disable it for entire volume */ + caps &= brickinfo->caps; + } else + caps = 0; +#endif } + volinfo->caps = caps; ret = 0; out: if (ret && (msg[0] != '\0')) { @@ -1315,6 +1438,8 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) char *str = NULL; char *username = NULL; char *password = NULL; + int caps = 0; + char msg[1024] __attribute__((unused)) = {0, }; this = THIS; GF_ASSERT (this); @@ -1477,6 +1602,7 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) if (count) brick = strtok_r (brick_list+1, " \n", &saveptr); + caps = CAPS_BD | CAPS_THIN; while ( i <= count) { ret = glusterd_brickinfo_new_from_brick (brick, &brickinfo); @@ -1489,6 +1615,27 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) brickinfo->hostname, brickinfo->path); goto out; } + +#ifdef HAVE_BD_XLATOR + if (!uuid_compare (brickinfo->uuid, MY_UUID)) { + if (brickinfo->vg[0]) { + ret = glusterd_is_valid_vg (brickinfo, 0, msg); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s", + msg); + goto out; + } + + /* if anyone of the brick does not have thin + support, disable it for entire volume */ + caps &= brickinfo->caps; + + + } else + caps = 0; + } +#endif + list_add_tail (&brickinfo->brick_list, &volinfo->bricks); brick = strtok_r (NULL, " \n", &saveptr); i++; @@ -1496,6 +1643,8 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) gd_update_volume_op_versions (volinfo); + volinfo->caps = caps; + ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT); if (ret) { glusterd_store_delete_volume (volinfo); diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index 8d8f8d4e0..6423d5a81 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -176,6 +176,8 @@ struct glusterd_brickinfo { gf_brick_status_t status; struct rpc_clnt *rpc; int decommissioned; + char vg[PATH_MAX]; /* FIXME: Use max size for length of vg */ + int caps; /* Capability */ }; typedef struct glusterd_brickinfo glusterd_brickinfo_t; @@ -231,6 +233,10 @@ struct _auth { typedef struct _auth auth_t; +/* Capabilities of xlator */ +#define CAPS_BD 0x00000001 +#define CAPS_THIN 0x00000010 + struct glusterd_rebalance_ { gf_defrag_status_t defrag_status; uint64_t rebalance_files; @@ -300,6 +306,7 @@ struct glusterd_volinfo_ { xlator_t *xl; gf_boolean_t memory_accounting; + int caps; /* Capability */ int op_version; int client_op_version; -- cgit