diff options
author | Xavier Hernandez <xhernandez@datalab.es> | 2014-05-15 10:35:14 +0200 |
---|---|---|
committer | Vijay Bellur <vbellur@redhat.com> | 2014-07-11 10:34:24 -0700 |
commit | 1392da3e237d8ea080573909015916e3544a6d2c (patch) | |
tree | 89f7f37e65b5d526c18e043cc7dbb51c9e19a50e | |
parent | ad112305a1c7452b13c92238b40ded80361838f3 (diff) |
cli/glusterd: Added support for dispersed volumes
Two new options have been added to the 'create' command of the cli
interface:
disperse [<count>] redundancy <count>
Both are optional. A dispersed volume is created by specifying, at
least, one of them. If 'disperse' is missing or it's present but
'<count>' does not, the number of bricks enumerated in the command
line is taken as the disperse count.
If 'redundancy' is missing, the lowest optimal value is assumed. A
configuration is considered optimal (for most workloads) when the
disperse count - redundancy count is a power of 2. If the resulting
redundancy is 1, the volume is created normally, but if it's greater
than 1, a warning is shown to the user and he/she must answer yes/no
to continue volume creation. If there isn't any optimal value for
the given number of bricks, a warning is also shown and, if the user
accepts, a redundancy of 1 is used.
If 'redundancy' is specified and the resulting volume is not optimal,
another warning is shown to the user.
A distributed-disperse volume can be created using a number of bricks
multiple of the disperse count.
Change-Id: Iab93efbe78e905cdb91f54f3741599f7ea6645e4
BUG: 1118629
Signed-off-by: Xavier Hernandez <xhernandez@datalab.es>
Reviewed-on: http://review.gluster.org/7782
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Jeff Darcy <jdarcy@redhat.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
24 files changed, 1054 insertions, 37 deletions
diff --git a/cli/src/cli-cmd-parser.c b/cli/src/cli-cmd-parser.c index 1a39be8d121..4a00b8485d3 100644 --- a/cli/src/cli-cmd-parser.c +++ b/cli/src/cli-cmd-parser.c @@ -177,7 +177,86 @@ out: } int32_t -cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options) +cli_cmd_create_disperse_check(struct cli_state * state, int * disperse, + int * redundancy, int count) +{ + int i = 0; + int tmp = 0; + gf_answer_t answer = GF_ANSWER_NO; + char question[128]; + + const char * question1 = "There isn't an optimal redundancy value " + "for this configuration. Do you want to " + "create the volume with redundancy 1 ?"; + + const char * question2 = "The optimal redundancy for this " + "configuration is %d. Do you want to create " + "the volume with this value ?"; + + const char * question3 = "This configuration is not optimal on most " + "workloads. Do you want to use it ?"; + + if (*disperse <= 0) { + if (count < 3) { + cli_err ("number of bricks must be greater " + "than 2"); + + return -1; + } + *disperse = count; + } + + if (*redundancy == 0) { + tmp = *disperse - 1; + for (i = tmp / 2; + (i > 0) && ((tmp & -tmp) != tmp); + i--, tmp--); + + if (i == 0) { + answer = cli_cmd_get_confirmation(state, question1); + if (answer == GF_ANSWER_NO) + return -1; + + *redundancy = 1; + } + else + { + *redundancy = *disperse - tmp; + if (*redundancy > 1) { + sprintf(question, question2, *redundancy); + answer = cli_cmd_get_confirmation(state, + question); + if (answer == GF_ANSWER_NO) + return -1; + } + } + + tmp = 0; + } + else { + tmp = *disperse - *redundancy; + } + + if (*redundancy > (*disperse - 1) / 2) { + cli_err ("redundancy must be less than %d for a " + "disperse %d volume", + (*disperse + 1) / 2, *disperse); + + return -1; + } + + if ((tmp & -tmp) != tmp) { + answer = cli_cmd_get_confirmation(state, question3); + if (answer == GF_ANSWER_NO) + return -1; + } + + return 0; +} + +int32_t +cli_cmd_volume_create_parse (struct cli_state *state, const char **words, + int wordcount, dict_t **options) { dict_t *dict = NULL; char *volname = NULL; @@ -191,7 +270,8 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options int32_t index = 0; char *bricks = NULL; int32_t brick_count = 0; - char *opwords[] = { "replica", "stripe", "transport", NULL }; + char *opwords[] = { "replica", "stripe", "transport", "disperse", + "redundancy", NULL }; char *invalid_volnames[] = {"volume", "type", "subvolumes", "option", "end-volume", "all", "volume_not_in_ring", @@ -200,9 +280,12 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options "snap-max-soft-limit", "auto-delete", NULL}; char *w = NULL; + char *ptr = NULL; int op_count = 0; int32_t replica_count = 1; int32_t stripe_count = 1; + int32_t disperse_count = -1; + int32_t redundancy_count = 0; gf_boolean_t is_force = _gf_false; int wc = wordcount; @@ -279,6 +362,10 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options case GF_CLUSTER_TYPE_STRIPE: type = GF_CLUSTER_TYPE_STRIPE_REPLICATE; break; + case GF_CLUSTER_TYPE_DISPERSE: + cli_err ("replicated-dispersed volume is not " + "supported"); + goto out; } if (wordcount < (index+2)) { @@ -310,6 +397,10 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options case GF_CLUSTER_TYPE_REPLICATE: type = GF_CLUSTER_TYPE_STRIPE_REPLICATE; break; + case GF_CLUSTER_TYPE_DISPERSE: + cli_err ("striped-dispersed volume is not " + "supported"); + goto out; } if (wordcount < (index + 2)) { ret = -1; @@ -348,6 +439,90 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options goto out; } index += 2; + + } else if ((strcmp (w, "disperse")) == 0) { + switch (type) { + case GF_CLUSTER_TYPE_DISPERSE: + if (disperse_count >= 0) { + cli_err ("disperse option given " + "twice"); + goto out; + } + break; + case GF_CLUSTER_TYPE_NONE: + type = GF_CLUSTER_TYPE_DISPERSE; + break; + case GF_CLUSTER_TYPE_STRIPE_REPLICATE: + cli_err ("striped-replicated-dispersed volume " + "is not supported"); + goto out; + case GF_CLUSTER_TYPE_STRIPE: + cli_err ("striped-dispersed volume is not " + "supported"); + goto out; + case GF_CLUSTER_TYPE_REPLICATE: + cli_err ("replicated-dispersed volume is not " + "supported"); + goto out; + } + + if (wordcount >= (index+2)) { + disperse_count = strtol (words[index + 1], + &ptr, 0); + if (*ptr != 0) + disperse_count = 0; + else { + if (disperse_count < 3) { + cli_err ("disperse count must " + "be greater than 2"); + ret = -1; + goto out; + } + index++; + } + } + + index++; + + } else if ((strcmp (w, "redundancy")) == 0) { + switch (type) { + case GF_CLUSTER_TYPE_NONE: + type = GF_CLUSTER_TYPE_DISPERSE; + break; + case GF_CLUSTER_TYPE_DISPERSE: + if (redundancy_count > 0) { + cli_err ("redundancy option given " + "twice"); + goto out; + } + break; + case GF_CLUSTER_TYPE_STRIPE_REPLICATE: + cli_err ("striped-replicated-dispersed volume " + "is not supported"); + goto out; + case GF_CLUSTER_TYPE_STRIPE: + cli_err ("striped-dispersed volume is not " + "supported"); + goto out; + case GF_CLUSTER_TYPE_REPLICATE: + cli_err ("replicated-dispersed volume is not " + "supported"); + goto out; + } + + if (wordcount < (index+2)) { + ret = -1; + goto out; + } + redundancy_count = strtol (words[index+1], NULL, 0); + if (redundancy_count < 1) { + cli_err ("redundancy must be greater than 0"); + ret = -1; + goto out; + } + + index += 2; + } else { GF_ASSERT (!"opword mismatch"); ret = -1; @@ -359,8 +534,6 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options if (!trans_type) trans_type = gf_strdup ("tcp"); - sub_count = stripe_count * replica_count; - /* reset the count value now */ count = 1; @@ -389,6 +562,23 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options goto out; } + if (type == GF_CLUSTER_TYPE_DISPERSE) { + ret = cli_cmd_create_disperse_check(state, &disperse_count, + &redundancy_count, + brick_count); + if (!ret) + ret = dict_set_int32 (dict, "disperse-count", + disperse_count); + if (!ret) + ret = dict_set_int32 (dict, "redundancy-count", + redundancy_count); + if (ret) + goto out; + + sub_count = disperse_count; + } else + sub_count = stripe_count * replica_count; + if (brick_count % sub_count) { if (type == GF_CLUSTER_TYPE_STRIPE) cli_err ("number of bricks is not a multiple of " @@ -396,6 +586,9 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options else if (type == GF_CLUSTER_TYPE_REPLICATE) cli_err ("number of bricks is not a multiple of " "replica count"); + else if (type == GF_CLUSTER_TYPE_DISPERSE) + cli_err ("number of bricks is not a multiple of " + "disperse count"); else cli_err ("number of bricks given doesn't match " "required count"); @@ -404,7 +597,7 @@ cli_cmd_volume_create_parse (const char **words, int wordcount, dict_t **options goto out; } - /* Everything if parsed fine. start setting info in dict */ + /* Everything is parsed fine. start setting info in dict */ ret = dict_set_str (dict, "volname", volname); if (ret) goto out; diff --git a/cli/src/cli-cmd-volume.c b/cli/src/cli-cmd-volume.c index b1b6c8275bc..43e696d56c5 100644 --- a/cli/src/cli-cmd-volume.c +++ b/cli/src/cli-cmd-volume.c @@ -362,7 +362,7 @@ cli_cmd_volume_create_cbk (struct cli_state *state, struct cli_cmd_word *word, if (!frame) goto out; - ret = cli_cmd_volume_create_parse (words, wordcount, &options); + ret = cli_cmd_volume_create_parse (state, words, wordcount, &options); if (ret) { cli_usage_out (word->pattern); @@ -376,32 +376,55 @@ cli_cmd_volume_create_cbk (struct cli_state *state, struct cli_cmd_word *word, goto out; } if ((type == GF_CLUSTER_TYPE_REPLICATE) || - (type == GF_CLUSTER_TYPE_STRIPE_REPLICATE)) { - if ((ret = dict_get_str (options, "bricks", &brick_list)) != 0) { - gf_log ("cli", GF_LOG_ERROR, "Replica bricks check : " - "Could not retrieve bricks list"); + (type == GF_CLUSTER_TYPE_STRIPE_REPLICATE) || + (type == GF_CLUSTER_TYPE_DISPERSE)) { + if ((ret = dict_get_str (options, "bricks", + &brick_list)) != 0) { + gf_log ("cli", GF_LOG_ERROR, "Bricks check : Could " + "not retrieve bricks " + "list"); goto out; } - if ((ret = dict_get_int32 (options, "count", &brick_count)) != 0) { - gf_log ("cli", GF_LOG_ERROR, "Replica bricks check : " - "Could not retrieve brick count"); + if ((ret = dict_get_int32 (options, "count", + &brick_count)) != 0) { + gf_log ("cli", GF_LOG_ERROR, "Bricks check : Could " + "not retrieve brick " + "count"); goto out; } - if ((ret = dict_get_int32 (options, "replica-count", &sub_count)) != 0) { - gf_log ("cli", GF_LOG_ERROR, "Replica bricks check : " - "Could not retrieve replica count"); - goto out; + + if (type != GF_CLUSTER_TYPE_DISPERSE) { + if ((ret = dict_get_int32 (options, "replica-count", + &sub_count)) != 0) { + gf_log ("cli", GF_LOG_ERROR, "Bricks check : " + "Could not retrieve " + "replica count"); + goto out; + } + gf_log ("cli", GF_LOG_INFO, "Replicate cluster type found." + " Checking brick order."); + } else { + ret = dict_get_int32 (options, "disperse-count", + &sub_count); + if (ret) { + gf_log ("cli", GF_LOG_ERROR, "Bricks check : " + "Could not retrieve " + "disperse count"); + goto out; + } + gf_log ("cli", GF_LOG_INFO, "Disperse cluster type found. " + "Checking brick order."); } - gf_log ("cli", GF_LOG_INFO, "Replicate cluster type found." - " Checking brick order."); - ret = cli_cmd_check_brick_order (state, brick_list, brick_count, sub_count); + ret = cli_cmd_check_brick_order (state, brick_list, + brick_count, sub_count); if (ret) { - gf_log("cli", GF_LOG_INFO, "Not creating volume because of bad brick order"); + gf_log("cli", GF_LOG_INFO, "Not creating volume " + "because of bad brick " + "order"); goto out; } } - ret = dict_get_str (options, "transport", &trans_type); if (ret) { gf_log("cli", GF_LOG_ERROR, "Unable to get transport type"); @@ -2328,6 +2351,7 @@ struct cli_cmd volume_cmds[] = { "list information of all volumes"}, { "volume create <NEW-VOLNAME> [stripe <COUNT>] [replica <COUNT>] " + "[disperse [<COUNT>]] [redundancy <COUNT>] " "[transport <tcp|rdma|tcp,rdma>] <NEW-BRICK>" #ifdef HAVE_BD_XLATOR "?<vg_name>" diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c index c077622c0f1..43db8358bcf 100644 --- a/cli/src/cli-rpc-ops.c +++ b/cli/src/cli-rpc-ops.c @@ -59,9 +59,11 @@ char *cli_vol_type_str[] = {"Distribute", "Stripe", "Replicate", "Striped-Replicate", + "Disperse", "Distributed-Stripe", "Distributed-Replicate", "Distributed-Striped-Replicate", + "Distributed-Disperse", }; char *cli_vol_status_str[] = {"Created", @@ -518,6 +520,8 @@ gf_cli_get_volume_cbk (struct rpc_req *req, struct iovec *iov, int32_t dist_count = 0; int32_t stripe_count = 0; int32_t replica_count = 0; + int32_t disperse_count = 0; + int32_t redundancy_count = 0; int32_t vol_type = 0; int32_t transport = 0; char *volume_id_str = NULL; @@ -671,6 +675,16 @@ xml_output: if (ret) goto out; + snprintf (key, 256, "volume%d.disperse_count", i); + ret = dict_get_int32 (dict, key, &disperse_count); + if (ret) + goto out; + + snprintf (key, 256, "volume%d.redundancy_count", i); + ret = dict_get_int32 (dict, key, &redundancy_count); + if (ret) + goto out; + snprintf (key, 256, "volume%d.transport", i); ret = dict_get_int32 (dict, key, &transport); if (ret) @@ -685,7 +699,7 @@ xml_output: // Distributed (stripe/replicate/stripe-replica) setups if ((type > 0) && ( dist_count < brick_count)) - vol_type = type + 3; + vol_type = type + 4; cli_out ("Volume Name: %s", volname); cli_out ("Type: %s", cli_vol_type_str[vol_type]); @@ -734,6 +748,11 @@ next: brick_count); } else if (type == GF_CLUSTER_TYPE_NONE) { cli_out ("Number of Bricks: %d", brick_count); + } else if (type == GF_CLUSTER_TYPE_DISPERSE) { + cli_out ("Number of Bricks: %d x (%d + %d) = %d", + (brick_count / dist_count), + disperse_count - redundancy_count, + redundancy_count, brick_count); } else { /* For both replicate and stripe, dist_count is good enough */ diff --git a/cli/src/cli-xml-output.c b/cli/src/cli-xml-output.c index b16c238f7fc..1bf4e874647 100644 --- a/cli/src/cli-xml-output.c +++ b/cli/src/cli-xml-output.c @@ -2528,6 +2528,8 @@ cli_xml_output_vol_info (cli_local_t *local, dict_t *dict) int dist_count = 0; int stripe_count = 0; int replica_count = 0; + int disperse_count = 0; + int redundancy_count = 0; int transport = 0; char *brick = NULL; char key[1024] = {0,}; @@ -2622,13 +2624,35 @@ cli_xml_output_vol_info (cli_local_t *local, dict_t *dict) XML_RET_CHECK_AND_GOTO (ret, out); memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "volume%d.disperse_count", i); + ret = dict_get_int32 (dict, key, &disperse_count); + if (ret) + goto out; + ret = xmlTextWriterWriteFormatElement (local->writer, + (xmlChar *)"disperseCount", + "%d", disperse_count); + XML_RET_CHECK_AND_GOTO (ret, out); + + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "volume%d.redundancy_count", i); + ret = dict_get_int32 (dict, key, &redundancy_count); + if (ret) + goto out; + ret = xmlTextWriterWriteFormatElement (local->writer, + (xmlChar *)"redundancyCount", + "%d", redundancy_count); + XML_RET_CHECK_AND_GOTO (ret, out); + + memset (key, 0, sizeof (key)); snprintf (key, sizeof (key), "volume%d.type", i); ret = dict_get_int32 (dict, key, &type); if (ret) goto out; - /* For Distributed-(stripe,replicate,stipe-replicate) types */ + /* For Distributed-(stripe,replicate,stipe-replicate,disperse) + types + */ if ((type > 0) && (dist_count < brick_count)) - type += 3; + type += 4; ret = xmlTextWriterWriteFormatElement (local->writer, (xmlChar *)"type", "%d", type); diff --git a/cli/src/cli.h b/cli/src/cli.h index 69a7e82bf63..a1a78eca2bc 100644 --- a/cli/src/cli.h +++ b/cli/src/cli.h @@ -221,8 +221,8 @@ cli_submit_request (struct rpc_clnt *rpc, void *req, call_frame_t *frame, xlator_t *this, fop_cbk_fn_t cbkfn, xdrproc_t xdrproc); int32_t -cli_cmd_volume_create_parse (const char **words, int wordcount, - dict_t **options); +cli_cmd_volume_create_parse (struct cli_state *state, const char **words, + int wordcount, dict_t **options); int32_t cli_cmd_volume_reset_parse (const char **words, int wordcount, dict_t **opt); diff --git a/rpc/xdr/src/cli1-xdr.x b/rpc/xdr/src/cli1-xdr.x index 3c43e374d95..3a9841934cb 100644 --- a/rpc/xdr/src/cli1-xdr.x +++ b/rpc/xdr/src/cli1-xdr.x @@ -23,7 +23,8 @@ GF_CLUSTER_TYPE_NONE = 0, GF_CLUSTER_TYPE_STRIPE, GF_CLUSTER_TYPE_REPLICATE, - GF_CLUSTER_TYPE_STRIPE_REPLICATE + GF_CLUSTER_TYPE_STRIPE_REPLICATE, + GF_CLUSTER_TYPE_DISPERSE }; enum gf1_cli_replace_op { diff --git a/tests/basic/ec/ec-12-4.t b/tests/basic/ec/ec-12-4.t new file mode 100644 index 00000000000..9ab47018617 --- /dev/null +++ b/tests/basic/ec/ec-12-4.t @@ -0,0 +1,14 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks basic dispersed volume functionality and cli interface + +DISPERSE=12 +REDUNDANCY=4 + +# This must be equal to 44 * $DISPERSE + 106 +TESTS_EXPECTED_IN_LOOP=634 + +. $(dirname $0)/ec-common diff --git a/tests/basic/ec/ec-3-1.t b/tests/basic/ec/ec-3-1.t new file mode 100644 index 00000000000..5769c202289 --- /dev/null +++ b/tests/basic/ec/ec-3-1.t @@ -0,0 +1,14 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks basic dispersed volume functionality and cli interface + +DISPERSE=3 +REDUNDANCY=1 + +# This must be equal to 44 * $DISPERSE + 106 +TESTS_EXPECTED_IN_LOOP=238 + +. $(dirname $0)/ec-common diff --git a/tests/basic/ec/ec-4-1.t b/tests/basic/ec/ec-4-1.t new file mode 100644 index 00000000000..d34e1fb4e95 --- /dev/null +++ b/tests/basic/ec/ec-4-1.t @@ -0,0 +1,14 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks basic dispersed volume functionality and cli interface + +DISPERSE=4 +REDUNDANCY=1 + +# This must be equal to 44 * $DISPERSE + 106 +TESTS_EXPECTED_IN_LOOP=282 + +. $(dirname $0)/ec-common diff --git a/tests/basic/ec/ec-5-1.t b/tests/basic/ec/ec-5-1.t new file mode 100644 index 00000000000..61d1cb6ce48 --- /dev/null +++ b/tests/basic/ec/ec-5-1.t @@ -0,0 +1,14 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks basic dispersed volume functionality and cli interface + +DISPERSE=5 +REDUNDANCY=1 + +# This must be equal to 44 * $DISPERSE + 106 +TESTS_EXPECTED_IN_LOOP=326 + +. $(dirname $0)/ec-common diff --git a/tests/basic/ec/ec-5-2.t b/tests/basic/ec/ec-5-2.t new file mode 100644 index 00000000000..4dc1c186f02 --- /dev/null +++ b/tests/basic/ec/ec-5-2.t @@ -0,0 +1,14 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks basic dispersed volume functionality and cli interface + +DISPERSE=5 +REDUNDANCY=2 + +# This must be equal to 44 * $DISPERSE + 106 +TESTS_EXPECTED_IN_LOOP=326 + +. $(dirname $0)/ec-common diff --git a/tests/basic/ec/ec-6-2.t b/tests/basic/ec/ec-6-2.t new file mode 100644 index 00000000000..23ec84e60e9 --- /dev/null +++ b/tests/basic/ec/ec-6-2.t @@ -0,0 +1,14 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks basic dispersed volume functionality and cli interface + +DISPERSE=6 +REDUNDANCY=2 + +# This must be equal to 44 * $DISPERSE + 106 +TESTS_EXPECTED_IN_LOOP=370 + +. $(dirname $0)/ec-common diff --git a/tests/basic/ec/ec-7-3.t b/tests/basic/ec/ec-7-3.t new file mode 100644 index 00000000000..4ebba2a1de3 --- /dev/null +++ b/tests/basic/ec/ec-7-3.t @@ -0,0 +1,14 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks basic dispersed volume functionality and cli interface + +DISPERSE=7 +REDUNDANCY=3 + +# This must be equal to 44 * $DISPERSE + 106 +TESTS_EXPECTED_IN_LOOP=414 + +. $(dirname $0)/ec-common diff --git a/tests/basic/ec/ec-common b/tests/basic/ec/ec-common new file mode 100644 index 00000000000..95f53f250bc --- /dev/null +++ b/tests/basic/ec/ec-common @@ -0,0 +1,143 @@ + +SIZE_LIST="1048576 1000 12345 0" + +LAST_BRICK=$(($DISPERSE - 1)) + +function fragment_size +{ + local fragments=$(($DISPERSE - $REDUNDANCY)) + local block_size=$((128 * $fragments)) + local size=$(($1 + $block_size - 1)) + + echo $((( $size - ( $size ) % $block_size ) / $fragments)) +} + +cleanup + +tmp=`mktemp -d` +if [ ! -d $tmp ]; then + exit 1 +fi + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 redundancy $REDUNDANCY $H0:$B0/${V0}{0..$LAST_BRICK} +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 + +TEST dd if=/dev/urandom of=$tmp/small bs=1024 count=1 +TEST dd if=/dev/urandom of=$tmp/big bs=1024 count=4096 + +cs_small=$(sha1sum $tmp/small | awk '{ print $1 }') +cs_big=$(sha1sum $tmp/big | awk '{ print $1 }') +cp $tmp/small $tmp/small1 +for size in $SIZE_LIST; do + truncate -s $size $tmp/small1 + eval cs_small_truncate[$size]=$(sha1sum $tmp/small1 | awk '{ print $1 }') +done +cp $tmp/big $tmp/big1 +for size in $SIZE_LIST; do + truncate -s $size $tmp/big1 + eval cs_big_truncate[$size]=$(sha1sum $tmp/big1 | awk '{ print $1 }') +done + +TEST df -h +TEST stat $M0 + +for idx in `seq 0 $LAST_BRICK`; do + brick[$idx]=$(gf_get_gfid_backend_file_path $B0/$V0$idx) +done + +cd $M0 +EXPECT "2" echo $(ls -a1 | wc -l) +TEST mkdir dir1 +TEST [ -d dir1 ] +TEST touch file1 +TEST [ -f file1 ] + +for dir in . dir1; do + TEST cp $tmp/small $dir/small + TEST [ -f $dir/small ] + fsize=$(fragment_size 1024) + EXPECT "1024" stat -c "%s" $dir/small + for idx in `seq 0 $LAST_BRICK`; do + EXPECT "$fsize" stat -c "%s" ${brick[$idx]}/$dir/small + done + + EXPECT "$cs_small" echo $(sha1sum $dir/small | awk '{ print $1 }') + + TEST cp $tmp/big $dir/big + TEST [ -f $dir/big ] + fsize=$(fragment_size 4194304) + EXPECT "4194304" stat -c "%s" $dir/big + for idx in `seq 0 $LAST_BRICK`; do + EXPECT "$fsize" stat -c "%s" ${brick[$idx]}/$dir/big + done + + EXPECT "$cs_big" echo $(sha1sum $dir/big | awk '{ print $1 }') + + for idx in `seq 0 $LAST_BRICK`; do + TEST kill_brick $V0 $H0 $B0/$V0$idx + + EXPECT "1024" stat -c "%s" $dir/small + EXPECT "4194304" stat -c "%s" $dir/big + EXPECT "$cs_small" echo $(sha1sum $dir/small | awk '{ print $1 }') + EXPECT "$cs_big" echo $(sha1sum $dir/big | awk '{ print $1 }') + + cd + TEST umount $M0 + TEST $CLI volume stop $V0 force + TEST $CLI volume start $V0 + TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 + cd $M0 + done + + for size in $SIZE_LIST; do + TEST truncate -s $size $dir/small + TEST [ -f $dir/small ] + fsize=$(fragment_size $size) + EXPECT "$size" stat -c "%s" $dir/small + for idx in `seq 0 $LAST_BRICK`; do + EXPECT "$fsize" stat -c "%s" ${brick[$idx]}/$dir/small + done + + EXPECT "${cs_small_truncate[$size]}" echo $(sha1sum $dir/small | awk '{ print $1 }') + + TEST truncate -s $size $dir/big + TEST [ -f $dir/big ] + EXPECT "$size" stat -c "%s" $dir/big + for idx in `seq 0 $LAST_BRICK`; do + EXPECT "$fsize" stat -c "%s" ${brick[$idx]}/$dir/big + done + + EXPECT "${cs_big_truncate[$size]}" echo $(sha1sum $dir/big | awk '{ print $1 }') + done + + TEST rm -f $dir/small + TEST [ ! -e $dir/small ] + for idx in `seq 0 $LAST_BRICK`; do + TEST [ ! -e ${brick[$idx]}/$dir/small ] + done + + TEST rm -f $dir/big + TEST [ ! -e $dir/big ] + for idx in `seq 0 $LAST_BRICK`; do + TEST [ ! -e ${brick[$idx]}/$dir/big ] + done +done + +TEST rmdir dir1 +TEST [ ! -e dir1 ] +for idx in `seq 0 $LAST_BRICK`; do + TEST [ ! -e ${brick[$idx]}/dir1 ] +done + +TEST rm -f file1 +TEST [ ! -e file1 ] +for idx in `seq 0 $LAST_BRICK`; do + TEST [ ! -e ${brick[$idx]}/file1 ] +done + +rm -rf $tmp + +cleanup diff --git a/tests/basic/ec/ec.t b/tests/basic/ec/ec.t new file mode 100644 index 00000000000..e81de0d97bd --- /dev/null +++ b/tests/basic/ec/ec.t @@ -0,0 +1,233 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +TEST_USER=test-ec-user +TEST_UID=27341 + +function my_getfattr { + getfattr --only-values -e text $* 2> /dev/null +} + +function get_rep_count { + v=$(my_getfattr -n trusted.nsr.rep-count $1) + #echo $v > /dev/tty + echo $v +} + +function create_file { + dd if=/dev/urandom of=$1 bs=4k count=$2 conv=sync 2> /dev/null +} + +function setup_perm_file { + mkdir $1/perm_dir || return 1 + chown ${TEST_USER} $1/perm_dir || return 1 + su ${TEST_USER} -c "touch $1/perm_dir/perm_file" || return 1 + return 0 +} + +# Functions to check repair for specific operation types. + +function check_create_write { + for b in $*; do + cmp $tmpdir/create-write $b/create-write || return 1 + done + return 0 +} + +function check_truncate { + truncate --size=8192 $tmpdir/truncate + for b in $*; do + cmp $tmpdir/truncate $b/truncate || return 1 + done + return 0 +} + +function check_hard_link { + for b in $*; do + inum1=$(ls -i $b/hard-link-1 | cut -d' ' -f1) + inum2=$(ls -i $b/hard-link-2 | cut -d' ' -f1) + [ "$inum1" = "$inum2" ] || return 1 + done + echo "Y" + return 0 +} + +function check_soft_link { + for b in $*; do + [ "$(readlink $b/soft-link)" = "soft-link-tgt" ] || return 1 + done + echo "Y" + return 0 +} + +function check_unlink { + for b in $*; do + [ ! -e $b/unlink ] || return 1 + done + echo "Y" + return 0 +} + +function check_mkdir { + for b in $*; do + [ -d $b/mkdir ] || return 1 + done + echo "Y" + return 0 +} + +function check_rmdir { + for b in $*; do + [ ! -e $b/rmdir ] || return 1 + done + echo "Y" + return 0 +} + +function check_setxattr { + for b in $*; do + v=$(my_getfattr -n user.foo $b/setxattr) + [ "$v" = "ash_nazg_durbatuluk" ] || return 1 + done + echo "Y" + return 0 +} + +function check_removexattr { + for b in $*; do + my_getfattr -n user.bar $b/removexattr 2> /dev/null + [ $? = 0 ] && return 1 + done + echo "Y" + return 0 +} + +function check_perm_file { + b1=$1 + shift 1 + ftext=$(stat -c "%u %g %a" $b1/perm_dir/perm_file) + #echo "first u/g/a = $ftext" > /dev/tty + for b in $*; do + btext=$(stat -c "%u %g %a" $b/perm_dir/perm_file) + #echo " next u/a/a = $btext" > /dev/tty + if [ x"$btext" != x"$ftext" ]; then + return 1 + fi + done + echo "Y" + return 0 +} + +cleanup + +TEST useradd -o -M -u ${TEST_UID} ${TEST_USER} +trap "userdel --force ${TEST_USER}" EXIT + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info + +TEST mkdir -p $B0/${V0}{0,1,2,3,4,5,6,7,8,9} +TEST $CLI volume create $V0 disperse 10 redundancy 2 $H0:$B0/${V0}{0,1,2,3,4,5,6,7,8,9} + +EXPECT "$V0" volinfo_field $V0 'Volume Name' +EXPECT 'Created' volinfo_field $V0 'Status' +EXPECT '10' brick_count $V0 + +TEST $CLI volume start $V0 +EXPECT 'Started' volinfo_field $V0 'Status' + +# Mount FUSE with caching disabled +TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0 + +# Create local files for comparisons etc. +tmpdir=$(mktemp -d) +trap "rm -rf $tmpdir" EXIT +TEST create_file $tmpdir/create-write 10 +TEST create_file $tmpdir/truncate 10 + +# Prepare files and directories we'll need later. +TEST cp $tmpdir/truncate $M0/ +TEST touch $M0/hard-link-1 +TEST touch $M0/unlink +TEST mkdir $M0/rmdir +TEST touch $M0/setxattr +TEST touch $M0/removexattr +TEST setfattr -n user.bar -v "ash_nazg_gimbatul" $M0/removexattr + +# Kill a couple of bricks and allow some time for things to settle. +TEST kill_brick $V0 $H0 $B0/${V0}3 +TEST kill_brick $V0 $H0 $B0/${V0}8 +sleep 10 + +# Test create+write +TEST cp $tmpdir/create-write $M0/ +# Test truncate +TEST truncate --size=8192 $M0/truncate +# Test hard link +TEST ln $M0/hard-link-1 $M0/hard-link-2 +# Test soft link +TEST ln -s soft-link-tgt $M0/soft-link +# Test unlink +TEST rm $M0/unlink +# Test rmdir +TEST rmdir $M0/rmdir +# Test mkdir +TEST mkdir $M0/mkdir +# Test setxattr +TEST setfattr -n user.foo -v "ash_nazg_durbatuluk" $M0/setxattr +# Test removexattr +TEST setfattr -x user.bar $M0/removexattr +# Test uid/gid behavior +TEST setup_perm_file $M0 + +# Unmount/remount so that create/write and truncate don't see cached data. +TEST umount $M0 +TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0 + +# Test create/write and truncate *before* the bricks are brought back. +TEST check_create_write $M0 +TEST check_truncate $M0 + +# Restart the bricks and allow repair to occur. +TEST $CLI volume start $V0 force +sleep 10 + +# Unmount/remount again, same reason as before. +TEST umount $M0 +TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0 + +# Make sure everything is as it should be. Most tests check for consistency +# between the bricks and the front end. This is not valid for disperse, so we +# check the mountpoint state instead. + +TEST check_create_write $M0 +TEST check_truncate $M0 + +TEST stat $M0/hard-link-1 +TEST stat $M0/hard-link-2 +TEST stat $M0/soft-link +TEST ! stat $M0/unlink +TEST ! stat $M0/rmdir +TEST stat $M0/mkdir +TEST stat $M0/setxattr +TEST stat $M0/removexattr +TEST stat $M0/perm_dir +TEST stat $M0/perm_dir/perm_file + +EXPECT_WITHIN 5 "Y" check_hard_link $B0/${V0}{0..9} +EXPECT_WITHIN 5 "Y" check_soft_link $B0/${V0}{0..9} +EXPECT_WITHIN 5 "Y" check_unlink $B0/${V0}{0..9} +EXPECT_WITHIN 5 "Y" check_rmdir $B0/${V0}{0..9} +EXPECT_WITHIN 5 "Y" check_mkdir $B0/${V0}{0..9} +EXPECT_WITHIN 5 "Y" check_setxattr $B0/${V0}{0..9} +EXPECT_WITHIN 5 "Y" check_removexattr $B0/${V0}{0..9} +EXPECT_WITHIN 5 "Y" check_perm_file $B0/${V0}{0..9} + +rm -rf $tmpdir +userdel --force ${TEST_USER} + +cleanup + diff --git a/tests/basic/ec/self-heal.t b/tests/basic/ec/self-heal.t new file mode 100644 index 00000000000..99cfd9420aa --- /dev/null +++ b/tests/basic/ec/self-heal.t @@ -0,0 +1,123 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +# This test checks self-healing feature of dispersed volumes + +cleanup + +tmp=`mktemp -d` +if [ ! -d $tmp ]; then + exit 1 +fi + +TESTS_EXPECTED_IN_LOOP=85 + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 redundancy 2 $H0:$B0/${V0}{0..5} +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 + +TEST dd if=/dev/urandom of=$tmp/test bs=1024 count=1024 + +cs=$(sha1sum $tmp/test | awk '{ print $1 }') + +TEST df -h +TEST stat $M0 + +for idx in {0..5}; do + brick[$idx]=$(gf_get_gfid_backend_file_path $B0/$V0$idx) +done + +cd $M0 +TEST cp $tmp/test test +TEST chmod 644 test +EXPECT "-rw-r--r--" stat -c "%A" test + +for idx1 in {0..5}; do + TEST chmod 666 ${brick[$idx1]}/test + sleep 1 + EXPECT "-rw-r--r--" stat -c "%A" test + EXPECT_WITHIN 5 "-rw-r--r--" stat -c "%A" ${brick[$idx1]}/test +done + +for idx1 in {0..4}; do + for idx2 in `seq $(($idx1 + 1)) 5`; do + if [ $idx1 -ne $idx2 ]; then + TEST chmod 666 ${brick[$idx1]}/test + TEST chmod 600 ${brick[$idx2]}/test + sleep 1 + EXPECT "-rw-r--r--" stat -c "%A" test + EXPECT_WITHIN 5 "-rw-r--r--" stat -c "%A" ${brick[$idx1]}/test + EXPECT_WITHIN 5 "-rw-r--r--" stat -c "%A" ${brick[$idx2]}/test + fi + done +done + +TEST truncate -s 0 ${brick[0]}/test +TEST truncate -s 2097152 ${brick[1]}/test +TEST setfattr -n user.test -v "test1" ${brick[0]}/test +TEST setfattr -n user.test -v "test2" ${brick[1]}/test +TEST chmod 600 ${brick[0]}/test +TEST chmod 666 ${brick[1]}/test +sleep 1 + +EXPECT "1048576" stat -c "%s" test +TEST ! getfattr -n user.test test + +EXPECT_WITHIN 5 "262144" stat -c "%s" ${brick[0]}/test +EXPECT_WITHIN 5 "262144" stat -c "%s" ${brick[1]}/test +TEST ! getfattr -n user.test ${brick[0]}/test +TEST ! getfattr -n user.test ${brick[1]}/test +EXPECT "-rw-r--r--" stat -c "%A" ${brick[0]}/test +EXPECT "-rw-r--r--" stat -c "%A" ${brick[1]}/test + +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST kill_brick $V0 $H0 $B0/${V0}1 +TEST cp $tmp/test test2 +EXPECT "1048576" stat -c "%s" test2 +TEST chmod 777 test2 +EXPECT "-rwxrwxrwx" stat -c "%A" test2 + +TEST mkdir dir1 +TEST ls -al dir1 + +TEST ln -s test2 test3 +TEST [ -h test3 ] + +TEST ln test2 test4 +TEST [ -f test4 ] +EXPECT "2" stat -c "%h" test2 +EXPECT "2" stat -c "%h" test4 + +cd +TEST umount $M0 +TEST $CLI volume stop $V0 force +TEST $CLI volume start $V0 +TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 +cd $M0 + +EXPECT "1048576" stat -c "%s" test2 +EXPECT "-rwxrwxrwx" stat -c "%A" test2 +EXPECT_WITHIN 5 "262144" stat -c "%s" ${brick[0]}/test2 +EXPECT_WITHIN 5 "262144" stat -c "%s" ${brick[1]}/test2 +EXPECT "-rwxrwxrwx" stat -c "%A" ${brick[0]}/test2 +EXPECT "-rwxrwxrwx" stat -c "%A" ${brick[1]}/test2 + +TEST ls -al dir1 +EXPECT_WITHIN 5 "1" eval "if [ -d ${brick[0]}/dir1 ]; then echo 1; fi" +EXPECT_WITHIN 5 "1" eval "if [ -d ${brick[1]}/dir1 ]; then echo 1; fi" + +TEST [ -h test3 ] +EXPECT_WITHIN 5 "1" eval "if [ -h ${brick[0]}/test3 ]; then echo 1; fi" +EXPECT_WITHIN 5 "1" eval "if [ -h ${brick[1]}/test3 ]; then echo 1; fi" + +EXPECT "2" stat -c "%h" test4 +EXPECT_WITHIN 5 "3" stat -c "%h" ${brick[0]}/test4 +EXPECT_WITHIN 5 "3" stat -c "%h" ${brick[1]}/test4 + +rm -rf $tmp + +cleanup diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c index 452df759ad4..089c7d637c9 100644 --- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c @@ -169,6 +169,12 @@ gd_addbr_validate_stripe_count (glusterd_volinfo_t *volinfo, int stripe_count, } } break; + case GF_CLUSTER_TYPE_DISPERSE: + snprintf (err_str, err_len, "Volume %s cannot be converted " + "from dispersed to striped-" + "dispersed", volinfo->volname); + gf_log(THIS->name, GF_LOG_ERROR, "%s", err_str); + goto out; } out: @@ -259,6 +265,12 @@ gd_addbr_validate_replica_count (glusterd_volinfo_t *volinfo, int replica_count, } } break; + case GF_CLUSTER_TYPE_DISPERSE: + snprintf (err_str, err_len, "Volume %s cannot be converted " + "from dispersed to replicated-" + "dispersed", volinfo->volname); + gf_log(THIS->name, GF_LOG_ERROR, "%s", err_str); + goto out; } out: return ret; @@ -276,6 +288,7 @@ gd_rmbr_validate_replica_count (glusterd_volinfo_t *volinfo, switch (volinfo->type) { case GF_CLUSTER_TYPE_NONE: case GF_CLUSTER_TYPE_STRIPE: + case GF_CLUSTER_TYPE_DISPERSE: snprintf (err_str, err_len, "replica count (%d) option given for non replicate " "volume %s", replica_count, volinfo->volname); @@ -737,6 +750,8 @@ __glusterd_handle_remove_brick (rpcsvc_request_t *req) strcpy (vol_type, "stripe"); } else if (volinfo->type == GF_CLUSTER_TYPE_STRIPE_REPLICATE) { strcpy (vol_type, "stripe-replicate"); + } else if (volinfo->type == GF_CLUSTER_TYPE_DISPERSE) { + strcpy (vol_type, "disperse"); } else { strcpy (vol_type, "distribute"); } diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c index ed4bd60f88b..e10dc22b56b 100644 --- a/xlators/mgmt/glusterd/src/glusterd-handler.c +++ b/xlators/mgmt/glusterd/src/glusterd-handler.c @@ -398,6 +398,16 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo, if (ret) goto out; + snprintf (key, 256, "volume%d.disperse_count", count); + ret = dict_set_int32 (volumes, key, volinfo->disperse_count); + if (ret) + goto out; + + snprintf (key, 256, "volume%d.redundancy_count", count); + ret = dict_set_int32 (volumes, key, volinfo->redundancy_count); + if (ret) + goto out; + snprintf (key, 256, "volume%d.transport", count); ret = dict_set_int32 (volumes, key, volinfo->transport_type); if (ret) diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c index c31d8a8ad71..086a6550a72 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.c +++ b/xlators/mgmt/glusterd/src/glusterd-store.c @@ -844,6 +844,18 @@ glusterd_volume_exclude_options_write (int fd, glusterd_volinfo_t *volinfo) if (ret) goto out; + snprintf (buf, sizeof (buf), "%d", volinfo->disperse_count); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_DISPERSE_CNT, + buf); + if (ret) + goto out; + + snprintf (buf, sizeof (buf), "%d", volinfo->redundancy_count); + ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_REDUNDANCY_CNT, + buf); + if (ret) + goto out; + snprintf (buf, sizeof (buf), "%d", volinfo->version); ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_VERSION, buf); if (ret) @@ -2618,6 +2630,12 @@ glusterd_store_update_volinfo (glusterd_volinfo_t *volinfo) } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_REPLICA_CNT, strlen (GLUSTERD_STORE_KEY_VOL_REPLICA_CNT))) { volinfo->replica_count = atoi (value); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_DISPERSE_CNT, + strlen (GLUSTERD_STORE_KEY_VOL_DISPERSE_CNT))) { + volinfo->disperse_count = atoi (value); + } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_REDUNDANCY_CNT, + strlen (GLUSTERD_STORE_KEY_VOL_REDUNDANCY_CNT))) { + volinfo->redundancy_count = atoi (value); } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_TRANSPORT, strlen (GLUSTERD_STORE_KEY_VOL_TRANSPORT))) { volinfo->transport_type = atoi (value); @@ -2754,6 +2772,11 @@ glusterd_store_update_volinfo (glusterd_volinfo_t *volinfo) GF_ASSERT (volinfo->replica_count > 0); break; + case GF_CLUSTER_TYPE_DISPERSE: + GF_ASSERT (volinfo->disperse_count > 0); + GF_ASSERT (volinfo->redundancy_count > 0); + break; + default: GF_ASSERT (0); break; diff --git a/xlators/mgmt/glusterd/src/glusterd-store.h b/xlators/mgmt/glusterd/src/glusterd-store.h index 89cf24de789..fb7de7b1b10 100644 --- a/xlators/mgmt/glusterd/src/glusterd-store.h +++ b/xlators/mgmt/glusterd/src/glusterd-store.h @@ -44,6 +44,8 @@ typedef enum glusterd_store_ver_ac_{ #define GLUSTERD_STORE_KEY_VOL_SUB_COUNT "sub_count" #define GLUSTERD_STORE_KEY_VOL_STRIPE_CNT "stripe_count" #define GLUSTERD_STORE_KEY_VOL_REPLICA_CNT "replica_count" +#define GLUSTERD_STORE_KEY_VOL_DISPERSE_CNT "disperse_count" +#define GLUSTERD_STORE_KEY_VOL_REDUNDANCY_CNT "redundancy_count" #define GLUSTERD_STORE_KEY_VOL_BRICK "brick" #define GLUSTERD_STORE_KEY_VOL_VERSION "version" #define GLUSTERD_STORE_KEY_VOL_TRANSPORT "transport-type" diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index dc923b1eeb4..aff2356eb4f 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -548,6 +548,8 @@ glusterd_volinfo_dup (glusterd_volinfo_t *volinfo, new_volinfo->type = volinfo->type; new_volinfo->replica_count = volinfo->replica_count; new_volinfo->stripe_count = volinfo->stripe_count; + new_volinfo->disperse_count = volinfo->disperse_count; + new_volinfo->redundancy_count = volinfo->redundancy_count; new_volinfo->dist_leaf_count = volinfo->dist_leaf_count; new_volinfo->sub_count = volinfo->sub_count; new_volinfo->transport_type = volinfo->transport_type; @@ -2525,6 +2527,18 @@ glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo, goto out; memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "%s%d.disperse_count", prefix, count); + ret = dict_set_int32 (dict, key, volinfo->disperse_count); + if (ret) + goto out; + + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "%s%d.redundancy_count", prefix, count); + ret = dict_set_int32 (dict, key, volinfo->redundancy_count); + if (ret) + goto out; + + memset (key, 0, sizeof (key)); snprintf (key, sizeof (key), "%s%d.dist_count", prefix, count); ret = dict_set_int32 (dict, key, volinfo->dist_leaf_count); if (ret) @@ -4206,6 +4220,24 @@ glusterd_import_volinfo (dict_t *peer_data, int count, gf_log (THIS->name, GF_LOG_INFO, "peer is possibly old version"); + /* not having a 'disperse_count' key is not a error + (as peer may be of old version) */ + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "%s%d.disperse_count", prefix, count); + ret = dict_get_int32 (peer_data, key, &new_volinfo->disperse_count); + if (ret) + gf_log (THIS->name, GF_LOG_INFO, + "peer is possibly old version"); + + /* not having a 'redundancy_count' key is not a error + (as peer may be of old version) */ + memset (key, 0, sizeof (key)); + snprintf (key, sizeof (key), "%s%d.redundancy_count", prefix, count); + ret = dict_get_int32 (peer_data, key, &new_volinfo->redundancy_count); + if (ret) + gf_log (THIS->name, GF_LOG_INFO, + "peer is possibly old version"); + /* not having a 'dist_count' key is not a error (as peer may be of old version) */ memset (key, 0, sizeof (key)); @@ -6932,6 +6964,9 @@ glusterd_get_dist_leaf_count (glusterd_volinfo_t *volinfo) int rcount = volinfo->replica_count; int scount = volinfo->stripe_count; + if (volinfo->type == GF_CLUSTER_TYPE_DISPERSE) + return volinfo->disperse_count; + return (rcount ? rcount : 1) * (scount ? scount : 1); } @@ -11694,6 +11729,13 @@ gd_update_volume_op_versions (glusterd_volinfo_t *volinfo) } } + if (volinfo->type == GF_CLUSTER_TYPE_DISPERSE) { + if (volinfo->op_version < GD_OP_VERSION_3_6_0) + volinfo->op_version = GD_OP_VERSION_3_6_0; + if (volinfo->client_op_version < GD_OP_VERSION_3_6_0) + volinfo->client_op_version = GD_OP_VERSION_3_6_0; + } + return; } @@ -12774,7 +12816,7 @@ glusterd_volume_quorum_calculate (glusterd_volinfo_t *volinfo, dict_t *dict, goto out; } - up_count = volinfo->replica_count - down_count; + up_count = volinfo->dist_leaf_count - down_count; if (quorum_type && !strcmp (quorum_type, "fixed")) { if (up_count >= quorum_count) { @@ -12782,7 +12824,8 @@ glusterd_volume_quorum_calculate (glusterd_volinfo_t *volinfo, dict_t *dict, goto out; } } else { - if (volinfo->replica_count % 2 == 0) { + if ((GF_CLUSTER_TYPE_DISPERSE != volinfo->type) && + (volinfo->dist_leaf_count % 2 == 0)) { if ((up_count > quorum_count) || ((up_count == quorum_count) && first_brick_on)) { quorum_met = _gf_true; @@ -12835,8 +12878,9 @@ glusterd_volume_quorum_check (glusterd_volinfo_t *volinfo, int64_t index, goto out; } - if (!glusterd_is_volume_replicate (volinfo) || - volinfo->replica_count < 3) { + if ((!glusterd_is_volume_replicate (volinfo) || + volinfo->replica_count < 3) && + (GF_CLUSTER_TYPE_DISPERSE != volinfo->type)) { for (i = 0; i < volinfo->brick_count ; i++) { /* for a pure distribute volume, and replica volume with replica count 2, quorum is not met if even @@ -12858,7 +12902,8 @@ glusterd_volume_quorum_check (glusterd_volinfo_t *volinfo, int64_t index, ret = 0; quorum_met = _gf_true; } else { - distribute_subvols = volinfo->brick_count / volinfo->replica_count; + distribute_subvols = volinfo->brick_count / + volinfo->dist_leaf_count; for (j = 0; j < distribute_subvols; j++) { // by default assume quorum is not met /* TODO: Handle distributed striped replicate volumes @@ -12867,11 +12912,11 @@ glusterd_volume_quorum_check (glusterd_volinfo_t *volinfo, int64_t index, */ ret = 1; quorum_met = _gf_false; - for (i = 0; i < volinfo->replica_count; i++) { + for (i = 0; i < volinfo->dist_leaf_count; i++) { snprintf (key, sizeof (key), "%s%"PRId64".brick%"PRId64".status", key_prefix, index, - (j * volinfo->replica_count) + i); + (j * volinfo->dist_leaf_count) + i); ret = dict_get_int32 (dict, key, &brick_online); if (ret || !brick_online) { if (i == 0) @@ -13043,6 +13088,9 @@ glusterd_snap_quorum_check_for_create (dict_t *dict, gf_boolean_t snap_volume, else quorum_count = volinfo->replica_count/2 + 1; + } else if (GF_CLUSTER_TYPE_DISPERSE == volinfo->type) { + quorum_count = volinfo->disperse_count - + volinfo->redundancy_count; } else { quorum_count = volinfo->brick_count; } @@ -13061,8 +13109,22 @@ glusterd_snap_quorum_check_for_create (dict_t *dict, gf_boolean_t snap_volume, if the quorum-type option is not set to auto, the behavior is set to the default behavior) */ - if (!ret) - quorum_count = tmp; + if (!ret) { + /* for dispersed volumes, only allow quorums + equal or larger than minimum functional + value. + */ + if ((GF_CLUSTER_TYPE_DISPERSE != + volinfo->type) || + (tmp >= quorum_count)) { + quorum_count = tmp; + } else { + gf_log(this->name, GF_LOG_INFO, + "Ignoring small quorum-count " + "(%d) on dispersed volume", tmp); + quorum_type = NULL; + } + } else quorum_type = NULL; } diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index 6ab899a16cf..9701c6b939c 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -2684,10 +2684,14 @@ volume_volgen_graph_build_clusters (volgen_graph_t *graph, "%s-replicate-%d"}; char *stripe_args[] = {"cluster/stripe", "%s-stripe-%d"}; + char *disperse_args[] = {"cluster/disperse", + "%s-disperse-%d"}; + char option[32] = ""; int rclusters = 0; int clusters = 0; int dist_count = 0; int ret = -1; + xlator_t * ec = NULL; if (!volinfo->dist_leaf_count) goto out; @@ -2737,6 +2741,26 @@ volume_volgen_graph_build_clusters (volgen_graph_t *graph, if (clusters < 0) goto out; break; + case GF_CLUSTER_TYPE_DISPERSE: + clusters = volgen_graph_build_clusters (graph, volinfo, + disperse_args[0], + disperse_args[1], + volinfo->brick_count, + volinfo->disperse_count); + if (clusters < 0) + goto out; + + sprintf(option, "%d", volinfo->redundancy_count); + ec = first_of (graph); + while (clusters-- > 0) { + ret = xlator_set_option (ec, "redundancy", option); + if (ret) + goto out; + + ec = ec->next; + } + + break; default: gf_log ("", GF_LOG_ERROR, "volume inconsistency: " "unrecognized clustering type"); diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c index 53beebe0555..f23a9eb96b7 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c @@ -1689,6 +1689,27 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) "replica count for volume %s", volname); goto out; } + } else if (GF_CLUSTER_TYPE_DISPERSE == volinfo->type) { + ret = dict_get_int32 (dict, "disperse-count", + &volinfo->disperse_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "disperse count for volume %s", volname); + goto out; + } + ret = dict_get_int32 (dict, "redundancy-count", + &volinfo->redundancy_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get " + "redundancy count for volume %s", volname); + goto out; + } + if (priv->op_version < GD_OP_VERSION_3_6_0) { + gf_log (this->name, GF_LOG_ERROR, "Disperse volume " + "needs op-version 3.6.0 or higher"); + ret = -1; + goto out; + } } /* dist-leaf-count is the count of brick nodes for a given diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index a8ecb505a5b..ddbb2c81338 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -336,6 +336,8 @@ struct glusterd_volinfo_ { int sub_count; /* backward compatibility */ int stripe_count; int replica_count; + int disperse_count; + int redundancy_count; int subvol_count; /* Number of subvolumes in a distribute volume */ int dist_leaf_count; /* Number of bricks in one |