diff options
| -rw-r--r-- | tests/basic/afr/afr-read-hash-mode.t | 56 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 86 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-mem-types.h | 1 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-read-txn.c | 39 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.h | 6 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 14 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 5 | 
7 files changed, 175 insertions, 32 deletions
diff --git a/tests/basic/afr/afr-read-hash-mode.t b/tests/basic/afr/afr-read-hash-mode.t new file mode 100644 index 00000000000..eeff10d8ebd --- /dev/null +++ b/tests/basic/afr/afr-read-hash-mode.t @@ -0,0 +1,56 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc + +cleanup + +function reads_brick_count { +        $CLI volume profile $V0 info incremental | grep -w READ | wc -l +} + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 3 arbiter 1 $H0:$B0/${V0}{0..2} + +TEST $CLI volume set $V0 cluster.choose-local off +TEST $CLI volume set $V0 performance.quick-read off +TEST $CLI volume set $V0 performance.io-cache off +TEST $CLI volume set $V0 performance.write-behind off +TEST $CLI volume set $V0 performance.stat-prefetch off +TEST $CLI volume set $V0 performance.read-ahead off +TEST $CLI volume start $V0 + +# Disable all caching +TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0 +TEST dd if=/dev/urandom of=$M0/FILE bs=1M count=8 +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + +# TEST if the option gives the intended behavior. The way we perform this test +# is by performing reads from the mount and write to /dev/null. If the +# read-hash-mode is 3, then for a given file, more than 1 brick should serve the +# read-fops where as with the default read-hash-mode (i.e. 1), only 1 brick will. + +# read-hash-mode=1 +TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0 +EXPECT "1" mount_get_option_value $M0 $V0-replicate-0 read-hash-mode +TEST $CLI volume profile $V0 start +TEST dd if=$M0/FILE of=/dev/null bs=1M +count=`reads_brick_count` +TEST [ $count -eq 1 ] +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + +# read-hash-mode=3 +TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0 +TEST $CLI volume set $V0 cluster.read-hash-mode 3 +EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "3" mount_get_option_value $M0 $V0-replicate-0 read-hash-mode +TEST $CLI volume profile $V0 info clear +TEST dd if=$M0/FILE of=/dev/null bs=1M +count=`reads_brick_count` +TEST [ $count -eq 2 ] + +# Check that the arbiter did not serve any reads +arbiter_reads=$($CLI volume top $V0 read brick $H0:$B0/${V0}2|grep FILE|awk '{print $1}') +TEST [ -z $arbiter_reads ] + +cleanup; diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index bfd8c2e8c2c..685a349ac49 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1630,38 +1630,69 @@ out:          return ret;  } -  int -afr_hash_child (afr_read_subvol_args_t *args, int32_t child_count, int hashmode) +afr_least_pending_reads_child (afr_private_t *priv)  { -        uuid_t gfid_copy = {0,}; -        pid_t pid; +        int i = 0; +        int child = 0; +        int64_t read_iter = -1; +        int64_t pending_read = -1; -        if (!hashmode) { -                return -1; +        pending_read = GF_ATOMIC_GET (priv->pending_reads[0]); +        for (i = 1; i < priv->child_count; i++) { +                if (AFR_IS_ARBITER_BRICK(priv, i)) +                        continue; +                read_iter =  GF_ATOMIC_GET(priv->pending_reads[i]); +                if (read_iter < pending_read) { +                        pending_read = read_iter; +                        child = i; +                }          } -        gf_uuid_copy (gfid_copy, args->gfid); +        return child; +} -        if ((hashmode > 1) && (args->ia_type != IA_IFDIR)) { -                /* -                 * Why getpid?  Because it's one of the cheapest calls -                 * available - faster than gethostname etc. - and returns a -                 * constant-length value that's sure to be shorter than a UUID. -                 * It's still very unlikely to be the same across clients, so -                 * it still provides good mixing.  We're not trying for -                 * perfection here.  All we need is a low probability that -                 * multiple clients won't converge on the same subvolume. -                 */ -                pid = getpid(); -                memcpy (gfid_copy, &pid, sizeof(pid)); +int +afr_hash_child (afr_read_subvol_args_t *args, afr_private_t *priv) +{ +        uuid_t gfid_copy = {0,}; +        pid_t pid; +        int child = -1; + +        switch (priv->hash_mode) { +        case 0: +                break; +        case 1: +                gf_uuid_copy (gfid_copy, args->gfid); +                child = SuperFastHash((char *)gfid_copy, +                                      sizeof(gfid_copy)) % priv->child_count; +                break; +        case 2: +                if (args->ia_type != IA_IFDIR) { +                        /* +                         * Why getpid?  Because it's one of the cheapest calls +                         * available - faster than gethostname etc. - and +                         * returns a constant-length value that's sure to be +                         * shorter than a UUID. It's still very unlikely to be +                         * the same across clients, so it still provides good +                         * mixing.  We're not trying for perfection here. All we +                         * need is a low probability that multiple clients +                         * won't converge on the same subvolume. +                         */ +                        pid = getpid(); +                        memcpy (gfid_copy, &pid, sizeof(pid)); +                } +                child = SuperFastHash((char *)gfid_copy, +                                      sizeof(gfid_copy)) % priv->child_count; +                break; +        case 3: +                child = afr_least_pending_reads_child (priv); +                break;          } -        return SuperFastHash((char *)gfid_copy, -                             sizeof(gfid_copy)) % child_count; +        return child;  } -  int  afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,  				  unsigned char *readable, @@ -1686,8 +1717,7 @@ afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,          }  	/* second preference - use hashed mode */ -	read_subvol = afr_hash_child (&local_args, priv->child_count, -                                      priv->hash_mode); +        read_subvol = afr_hash_child (&local_args, priv);  	if (read_subvol >= 0 && readable[read_subvol])                  return read_subvol; @@ -4611,6 +4641,8 @@ afr_priv_dump (xlator_t *this)                  gf_proc_dump_write(key, "%d", priv->child_up[i]);                  sprintf (key, "pending_key[%d]", i);                  gf_proc_dump_write(key, "%s", priv->pending_key[i]); +                sprintf (key, "pending_reads[%d]", i); +                gf_proc_dump_write(key, "%"PRId64, GF_ATOMIC_GET(priv->pending_reads[i]));          }          gf_proc_dump_write("data_self_heal", "%s", priv->data_self_heal);          gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal); @@ -4623,6 +4655,7 @@ afr_priv_dump (xlator_t *this)          gf_proc_dump_write("background-self-heal-count", "%d",                             priv->background_self_heal_count);          gf_proc_dump_write("healers", "%d", priv->healers); +        gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode);          if (priv->quorum_count == AFR_QUORUM_AUTO) {                  gf_proc_dump_write ("quorum-type", "auto");          } else if (priv->quorum_count == 0) { @@ -5325,6 +5358,8 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)                  goto out;          } +        local->read_subvol = -1; +  	local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies),  				   gf_afr_mt_reply_t);  	if (!local->replies) { @@ -5474,9 +5509,12 @@ afr_priv_destroy (afr_private_t *priv)                  for (i = 0; i < priv->child_count; i++)                          GF_FREE (priv->pending_key[i]);          } +        GF_FREE (priv->pending_reads); +        GF_FREE (priv->local);          GF_FREE (priv->pending_key);          GF_FREE (priv->children);          GF_FREE (priv->child_up); +        GF_FREE (priv->child_latency);          LOCK_DESTROY (&priv->lock);          GF_FREE (priv); diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index c7d6261b110..2e1117fc18c 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -47,6 +47,7 @@ enum gf_afr_mem_types_ {          gf_afr_mt_spb_status_t,          gf_afr_mt_empty_brick_t,          gf_afr_mt_child_latency_t, +        gf_afr_mt_atomic_t,      gf_afr_mt_end  };  #endif diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index f6c491b713e..a8a4090efd1 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -12,6 +12,39 @@  #include "afr-transaction.h"  #include "afr-messages.h" +void +afr_pending_read_increment (afr_private_t *priv, int child_index) +{ +        if (child_index < 0 || child_index > priv->child_count) +                return; + +        GF_ATOMIC_INC(priv->pending_reads[child_index]); +} + +void +afr_pending_read_decrement (afr_private_t *priv, int child_index) +{ +        if (child_index < 0 || child_index > priv->child_count) +                return; + +        GF_ATOMIC_DEC(priv->pending_reads[child_index]); +} + +void +afr_read_txn_wind (call_frame_t *frame, xlator_t *this, int subvol) +{ +        afr_local_t *local = NULL; +        afr_private_t *priv = NULL; + +        local = frame->local; +        priv = this->private; + +        afr_pending_read_decrement (priv, local->read_subvol); +        local->read_subvol = subvol; +        afr_pending_read_increment (priv, subvol); +        local->readfn (frame, this, subvol); +} +  int  afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this)  { @@ -43,7 +76,7 @@ afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this)  	   readable subvols. */  	if (subvol != -1)  		local->read_attempted[subvol] = 1; -	local->readfn (frame, this, subvol); +	afr_read_txn_wind (frame, this, subvol);  	return 0;  } @@ -89,7 +122,7 @@ readfn:          if (read_subvol == -1) {                  AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN (-1, -err);          } -	local->readfn (frame, this, read_subvol); +        afr_read_txn_wind (frame, this, read_subvol);  	return 0;  } @@ -246,7 +279,7 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,  	local->read_attempted[read_subvol] = 1;  read: -	local->readfn (frame, this, read_subvol); +	afr_read_txn_wind (frame, this, read_subvol);  	return 0; diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index a27e9a3c0b4..cb62c185938 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -37,6 +37,12 @@ int afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,  int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol); +void +afr_pending_read_increment (afr_private_t *priv, int child_index); + +void +afr_pending_read_decrement (afr_private_t *priv, int child_index); +  call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame);  gf_boolean_t afr_has_quorum (unsigned char *subvols, xlator_t *this);  gf_boolean_t afr_needs_changelog_update (afr_local_t *local); diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index cfba5d5d3c9..22ce0a35ece 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -429,6 +429,9 @@ init (xlator_t *this)          }          GF_OPTION_INIT ("choose-local", priv->choose_local, bool, out); +        priv->pending_reads = GF_CALLOC (sizeof(*priv->pending_reads), +                                         priv->child_count, gf_afr_mt_atomic_t); +          GF_OPTION_INIT ("read-hash-mode", priv->hash_mode, uint32, out);          priv->favorite_child = -1; @@ -703,18 +706,19 @@ struct volume_options options[] = {          { .key = {"read-hash-mode" },            .type = GF_OPTION_TYPE_INT,            .min = 0, -          .max = 2, +          .max = 3,            .default_value = "1",            .op_version = {2},            .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,            .tags = {"replicate"},            .description = "inode-read fops happen only on one of the bricks in "                           "replicate. AFR will prefer the one computed using " -                         "the method specified using this option" -                         "0 = first up server, " +                         "the method specified using this option.\n" +                         "0 = first readable child of AFR, starting from 1st child.\n"                           "1 = hash by GFID of file (all clients use " -                                                    "same subvolume), " -                         "2 = hash by GFID of file and client PID", +                                                    "same subvolume).\n" +                         "2 = hash by GFID of file and client PID.\n" +                         "3 = brick having the least outstanding read requests."          },          { .key  = {"choose-local" },            .type = GF_OPTION_TYPE_BOOL, diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index b2f3af136bd..129670517f3 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -113,6 +113,7 @@ typedef struct _afr_private {  	gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */          int read_child;               /* read-subvolume */          unsigned int hash_mode;       /* for when read_child is not set */ +        gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/          int favorite_child;  /* subvolume to be preferred in resolving                                           split-brain cases */ @@ -425,6 +426,8 @@ typedef struct _afr_local {  	unsigned char *readable;  	unsigned char *readable2; /*For rename transaction*/ +        int read_subvol; /* Current read subvolume */ +  	afr_inode_refresh_cbk_t refreshfn;  	/* @refreshinode: @@ -974,6 +977,8 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);                          __this = frame->this;                   \                          afr_handle_inconsistent_fop (frame, &__op_ret,\                                                       &__op_errno);\ +                        if (__local && __local->is_read_txn) \ +                                afr_pending_read_decrement (__this->private, __local->read_subvol); \                          frame->local = NULL;                    \                  }                                               \                                                                  \  | 
