diff options
| author | Ravishankar N <ravishankar@redhat.com> | 2015-04-15 22:53:21 +0530 | 
|---|---|---|
| committer | Krishnan Parthasarathi <kparthas@redhat.com> | 2015-04-27 22:40:31 -0700 | 
| commit | d4889b2cfd29e6ecc911d2b29d1f85d516a66eaf (patch) | |
| tree | a702c6323d43561d79d2236b8cb7087e5207b2b7 | |
| parent | 70a729e9751e45e266f7462443dcf2b6be3cecbe (diff) | |
arbiter: load arbiter xlator on every 3rd brick of a replica 3 AFR subvol
Logic for adding the 'glusterd_brickinfo->group' member and using it to
find the brick positon has been taken from http://review.gluster.org/#/c/9919.
Thanks to Jeff Darcy for that.
This patch is a part of the arbiter logic implementation for 3 way AFR
details of which can be found at http://review.gluster.org/#/c/9656/
Change-Id: Idbfe4f29ee8e098e0102def8f38b32314316b188
BUG: 1199985
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
Reviewed-on: http://review.gluster.org/10257
Tested-by: NetBSD Build System
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Reviewed-by: Krishnan Parthasarathi <kparthas@redhat.com>
Tested-by: Krishnan Parthasarathi <kparthas@redhat.com>
| -rw-r--r-- | libglusterfs/src/list.h | 14 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 6 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 2 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-utils.c | 1 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volgen.c | 86 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volgen.h | 3 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-ops.c | 4 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd.h | 9 | 
8 files changed, 111 insertions, 14 deletions
diff --git a/libglusterfs/src/list.h b/libglusterfs/src/list.h index a860275a91e..875594136a2 100644 --- a/libglusterfs/src/list.h +++ b/libglusterfs/src/list.h @@ -256,4 +256,18 @@ static inline void list_replace_init(struct list_head *old,  	     &pos->member != (head);                                    \  	     pos = n, n = list_entry(n->member.prev, typeof(*n), member)) +/* + * This list implementation has some advantages, but one disadvantage: you + * can't use NULL to check whether you're at the head or tail.  Thus, the + * address of the head has to be an argument for these macros. + */ + +#define list_next(ptr, head, type, member)      \ +        (((ptr)->member.next == head) ? NULL    \ +                                 : list_entry((ptr)->member.next, type, member)) + +#define list_prev(ptr, head, type, member)      \ +        (((ptr)->member.prev == head) ? NULL    \ +                                 : list_entry((ptr)->member.prev, type, member)) +  #endif /* _LLIST_H */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index f962fb6494e..21575fed2de 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -274,6 +274,8 @@ init (xlator_t *this)          priv->read_child = -1; +        GF_OPTION_INIT ("arbiter-count", priv->arbiter_count, uint32, out); +  	GF_OPTION_INIT ("afr-dirty-xattr", priv->afr_dirty, str, out);  	GF_OPTION_INIT ("metadata-splitbrain-forced-heal", @@ -794,5 +796,9 @@ struct volume_options options[] = {                           "attributes from the same subvol as long as it holds "                           " a good copy of the file/dir.",          }, +        { .key = {"arbiter-count"}, +          .type = GF_OPTION_TYPE_INT, +          .description = "subset of child_count. Has to be 0 or 1." +        },          { .key  = {NULL} },  }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index de000e765ea..f7bc6ea0f94 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -55,6 +55,8 @@ typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this);  typedef struct _afr_private {          gf_lock_t lock;               /* to guard access to child_count, etc */          unsigned int child_count;     /* total number of children   */ +        unsigned int arbiter_count;   /*subset of child_count. +                                        Has to be 0 or 1.*/          xlator_t **children; diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c index e23d2a35fe8..2103fd62e03 100644 --- a/xlators/mgmt/glusterd/src/glusterd-utils.c +++ b/xlators/mgmt/glusterd/src/glusterd-utils.c @@ -6028,6 +6028,7 @@ glusterd_recreate_volfiles (glusterd_conf_t *conf)          int                      op_ret = 0;          GF_ASSERT (conf); +          cds_list_for_each_entry (volinfo, &conf->volumes, vol_list) {                  ret = generate_brick_volfiles (volinfo);                  if (ret) { diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c index faaf5d59a48..a149e9916df 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.c +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c @@ -1502,12 +1502,26 @@ brick_graph_add_arbiter (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,                           dict_t *set_dict, glusterd_brickinfo_t *brickinfo)  {          xlator_t *xl = NULL; +        glusterd_brickinfo_t  *next = NULL; +        glusterd_brickinfo_t  *last = NULL;          int ret = -1;          if (volinfo->arbiter_count != 1)                  return 0; -        /*TODO: Parse brickinfo and add the arbiter xlator only if brick is the -         * last brick (i.e. 3rd brick) of the replcia pair.*/ + +        /* Find the last brick in the same group. */ +        last = brickinfo; +        for (;;) { +                next = list_next (last, &volinfo->bricks, +                                  glusterd_brickinfo_t, brick_list); +                if (!next || (next->group != brickinfo->group)) { +                        break; +                } +                last = next; +        } +        if (last != brickinfo) +                return 0; +          xl = volgen_graph_add (graph, "features/arbiter", volinfo->volname);          if (!xl)                  goto out; @@ -1571,6 +1585,22 @@ out:          return ret;  } +void +assign_brick_groups (glusterd_volinfo_t *volinfo) +{ +        glusterd_brickinfo_t    *brickinfo      = NULL; +        uint16_t                group_num       = 0; +        int                     in_group        = 0; + +        list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) { +                brickinfo->group = group_num; +                if (++in_group >= volinfo->replica_count) { +                        in_group = 0; +                        ++group_num; +                } +        } +} +  static int  brick_graph_add_changelog (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,                              dict_t *set_dict, glusterd_brickinfo_t *brickinfo) @@ -3087,6 +3117,43 @@ out:  }  static int +volgen_graph_build_afr_clusters (volgen_graph_t *graph, +                                 glusterd_volinfo_t *volinfo) +{ +        int             i                    = 0; +        int             ret                  = 0; +        int             clusters             = 0; +        char            *replicate_args[]    = {"cluster/replicate", +                                                "%s-replicate-%d"}; +        xlator_t        *afr                 = NULL; +        char            option[32]           = {0}; + +        clusters = volgen_link_bricks_from_list_tail (graph, volinfo, +                                                replicate_args[0], +                                                replicate_args[1], +                                                volinfo->brick_count, +                                                volinfo->replica_count); +        if (clusters < 0) +                goto out; + +        if (!volinfo->arbiter_count) +                goto out; + +        afr = first_of (graph); +        sprintf(option, "%d", volinfo->arbiter_count); +        for (i = 0; i < clusters; i++) { +                ret = xlator_set_option (afr, "arbiter-count", option); +                if (ret) { +                        clusters = -1; +                        goto out; +                } +                afr = afr->next; +        } +out: +        return clusters; +} + +static int  volume_volgen_graph_build_clusters (volgen_graph_t *graph,                                      glusterd_volinfo_t *volinfo,                                      gf_boolean_t is_quotad) @@ -3116,13 +3183,7 @@ volume_volgen_graph_build_clusters (volgen_graph_t *graph,          /* All other cases, it will have one or the other cluster type */          switch (volinfo->type) {          case GF_CLUSTER_TYPE_REPLICATE: -                clusters = volgen_link_bricks_from_list_tail -                        (graph, volinfo, -                         replicate_args[0], -                         replicate_args[1], -                         volinfo->brick_count, -                         volinfo->replica_count); - +                clusters = volgen_graph_build_afr_clusters (graph, volinfo);                  if (clusters < 0)                          goto out;                  break; @@ -3146,11 +3207,7 @@ volume_volgen_graph_build_clusters (volgen_graph_t *graph,                  /* Replicate after the clients, then stripe */                  if (volinfo->replica_count == 0)                          goto out; -                clusters = volgen_link_bricks_from_list_tail (graph, volinfo, -                                                        replicate_args[0], -                                                        replicate_args[1], -                                                        volinfo->brick_count, -                                                        volinfo->replica_count); +                clusters = volgen_graph_build_afr_clusters (graph, volinfo);                  if (clusters < 0)                          goto out; @@ -4473,6 +4530,7 @@ generate_brick_volfiles (glusterd_volinfo_t *volinfo)          if (ret == -1)                  return -1; +        assign_brick_groups (volinfo);          get_vol_tstamp_file (tstamp_file, volinfo);          if (ret) { diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.h b/xlators/mgmt/glusterd/src/glusterd-volgen.h index 02f8df0cf7d..4575049ada9 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volgen.h +++ b/xlators/mgmt/glusterd/src/glusterd-volgen.h @@ -218,6 +218,9 @@ glusterd_check_voloption_flags (char *key, int32_t flags);  gf_boolean_t  glusterd_is_valid_volfpath (char *volname, char *brick); +void +assign_brick_groups (glusterd_volinfo_t *volinfo); +  int  generate_brick_volfiles (glusterd_volinfo_t *volinfo); diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c index a42f08c1600..de3045ffde3 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c @@ -1996,6 +1996,8 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr)                                   "replica count for volume %s", volname);                          goto out;                  } +                ret = dict_get_int32 (dict, "arbiter-count", +                                      &volinfo->arbiter_count);          } else if (GF_CLUSTER_TYPE_STRIPE == volinfo->type) {                  ret = dict_get_int32 (dict, "stripe-count",                                        &volinfo->stripe_count); @@ -2019,6 +2021,8 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr)                                  "replica count for volume %s", volname);                          goto out;                  } +                ret = dict_get_int32 (dict, "arbiter-count", +                                      &volinfo->arbiter_count);          } else if (GF_CLUSTER_TYPE_DISPERSE == volinfo->type) {                  ret = dict_get_int32 (dict, "disperse-count",                                        &volinfo->disperse_count); diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h index ff63cce2234..60c3ebdf9bb 100644 --- a/xlators/mgmt/glusterd/src/glusterd.h +++ b/xlators/mgmt/glusterd/src/glusterd.h @@ -202,6 +202,15 @@ struct glusterd_brickinfo {          char vg[PATH_MAX]; /* FIXME: Use max size for length of vg */          int     caps; /* Capability */          int32_t            snap_status; +        /* +         * The group is used to identify which bricks are part of the same +         * replica set during brick-volfile generation, so that NSR volfiles +         * can "cross-connect" the bricks to one another. It is also used by +         * AFR to load the arbiter xlator in the appropriate brick in case of +         * a replica 3 volume with arbiter enabled. +         */ +        uint16_t           group; +  };  typedef struct glusterd_brickinfo glusterd_brickinfo_t;  | 
