diff options
author | Dan Lambright <dlambrig@redhat.com> | 2015-02-18 14:49:50 -0500 |
---|---|---|
committer | Vijay Bellur <vbellur@redhat.com> | 2015-03-18 04:47:41 -0700 |
commit | a216745e5db3fdb4fa8d625c971e70f8d0e34d23 (patch) | |
tree | c1165dbc612ec7121bd1734cb9bb006f9ac7c9d3 /libglusterfs | |
parent | 38ccaaf9d1a93c4fc6d733ee3bd5c73e5457bdab (diff) |
cluster/dht: Change the subvolume encoding in d_off to be a "global"
position in the graph rather than relative (local) to a particular
translator.
Encoding the volume in this way allows a single translator to manage
which brick is currently being scanned for directory entries. Using a
single translator minimizes allocated bits in the d_off. It also allows
multiple DHT translators in the same graph to have a common frame of
reference (the graph position) for which brick is being read. Multiple
DHT translators are needed for the Tiering feature.
The fix builds off a previous change (9332) which removed subvolume
encoding from AFR. The fix makes an equivalent change to the EC
translator.
More background can be found in fix 9332 and gluster-dev discussions [1].
DHT and AFR/EC are responsibile (as before) for choosing which brick to
enumerate directory entries in over the readdir lifecycle.
The client translator receiving the readdir fop encodes the dht_t. It
is referred to as the "leaf node" in the graph and corresponds to the
brick being scanned.
When DHT decodes the d_off, it translates the leaf node to a local
subvolume, which represents the next node in the graph leading to
the brick.
Tracking of leaf nodes is done in common utility functions. Leaf nodes
counts and positional information are updated on a graph switch.
[1] www.gluster.org/pipermail/gluster-devel/2015-January/043592.html
Change-Id: Iaf0ea86d7046b1ceadbad69d88707b243077ebc8
BUG: 1190734
Signed-off-by: Dan Lambright <dlambrig@redhat.com>
Reviewed-on: http://review.gluster.org/9688
Reviewed-by: Xavier Hernandez <xhernandez@datalab.es>
Reviewed-by: Krishnan Parthasarathi <kparthas@redhat.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
Tested-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'libglusterfs')
-rw-r--r-- | libglusterfs/src/gf-dirent.c | 128 | ||||
-rw-r--r-- | libglusterfs/src/gf-dirent.h | 10 | ||||
-rw-r--r-- | libglusterfs/src/glusterfs.h | 2 | ||||
-rw-r--r-- | libglusterfs/src/graph.c | 125 | ||||
-rw-r--r-- | libglusterfs/src/xlator.h | 7 |
5 files changed, 271 insertions, 1 deletions
diff --git a/libglusterfs/src/gf-dirent.c b/libglusterfs/src/gf-dirent.c index f6fd3ab54ee..b5f395afc36 100644 --- a/libglusterfs/src/gf-dirent.c +++ b/libglusterfs/src/gf-dirent.c @@ -21,6 +21,134 @@ #include "compat.h" #include "xlator.h" +#define ONE 1ULL +#define PRESENT_D_OFF_BITS 63 +#define BACKEND_D_OFF_BITS 63 +#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1)) +#define MASK (~0ULL) +#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1))) +#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS)) + +static uint64_t +bits_for (uint64_t num) +{ + uint64_t bits = 0, ctrl = 1; + + while (ctrl < num) { + ctrl *= 2; + bits++; + } + + return bits; +} + +int +gf_deitransform(xlator_t *this, + uint64_t offset) +{ + int cnt = 0; + int max = 0; + int max_bits = 0; + uint64_t off_mask = 0; + uint64_t host_mask = 0; + + max = glusterfs_get_leaf_count(this->graph); + + if (max == 1) { + cnt = 0; + goto out; + } + + if (offset & TOP_BIT) { + /* HUGE d_off */ + max_bits = bits_for (max); + off_mask = (MASK << max_bits); + host_mask = ~(off_mask); + + cnt = offset & host_mask; + } else { + /* small d_off */ + cnt = offset % max; + } +out: + return cnt; +} + +uint64_t +gf_dirent_orig_offset(xlator_t *this, + uint64_t offset) +{ + int max = 0; + int max_bits = 0; + uint64_t off_mask = 0; + uint64_t orig_offset; + + max = glusterfs_get_leaf_count(this->graph); + + if (max == 1) { + orig_offset = offset; + goto out; + } + + if (offset & TOP_BIT) { + /* HUGE d_off */ + max_bits = bits_for (max); + off_mask = (MASK << max_bits); + orig_offset = ((offset & ~TOP_BIT) & off_mask) << SHIFT_BITS; + } else { + /* small d_off */ + orig_offset = offset / max; + } +out: + return orig_offset; +} + +int +gf_itransform (xlator_t *this, uint64_t x, uint64_t *y_p, int client_id) +{ + int max = 0; + uint64_t y = 0; + uint64_t hi_mask = 0; + uint64_t off_mask = 0; + int max_bits = 0; + + if (x == ((uint64_t) -1)) { + y = (uint64_t) -1; + goto out; + } + + if (!x) { + y = 0; + goto out; + } + + max = glusterfs_get_leaf_count(this->graph); + + if (max == 1) { + y = x; + goto out; + } + + max_bits = bits_for (max); + + hi_mask = ~(PRESENT_MASK >> (max_bits + 1)); + + if (x & hi_mask) { + /* HUGE d_off */ + off_mask = MASK << max_bits; + y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | client_id; + } else { + /* small d_off */ + y = ((x * max) + client_id); + } + +out: + if (y_p) + *y_p = y; + + return 0; +} + gf_dirent_t * gf_dirent_for_name (const char *name) { diff --git a/libglusterfs/src/gf-dirent.h b/libglusterfs/src/gf-dirent.h index 4c1ff0b1684..07c605f82b0 100644 --- a/libglusterfs/src/gf-dirent.h +++ b/libglusterfs/src/gf-dirent.h @@ -22,6 +22,16 @@ #define gf_dirent_size(name) (sizeof (gf_dirent_t) + strlen (name) + 1) +int +gf_deitransform(xlator_t *this, uint64_t y); + +int +gf_itransform (xlator_t *this, uint64_t x, uint64_t *y_p, int client_id); + +uint64_t +gf_dirent_orig_offset (xlator_t *this, uint64_t offset); + + struct _dir_entry_t { struct _dir_entry_t *next; char *name; diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index a810f3a81f0..791e6dc5fd8 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -452,6 +452,7 @@ struct _glusterfs_graph { int id; /* Used in logging */ int used; /* Should be set when fuse gets first CHILD_UP */ + uint32_t leaf_count; uint32_t volfile_checksum; }; typedef struct _glusterfs_graph glusterfs_graph_t; @@ -617,6 +618,7 @@ int glusterfs_graph_prepare (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx); int glusterfs_graph_destroy_residual (glusterfs_graph_t *graph); int glusterfs_graph_deactivate (glusterfs_graph_t *graph); int glusterfs_graph_destroy (glusterfs_graph_t *graph); +int glusterfs_get_leaf_count (glusterfs_graph_t *graph); int glusterfs_graph_activate (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx); glusterfs_graph_t *glusterfs_graph_construct (FILE *fp); glusterfs_graph_t *glusterfs_graph_new (); diff --git a/libglusterfs/src/graph.c b/libglusterfs/src/graph.c index b427740f10f..709ec3b3ce3 100644 --- a/libglusterfs/src/graph.c +++ b/libglusterfs/src/graph.c @@ -515,15 +515,138 @@ glusterfs_graph_prepare (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx) /* XXX: --xlator-option additions */ gf_add_cmdline_options (graph, &ctx->cmd_args); - return 0; } +static +xlator_t *glusterfs_root(glusterfs_graph_t *graph) +{ + return graph->first; +} + +static +int glusterfs_is_leaf(xlator_t *xl) +{ + int ret = 0; + + if (!xl->children) + ret = 1; + + return ret; +} + +static +uint32_t glusterfs_count_leaves(xlator_t *xl) +{ + int n = 0; + xlator_list_t *list = NULL; + + if (glusterfs_is_leaf(xl)) + n = 1; + else + for (list = xl->children; list; list = list->next) + n += glusterfs_count_leaves(list->xlator); + + return n; +} + +int glusterfs_get_leaf_count(glusterfs_graph_t *graph) +{ + return graph->leaf_count; +} + +static +int _glusterfs_leaf_position(xlator_t *tgt, int *id, xlator_t *xl) +{ + xlator_list_t *list = NULL; + int found = 0; + + if (xl == tgt) + found = 1; + else if (glusterfs_is_leaf(xl)) + *id += 1; + else + for (list = xl->children; !found && list; list = list->next) + found = _glusterfs_leaf_position(tgt, id, list->xlator); + + return found; +} + +int glusterfs_leaf_position(xlator_t *tgt) +{ + xlator_t *root = NULL; + int pos = 0; + + root = glusterfs_root(tgt->graph); + + if (!_glusterfs_leaf_position(tgt, &pos, root)) + pos = -1; + + return pos; +} + +static int +_glusterfs_reachable_leaves(xlator_t *base, xlator_t *xl, dict_t *leaves) +{ + xlator_list_t *list = NULL; + int err = 1; + int pos = 0; + char strpos[6]; + + if (glusterfs_is_leaf(xl)) { + pos = glusterfs_leaf_position(xl); + if (pos < 0) + goto out; + sprintf(strpos, "%d", pos); + + err = dict_set_static_ptr(leaves, strpos, base); + + } else { + for (err = 0, list = xl->children; + !err && list; + list = list->next) + err = _glusterfs_reachable_leaves(base, list->xlator, + leaves); + } + +out: + return err; +} + +/* + * This function determines which leaves are children (or grandchildren) + * of the given base. The base may have multiple sub volumes. Each sub + * volumes in turn may have sub volumes.. until the leaves are reached. + * Each leaf is numbered 1,2,3,...etc. + * + * The base translator calls this function to see which of *its* subvolumes + * it would forward an FOP to, to *get to* a particular leaf. + * That information is built into the "leaves" dictionary. + * key:destination leaf# -> value:base subvolume xlator. + */ + +int +glusterfs_reachable_leaves(xlator_t *base, dict_t *leaves) +{ + xlator_list_t *list = NULL; + int err = 0; + + for (list = base->children; !err && list; list = list->next) + err = _glusterfs_reachable_leaves(list->xlator, + list->xlator, leaves); + + return err; +} int glusterfs_graph_activate (glusterfs_graph_t *graph, glusterfs_ctx_t *ctx) { int ret = 0; + xlator_t *root = NULL; + + root = glusterfs_root(graph); + + graph->leaf_count = glusterfs_count_leaves(root); /* XXX: all xlator options validation */ ret = glusterfs_graph_validate_options (graph); diff --git a/libglusterfs/src/xlator.h b/libglusterfs/src/xlator.h index 733f6cf47ab..5a71ceb3f31 100644 --- a/libglusterfs/src/xlator.h +++ b/libglusterfs/src/xlator.h @@ -978,4 +978,11 @@ glusterfs_volfile_reconfigure (int oldvollen, FILE *newvolfile_fp, int loc_touchup (loc_t *loc, const char *name); + +int +glusterfs_leaf_position(xlator_t *tgt); + +int +glusterfs_reachable_leaves(xlator_t *base, dict_t *leaves); + #endif /* _XLATOR_H */ |