diff options
author | Anand Avati <avati@redhat.com> | 2014-12-23 10:04:00 -0800 |
---|---|---|
committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2014-12-26 06:21:44 -0800 |
commit | 7926fe6f7df664bbe5e050a8e66240dd67155eec (patch) | |
tree | 6a63fc4b9804dcff58faf809a65dbff8b6f7c61c /xlators/cluster | |
parent | 79a5b2b991037cc1db5c71f7aa2a32ce712a22e9 (diff) |
afr: stop encoding subvolume id in readdir d_off
The purpose of encoding d_off in AFR is to indicate the
selected subvolume for the first readdir, and continue all
further readdirs of the session on the same subvolume. This is
required because, unlike files, dir d_offs are specific to the
backend and cannot be re-used on another subvolume. The d_off
transformation encodes the subvolume id and prevents such
invalid use of d_offs on other servers.
However, this approach could be quite wasteful of precious d_off
bit-space. Unlike DHT, where server id can change from entry to
entry and thus encoding the server id in the transformed d_off
is necessary, we could take a slightly relaxed approach in AFR.
The approach is to save the subvolume where the last readdir
request was sent in the fd_ctx. This consumes constant space (i.e
no per-entry cache), and serves the purpose of avoiding d_off
"misuse" (i.e using d_off from one server on another).
The compromise here is NFS resuming readdir from a non-0 cookie
after an extended delay (either anonymous FD has been reclaimed,
or server has restarted). In such cases a subvolume is picked
freshly. To make this fresh picking more deterministic (i.e, to
pick the same subvolume whenever possible, even after reboots),
the function afr_hash_child (used by afr_read_subvol_select_by_policy)
is modified to skip all dynamic inputs (i.e PID) for the case
of directories.
Change-Id: I46ad95feaeb21fb811b7e8d772866a646330c9d8
BUG: 1163161
Signed-off-by: Anand Avati <avati@redhat.com>
Reviewed-on: http://review.gluster.org/9332
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Diffstat (limited to 'xlators/cluster')
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 4 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-dir-read.c | 144 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.h | 6 |
3 files changed, 23 insertions, 131 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 5d6737f72df..2cbd0ce4c90 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -741,7 +741,7 @@ afr_hash_child (inode_t *inode, int32_t child_count, int hashmode) uuid_copy (gfid_copy, inode->gfid); } - if (hashmode > 1) { + if (hashmode > 1 && inode->ia_type != IA_IFDIR) { /* * Why getpid? Because it's one of the cheapest calls * available - faster than gethostname etc. - and returns a @@ -2218,6 +2218,8 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto out; } + fd_ctx->readdir_subvol = -1; + pthread_mutex_init (&fd_ctx->delay_lock, NULL); INIT_LIST_HEAD (&fd_ctx->eager_locked); diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index 41f5e60032d..af6a1787593 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -124,132 +124,6 @@ out: } -#define BACKEND_D_OFF_BITS 63 -#define PRESENT_D_OFF_BITS 63 - -#define ONE 1ULL -#define MASK (~0ULL) -#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS)) -#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS)) - -#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1)) -#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1))) - -static uint64_t -afr_bits_for (uint64_t num) -{ - uint64_t bits = 0, ctrl = 1; - - while (ctrl < num) { - ctrl *= 2; - bits ++; - } - - return bits; -} - -int -afr_itransform (xlator_t *this, int subvol, uint64_t x, uint64_t *y_p) -{ - afr_private_t *conf = NULL; - int cnt = 0; - int max = 0; - uint64_t y = 0; - uint64_t hi_mask = 0; - uint64_t off_mask = 0; - int max_bits = 0; - - if (x == ((uint64_t) -1)) { - y = (uint64_t) -1; - goto out; - } - - conf = this->private; - if (!conf) - goto out; - - max = conf->child_count; - cnt = subvol; - - if (max == 1) { - y = x; - goto out; - } - - max_bits = afr_bits_for (max); - - hi_mask = ~(PRESENT_MASK >> (max_bits + 1)); - - if (x & hi_mask) { - /* HUGE d_off */ - off_mask = MASK << max_bits; - y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt; - } else { - /* small d_off */ - y = ((x * max) + cnt); - } - -out: - if (y_p) - *y_p = y; - - return 0; -} - - -int -afr_deitransform (xlator_t *this, uint64_t y, int *subvol_p, - uint64_t *x_p) -{ - afr_private_t *conf = NULL; - int cnt = 0; - int max = 0; - uint64_t x = 0; - int subvol = 0; - int max_bits = 0; - uint64_t off_mask = 0; - uint64_t host_mask = 0; - - if (!this->private) - return -1; - - conf = this->private; - max = conf->child_count; - - if (max == 1) { - x = y; - cnt = 0; - goto out; - } - - if (y & TOP_BIT) { - /* HUGE d_off */ - max_bits = afr_bits_for (max); - off_mask = (MASK << max_bits); - host_mask = ~(off_mask); - - x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS; - - cnt = y & host_mask; - } else { - /* small d_off */ - cnt = y % max; - x = y / max; - } - -out: - subvol = cnt; - - if (subvol_p) - *subvol_p = subvol; - - if (x_p) - *x_p = x; - - return 0; -} - - static void afr_readdir_transform_entries (gf_dirent_t *subvol_entries, int subvol, gf_dirent_t *entries, fd_t *fd) @@ -273,7 +147,6 @@ afr_readdir_transform_entries (gf_dirent_t *subvol_entries, int subvol, } list_del_init (&entry->list); - afr_itransform (THIS, subvol, entry->d_off, &entry->d_off); list_add_tail (&entry->list, &entries->list); if (entry->inode) { @@ -333,9 +206,11 @@ afr_readdir_wind (call_frame_t *frame, xlator_t *this, int subvol) { afr_local_t *local = NULL; afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; priv = this->private; local = frame->local; + fd_ctx = afr_fd_ctx_get (local->fd, this); if (subvol == -1) { AFR_STACK_UNWIND (readdir, frame, local->op_ret, @@ -343,6 +218,8 @@ afr_readdir_wind (call_frame_t *frame, xlator_t *this, int subvol) return 0; } + fd_ctx->readdir_subvol = subvol; + if (local->op == GF_FOP_READDIR) STACK_WIND_COOKIE (frame, afr_readdir_cbk, (void *) (long) subvol, @@ -370,18 +247,27 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, afr_local_t *local = NULL; int32_t op_errno = 0; int subvol = -1; + afr_fd_ctx_t *fd_ctx = NULL; local = AFR_FRAME_INIT (frame, op_errno); if (!local) goto out; + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) { + op_errno = EINVAL; + goto out; + } + local->op = whichop; local->fd = fd_ref (fd); local->cont.readdir.size = size; local->cont.readdir.offset = offset; local->xdata_req = (dict)? dict_ref (dict) : NULL; - if (offset == 0) { + subvol = fd_ctx->readdir_subvol; + + if (offset == 0 || subvol == -1) { /* First readdir has option of failing over and selecting an appropriate read subvolume */ afr_read_txn (frame, this, fd->inode, afr_readdir_wind, @@ -389,8 +275,6 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, } else { /* But continued readdirs MUST stick to the same subvolume without an option to failover */ - afr_deitransform (this, offset, &subvol, - (uint64_t *)&local->cont.readdir.offset); afr_readdir_wind (frame, this, subvol); } diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 7e138c54ec0..4044fd59d4e 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -301,6 +301,12 @@ typedef struct { /* list of frames currently in progress */ struct list_head eager_locked; + + /* the subvolume on which the latest sequence of readdirs (starting + at offset 0) has begun. Till the next readdir request with 0 offset + arrives, we continue to read off this subvol. + */ + int readdir_subvol; } afr_fd_ctx_t; |