summaryrefslogtreecommitdiffstats
path: root/xlators/cluster
diff options
context:
space:
mode:
authorAnand Avati <avati@redhat.com>2014-12-23 10:04:00 -0800
committerPranith Kumar Karampuri <pkarampu@redhat.com>2014-12-26 06:21:44 -0800
commit7926fe6f7df664bbe5e050a8e66240dd67155eec (patch)
tree6a63fc4b9804dcff58faf809a65dbff8b6f7c61c /xlators/cluster
parent79a5b2b991037cc1db5c71f7aa2a32ce712a22e9 (diff)
afr: stop encoding subvolume id in readdir d_off
The purpose of encoding d_off in AFR is to indicate the selected subvolume for the first readdir, and continue all further readdirs of the session on the same subvolume. This is required because, unlike files, dir d_offs are specific to the backend and cannot be re-used on another subvolume. The d_off transformation encodes the subvolume id and prevents such invalid use of d_offs on other servers. However, this approach could be quite wasteful of precious d_off bit-space. Unlike DHT, where server id can change from entry to entry and thus encoding the server id in the transformed d_off is necessary, we could take a slightly relaxed approach in AFR. The approach is to save the subvolume where the last readdir request was sent in the fd_ctx. This consumes constant space (i.e no per-entry cache), and serves the purpose of avoiding d_off "misuse" (i.e using d_off from one server on another). The compromise here is NFS resuming readdir from a non-0 cookie after an extended delay (either anonymous FD has been reclaimed, or server has restarted). In such cases a subvolume is picked freshly. To make this fresh picking more deterministic (i.e, to pick the same subvolume whenever possible, even after reboots), the function afr_hash_child (used by afr_read_subvol_select_by_policy) is modified to skip all dynamic inputs (i.e PID) for the case of directories. Change-Id: I46ad95feaeb21fb811b7e8d772866a646330c9d8 BUG: 1163161 Signed-off-by: Anand Avati <avati@redhat.com> Reviewed-on: http://review.gluster.org/9332 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com> Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Diffstat (limited to 'xlators/cluster')
-rw-r--r--xlators/cluster/afr/src/afr-common.c4
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.c144
-rw-r--r--xlators/cluster/afr/src/afr.h6
3 files changed, 23 insertions, 131 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 5d6737f72df..2cbd0ce4c90 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -741,7 +741,7 @@ afr_hash_child (inode_t *inode, int32_t child_count, int hashmode)
uuid_copy (gfid_copy, inode->gfid);
}
- if (hashmode > 1) {
+ if (hashmode > 1 && inode->ia_type != IA_IFDIR) {
/*
* Why getpid? Because it's one of the cheapest calls
* available - faster than gethostname etc. - and returns a
@@ -2218,6 +2218,8 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)
goto out;
}
+ fd_ctx->readdir_subvol = -1;
+
pthread_mutex_init (&fd_ctx->delay_lock, NULL);
INIT_LIST_HEAD (&fd_ctx->eager_locked);
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
index 41f5e60032d..af6a1787593 100644
--- a/xlators/cluster/afr/src/afr-dir-read.c
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -124,132 +124,6 @@ out:
}
-#define BACKEND_D_OFF_BITS 63
-#define PRESENT_D_OFF_BITS 63
-
-#define ONE 1ULL
-#define MASK (~0ULL)
-#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS))
-#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS))
-
-#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1))
-#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1)))
-
-static uint64_t
-afr_bits_for (uint64_t num)
-{
- uint64_t bits = 0, ctrl = 1;
-
- while (ctrl < num) {
- ctrl *= 2;
- bits ++;
- }
-
- return bits;
-}
-
-int
-afr_itransform (xlator_t *this, int subvol, uint64_t x, uint64_t *y_p)
-{
- afr_private_t *conf = NULL;
- int cnt = 0;
- int max = 0;
- uint64_t y = 0;
- uint64_t hi_mask = 0;
- uint64_t off_mask = 0;
- int max_bits = 0;
-
- if (x == ((uint64_t) -1)) {
- y = (uint64_t) -1;
- goto out;
- }
-
- conf = this->private;
- if (!conf)
- goto out;
-
- max = conf->child_count;
- cnt = subvol;
-
- if (max == 1) {
- y = x;
- goto out;
- }
-
- max_bits = afr_bits_for (max);
-
- hi_mask = ~(PRESENT_MASK >> (max_bits + 1));
-
- if (x & hi_mask) {
- /* HUGE d_off */
- off_mask = MASK << max_bits;
- y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt;
- } else {
- /* small d_off */
- y = ((x * max) + cnt);
- }
-
-out:
- if (y_p)
- *y_p = y;
-
- return 0;
-}
-
-
-int
-afr_deitransform (xlator_t *this, uint64_t y, int *subvol_p,
- uint64_t *x_p)
-{
- afr_private_t *conf = NULL;
- int cnt = 0;
- int max = 0;
- uint64_t x = 0;
- int subvol = 0;
- int max_bits = 0;
- uint64_t off_mask = 0;
- uint64_t host_mask = 0;
-
- if (!this->private)
- return -1;
-
- conf = this->private;
- max = conf->child_count;
-
- if (max == 1) {
- x = y;
- cnt = 0;
- goto out;
- }
-
- if (y & TOP_BIT) {
- /* HUGE d_off */
- max_bits = afr_bits_for (max);
- off_mask = (MASK << max_bits);
- host_mask = ~(off_mask);
-
- x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS;
-
- cnt = y & host_mask;
- } else {
- /* small d_off */
- cnt = y % max;
- x = y / max;
- }
-
-out:
- subvol = cnt;
-
- if (subvol_p)
- *subvol_p = subvol;
-
- if (x_p)
- *x_p = x;
-
- return 0;
-}
-
-
static void
afr_readdir_transform_entries (gf_dirent_t *subvol_entries, int subvol,
gf_dirent_t *entries, fd_t *fd)
@@ -273,7 +147,6 @@ afr_readdir_transform_entries (gf_dirent_t *subvol_entries, int subvol,
}
list_del_init (&entry->list);
- afr_itransform (THIS, subvol, entry->d_off, &entry->d_off);
list_add_tail (&entry->list, &entries->list);
if (entry->inode) {
@@ -333,9 +206,11 @@ afr_readdir_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
priv = this->private;
local = frame->local;
+ fd_ctx = afr_fd_ctx_get (local->fd, this);
if (subvol == -1) {
AFR_STACK_UNWIND (readdir, frame, local->op_ret,
@@ -343,6 +218,8 @@ afr_readdir_wind (call_frame_t *frame, xlator_t *this, int subvol)
return 0;
}
+ fd_ctx->readdir_subvol = subvol;
+
if (local->op == GF_FOP_READDIR)
STACK_WIND_COOKIE (frame, afr_readdir_cbk,
(void *) (long) subvol,
@@ -370,18 +247,27 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
afr_local_t *local = NULL;
int32_t op_errno = 0;
int subvol = -1;
+ afr_fd_ctx_t *fd_ctx = NULL;
local = AFR_FRAME_INIT (frame, op_errno);
if (!local)
goto out;
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
local->op = whichop;
local->fd = fd_ref (fd);
local->cont.readdir.size = size;
local->cont.readdir.offset = offset;
local->xdata_req = (dict)? dict_ref (dict) : NULL;
- if (offset == 0) {
+ subvol = fd_ctx->readdir_subvol;
+
+ if (offset == 0 || subvol == -1) {
/* First readdir has option of failing over and selecting
an appropriate read subvolume */
afr_read_txn (frame, this, fd->inode, afr_readdir_wind,
@@ -389,8 +275,6 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
} else {
/* But continued readdirs MUST stick to the same subvolume
without an option to failover */
- afr_deitransform (this, offset, &subvol,
- (uint64_t *)&local->cont.readdir.offset);
afr_readdir_wind (frame, this, subvol);
}
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 7e138c54ec0..4044fd59d4e 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -301,6 +301,12 @@ typedef struct {
/* list of frames currently in progress */
struct list_head eager_locked;
+
+ /* the subvolume on which the latest sequence of readdirs (starting
+ at offset 0) has begun. Till the next readdir request with 0 offset
+ arrives, we continue to read off this subvol.
+ */
+ int readdir_subvol;
} afr_fd_ctx_t;