summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnand Avati <avati@redhat.com>2014-12-23 10:04:00 -0800
committerRaghavendra Bhat <raghavendra@redhat.com>2015-03-03 23:38:41 -0800
commitf396e475417aa52daf49e4564c67628cc8f0e598 (patch)
tree1624d242eb4b08820c1ef3a7052a9cb8625e8479
parentb887c4ee9338215ce11aa350c97fcc6f133fcce7 (diff)
afr: stop encoding subvolume id in readdir d_off
Backport of http://review.gluster.org/9332 The purpose of encoding d_off in AFR is to indicate the selected subvolume for the first readdir, and continue all further readdirs of the session on the same subvolume. This is required because, unlike files, dir d_offs are specific to the backend and cannot be re-used on another subvolume. The d_off transformation encodes the subvolume id and prevents such invalid use of d_offs on other servers. However, this approach could be quite wasteful of precious d_off bit-space. Unlike DHT, where server id can change from entry to entry and thus encoding the server id in the transformed d_off is necessary, we could take a slightly relaxed approach in AFR. The approach is to save the subvolume where the last readdir request was sent in the fd_ctx. This consumes constant space (i.e no per-entry cache), and serves the purpose of avoiding d_off "misuse" (i.e using d_off from one server on another). The compromise here is NFS resuming readdir from a non-0 cookie after an extended delay (either anonymous FD has been reclaimed, or server has restarted). In such cases a subvolume is picked freshly. To make this fresh picking more deterministic (i.e, to pick the same subvolume whenever possible, even after reboots), the function afr_hash_child (used by afr_read_subvol_select_by_policy) is modified to skip all dynamic inputs (i.e PID) for the case of directories. BUG: 1191537 Change-Id: I7e3bd8dfe346a9a8e428d7ddeada6cfb66e64e54 Signed-off-by: Anand Avati <avati@redhat.com> Reviewed-on: http://review.gluster.org/9638 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Raghavendra Bhat <raghavendra@redhat.com>
-rw-r--r--xlators/cluster/afr/src/afr-common.c4
-rw-r--r--xlators/cluster/afr/src/afr-dir-read.c144
-rw-r--r--xlators/cluster/afr/src/afr.h6
3 files changed, 23 insertions, 131 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index d8da5edccc5..69e13078652 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -739,7 +739,7 @@ afr_hash_child (inode_t *inode, int32_t child_count, int hashmode)
uuid_copy (gfid_copy, inode->gfid);
}
- if (hashmode > 1) {
+ if (hashmode > 1 && inode->ia_type != IA_IFDIR) {
/*
* Why getpid? Because it's one of the cheapest calls
* available - faster than gethostname etc. - and returns a
@@ -2264,6 +2264,8 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)
goto out;
}
+ fd_ctx->readdir_subvol = -1;
+
pthread_mutex_init (&fd_ctx->delay_lock, NULL);
INIT_LIST_HEAD (&fd_ctx->eager_locked);
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
index 3b3d3093c5d..28bf89f2842 100644
--- a/xlators/cluster/afr/src/afr-dir-read.c
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -157,132 +157,6 @@ afr_validate_read_subvol (inode_t *inode, xlator_t *this, int par_read_subvol)
}
-#define BACKEND_D_OFF_BITS 63
-#define PRESENT_D_OFF_BITS 63
-
-#define ONE 1ULL
-#define MASK (~0ULL)
-#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS))
-#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS))
-
-#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1))
-#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1)))
-
-static uint64_t
-afr_bits_for (uint64_t num)
-{
- uint64_t bits = 0, ctrl = 1;
-
- while (ctrl < num) {
- ctrl *= 2;
- bits ++;
- }
-
- return bits;
-}
-
-int
-afr_itransform (xlator_t *this, int subvol, uint64_t x, uint64_t *y_p)
-{
- afr_private_t *conf = NULL;
- int cnt = 0;
- int max = 0;
- uint64_t y = 0;
- uint64_t hi_mask = 0;
- uint64_t off_mask = 0;
- int max_bits = 0;
-
- if (x == ((uint64_t) -1)) {
- y = (uint64_t) -1;
- goto out;
- }
-
- conf = this->private;
- if (!conf)
- goto out;
-
- max = conf->child_count;
- cnt = subvol;
-
- if (max == 1) {
- y = x;
- goto out;
- }
-
- max_bits = afr_bits_for (max);
-
- hi_mask = ~(PRESENT_MASK >> (max_bits + 1));
-
- if (x & hi_mask) {
- /* HUGE d_off */
- off_mask = MASK << max_bits;
- y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt;
- } else {
- /* small d_off */
- y = ((x * max) + cnt);
- }
-
-out:
- if (y_p)
- *y_p = y;
-
- return 0;
-}
-
-
-int
-afr_deitransform (xlator_t *this, uint64_t y, int *subvol_p,
- uint64_t *x_p)
-{
- afr_private_t *conf = NULL;
- int cnt = 0;
- int max = 0;
- uint64_t x = 0;
- int subvol = 0;
- int max_bits = 0;
- uint64_t off_mask = 0;
- uint64_t host_mask = 0;
-
- if (!this->private)
- return -1;
-
- conf = this->private;
- max = conf->child_count;
-
- if (max == 1) {
- x = y;
- cnt = 0;
- goto out;
- }
-
- if (y & TOP_BIT) {
- /* HUGE d_off */
- max_bits = afr_bits_for (max);
- off_mask = (MASK << max_bits);
- host_mask = ~(off_mask);
-
- x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS;
-
- cnt = y & host_mask;
- } else {
- /* small d_off */
- cnt = y % max;
- x = y / max;
- }
-
-out:
- subvol = cnt;
-
- if (subvol_p)
- *subvol_p = subvol;
-
- if (x_p)
- *x_p = x;
-
- return 0;
-}
-
-
static void
afr_readdir_transform_entries (gf_dirent_t *subvol_entries, int subvol,
gf_dirent_t *entries, fd_t *fd)
@@ -301,7 +175,6 @@ afr_readdir_transform_entries (gf_dirent_t *subvol_entries, int subvol,
}
list_del_init (&entry->list);
- afr_itransform (THIS, subvol, entry->d_off, &entry->d_off);
list_add_tail (&entry->list, &entries->list);
if (entry->inode) {
@@ -356,9 +229,11 @@ afr_readdir_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
priv = this->private;
local = frame->local;
+ fd_ctx = afr_fd_ctx_get (local->fd, this);
if (subvol == -1) {
AFR_STACK_UNWIND (readdir, frame, local->op_ret,
@@ -366,6 +241,8 @@ afr_readdir_wind (call_frame_t *frame, xlator_t *this, int subvol)
return 0;
}
+ fd_ctx->readdir_subvol = subvol;
+
if (local->op == GF_FOP_READDIR)
STACK_WIND_COOKIE (frame, afr_readdir_cbk,
(void *) (long) subvol,
@@ -393,18 +270,27 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
afr_local_t *local = NULL;
int32_t op_errno = 0;
int subvol = -1;
+ afr_fd_ctx_t *fd_ctx = NULL;
local = AFR_FRAME_INIT (frame, op_errno);
if (!local)
goto out;
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
local->op = whichop;
local->fd = fd_ref (fd);
local->cont.readdir.size = size;
local->cont.readdir.offset = offset;
local->xdata_req = (dict)? dict_ref (dict) : NULL;
- if (offset == 0) {
+ subvol = fd_ctx->readdir_subvol;
+
+ if (offset == 0 || subvol == -1) {
/* First readdir has option of failing over and selecting
an appropriate read subvolume */
afr_read_txn (frame, this, fd->inode, afr_readdir_wind,
@@ -412,8 +298,6 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
} else {
/* But continued readdirs MUST stick to the same subvolume
without an option to failover */
- afr_deitransform (this, offset, &subvol,
- (uint64_t *)&local->cont.readdir.offset);
afr_readdir_wind (frame, this, subvol);
}
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 7e138c54ec0..4044fd59d4e 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -301,6 +301,12 @@ typedef struct {
/* list of frames currently in progress */
struct list_head eager_locked;
+
+ /* the subvolume on which the latest sequence of readdirs (starting
+ at offset 0) has begun. Till the next readdir request with 0 offset
+ arrives, we continue to read off this subvol.
+ */
+ int readdir_subvol;
} afr_fd_ctx_t;