diff options
Diffstat (limited to 'xlators/cluster/dht/src/dht-helper.c')
| -rw-r--r-- | xlators/cluster/dht/src/dht-helper.c | 461 |
1 files changed, 341 insertions, 120 deletions
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index ff540013e..311a48112 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #ifndef _CONFIG_H @@ -27,6 +18,28 @@ #include "xlator.h" #include "dht-common.h" +static inline int +dht_inode_ctx_set1 (xlator_t *this, inode_t *inode, xlator_t *subvol) +{ + uint64_t tmp_subvol = 0; + + tmp_subvol = (long)subvol; + return inode_ctx_set1 (inode, this, &tmp_subvol); +} + +int +dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol) +{ + int ret = -1; + uint64_t tmp_subvol = 0; + + ret = inode_ctx_get1 (inode, this, &tmp_subvol); + if (tmp_subvol && subvol) + *subvol = (xlator_t *)tmp_subvol; + + return ret; +} + int dht_frame_return (call_frame_t *frame) @@ -49,6 +62,43 @@ dht_frame_return (call_frame_t *frame) } +static uint64_t +dht_bits_for (uint64_t num) +{ + uint64_t bits = 0, ctrl = 1; + + while (ctrl < num) { + ctrl *= 2; + bits ++; + } + + return bits; +} + +/* + * A slightly "updated" version of the algorithm described in the commit log + * is used here. + * + * The only enhancement is that: + * + * - The number of bits used by the backend filesystem for HUGE d_off which + * is described as 63, and + * - The number of bits used by the d_off presented by the transformation + * upwards which is described as 64, are both made "configurable." + */ + + +#define BACKEND_D_OFF_BITS 63 +#define PRESENT_D_OFF_BITS 63 + +#define ONE 1ULL +#define MASK (~0ULL) +#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS)) +#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS)) + +#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1)) +#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1))) + int dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) { @@ -56,6 +106,9 @@ dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) int cnt = 0; int max = 0; uint64_t y = 0; + uint64_t hi_mask = 0; + uint64_t off_mask = 0; + int max_bits = 0; if (x == ((uint64_t) -1)) { y = (uint64_t) -1; @@ -69,7 +122,23 @@ dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) max = conf->subvolume_cnt; cnt = dht_subvol_cnt (this, subvol); - y = ((x * max) + cnt); + if (max == 1) { + y = x; + goto out; + } + + max_bits = dht_bits_for (max); + + hi_mask = ~(PRESENT_MASK >> (max_bits + 1)); + + if (x & hi_mask) { + /* HUGE d_off */ + off_mask = MASK << max_bits; + y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt; + } else { + /* small d_off */ + y = ((x * max) + cnt); + } out: if (y_p) @@ -89,7 +158,7 @@ dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, int ret = 0; /* not found */ /* Why do other tasks if first required 'char' itself is not there */ - if (!loc->name || !strchr (loc->name, '@')) + if (!new_loc || !loc || !loc->name || !strchr (loc->name, '@')) goto out; trav = this->children; @@ -129,10 +198,8 @@ dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc, out: if (!ret) { /* !success */ - if (new_path) - GF_FREE (new_path); - if (new_name) - GF_FREE (new_name); + GF_FREE (new_path); + GF_FREE (new_name); } return ret; } @@ -146,16 +213,38 @@ dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, int max = 0; uint64_t x = 0; xlator_t *subvol = 0; + int max_bits = 0; + uint64_t off_mask = 0; + uint64_t host_mask = 0; if (!this->private) - goto out; + return -1; conf = this->private; max = conf->subvolume_cnt; - cnt = y % max; - x = y / max; + if (max == 1) { + x = y; + cnt = 0; + goto out; + } + + if (y & TOP_BIT) { + /* HUGE d_off */ + max_bits = dht_bits_for (max); + off_mask = (MASK << max_bits); + host_mask = ~(off_mask); + + x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS; + + cnt = y & host_mask; + } else { + /* small d_off */ + cnt = y % max; + x = y / max; + } +out: subvol = conf->subvolumes[cnt]; if (subvol_p) @@ -164,7 +253,6 @@ dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, if (x_p) *x_p = x; -out: return 0; } @@ -215,16 +303,11 @@ dht_local_wipe (xlator_t *this, dht_local_t *local) local->selfheal.layout = NULL; } - if (local->newpath) { - GF_FREE (local->newpath); - } + GF_FREE (local->newpath); - if (local->key) { - GF_FREE (local->key); - } + GF_FREE (local->key); - if (local->rebalance.vector) - GF_FREE (local->rebalance.vector); + GF_FREE (local->rebalance.vector); if (local->rebalance.iobref) iobref_unref (local->rebalance.iobref); @@ -279,20 +362,6 @@ out: return local; } - -char * -basestr (const char *str) -{ - char *basestr = NULL; - - basestr = strrchr (str, '/'); - if (basestr) - basestr ++; - - return basestr; -} - - xlator_t * dht_first_up_subvol (xlator_t *this) { @@ -444,7 +513,36 @@ out: return next; } +/* This func wraps around, if prev is actually the last subvol. + */ +xlator_t * +dht_subvol_next_available (xlator_t *this, xlator_t *prev) +{ + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *next = NULL; + + conf = this->private; + if (!conf) + goto out; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) { + /* if prev is last in conf->subvolumes, then wrap + * around. + */ + if ((i + 1) < conf->subvolume_cnt) { + next = conf->subvolumes[i + 1]; + } else { + next = conf->subvolumes[0]; + } + break; + } + } + +out: + return next; +} int dht_subvol_cnt (xlator_t *this, xlator_t *subvol) { @@ -636,20 +734,36 @@ dht_migration_complete_check_task (void *data) call_frame_t *frame = NULL; loc_t tmp_loc = {0,}; char *path = NULL; + dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + uint64_t tmp_subvol = 0; + int open_failed = 0; this = THIS; frame = data; local = frame->local; + conf = this->private; src_node = local->cached_subvol; - /* getxattr on cached_subvol for 'linkto' value */ - if (!local->loc.inode) + if (!local->loc.inode && !local->fd) + goto out; + + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr + * as root:root. If a fd is already open, access check wont be done*/ + + if (!local->loc.inode) { ret = syncop_fgetxattr (src_node, local->fd, &dict, - DHT_LINKFILE_KEY); - else + conf->link_xattr_name); + } else { + SYNCTASK_SETID (0, 0); ret = syncop_getxattr (src_node, &local->loc, &dict, - DHT_LINKFILE_KEY); + conf->link_xattr_name); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + } if (!ret) dst_node = dht_linkfile_subvol (this, NULL, NULL, dict); @@ -700,10 +814,7 @@ dht_migration_complete_check_task (void *data) /* update inode ctx (the layout) */ dht_layout_unref (this, local->layout); - if (!local->loc.inode) - ret = dht_layout_preset (this, dst_node, local->fd->inode); - else - ret = dht_layout_preset (this, dst_node, local->loc.inode); + ret = dht_layout_preset (this, dst_node, inode); if (ret != 0) { gf_log (this->name, GF_LOG_DEBUG, "%s: could not set preset layout for subvol %s", @@ -721,10 +832,7 @@ dht_migration_complete_check_task (void *data) goto out; } - if (!local->loc.inode) - ret = dht_layout_set (this, local->fd->inode, layout); - else - ret = dht_layout_set (this, local->loc.inode, layout); + ret = dht_layout_set (this, inode, layout); if (ret) { gf_log (this->name, GF_LOG_ERROR, "%s: failed to set the new layout", @@ -735,43 +843,46 @@ dht_migration_complete_check_task (void *data) local->cached_subvol = dst_node; ret = 0; - if (!local->fd) + /* once we detect the migration complete, the inode-ctx2 is no more + required.. delete the ctx and also, it means, open() already + done on all the fd of inode */ + ret = inode_ctx_reset1 (inode, this, &tmp_subvol); + if (tmp_subvol) goto out; - /* once we detect the migration complete, the fd-ctx is no more - required.. delete the ctx, and do one extra 'fd_unref' for open fd */ - ret = fd_ctx_del (local->fd, this, NULL); - if (!ret) { - fd_unref (local->fd); - ret = 0; + if (list_empty (&inode->fd_list)) goto out; - } - /* if 'local->fd' (ie, fd based operation), send a 'open()' on - destination if not already done */ - if (local->loc.inode) { - ret = syncop_open (dst_node, &local->loc, - local->fd->flags, local->fd); - } else { - tmp_loc.inode = local->fd->inode; - inode_path (local->fd->inode, NULL, &path); - if (path) - tmp_loc.path = path; - ret = syncop_open (dst_node, &tmp_loc, - local->fd->flags, local->fd); - if (path) - GF_FREE (path); + /* perform open as root:root. There is window between linkfile + * creation(root:root) and setattr with the correct uid/gid + */ + SYNCTASK_SETID(0, 0); + + /* perform 'open()' on all the fd's present on the inode */ + tmp_loc.inode = inode; + inode_path (inode, NULL, &path); + if (path) + tmp_loc.path = path; + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (fd_is_anonymous (iter_fd)) + continue; + ret = syncop_open (dst_node, &tmp_loc, + iter_fd->flags, iter_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "failed to open " + "the fd (%p, flags=0%o) on file %s @ %s", + iter_fd, iter_fd->flags, path, dst_node->name); + open_failed = 1; + } } - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to send open() on target file at %s", - local->loc.path, dst_node->name); + GF_FREE (path); + + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + if (open_failed) { + ret = -1; goto out; } - - /* need this unref for the fd on src_node */ - fd_unref (local->fd); ret = 0; out: @@ -782,11 +893,8 @@ int dht_rebalance_complete_check (xlator_t *this, call_frame_t *frame) { int ret = -1; - dht_conf_t *conf = NULL; - - conf = this->private; - ret = synctask_new (conf->env, dht_migration_complete_check_task, + ret = synctask_new (this->ctx->env, dht_migration_complete_check_task, dht_migration_complete_check_done, frame, frame); return ret; @@ -818,20 +926,34 @@ dht_rebalance_inprogress_task (void *data) char *path = NULL; struct iatt stbuf = {0,}; loc_t tmp_loc = {0,}; + dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + int open_failed = 0; this = THIS; frame = data; local = frame->local; + conf = this->private; src_node = local->cached_subvol; - /* getxattr on cached_subvol for 'linkto' value */ - if (local->loc.inode) + if (!local->loc.inode && !local->fd) + goto out; + + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr + * as root:root. If a fd is already open, access check wont be done*/ + if (local->loc.inode) { + SYNCTASK_SETID (0, 0); ret = syncop_getxattr (src_node, &local->loc, &dict, - DHT_LINKFILE_KEY); - else + conf->link_xattr_name); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + } else { ret = syncop_fgetxattr (src_node, local->fd, &dict, - DHT_LINKFILE_KEY); + conf->link_xattr_name); + } if (ret) { gf_log (this->name, GF_LOG_ERROR, @@ -873,34 +995,46 @@ dht_rebalance_inprogress_task (void *data) ret = 0; - if (!local->fd) - goto out; + if (list_empty (&inode->fd_list)) + goto done; + + /* perform open as root:root. There is window between linkfile + * creation(root:root) and setattr with the correct uid/gid + */ + SYNCTASK_SETID (0, 0); + + tmp_loc.inode = inode; + inode_path (inode, NULL, &path); + if (path) + tmp_loc.path = path; + + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (fd_is_anonymous (iter_fd)) + continue; - if (local->loc.inode) { - ret = syncop_open (dst_node, &local->loc, - local->fd->flags, local->fd); - } else { - tmp_loc.inode = local->fd->inode; - inode_path (local->fd->inode, NULL, &path); - if (path) - tmp_loc.path = path; ret = syncop_open (dst_node, &tmp_loc, - local->fd->flags, local->fd); - if (path) - GF_FREE (path); + iter_fd->flags, iter_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "failed to send open " + "the fd (%p, flags=0%o) on file %s @ %s", + iter_fd, iter_fd->flags, path, dst_node->name); + open_failed = 1; + } } + GF_FREE (path); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to send open() on target file at %s", - local->loc.path, dst_node->name); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + + if (open_failed) { + ret = -1; goto out; } - ret = fd_ctx_set (local->fd, this, (uint64_t)(long)dst_node); +done: + ret = dht_inode_ctx_set1 (this, inode, dst_node); if (ret) { gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set fd-ctx target file at %s", + "%s: failed to set inode-ctx target file at %s", local->loc.path, dst_node->name); goto out; } @@ -915,12 +1049,99 @@ dht_rebalance_in_progress_check (xlator_t *this, call_frame_t *frame) { int ret = -1; - dht_conf_t *conf = NULL; - conf = this->private; - - ret = synctask_new (conf->env, dht_rebalance_inprogress_task, + ret = synctask_new (this->ctx->env, dht_rebalance_inprogress_task, dht_inprogress_check_done, frame, frame); return ret; } + +int +dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this, + dht_layout_t *layout_int) +{ + dht_inode_ctx_t *ctx = NULL; + int ret = -1; + + ret = dht_inode_ctx_get (inode, this, &ctx); + if (!ret && ctx) { + ctx->layout = layout_int; + } else { + ctx = GF_CALLOC (1, sizeof (*ctx), gf_dht_mt_inode_ctx_t); + if (!ctx) + return ret; + ctx->layout = layout_int; + } + + ret = dht_inode_ctx_set (inode, this, ctx); + + return ret; +} + +int +dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat, + int32_t post) +{ + dht_inode_ctx_t *ctx = NULL; + dht_stat_time_t *time = 0; + int ret = -1; + + GF_VALIDATE_OR_GOTO (this->name, stat, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + + ret = dht_inode_ctx_get (inode, this, &ctx); + + if (ret) { + ctx = GF_CALLOC (1, sizeof (*ctx), gf_dht_mt_inode_ctx_t); + if (!ctx) + return -1; + } + + time = &ctx->time; + + DHT_UPDATE_TIME(time->mtime, time->mtime_nsec, + stat->ia_mtime, stat->ia_mtime_nsec, inode, post); + DHT_UPDATE_TIME(time->ctime, time->ctime_nsec, + stat->ia_ctime, stat->ia_ctime_nsec, inode, post); + DHT_UPDATE_TIME(time->atime, time->atime_nsec, + stat->ia_atime, stat->ia_atime_nsec, inode, post); + + ret = dht_inode_ctx_set (inode, this, ctx); +out: + return 0; +} + +int +dht_inode_ctx_get (inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + + ret = inode_ctx_get (inode, this, &ctx_int); + + if (ret) + return ret; + + if (ctx) + *ctx = (dht_inode_ctx_t *) ctx_int; +out: + return ret; +} + +int dht_inode_ctx_set (inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx) +{ + int ret = -1; + uint64_t ctx_int = 0; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO (this->name, ctx, out); + + ctx_int = (long)ctx; + ret = inode_ctx_set (inode, this, &ctx_int); +out: + return ret; +} |
