summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/dht/src/dht-helper.c
diff options
context:
space:
mode:
authorAmar Tumballi <amar@gluster.com>2011-09-07 12:53:09 +0530
committerVijay Bellur <vijay@gluster.com>2011-09-12 23:57:38 -0700
commita07bb18c8adeb8597f62095c5d1361c5bad01f09 (patch)
tree55bf7ef630793cfd11fc15e84d859cbfb2184b01 /xlators/cluster/dht/src/dht-helper.c
parent09eeaf4e68c225b8e5ccc0a9b4f10f8c4748e205 (diff)
distribute rebalance: handle the open file migration
Complexity involved: To migrate a file with open fd, we have to notify the other client process which has the open fd, and make sure the write()s happening on that fd is properly synced to the migrated file. Once the migration is complete, the client process which has open-fd should get notified and it should start performing all the operations on the new subvolume, instead of earlier cached volume. How to solve the notification part: We can overload the 'postbuf' attribute in the _cbk() function to understand if a file is 'under-migration' or 'migration-complete' state. (This will be something similar to deciding whether a file is DHT-linkfile by its 'mode'). Overall change includes below mentioned major changes: 1. dht_linkfile is decided by only 2 factors (mode(01000), xattr(trusted.glusterfs.dht.linkto)), instead of earlier 3 factors (size==0) 2. in linkfile self-heal part (in 'dht_lookup_everywhere_cbk()'), don't delete a linkfile if there is a open-fd on it. It means, there may be a migration in progress. 3. if a file's revalidate fails with ENOENT, it may be due to file migration, and hence need a lookup_everywhere() 4. There will be 2 phases of file-migration. -> Phase 1: Migration in progress * The source data file will have SGID and STICKY bit set in its mode. * The source data file will have a 'linkto' xattr pointing the destination. * Destination file will have mode set to '01000', and 'linkto' xattr set to itself. -> Phase 2: File migration Complete * The source data file will have mode '01000', and will be 'truncated' to size 0. * The destination file will have inherited mode from the source. (without sgid and sticky bit) and its 'linkto' attribute will be removed. 4. Changes in distribute to work smoothly with a file which is in migration / got migrated. The 'fops' are divided into 3 categories, inode-read, inode-write and others. inode-read fops need to handle only 'phase 2' notification, where as, the inode-write fops need to handle both 'phase 1' and phase2. The inode-write operations will be done on source file, and if any of 'file-migration' procedures are detected in _cbk(), then the operations should be performed on the destination too. when a phase-2 is detected, then the inode-ctx itself should be changed to represent a new layout. With these changes, the open file migration will work smoothly with multiple clients. Change-Id: I512408463814e650f34c62ed009bf2101d016fd6 BUG: 3071 Reviewed-on: http://review.gluster.com/209 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Vijay Bellur <vijay@gluster.com>
Diffstat (limited to 'xlators/cluster/dht/src/dht-helper.c')
-rw-r--r--xlators/cluster/dht/src/dht-helper.c416
1 files changed, 410 insertions, 6 deletions
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
index 44ed9c682..99abe023b 100644
--- a/xlators/cluster/dht/src/dht-helper.c
+++ b/xlators/cluster/dht/src/dht-helper.c
@@ -224,27 +224,60 @@ dht_local_wipe (xlator_t *this, dht_local_t *local)
GF_FREE (local->key);
}
+ if (local->rebalance.vector)
+ GF_FREE (local->rebalance.vector);
+
+ if (local->rebalance.iobref)
+ iobref_unref (local->rebalance.iobref);
+
GF_FREE (local);
}
dht_local_t *
-dht_local_init (call_frame_t *frame)
+dht_local_init (call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop)
{
dht_local_t *local = NULL;
+ inode_t *inode = NULL;
+ int ret = 0;
/* TODO: use mem-pool */
- local = GF_CALLOC (1, sizeof (*local),
- gf_dht_mt_dht_local_t);
-
+ local = GF_CALLOC (1, sizeof (*local), gf_dht_mt_dht_local_t);
if (!local)
- return NULL;
+ goto out;
+
+ if (loc) {
+ ret = loc_copy (&local->loc, loc);
+ if (ret)
+ goto out;
- local->op_ret = -1;
+ inode = loc->inode;
+ }
+
+ if (fd) {
+ local->fd = fd_ref (fd);
+ if (!inode)
+ inode = fd->inode;
+ }
+
+ local->op_ret = -1;
local->op_errno = EUCLEAN;
+ local->fop = fop;
+
+ if (inode) {
+ local->layout = dht_layout_get (frame->this, inode);
+ local->cached_subvol = dht_subvol_get_cached (frame->this,
+ inode);
+ }
frame->local = local;
+out:
+ if (ret) {
+ if (local)
+ GF_FREE (local);
+ local = NULL;
+ }
return local;
}
@@ -496,3 +529,374 @@ err:
loc_wipe (child);
return -1;
}
+
+
+
+int
+dht_init_subvolumes (xlator_t *this, dht_conf_t *conf)
+{
+ xlator_list_t *subvols = NULL;
+ int cnt = 0;
+
+ if (!conf)
+ return -1;
+
+ for (subvols = this->children; subvols; subvols = subvols->next)
+ cnt++;
+
+ conf->subvolumes = GF_CALLOC (cnt, sizeof (xlator_t *),
+ gf_dht_mt_xlator_t);
+ if (!conf->subvolumes) {
+ return -1;
+ }
+ conf->subvolume_cnt = cnt;
+
+ cnt = 0;
+ for (subvols = this->children; subvols; subvols = subvols->next)
+ conf->subvolumes[cnt++] = subvols->xlator;
+
+ conf->subvolume_status = GF_CALLOC (cnt, sizeof (char),
+ gf_dht_mt_char);
+ if (!conf->subvolume_status) {
+ return -1;
+ }
+
+ conf->last_event = GF_CALLOC (cnt, sizeof (int),
+ gf_dht_mt_char);
+ if (!conf->last_event) {
+ return -1;
+ }
+
+ conf->subvol_up_time = GF_CALLOC (cnt, sizeof (time_t),
+ gf_dht_mt_subvol_time);
+ if (!conf->subvol_up_time) {
+ return -1;
+ }
+
+ conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t),
+ gf_dht_mt_dht_du_t);
+ if (!conf->du_stats) {
+ return -1;
+ }
+
+ return 0;
+}
+
+
+
+
+static int
+dht_migration_complete_check_done (int op_ret, call_frame_t *frame, void *data)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->rebalance.target_op_fn (THIS, frame, op_ret);
+
+ return 0;
+}
+
+
+int
+dht_migration_complete_check_task (void *data)
+{
+ int ret = -1;
+ xlator_t *src_node = NULL;
+ xlator_t *dst_node = NULL;
+ dht_local_t *local = NULL;
+ dict_t *dict = NULL;
+ dht_layout_t *layout = NULL;
+ struct iatt stbuf = {0,};
+ xlator_t *this = NULL;
+ call_frame_t *frame = NULL;
+ loc_t tmp_loc = {0,};
+ char *path = NULL;
+
+ this = THIS;
+ frame = data;
+ local = frame->local;
+
+ src_node = local->cached_subvol;
+
+ /* getxattr on cached_subvol for 'linkto' value */
+ if (!local->loc.inode)
+ ret = syncop_fgetxattr (src_node, local->fd, &dict,
+ DHT_LINKFILE_KEY);
+ else
+ ret = syncop_getxattr (src_node, &local->loc, &dict,
+ DHT_LINKFILE_KEY);
+
+ if (!ret)
+ dst_node = dht_linkfile_subvol (this, NULL, NULL, dict);
+
+ if (ret) {
+ if ((errno != ENOENT) || (!local->loc.inode)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: failed to get the 'linkto' xattr %s",
+ local->loc.path, strerror (errno));
+ goto out;
+ }
+ /* Need to do lookup on hashed subvol, then get the file */
+ ret = syncop_lookup (this, &local->loc, NULL, &stbuf, NULL,
+ NULL);
+ if (ret)
+ goto out;
+ dst_node = dht_subvol_get_cached (this, local->loc.inode);
+ }
+
+ if (!dst_node) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: failed to get the destination node",
+ local->loc.path);
+ ret = -1;
+ goto out;
+ }
+
+ /* lookup on dst */
+ if (local->loc.inode) {
+ ret = syncop_lookup (dst_node, &local->loc, NULL, &stbuf, NULL, NULL);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: failed to lookup the file on %s",
+ local->loc.path, dst_node->name);
+ goto out;
+ }
+
+ if (uuid_compare (stbuf.ia_gfid, local->loc.inode->gfid)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: gfid different on the target file on %s",
+ local->loc.path, dst_node->name);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ /* update inode ctx (the layout) */
+ dht_layout_unref (this, local->layout);
+
+ if (!local->loc.inode)
+ ret = dht_layout_preset (this, dst_node, local->fd->inode);
+ else
+ ret = dht_layout_preset (this, dst_node, local->loc.inode);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%s: could not set preset layout for subvol %s",
+ local->loc.path, dst_node->name);
+ ret = -1;
+ goto out;
+ }
+
+ layout = dht_layout_for_subvol (this, dst_node);
+ if (!layout) {
+ gf_log (this->name, GF_LOG_INFO,
+ "%s: no pre-set layout for subvolume %s",
+ local->loc.path, dst_node ? dst_node->name : "<nil>");
+ ret = -1;
+ goto out;
+ }
+
+ if (!local->loc.inode)
+ ret = dht_layout_set (this, local->fd->inode, layout);
+ else
+ ret = dht_layout_set (this, local->loc.inode, layout);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: failed to set the new layout",
+ local->loc.path);
+ goto out;
+ }
+
+ local->cached_subvol = dst_node;
+ ret = 0;
+
+ if (!local->fd)
+ goto out;
+
+ /* once we detect the migration complete, the fd-ctx is no more
+ required.. delete the ctx, and do one extra 'fd_unref' for open fd */
+ ret = fd_ctx_del (local->fd, this, NULL);
+ if (!ret) {
+ fd_unref (local->fd);
+ ret = 0;
+ goto out;
+ }
+
+ /* if 'local->fd' (ie, fd based operation), send a 'open()' on
+ destination if not already done */
+ if (local->loc.inode) {
+ ret = syncop_open (dst_node, &local->loc,
+ local->fd->flags, local->fd);
+ } else {
+ tmp_loc.inode = local->fd->inode;
+ inode_path (local->fd->inode, NULL, &path);
+ if (path)
+ tmp_loc.path = path;
+ ret = syncop_open (dst_node, &tmp_loc,
+ local->fd->flags, local->fd);
+ if (path)
+ GF_FREE (path);
+
+ }
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: failed to send open() on target file at %s",
+ local->loc.path, dst_node->name);
+ goto out;
+ }
+
+ /* need this unref for the fd on src_node */
+ fd_unref (local->fd);
+ ret = 0;
+out:
+
+ return ret;
+}
+
+int
+dht_rebalance_complete_check (xlator_t *this, call_frame_t *frame)
+{
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ ret = synctask_new (conf->env, dht_migration_complete_check_task,
+ dht_migration_complete_check_done,
+ frame, frame);
+ return ret;
+}
+
+/* During 'in-progress' state, both nodes should have the file */
+static int
+dht_inprogress_check_done (int op_ret, call_frame_t *sync_frame, void *data)
+{
+ dht_local_t *local = NULL;
+
+ local = sync_frame->local;
+
+ local->rebalance.target_op_fn (THIS, sync_frame, op_ret);
+
+ return 0;
+}
+
+static int
+dht_rebalance_inprogress_task (void *data)
+{
+ int ret = -1;
+ xlator_t *src_node = NULL;
+ xlator_t *dst_node = NULL;
+ dht_local_t *local = NULL;
+ dict_t *dict = NULL;
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ char *path = NULL;
+ struct iatt stbuf = {0,};
+ loc_t tmp_loc = {0,};
+
+ this = THIS;
+ frame = data;
+ local = frame->local;
+
+ src_node = local->cached_subvol;
+
+ /* getxattr on cached_subvol for 'linkto' value */
+ if (local->loc.inode)
+ ret = syncop_getxattr (src_node, &local->loc, &dict,
+ DHT_LINKFILE_KEY);
+ else
+ ret = syncop_fgetxattr (src_node, local->fd, &dict,
+ DHT_LINKFILE_KEY);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: failed to get the 'linkto' xattr %s",
+ local->loc.path, strerror (errno));
+ goto out;
+ }
+
+ dst_node = dht_linkfile_subvol (this, NULL, NULL, dict);
+ if (!dst_node) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: failed to get the 'linkto' xattr from dict",
+ local->loc.path);
+ ret = -1;
+ goto out;
+ }
+
+ local->rebalance.target_node = dst_node;
+
+ if (local->loc.inode) {
+ /* lookup on dst */
+ ret = syncop_lookup (dst_node, &local->loc, NULL,
+ &stbuf, NULL, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: failed to lookup the file on %s",
+ local->loc.path, dst_node->name);
+ goto out;
+ }
+
+ if (uuid_compare (stbuf.ia_gfid, local->loc.inode->gfid)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: gfid different on the target file on %s",
+ local->loc.path, dst_node->name);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ ret = 0;
+
+ if (!local->fd)
+ goto out;
+
+ if (local->loc.inode) {
+ ret = syncop_open (dst_node, &local->loc,
+ local->fd->flags, local->fd);
+ } else {
+ tmp_loc.inode = local->fd->inode;
+ inode_path (local->fd->inode, NULL, &path);
+ if (path)
+ tmp_loc.path = path;
+ ret = syncop_open (dst_node, &tmp_loc,
+ local->fd->flags, local->fd);
+ if (path)
+ GF_FREE (path);
+ }
+
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: failed to send open() on target file at %s",
+ local->loc.path, dst_node->name);
+ goto out;
+ }
+
+ ret = fd_ctx_set (local->fd, this, (uint64_t)(long)dst_node);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: failed to set fd-ctx target file at %s",
+ local->loc.path, dst_node->name);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+dht_rebalance_in_progress_check (xlator_t *this, call_frame_t *frame)
+{
+
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ ret = synctask_new (conf->env, dht_rebalance_inprogress_task,
+ dht_inprogress_check_done,
+ frame, frame);
+ return ret;
+}