diff options
author | Susant Palai <spalai@redhat.com> | 2017-04-11 17:27:17 +0530 |
---|---|---|
committer | Shyamsundar Ranganathan <srangana@redhat.com> | 2017-08-11 12:18:11 +0000 |
commit | c90bffe08466baba369a23964a68b9ac59eb0394 (patch) | |
tree | d2e8730263449b444bc64939278523d9941567e7 | |
parent | bfc241ab7d0fbb2c9202c8f88a2d543cb4605f80 (diff) |
cluster/dht: Make rebalance honor min-free-disk
test: Manual
created files of size 1K on 2 brick(of size 1GB) setup .
added a brick of size 16GB.
set min-free-disk to 12GB(so that first two bricks won't receive any files).
removed one of the 1st brick of size 1GB.
Logs from test:
[2017-04-12 08:52:08.196484] W [MSGID: 0] [dht-rebalance.c:895:__dht_check_free_space]
0-test1-dht: Write will cross min-free-disk for file - /tile32 on subvol - test1-client-1.
Looking for new subvol.
[2017-04-12 08:52:08.196904] I [MSGID: 0] [dht-rebalance.c:925:__dht_check_free_space]
0-test1-dht: new target found - test1-client-2 for file - /tile32
- Post migration we have two files. The new destination (/brick/1) has the data file
[root@vm1 ~]# ll /brick/1/tile32
-rw-r--r--. 2 root root 0 Apr 12 14:22 /brick/1/tile32
- On the old target the linkto file is there with linkto xattr pointing to /brick/1
[root@vm1 ~]# ll /tmp/2/tile32
---------T. 2 root root 1000 Apr 12 14:22 /tmp/2/tile32
[root@vm1 ~]# getfattr -m . -de text /tmp/2/tile32
getfattr: Removing leading '/' from absolute path names
security.selinux="unconfined_u:object_r:user_tmp_t:s0"
trusted.gfid="����:Aс�#�/'b2"
trusted.glusterfs.dht.linkto="test1-client-2"
Marking ./tests/features/worm_sh.t as bad test.
Reason being, this patch failed on master branch as well and it has nothing
to do with rebalance/remove-brick.
> BUG: 1441508
> Change-Id: I90bae251cda3d957a49cdceda90cd08311a392fb
> Signed-off-by: Susant Palai <spalai@redhat.com>
> Reviewed-on: https://review.gluster.org/17034
> Smoke: Gluster Build System <jenkins@build.gluster.org>
> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
> Reviewed-by: Amar Tumballi <amarts@redhat.com>
> Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
> Signed-off-by: Susant Palai <spalai@redhat.com>
Change-Id: I90bae251cda3d957a49cdceda90cd08311a392fb
BUG: 1473132
Signed-off-by: Susant Palai <spalai@redhat.com>
Reviewed-on: https://review.gluster.org/17831
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Smoke: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Shyamsundar Ranganathan <srangana@redhat.com>
-rwxr-xr-x | tests/bugs/distribute/bug-1161311.t | 25 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 5 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-diskusage.c | 31 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 214 | ||||
-rw-r--r-- | xlators/storage/posix/src/posix.c | 2 |
5 files changed, 258 insertions, 19 deletions
diff --git a/tests/bugs/distribute/bug-1161311.t b/tests/bugs/distribute/bug-1161311.t index c5a7f041ac8..64fd17193e9 100755 --- a/tests/bugs/distribute/bug-1161311.t +++ b/tests/bugs/distribute/bug-1161311.t @@ -15,6 +15,27 @@ . $(dirname $0)/../../include.rc . $(dirname $0)/../../volume.rc +cleanup +TEST truncate -s 10GB $B0/brick1 +TEST truncate -s 10GB $B0/brick2 +TEST truncate -s 10GB $B0/brick3 + +TEST LO1=`SETUP_LOOP $B0/brick1` +TEST MKFS_LOOP $LO1 + +TEST LO2=`SETUP_LOOP $B0/brick2` +TEST MKFS_LOOP $LO2 + +TEST LO3=`SETUP_LOOP $B0/brick3` +TEST MKFS_LOOP $LO3 + +TEST mkdir -p $B0/${V0}1 $B0/${V0}2 $B0/${V0}3 + + +TEST MOUNT_LOOP $LO1 $B0/${V0}1 +TEST MOUNT_LOOP $LO2 $B0/${V0}2 +TEST MOUNT_LOOP $LO3 $B0/${V0}3 + checksticky () { i=0; while [ ! -k $1 ]; do @@ -31,7 +52,6 @@ checksticky () { return 0 } -cleanup; TEST glusterd TEST pidof glusterd @@ -125,5 +145,6 @@ TEST ln ./dir1/FILE7 ./FILE7 cd / linkcountsrc=$(stat -c %h $M0/dir1/FILE1) TEST [[ $linkcountsrc == 14 ]] - +UMOUNT_LOOP ${B0}/${V0}{1..3} +rm -f ${B0}/brick{1..3} cleanup; diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 6b60c5e522f..56918707449 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -316,6 +316,9 @@ struct dht_du { uint64_t avail_space; uint32_t log; uint32_t chunks; + uint32_t total_blocks; + uint32_t avail_blocks; + uint32_t frsize; /*fragment size*/ }; typedef struct dht_du dht_du_t; @@ -1160,7 +1163,7 @@ gf_boolean_t dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator); xlator_t * dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol, - dht_layout_t *layout); + dht_layout_t *layout, uint64_t filesize); xlator_t * dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, dht_layout_t *layout); diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 06957434c58..13698a9616d 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -81,7 +81,11 @@ dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this, conf->du_stats[i].avail_space = bytes; conf->du_stats[i].avail_inodes = percent_inodes; conf->du_stats[i].chunks = chunks; - gf_msg_debug (this->name, 0, + conf->du_stats[i].total_blocks = statvfs->f_blocks; + conf->du_stats[i].avail_blocks = statvfs->f_bavail; + conf->du_stats[i].frsize = statvfs->f_frsize; + + gf_msg_debug (this->name, 0, "subvolume '%s': avail_percent " "is: %.2f and avail_space " "is: %" PRIu64" and avail_inodes" @@ -312,7 +316,7 @@ dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, LOCK (&conf->subvolume_lock); { avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, - layout); + layout, 0); if(!avail_subvol) { avail_subvol = dht_subvol_maxspace_nonzeroinode(this, @@ -373,12 +377,17 @@ out: /*Get subvolume which has both space and inodes more than the min criteria*/ xlator_t * dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, - dht_layout_t *layout) + dht_layout_t *layout, uint64_t filesize) { int i = 0; double max = 0; double max_inodes = 0; int ignore_subvol = 0; + uint64_t total_blocks = 0; + uint64_t avail_blocks = 0; + uint64_t frsize = 0; + double post_availspace = 0; + double post_percent = 0; xlator_t *avail_subvol = NULL; dht_conf_t *conf = NULL; @@ -401,6 +410,9 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, max = conf->du_stats[i].avail_percent; max_inodes = conf->du_stats[i].avail_inodes; avail_subvol = conf->subvolumes[i]; + total_blocks = conf->du_stats[i].total_blocks; + avail_blocks = conf->du_stats[i].avail_blocks; + frsize = conf->du_stats[i].frsize; } } @@ -416,6 +428,19 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, } } + if (avail_subvol) { + if (conf->disk_unit == 'p') { + post_availspace = (avail_blocks * frsize) - filesize; + post_percent = (post_availspace * 100) / (total_blocks * frsize); + if (post_percent < conf->min_free_disk) + avail_subvol = NULL; + } + if (conf->disk_unit != 'p') { + if ((max - filesize) < conf->min_free_disk) + avail_subvol = NULL; + } + } + return avail_subvol; } diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index 5d3426679ee..de7b5146c56 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -707,12 +707,29 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc "%s: failed to set xattr on %s (%s)", loc->path, to->name, strerror (-ret)); + + /* TODO: Need to add a detailed comment about why we moved away from + ftruncate. + ret = syncop_ftruncate (to, fd, stbuf->ia_size, NULL, NULL); if (ret < 0) gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, "ftruncate failed for %s on %s (%s)", loc->path, to->name, strerror (-ret)); + */ + + /* Fallocate does not work for size 0, hence the check. Anyway we don't + * need to care about min-free-disk for 0 byte size file */ + if (stbuf->ia_size > 0) { + ret = syncop_fallocate (to, fd, 0, 0, stbuf->ia_size, NULL, + NULL); + if (ret < 0) + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_MIGRATE_FILE_FAILED, + "fallocate failed for %s on %s (%s)", + loc->path, to->name, strerror (-ret)); + } ret = syncop_fsetattr (to, fd, stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), @@ -743,16 +760,19 @@ out: static int __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, - struct iatt *stbuf, int flag) + struct iatt *stbuf, int flag, dht_conf_t *conf, + gf_boolean_t *target_changed, xlator_t **new_subvol) { struct statvfs src_statfs = {0,}; struct statvfs dst_statfs = {0,}; int ret = -1; xlator_t *this = NULL; dict_t *xdata = NULL; - + dht_layout_t *layout = NULL; uint64_t src_statfs_blocks = 1; uint64_t dst_statfs_blocks = 1; + double post_availspace = 0; + double post_percent = 0; this = THIS; @@ -794,6 +814,10 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, goto out; } + gf_msg_debug (this->name, 0, "min_free_disk - %f , block available - %lu ," + " block size - %lu ", conf->min_free_disk, dst_statfs.f_bavail, + dst_statfs.f_bsize); + /* if force option is given, do not check for space @ dst. * Check only if space is avail for the file */ if (flag != GF_DHT_MIGRATE_DATA) @@ -832,18 +856,66 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, goto out; } } + + check_avail_space: - if (((dst_statfs.f_bavail * dst_statfs.f_bsize) / - GF_DISK_SECTOR_SIZE) < stbuf->ia_blocks) { - gf_msg (this->name, GF_LOG_ERROR, 0, - DHT_MSG_MIGRATE_FILE_FAILED, - "data movement attempted from node (%s) to node (%s) " - "which does not have required free space for (%s)", - from->name, to->name, loc->path); + + if (conf->disk_unit == 'p' && dst_statfs.f_blocks) { + post_availspace = (dst_statfs.f_bavail * dst_statfs.f_frsize) - stbuf->ia_size; + post_percent = (post_availspace * 100) / (dst_statfs.f_blocks * dst_statfs.f_frsize); + if (post_percent < conf->min_free_disk) { + gf_msg (this->name, GF_LOG_WARNING, 0, 0, + "Write will cross min-free-disk for " + "file - %s on subvol - %s. Looking " + "for new subvol", loc->path, to->name); + + goto find_new_subvol; + } else { + ret = 0; + goto out; + } + } + + if (conf->disk_unit != 'p' && + ((dst_statfs.f_bavail * dst_statfs.f_frsize) - stbuf->ia_size) < conf->min_free_disk) { + gf_msg (this->name, GF_LOG_WARNING, 0, 0, "Write will cross " + "min-free-disk for file - %s on subvol - %s. Looking " + "for new subvol", loc->path, to->name); + + goto find_new_subvol; + } else { + ret = 0; + goto out; + } + + +find_new_subvol: + layout = dht_layout_get (this, loc->parent); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, "Layout is NULL"); ret = -1; goto out; } + *new_subvol = dht_subvol_with_free_space_inodes (this, to, + layout, stbuf->ia_size); + if (!(*new_subvol)) { + gf_msg (this->name, GF_LOG_WARNING, 0, + DHT_MSG_SUBVOL_INSUFF_SPACE, "Could not find any subvol" + " with space accomodating the file. Consider adding " + "bricks"); + + *target_changed = _gf_false; + ret = -1; + goto out; + } else { + gf_msg (this->name, GF_LOG_INFO, 0, 0, "new target found - %s" + " for file - %s", (*new_subvol)->name, loc->path); + *target_changed = _gf_true; + ret = 0; + goto out; + } + ret = 0; out: if (xdata) @@ -1307,6 +1379,9 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, lock_migration_info_t locklist; dict_t *meta_dict = NULL; gf_boolean_t meta_locked = _gf_false; + gf_boolean_t target_changed = _gf_false; + xlator_t *new_target = NULL; + xlator_t *old_target = NULL; defrag = conf->defrag; if (!defrag) @@ -1416,12 +1491,57 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, /* create the destination, with required modes/xattr */ ret = __dht_rebalance_create_dst_file (to, from, loc, &stbuf, &dst_fd, xattr); - if (ret) + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, 0, "Create dst failed" + " on - %s for file - %s", to->name, loc->path); goto out; + } clean_dst = _gf_true; - ret = __dht_check_free_space (to, from, loc, &stbuf, flag); + ret = __dht_check_free_space (to, from, loc, &stbuf, flag, conf, + &target_changed, &new_target); + if (target_changed) { + /* Can't handle for hardlinks. Marking this as failure */ + if (flag == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS || stbuf.ia_nlink > 1) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_SUBVOL_INSUFF_SPACE, "Exiting migration for" + " file - %s. flag - %d, stbuf.ia_nlink - %d", + loc->path, flag, stbuf.ia_nlink); + ret = -1; + goto out; + } + + + ret = syncop_ftruncate (to, dst_fd, 0, NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform truncate on %s (%s)", + loc->path, to->name, strerror (-ret)); + ret = -1; + } + + syncop_close (dst_fd); + + old_target = to; + to = new_target; + + /* if the file migration is successful to this new target, then + * update the xattr on the old destination to point the new + * destination. We need to do update this only post migration + * as in case of failure the linkto needs to point to the source + * subvol */ + ret = __dht_rebalance_create_dst_file (to, from, loc, &stbuf, + &dst_fd, xattr); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Create dst failed" + " on - %s for file - %s", to->name, loc->path); + goto out; + } else { + gf_msg (this->name, GF_LOG_INFO, 0, 0, "destination for file " + "- %s is changed to - %s", loc->path, to->name); + } + } if (ret) { goto out; @@ -1651,6 +1771,36 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, ret = -1; } + if (target_changed) { + if (!dict) { + dict = dict_new (); + if (!dict) { + ret = -1; + goto out; + } + } else { + dict_del (dict, conf->link_xattr_name); + dict_del (dict, GLUSTERFS_POSIXLK_COUNT); + ret = dict_set_str (dict, conf->link_xattr_name, to->name); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set xattr in dict for %s (linkto:%s)", + loc->path, to->name); + ret = -1; + goto out; + } + + ret = syncop_setxattr (old_target, loc, dict, 0, NULL, NULL); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_MIGRATE_FILE_FAILED, + "failed to set xattr on %s in %s (%s)", + loc->path, old_target->name, strerror (-ret)); + ret = -1; + goto out; + } + } + } clean_dst = _gf_false; @@ -2151,6 +2301,8 @@ gf_defrag_migrate_single_file (void *opaque) inode_t *inode = NULL; xlator_t *hashed_subvol = NULL; xlator_t *cached_subvol = NULL; + call_frame_t *statfs_frame = NULL; + xlator_t *old_THIS = NULL; rebal_entry = (struct dht_container *)opaque; if (!rebal_entry) { @@ -2256,6 +2408,20 @@ gf_defrag_migrate_single_file (void *opaque) /* use the inode returned by inode_link */ entry_loc.inode = inode; + old_THIS = THIS; + THIS = this; + statfs_frame = create_frame (this, this->ctx->pool); + if (!statfs_frame) { + gf_msg (this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM, + "Insufficient memory. Frame creation failed"); + ret = -1; + goto out; + } + + /* async statfs information for honoring min-free-disk */ + dht_get_du_info (statfs_frame, this, loc); + THIS = old_THIS; + ret = syncop_setxattr (this, &entry_loc, migrate_data, 0, NULL, NULL); if (ret < 0) { op_errno = -ret; @@ -2334,6 +2500,10 @@ gf_defrag_migrate_single_file (void *opaque) } out: + if (statfs_frame) { + STACK_DESTROY (statfs_frame->root); + } + loc_wipe (&entry_loc); return ret; @@ -2594,7 +2764,7 @@ gf_defrag_get_entry (xlator_t *this, int i, struct dht_container **container, continue; } - /*Build Container Structure */ + /*Build Container Structure */ tmp_container = GF_CALLOC (1, sizeof(struct dht_container), gf_dht_mt_container_t); @@ -3748,6 +3918,8 @@ gf_defrag_start_crawl (void *data) pthread_t *tid = NULL; gf_boolean_t is_tier_detach = _gf_false; int j = 0; + call_frame_t *statfs_frame = NULL; + xlator_t *old_THIS = NULL; this = data; if (!this) @@ -3784,6 +3956,21 @@ gf_defrag_start_crawl (void *data) goto out; } + old_THIS = THIS; + THIS = this; + + statfs_frame = create_frame (this, this->ctx->pool); + if (!statfs_frame) { + gf_msg (this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM, + "Insufficient memory. Frame creation failed"); + ret = -1; + goto out; + } + + /* async statfs update for honoring min-free-disk */ + dht_get_du_info (statfs_frame, this, &loc); + THIS = old_THIS; + fix_layout = dict_new (); if (!fix_layout) { ret = -1; @@ -4052,6 +4239,9 @@ out: if (migrate_data) dict_unref (migrate_data); + if (statfs_frame) { + STACK_DESTROY (statfs_frame->root); + } exit: return ret; } diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index 47a477e4470..5f3f3388046 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -718,7 +718,7 @@ posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, goto out; } - if (dict_get (xdata, GLUSTERFS_WRITE_UPDATE_ATOMIC)) { + if (xdata && dict_get (xdata, GLUSTERFS_WRITE_UPDATE_ATOMIC)) { locked = _gf_true; pthread_mutex_lock (&ctx->write_atomic_lock); } |