diff options
author | Susant Palai <spalai@redhat.com> | 2017-04-17 13:00:54 +0530 |
---|---|---|
committer | Raghavendra G <rgowdapp@redhat.com> | 2017-04-19 01:35:14 -0400 |
commit | b350bcd6a3db2e92d1baa47a5ec02efd09f76f16 (patch) | |
tree | af5f3a473131a7978bbda2bdd74582904d0d0ab2 | |
parent | e3d9018f2ddc28548c0aa18960a3a524521c9ad7 (diff) |
cluster/dht: Skip file migration if the subvol that meets min-free-disk
criteria happens to be the same subvol containing data-file
Rebalance need to figure out a new subvol in case the hashed subvol
does not have enough space. In the process of figuring out the new subvol,
we need to ignore the source subvol, otherwise it will lead to data loss.
Test: Manual
Ran the following
sizeof /tmp/1: 1.5GB
sizeof /brick/1: 16GB
sizeof /tmp/2: 1.5GB
<start>
glusterd; gluster v create test1 vm1:/brick/1 vm1:/tmp/1;
gluster v start test1;
mount -t glusterfs vm1:test1 /mnt;
for i in {1..2000}
do
dd if=/dev/zero of=/mnt/file$i bs=1KB count=1 &> /dev/null;
done
gluster v add-brick test1 vm1:/tmp/2
gluster v set test1 min-free-disk 12GB
gluster v remove-brick test1 vm1:/tmp/1 star
<end>
file count and data were intact.
Change-Id: Ib8fc8467a3d48a7c12958824c4f0b88e160b86c1
BUG: 1441508
Signed-off-by: Susant Palai <spalai@redhat.com>
Reviewed-on: https://review.gluster.org/17064
Smoke: Gluster Build System <jenkins@build.gluster.org>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
-rw-r--r-- | xlators/cluster/dht/src/dht-common.h | 2 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-diskusage.c | 19 | ||||
-rw-r--r-- | xlators/cluster/dht/src/dht-rebalance.c | 96 |
3 files changed, 92 insertions, 25 deletions
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 4555152db54..0e082e35c57 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -1153,7 +1153,7 @@ dht_dir_has_layout (dict_t *xattr, char *name); gf_boolean_t dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator); xlator_t * -dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol, +dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol, xlator_t *ignore, dht_layout_t *layout, uint64_t filesize); xlator_t * dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 13698a9616d..05592154e30 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -315,7 +315,7 @@ dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, LOCK (&conf->subvolume_lock); { - avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, + avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, NULL, layout, 0); if(!avail_subvol) { @@ -340,8 +340,8 @@ out: } static inline -int32_t dht_subvol_has_err (dht_conf_t *conf, xlator_t *this, - dht_layout_t *layout) +int32_t dht_subvol_has_err (dht_conf_t *conf, xlator_t *this, xlator_t *ignore, + dht_layout_t *layout) { int ret = -1; int i = 0; @@ -349,6 +349,13 @@ int32_t dht_subvol_has_err (dht_conf_t *conf, xlator_t *this, if (!this || !layout) goto out; + /* this check is meant for rebalance process. The source of the file + * should be ignored for space check */ + if (this == ignore) { + goto out; + } + + /* check if subvol has layout errors, before selecting it */ for (i = 0; i < layout->cnt; i++) { if (!strcmp (layout->list[i].xlator->name, this->name) && @@ -376,7 +383,7 @@ out: /*Get subvolume which has both space and inodes more than the min criteria*/ xlator_t * -dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, +dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, xlator_t *ignore, dht_layout_t *layout, uint64_t filesize) { int i = 0; @@ -398,7 +405,7 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, /* check if subvol has layout errors and also it is not a * decommissioned brick, before selecting it */ ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i], - layout); + ignore, layout); if (ignore_subvol) continue; @@ -463,7 +470,7 @@ dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, /* check if subvol has layout errors and also it is not a * decommissioned brick, before selecting it*/ - ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i], + ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i], NULL, layout); if (ignore_subvol) continue; diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index d963cfb2b8f..c1174225138 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -719,27 +719,30 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc loc->path, to->name, strerror (-ret)); */ + ret = syncop_fsetattr (to, fd, stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID), + NULL, NULL, NULL, NULL); + if (ret < 0) + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_MIGRATE_FILE_FAILED, + "chown failed for %s on %s (%s)", + loc->path, to->name, strerror (-ret)); + /* Fallocate does not work for size 0, hence the check. Anyway we don't * need to care about min-free-disk for 0 byte size file */ if (stbuf->ia_size > 0) { ret = syncop_fallocate (to, fd, 0, 0, stbuf->ia_size, NULL, NULL); - if (ret < 0) + if (ret < 0) { gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, "fallocate failed for %s on %s (%s)", loc->path, to->name, strerror (-ret)); + ret = -1; + goto out; + } } - ret = syncop_fsetattr (to, fd, stbuf, - (GF_SET_ATTR_UID | GF_SET_ATTR_GID), - NULL, NULL, NULL, NULL); - if (ret < 0) - gf_msg (this->name, GF_LOG_ERROR, 0, - DHT_MSG_MIGRATE_FILE_FAILED, - "chown failed for %s on %s (%s)", - loc->path, to->name, strerror (-ret)); - /* success */ ret = 0; @@ -761,7 +764,8 @@ out: static int __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf, int flag, dht_conf_t *conf, - gf_boolean_t *target_changed, xlator_t **new_subvol) + gf_boolean_t *target_changed, xlator_t **new_subvol, + gf_boolean_t *ignore_failure) { struct statvfs src_statfs = {0,}; struct statvfs dst_statfs = {0,}; @@ -773,6 +777,7 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, uint64_t dst_statfs_blocks = 1; double post_availspace = 0; double post_percent = 0; + int i = 0; this = THIS; @@ -897,13 +902,27 @@ find_new_subvol: goto out; } - *new_subvol = dht_subvol_with_free_space_inodes (this, to, - layout, stbuf->ia_size); - if (!(*new_subvol)) { + *new_subvol = dht_subvol_with_free_space_inodes (this, to, from, layout, + stbuf->ia_size); + if ((!(*new_subvol)) || (*new_subvol == from)) { gf_msg (this->name, GF_LOG_WARNING, 0, DHT_MSG_SUBVOL_INSUFF_SPACE, "Could not find any subvol" - " with space accomodating the file. Consider adding " - "bricks"); + " with space accomodating the file - %s. Consider adding " + "bricks", loc->path); + + /* For remove-brick case if the source is not one of the + * removed-brick, do not mark the error as failure */ + if (conf->decommission_subvols_cnt) { + *ignore_failure = _gf_true; + for (i = 0; i < conf->decommission_subvols_cnt; i++) { + if (conf->decommissioned_bricks[i] == from) { + *ignore_failure = _gf_false; + break; + } + } + } else { + *ignore_failure = _gf_false; + } *target_changed = _gf_false; ret = -1; @@ -1382,6 +1401,8 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, gf_boolean_t target_changed = _gf_false; xlator_t *new_target = NULL; xlator_t *old_target = NULL; + fd_t *linkto_fd = NULL; + gf_boolean_t ignore_failure = _gf_false; defrag = conf->defrag; if (!defrag) @@ -1500,7 +1521,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, clean_dst = _gf_true; ret = __dht_check_free_space (to, from, loc, &stbuf, flag, conf, - &target_changed, &new_target); + &target_changed, &new_target, &ignore_failure); if (target_changed) { /* Can't handle for hardlinks. Marking this as failure */ if (flag == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS || stbuf.ia_nlink > 1) { @@ -1544,6 +1565,9 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, } if (ret) { + if (ignore_failure) + ret = 0; + goto out; } @@ -1791,13 +1815,47 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, } ret = syncop_setxattr (old_target, loc, dict, 0, NULL, NULL); - if (ret) { + if (ret && -ret != ESTALE && -ret != ENOENT) { gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, "failed to set xattr on %s in %s (%s)", loc->path, old_target->name, strerror (-ret)); ret = -1; goto out; + } else if (-ret == ESTALE || -ret == ENOENT) { + /* The failure ESTALE indicates that the linkto + * file on the hashed subvol might have been deleted. + * In this case will create a linkto file with new target + * as linkto xattr value*/ + linkto_fd = fd_create (loc->inode, DHT_REBALANCE_PID); + if (!linkto_fd) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_MIGRATE_FILE_FAILED, + "%s: fd create failed (%s)", + loc->path, strerror (errno)); + ret = -1; + goto out; + } + ret = syncop_create (old_target, loc, O_RDWR, + DHT_LINKFILE_MODE, linkto_fd, + NULL, dict, NULL); + if (ret != 0 && -ret != EEXIST && -ret != ESTALE) { + ret = -1; + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_MIGRATE_FILE_FAILED, + "failed to create linkto file on %s in %s (%s)", + loc->path, old_target->name, strerror (-ret)); + goto out; + } else if (ret == 0) { + ret = syncop_fsetattr (old_target, linkto_fd, &stbuf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID), + NULL, NULL, NULL, NULL); + if (ret < 0) + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_MIGRATE_FILE_FAILED, + "chown failed for %s on %s (%s)", + loc->path, old_target->name, strerror (-ret)); + } } } } @@ -2043,6 +2101,8 @@ out: syncop_close (dst_fd); if (src_fd) syncop_close (src_fd); + if (linkto_fd) + syncop_close (linkto_fd); loc_wipe (&tmp_loc); |