diff options
| author | Kevin Vigor <kvigor@fb.com> | 2016-02-09 16:28:05 -0800 |
|---|---|---|
| committer | Shreyas Siravara <sshreyas@fb.com> | 2016-12-19 11:06:05 -0800 |
| commit | 8b92a807576a2d29647e967a269bdd2d4faca167 (patch) | |
| tree | 4de6f3b828a6f771bd91c607c982ed37da0a20a2 | |
| parent | d83f9f467e47275e03d1fa979eed19960c9ef3bf (diff) | |
storage/posix: Add free space limits to bricks
Summary:
- Add a configurable minimum free space for bricks, using the new
options storage.min-free-disk (analagous to cluster.min-free-disk,
and using the same units: either a percentage or an
absolute number of bytes) and storage.freespace-check-interval
(how frequently to check free space, in seconds).
- This is a cherry-pick of D2920210 to 3.8
Signed-off-by: Shreyas Siravara <sshreyas@fb.com>
Change-Id: I4b87e421aad023e49b5972c6e61539670a818411
Reviewed-on: http://review.gluster.org/16176
Tested-by: Shreyas Siravara <sshreyas@fb.com>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
Smoke: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Kevin Vigor <kvigor@fb.com>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
| -rw-r--r-- | tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t | 22 | ||||
| -rwxr-xr-x | tests/bugs/glusterd/bug-859927.t | 8 | ||||
| -rwxr-xr-x | tests/features/brick-min-free-space.t | 113 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-volume-set.c | 8 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix-aio.c | 5 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.c | 128 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.h | 12 |
7 files changed, 280 insertions, 16 deletions
diff --git a/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t b/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t index 9fc7ac3b845..3bc80ab9dab 100644 --- a/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t +++ b/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t @@ -1,6 +1,6 @@ #!/bin/bash -## Test case for cluster.min-free-disk option validation. +## Test case for cluster.cluster.min-free-disk option validation. . $(dirname $0)/../../include.rc @@ -17,21 +17,21 @@ TEST $CLI volume create $V0 $H0:$B0/brick1 $H0:$B0/brick2 TEST $CLI volume start $V0 ## Setting invalid value for option cluster.min-free-disk should fail -TEST ! $CLI volume set $V0 min-free-disk "" -TEST ! $CLI volume set $V0 min-free-disk 143.!/12 -TEST ! $CLI volume set $V0 min-free-disk 123% -TEST ! $CLI volume set $V0 min-free-disk 194.34% +TEST ! $CLI volume set $V0 cluster.min-free-disk "" +TEST ! $CLI volume set $V0 cluster.min-free-disk 143.!/12 +TEST ! $CLI volume set $V0 cluster.min-free-disk 123% +TEST ! $CLI volume set $V0 cluster.min-free-disk 194.34% ## Setting fractional value as a size (unit is byte) for option ## cluster.min-free-disk should fail -TEST ! $CLI volume set $V0 min-free-disk 199.051 -TEST ! $CLI volume set $V0 min-free-disk 111.999 +TEST ! $CLI volume set $V0 cluster.min-free-disk 199.051 +TEST ! $CLI volume set $V0 cluster.min-free-disk 111.999 ## Setting valid value for option cluster.min-free-disk should pass -TEST $CLI volume set $V0 min-free-disk 12% -TEST $CLI volume set $V0 min-free-disk 56.7% -TEST $CLI volume set $V0 min-free-disk 120 -TEST $CLI volume set $V0 min-free-disk 369.0000 +TEST $CLI volume set $V0 cluster.min-free-disk 12% +TEST $CLI volume set $V0 cluster.min-free-disk 56.7% +TEST $CLI volume set $V0 cluster.min-free-disk 120 +TEST $CLI volume set $V0 cluster.min-free-disk 369.0000 cleanup; diff --git a/tests/bugs/glusterd/bug-859927.t b/tests/bugs/glusterd/bug-859927.t index c30d2b852d4..1b9ca18c08a 100755 --- a/tests/bugs/glusterd/bug-859927.t +++ b/tests/bugs/glusterd/bug-859927.t @@ -44,12 +44,12 @@ TEST ! $CLI volume set $V0 min-free-inodes " " TEST $CLI volume set $V0 min-free-inodes 60% EXPECT "60%" volume_option $V0 cluster.min-free-inodes -TEST ! $CLI volume set $V0 min-free-disk "" -TEST ! $CLI volume set $V0 min-free-disk " " -TEST $CLI volume set $V0 min-free-disk 60% +TEST ! $CLI volume set $V0 cluster.min-free-disk "" +TEST ! $CLI volume set $V0 cluster.min-free-disk " " +TEST $CLI volume set $V0 cluster.min-free-disk 60% EXPECT "60%" volume_option $V0 cluster.min-free-disk -TEST $CLI volume set $V0 min-free-disk 120 +TEST $CLI volume set $V0 cluster.min-free-disk 120 EXPECT "120" volume_option $V0 cluster.min-free-disk TEST ! $CLI volume set $V0 frame-timeout "" diff --git a/tests/features/brick-min-free-space.t b/tests/features/brick-min-free-space.t new file mode 100755 index 00000000000..4372998681f --- /dev/null +++ b/tests/features/brick-min-free-space.t @@ -0,0 +1,113 @@ +#!/bin/bash +# +# Test storage.min-free-disk option works. +# + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +cleanup; + +TEST glusterd + +TEST truncate -s 16M $B0/brick0 +TEST LOOPDEV=$(losetup --find --show $B0/brick0) +TEST mkfs.xfs $LOOPDEV + +mkdir -p $B0/$V0 + +TEST mount -t xfs $LOOPDEV $B0/$V0 + +########### +# AIO on # +########### + +TEST $CLI volume create $V0 $H0:$B0/$V0 +TEST $CLI volume start $V0 +TEST $CLI volume set $V0 readdir-ahead on +TEST $CLI vol set $V0 storage.linux-aio on + +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 + +# Filesystem has ~12MB capacity after XFS and glusterfs overhead. +# A 16MB write should blow up. +TEST ! dd if=/dev/zero of=$M0/test bs=1M count=16 oflag=direct +TEST rm $M0/test + +# But we should be able to write 10MB +TEST dd if=/dev/zero of=$M0/test bs=1M count=10 oflag=direct + +# Now enable limit and set to at least 8MB free space +TEST $CLI volume set $V0 storage.freespace-check-interval 1 +TEST $CLI volume set $V0 storage.min-free-disk 8388608 + +# Now even a tiny write ought fail. +TEST ! dd if=/dev/zero of=$M0/test1 bs=1M count=1 oflag=direct +TEST rm $M0/test1 + +# Repeat using percent syntax. +TEST $CLI volume set $V0 storage.min-free-disk 33% + +TEST ! dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct +TEST rm $M0/test1 + +# Disable limit. +TEST $CLI volume set $V0 storage.freespace-check-interval 0 + +# Now we can write again. +TEST dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct + +TEST rm $M0/test1 +TEST rm $M0/test + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0; +TEST $CLI volume stop $V0 +TEST $CLI volume delete $V0 + +############ +# AIO off # +############ + +TEST $CLI volume create $V0 $H0:$B0/$V0 +TEST $CLI volume start $V0 +TEST $CLI volume set $V0 readdir-ahead on +TEST $CLI vol set $V0 storage.linux-aio off + +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 + +# Filesystem has ~12MB capacity after XFS and glusterfs overhead. +# A 16MB write should blow up. +TEST ! dd if=/dev/zero of=$M0/test bs=1M count=16 oflag=direct +TEST rm $M0/test + +# But we should be able to write 10MB +TEST dd if=/dev/zero of=$M0/test bs=1M count=10 oflag=direct + +# Now enable limit and set to at least 8MB free space +TEST $CLI volume set $V0 storage.freespace-check-interval 1 +TEST $CLI volume set $V0 storage.min-free-disk 8388608 + +# Now even a tiny write ought fail. +TEST ! dd if=/dev/zero of=$M0/test1 bs=1M count=1 oflag=direct +TEST rm $M0/test1 + +# Repeat using percent syntax. +TEST $CLI volume set $V0 storage.min-free-disk 33% + +TEST ! dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct +TEST rm $M0/test1 + +# Disable limit. +TEST $CLI volume set $V0 storage.freespace-check-interval 0 + +# Now we can write again. +TEST dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct + +TEST rm $M0/test1 +TEST rm $M0/test + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0; +TEST $CLI volume stop $V0 +TEST $CLI volume delete $V0 + +cleanup; diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c index f4dd9fcde71..89e63144fad 100644 --- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c +++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c @@ -2517,6 +2517,14 @@ struct volopt_map_entry glusterd_volopt_map[] = { .voltype = "storage/posix", .op_version = GD_OP_VERSION_3_6_0, }, + { .key = "storage.min-free-disk", + .voltype = "storage/posix", + .op_version = 2, + }, + { .key = "storage.freespace-check-interval", + .voltype = "storage/posix", + .op_version = 2, + }, { .key = "storage.bd-aio", .voltype = "storage/bd", .op_version = 3 diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c index d8ef5f7b73f..636108affbb 100644 --- a/xlators/storage/posix/src/posix-aio.c +++ b/xlators/storage/posix/src/posix-aio.c @@ -331,6 +331,11 @@ posix_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, priv = this->private; + if (!posix_write_ok (this, priv)) { + op_errno = ENOSPC; + goto err; + } + ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno); if (ret < 0) { gf_msg (this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL, diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index cecf5dcb66d..c40a087ec46 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -658,6 +658,81 @@ out: return 0; } +static gf_boolean_t freespace_ok (xlator_t *this, const struct statvfs *stats, + double min_free_disk, + gf_boolean_t previously_ok) +{ + gf_boolean_t currently_ok; + + if (min_free_disk < 100.0) { + double free_percent = 100.0 * stats->f_bavail / stats->f_blocks; + + currently_ok = + free_percent >= min_free_disk ? _gf_true : _gf_false; + if (previously_ok && !currently_ok) { + gf_log (this->name, GF_LOG_WARNING, + "min-free-disk limit exceeded: free percent " + "%f%% < %f%%. Writes disabled.", + free_percent, min_free_disk); + } + } else { + double free_bytes = stats->f_bavail * stats->f_frsize; + + currently_ok = + free_bytes >= min_free_disk ? _gf_true : _gf_false; + if (previously_ok && !currently_ok) { + gf_log (this->name, GF_LOG_WARNING, + "min-free-disk limit exceeded: free bytes %f " + "< %f. Writes disabled.", + free_bytes, min_free_disk); + } + } + + if (currently_ok && !previously_ok) { + gf_log (this->name, GF_LOG_INFO, "Free space has risen above " + "min-free-disk limit, writes " + "re-enabled."); + } + + return currently_ok; +} + +gf_boolean_t +posix_write_ok (xlator_t *this, struct posix_private *priv) +{ + /* Check if there is sufficient free space to allow writes. + * + * This is called in the write path, so performance matters. We + * periodically sample free space by calling statvfs(). + * freespace_check_lock is used to ensure only one process at a + * time makes the call; if the lock is contended, the previous + * status (reflected in freespace_check_passed) is used while + * the process that holds the mutex updates the current status. + */ + if (!priv->freespace_check_interval) { + return _gf_true; + } + + if (!pthread_mutex_trylock (&priv->freespace_check_lock)) { + struct timespec now; + + clock_gettime (CLOCK_MONOTONIC, &now); + if (now.tv_sec >= priv->freespace_check_last.tv_sec + + priv->freespace_check_interval) { + sys_statvfs (priv->base_path, &priv->freespace_stats); + priv->freespace_check_last.tv_sec = now.tv_sec; + + priv->freespace_check_passed = freespace_ok ( + this, &priv->freespace_stats, priv->min_free_disk, + priv->freespace_check_passed); + } + + pthread_mutex_unlock (&priv->freespace_check_lock); + } + + return priv->freespace_check_passed; +} + static int32_t posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, off_t offset, size_t len, @@ -667,6 +742,7 @@ posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t op_errno = 0; struct posix_fd *pfd = NULL; gf_boolean_t locked = _gf_false; + struct posix_private *priv = this->private; DECLARE_OLD_FS_ID_VAR; @@ -675,6 +751,12 @@ posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (priv, out); + + if (!posix_write_ok (this, priv)) { + ret = -ENOSPC; + goto out; + } ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno); if (ret < 0) { @@ -3307,6 +3389,12 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, VALIDATE_OR_GOTO (priv, out); + if (!posix_write_ok (this, priv)) { + op_errno = ENOSPC; + op_ret = -1; + goto out; + } + ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno); if (ret < 0) { gf_msg (this->name, GF_LOG_WARNING, ret, P_MSG_PFD_NULL, @@ -6671,6 +6759,16 @@ struct posix_private *priv = NULL; options, uint32, out); posix_spawn_health_check_thread (this); + pthread_mutex_lock (&priv->freespace_check_lock); + { + GF_OPTION_RECONF ("freespace-check-interval", + priv->freespace_check_interval, + options, uint32, out); + GF_OPTION_RECONF ("min-free-disk", priv->min_free_disk, options, + percent_or_size, out); + } + pthread_mutex_unlock (&priv->freespace_check_lock); + ret = 0; out: return ret; @@ -7285,6 +7383,19 @@ init (xlator_t *this) GF_OPTION_INIT ("batch-fsync-delay-usec", _private->batch_fsync_delay_usec, uint32, out); + + GF_OPTION_INIT ("freespace-check-interval", + _private->freespace_check_interval, uint32, out); + + GF_OPTION_INIT ("min-free-disk", _private->min_free_disk, + percent_or_size, out); + + pthread_mutex_init (&_private->freespace_check_lock, NULL); + sys_statvfs (_private->base_path, &_private->freespace_stats); + clock_gettime (CLOCK_MONOTONIC, &_private->freespace_check_last); + _private->freespace_check_passed = freespace_ok ( + this, &_private->freespace_stats, _private->min_free_disk, + _gf_true); out: return ret; } @@ -7462,5 +7573,22 @@ struct volume_options options[] = { "\t- Strip: Will strip the user namespace before setting. The raw filesystem will work in OS X.\n" }, #endif + { .key = {"min-free-disk"}, + .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, + .default_value = "2%", + .description = "Minimum percentage/size of disk space, after which we" + "start failing writes with ENOSPC." + }, + { + .key = {"freespace-check-interval"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .default_value = "5", + .validate = GF_OPT_VALIDATE_MIN, + .description = "Interval in seconds between freespace measurements " + "used for the min-free-disk determination. " + "Set to 0 to disable." + }, + { .key = {NULL} } }; diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h index 87f91e57747..ef4bc66ecbc 100644 --- a/xlators/storage/posix/src/posix.h +++ b/xlators/storage/posix/src/posix.h @@ -174,7 +174,14 @@ struct posix_private { XATTR_BOTH, } xattr_user_namespace; #endif - + /* freespace_check_lock protects access to following three fields. */ + pthread_mutex_t freespace_check_lock; + struct timespec freespace_check_last; + struct statvfs freespace_stats; + double min_free_disk; + /* mutex protection ends. */ + uint32_t freespace_check_interval; + gf_boolean_t freespace_check_passed; }; typedef struct { @@ -263,6 +270,9 @@ posix_get_ancestry (xlator_t *this, inode_t *leaf_inode, void posix_gfid_unset (xlator_t *this, dict_t *xdata); +gf_boolean_t +posix_write_ok (xlator_t *this, struct posix_private *priv); + int posix_pacl_set (const char *path, const char *key, const char *acl_s); |
