summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKevin Vigor <kvigor@fb.com>2016-02-09 16:28:05 -0800
committerShreyas Siravara <sshreyas@fb.com>2016-12-19 11:06:05 -0800
commit8b92a807576a2d29647e967a269bdd2d4faca167 (patch)
tree4de6f3b828a6f771bd91c607c982ed37da0a20a2
parentd83f9f467e47275e03d1fa979eed19960c9ef3bf (diff)
storage/posix: Add free space limits to bricks
Summary: - Add a configurable minimum free space for bricks, using the new options storage.min-free-disk (analagous to cluster.min-free-disk, and using the same units: either a percentage or an absolute number of bytes) and storage.freespace-check-interval (how frequently to check free space, in seconds). - This is a cherry-pick of D2920210 to 3.8 Signed-off-by: Shreyas Siravara <sshreyas@fb.com> Change-Id: I4b87e421aad023e49b5972c6e61539670a818411 Reviewed-on: http://review.gluster.org/16176 Tested-by: Shreyas Siravara <sshreyas@fb.com> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> Smoke: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Kevin Vigor <kvigor@fb.com> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
-rw-r--r--tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t22
-rwxr-xr-xtests/bugs/glusterd/bug-859927.t8
-rwxr-xr-xtests/features/brick-min-free-space.t113
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c8
-rw-r--r--xlators/storage/posix/src/posix-aio.c5
-rw-r--r--xlators/storage/posix/src/posix.c128
-rw-r--r--xlators/storage/posix/src/posix.h12
7 files changed, 280 insertions, 16 deletions
diff --git a/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t b/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t
index 9fc7ac3b845..3bc80ab9dab 100644
--- a/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t
+++ b/tests/bugs/glusterd/bug-1163108-min-free-disk-option-validation.t
@@ -1,6 +1,6 @@
#!/bin/bash
-## Test case for cluster.min-free-disk option validation.
+## Test case for cluster.cluster.min-free-disk option validation.
. $(dirname $0)/../../include.rc
@@ -17,21 +17,21 @@ TEST $CLI volume create $V0 $H0:$B0/brick1 $H0:$B0/brick2
TEST $CLI volume start $V0
## Setting invalid value for option cluster.min-free-disk should fail
-TEST ! $CLI volume set $V0 min-free-disk ""
-TEST ! $CLI volume set $V0 min-free-disk 143.!/12
-TEST ! $CLI volume set $V0 min-free-disk 123%
-TEST ! $CLI volume set $V0 min-free-disk 194.34%
+TEST ! $CLI volume set $V0 cluster.min-free-disk ""
+TEST ! $CLI volume set $V0 cluster.min-free-disk 143.!/12
+TEST ! $CLI volume set $V0 cluster.min-free-disk 123%
+TEST ! $CLI volume set $V0 cluster.min-free-disk 194.34%
## Setting fractional value as a size (unit is byte) for option
## cluster.min-free-disk should fail
-TEST ! $CLI volume set $V0 min-free-disk 199.051
-TEST ! $CLI volume set $V0 min-free-disk 111.999
+TEST ! $CLI volume set $V0 cluster.min-free-disk 199.051
+TEST ! $CLI volume set $V0 cluster.min-free-disk 111.999
## Setting valid value for option cluster.min-free-disk should pass
-TEST $CLI volume set $V0 min-free-disk 12%
-TEST $CLI volume set $V0 min-free-disk 56.7%
-TEST $CLI volume set $V0 min-free-disk 120
-TEST $CLI volume set $V0 min-free-disk 369.0000
+TEST $CLI volume set $V0 cluster.min-free-disk 12%
+TEST $CLI volume set $V0 cluster.min-free-disk 56.7%
+TEST $CLI volume set $V0 cluster.min-free-disk 120
+TEST $CLI volume set $V0 cluster.min-free-disk 369.0000
cleanup;
diff --git a/tests/bugs/glusterd/bug-859927.t b/tests/bugs/glusterd/bug-859927.t
index c30d2b852d4..1b9ca18c08a 100755
--- a/tests/bugs/glusterd/bug-859927.t
+++ b/tests/bugs/glusterd/bug-859927.t
@@ -44,12 +44,12 @@ TEST ! $CLI volume set $V0 min-free-inodes " "
TEST $CLI volume set $V0 min-free-inodes 60%
EXPECT "60%" volume_option $V0 cluster.min-free-inodes
-TEST ! $CLI volume set $V0 min-free-disk ""
-TEST ! $CLI volume set $V0 min-free-disk " "
-TEST $CLI volume set $V0 min-free-disk 60%
+TEST ! $CLI volume set $V0 cluster.min-free-disk ""
+TEST ! $CLI volume set $V0 cluster.min-free-disk " "
+TEST $CLI volume set $V0 cluster.min-free-disk 60%
EXPECT "60%" volume_option $V0 cluster.min-free-disk
-TEST $CLI volume set $V0 min-free-disk 120
+TEST $CLI volume set $V0 cluster.min-free-disk 120
EXPECT "120" volume_option $V0 cluster.min-free-disk
TEST ! $CLI volume set $V0 frame-timeout ""
diff --git a/tests/features/brick-min-free-space.t b/tests/features/brick-min-free-space.t
new file mode 100755
index 00000000000..4372998681f
--- /dev/null
+++ b/tests/features/brick-min-free-space.t
@@ -0,0 +1,113 @@
+#!/bin/bash
+#
+# Test storage.min-free-disk option works.
+#
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+cleanup;
+
+TEST glusterd
+
+TEST truncate -s 16M $B0/brick0
+TEST LOOPDEV=$(losetup --find --show $B0/brick0)
+TEST mkfs.xfs $LOOPDEV
+
+mkdir -p $B0/$V0
+
+TEST mount -t xfs $LOOPDEV $B0/$V0
+
+###########
+# AIO on #
+###########
+
+TEST $CLI volume create $V0 $H0:$B0/$V0
+TEST $CLI volume start $V0
+TEST $CLI volume set $V0 readdir-ahead on
+TEST $CLI vol set $V0 storage.linux-aio on
+
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0
+
+# Filesystem has ~12MB capacity after XFS and glusterfs overhead.
+# A 16MB write should blow up.
+TEST ! dd if=/dev/zero of=$M0/test bs=1M count=16 oflag=direct
+TEST rm $M0/test
+
+# But we should be able to write 10MB
+TEST dd if=/dev/zero of=$M0/test bs=1M count=10 oflag=direct
+
+# Now enable limit and set to at least 8MB free space
+TEST $CLI volume set $V0 storage.freespace-check-interval 1
+TEST $CLI volume set $V0 storage.min-free-disk 8388608
+
+# Now even a tiny write ought fail.
+TEST ! dd if=/dev/zero of=$M0/test1 bs=1M count=1 oflag=direct
+TEST rm $M0/test1
+
+# Repeat using percent syntax.
+TEST $CLI volume set $V0 storage.min-free-disk 33%
+
+TEST ! dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct
+TEST rm $M0/test1
+
+# Disable limit.
+TEST $CLI volume set $V0 storage.freespace-check-interval 0
+
+# Now we can write again.
+TEST dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct
+
+TEST rm $M0/test1
+TEST rm $M0/test
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0;
+TEST $CLI volume stop $V0
+TEST $CLI volume delete $V0
+
+############
+# AIO off #
+############
+
+TEST $CLI volume create $V0 $H0:$B0/$V0
+TEST $CLI volume start $V0
+TEST $CLI volume set $V0 readdir-ahead on
+TEST $CLI vol set $V0 storage.linux-aio off
+
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0
+
+# Filesystem has ~12MB capacity after XFS and glusterfs overhead.
+# A 16MB write should blow up.
+TEST ! dd if=/dev/zero of=$M0/test bs=1M count=16 oflag=direct
+TEST rm $M0/test
+
+# But we should be able to write 10MB
+TEST dd if=/dev/zero of=$M0/test bs=1M count=10 oflag=direct
+
+# Now enable limit and set to at least 8MB free space
+TEST $CLI volume set $V0 storage.freespace-check-interval 1
+TEST $CLI volume set $V0 storage.min-free-disk 8388608
+
+# Now even a tiny write ought fail.
+TEST ! dd if=/dev/zero of=$M0/test1 bs=1M count=1 oflag=direct
+TEST rm $M0/test1
+
+# Repeat using percent syntax.
+TEST $CLI volume set $V0 storage.min-free-disk 33%
+
+TEST ! dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct
+TEST rm $M0/test1
+
+# Disable limit.
+TEST $CLI volume set $V0 storage.freespace-check-interval 0
+
+# Now we can write again.
+TEST dd if=/dev/zero of=$M0/test1 bs=4K count=1 oflag=direct
+
+TEST rm $M0/test1
+TEST rm $M0/test
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0;
+TEST $CLI volume stop $V0
+TEST $CLI volume delete $V0
+
+cleanup;
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index f4dd9fcde71..89e63144fad 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -2517,6 +2517,14 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.voltype = "storage/posix",
.op_version = GD_OP_VERSION_3_6_0,
},
+ { .key = "storage.min-free-disk",
+ .voltype = "storage/posix",
+ .op_version = 2,
+ },
+ { .key = "storage.freespace-check-interval",
+ .voltype = "storage/posix",
+ .op_version = 2,
+ },
{ .key = "storage.bd-aio",
.voltype = "storage/bd",
.op_version = 3
diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c
index d8ef5f7b73f..636108affbb 100644
--- a/xlators/storage/posix/src/posix-aio.c
+++ b/xlators/storage/posix/src/posix-aio.c
@@ -331,6 +331,11 @@ posix_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
priv = this->private;
+ if (!posix_write_ok (this, priv)) {
+ op_errno = ENOSPC;
+ goto err;
+ }
+
ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno);
if (ret < 0) {
gf_msg (this->name, GF_LOG_WARNING, op_errno, P_MSG_PFD_NULL,
diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c
index cecf5dcb66d..c40a087ec46 100644
--- a/xlators/storage/posix/src/posix.c
+++ b/xlators/storage/posix/src/posix.c
@@ -658,6 +658,81 @@ out:
return 0;
}
+static gf_boolean_t freespace_ok (xlator_t *this, const struct statvfs *stats,
+ double min_free_disk,
+ gf_boolean_t previously_ok)
+{
+ gf_boolean_t currently_ok;
+
+ if (min_free_disk < 100.0) {
+ double free_percent = 100.0 * stats->f_bavail / stats->f_blocks;
+
+ currently_ok =
+ free_percent >= min_free_disk ? _gf_true : _gf_false;
+ if (previously_ok && !currently_ok) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "min-free-disk limit exceeded: free percent "
+ "%f%% < %f%%. Writes disabled.",
+ free_percent, min_free_disk);
+ }
+ } else {
+ double free_bytes = stats->f_bavail * stats->f_frsize;
+
+ currently_ok =
+ free_bytes >= min_free_disk ? _gf_true : _gf_false;
+ if (previously_ok && !currently_ok) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "min-free-disk limit exceeded: free bytes %f "
+ "< %f. Writes disabled.",
+ free_bytes, min_free_disk);
+ }
+ }
+
+ if (currently_ok && !previously_ok) {
+ gf_log (this->name, GF_LOG_INFO, "Free space has risen above "
+ "min-free-disk limit, writes "
+ "re-enabled.");
+ }
+
+ return currently_ok;
+}
+
+gf_boolean_t
+posix_write_ok (xlator_t *this, struct posix_private *priv)
+{
+ /* Check if there is sufficient free space to allow writes.
+ *
+ * This is called in the write path, so performance matters. We
+ * periodically sample free space by calling statvfs().
+ * freespace_check_lock is used to ensure only one process at a
+ * time makes the call; if the lock is contended, the previous
+ * status (reflected in freespace_check_passed) is used while
+ * the process that holds the mutex updates the current status.
+ */
+ if (!priv->freespace_check_interval) {
+ return _gf_true;
+ }
+
+ if (!pthread_mutex_trylock (&priv->freespace_check_lock)) {
+ struct timespec now;
+
+ clock_gettime (CLOCK_MONOTONIC, &now);
+ if (now.tv_sec >= priv->freespace_check_last.tv_sec +
+ priv->freespace_check_interval) {
+ sys_statvfs (priv->base_path, &priv->freespace_stats);
+ priv->freespace_check_last.tv_sec = now.tv_sec;
+
+ priv->freespace_check_passed = freespace_ok (
+ this, &priv->freespace_stats, priv->min_free_disk,
+ priv->freespace_check_passed);
+ }
+
+ pthread_mutex_unlock (&priv->freespace_check_lock);
+ }
+
+ return priv->freespace_check_passed;
+}
+
static int32_t
posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd,
int32_t flags, off_t offset, size_t len,
@@ -667,6 +742,7 @@ posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd,
int32_t op_errno = 0;
struct posix_fd *pfd = NULL;
gf_boolean_t locked = _gf_false;
+ struct posix_private *priv = this->private;
DECLARE_OLD_FS_ID_VAR;
@@ -675,6 +751,12 @@ posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd,
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (priv, out);
+
+ if (!posix_write_ok (this, priv)) {
+ ret = -ENOSPC;
+ goto out;
+ }
ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno);
if (ret < 0) {
@@ -3307,6 +3389,12 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
VALIDATE_OR_GOTO (priv, out);
+ if (!posix_write_ok (this, priv)) {
+ op_errno = ENOSPC;
+ op_ret = -1;
+ goto out;
+ }
+
ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno);
if (ret < 0) {
gf_msg (this->name, GF_LOG_WARNING, ret, P_MSG_PFD_NULL,
@@ -6671,6 +6759,16 @@ struct posix_private *priv = NULL;
options, uint32, out);
posix_spawn_health_check_thread (this);
+ pthread_mutex_lock (&priv->freespace_check_lock);
+ {
+ GF_OPTION_RECONF ("freespace-check-interval",
+ priv->freespace_check_interval,
+ options, uint32, out);
+ GF_OPTION_RECONF ("min-free-disk", priv->min_free_disk, options,
+ percent_or_size, out);
+ }
+ pthread_mutex_unlock (&priv->freespace_check_lock);
+
ret = 0;
out:
return ret;
@@ -7285,6 +7383,19 @@ init (xlator_t *this)
GF_OPTION_INIT ("batch-fsync-delay-usec", _private->batch_fsync_delay_usec,
uint32, out);
+
+ GF_OPTION_INIT ("freespace-check-interval",
+ _private->freespace_check_interval, uint32, out);
+
+ GF_OPTION_INIT ("min-free-disk", _private->min_free_disk,
+ percent_or_size, out);
+
+ pthread_mutex_init (&_private->freespace_check_lock, NULL);
+ sys_statvfs (_private->base_path, &_private->freespace_stats);
+ clock_gettime (CLOCK_MONOTONIC, &_private->freespace_check_last);
+ _private->freespace_check_passed = freespace_ok (
+ this, &_private->freespace_stats, _private->min_free_disk,
+ _gf_true);
out:
return ret;
}
@@ -7462,5 +7573,22 @@ struct volume_options options[] = {
"\t- Strip: Will strip the user namespace before setting. The raw filesystem will work in OS X.\n"
},
#endif
+ { .key = {"min-free-disk"},
+ .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
+ .default_value = "2%",
+ .description = "Minimum percentage/size of disk space, after which we"
+ "start failing writes with ENOSPC."
+ },
+ {
+ .key = {"freespace-check-interval"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .default_value = "5",
+ .validate = GF_OPT_VALIDATE_MIN,
+ .description = "Interval in seconds between freespace measurements "
+ "used for the min-free-disk determination. "
+ "Set to 0 to disable."
+ },
+
{ .key = {NULL} }
};
diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h
index 87f91e57747..ef4bc66ecbc 100644
--- a/xlators/storage/posix/src/posix.h
+++ b/xlators/storage/posix/src/posix.h
@@ -174,7 +174,14 @@ struct posix_private {
XATTR_BOTH,
} xattr_user_namespace;
#endif
-
+ /* freespace_check_lock protects access to following three fields. */
+ pthread_mutex_t freespace_check_lock;
+ struct timespec freespace_check_last;
+ struct statvfs freespace_stats;
+ double min_free_disk;
+ /* mutex protection ends. */
+ uint32_t freespace_check_interval;
+ gf_boolean_t freespace_check_passed;
};
typedef struct {
@@ -263,6 +270,9 @@ posix_get_ancestry (xlator_t *this, inode_t *leaf_inode,
void
posix_gfid_unset (xlator_t *this, dict_t *xdata);
+gf_boolean_t
+posix_write_ok (xlator_t *this, struct posix_private *priv);
+
int
posix_pacl_set (const char *path, const char *key, const char *acl_s);